auto &CudaTC = ToolChains[CudaTriple.str() + "/" + HostTriple.str()];
if (!CudaTC) {
CudaTC = llvm::make_unique<toolchains::CudaToolChain>(
- *this, CudaTriple, *HostTC, C.getInputArgs());
+ *this, CudaTriple, *HostTC, C.getInputArgs(), Action::OFK_Cuda);
}
C.addOffloadDeviceToolChain(CudaTC.get(), Action::OFK_Cuda);
}
ToolChains[TT.str() + "/" + HostTC->getTriple().str()];
if (!CudaTC)
CudaTC = llvm::make_unique<toolchains::CudaToolChain>(
- *this, TT, *HostTC, C.getInputArgs());
+ *this, TT, *HostTC, C.getInputArgs(), Action::OFK_OpenMP);
TC = CudaTC.get();
} else
TC = &getToolChain(C.getInputArgs(), TT);
for (unsigned I = 0; I < Outputs.size(); ++I) {
if (I)
UB += ',';
- UB += Outputs[I].getFilename();
+ SmallString<256> OutputFileName(Outputs[I].getFilename());
+ // Change extension of target files for OpenMP offloading
+ // to NVIDIA GPUs.
+ if (DepInfo[I].DependentToolChain->getTriple().isNVPTX() &&
+ JA.isOffloading(Action::OFK_OpenMP))
+ llvm::sys::path::replace_extension(OutputFileName, "cubin");
+ UB += OutputFileName;
}
CmdArgs.push_back(TCArgs.MakeArgString(UB));
CmdArgs.push_back("-unbundle");
break;
}
}
+
+/// Add OpenMP linker script arguments at the end of the argument list so that
+/// the fat binary is built by embedding each of the device images into the
+/// host. The linker script also defines a few symbols required by the code
+/// generation so that the images can be easily retrieved at runtime by the
+/// offloading library. This should be used only in tool chains that support
+/// linker scripts.
+void tools::AddOpenMPLinkerScript(const ToolChain &TC, Compilation &C,
+ const InputInfo &Output,
+ const InputInfoList &Inputs,
+ const ArgList &Args, ArgStringList &CmdArgs,
+ const JobAction &JA) {
+
+ // If this is not an OpenMP host toolchain, we don't need to do anything.
+ if (!JA.isHostOffloading(Action::OFK_OpenMP))
+ return;
+
+ // Create temporary linker script. Keep it if save-temps is enabled.
+ const char *LKS;
+ SmallString<256> Name = llvm::sys::path::filename(Output.getFilename());
+ if (C.getDriver().isSaveTempsEnabled()) {
+ llvm::sys::path::replace_extension(Name, "lk");
+ LKS = C.getArgs().MakeArgString(Name.c_str());
+ } else {
+ llvm::sys::path::replace_extension(Name, "");
+ Name = C.getDriver().GetTemporaryPath(Name, "lk");
+ LKS = C.addTempFile(C.getArgs().MakeArgString(Name.c_str()));
+ }
+
+ // Add linker script option to the command.
+ CmdArgs.push_back("-T");
+ CmdArgs.push_back(LKS);
+
+ // Create a buffer to write the contents of the linker script.
+ std::string LksBuffer;
+ llvm::raw_string_ostream LksStream(LksBuffer);
+
+ // Get the OpenMP offload tool chains so that we can extract the triple
+ // associated with each device input.
+ auto OpenMPToolChains = C.getOffloadToolChains<Action::OFK_OpenMP>();
+ assert(OpenMPToolChains.first != OpenMPToolChains.second &&
+ "No OpenMP toolchains??");
+
+ // Track the input file name and device triple in order to build the script,
+ // inserting binaries in the designated sections.
+ SmallVector<std::pair<std::string, const char *>, 8> InputBinaryInfo;
+
+ // Add commands to embed target binaries. We ensure that each section and
+ // image is 16-byte aligned. This is not mandatory, but increases the
+ // likelihood of data to be aligned with a cache block in several main host
+ // machines.
+ LksStream << "/*\n";
+ LksStream << " OpenMP Offload Linker Script\n";
+ LksStream << " *** Automatically generated by Clang ***\n";
+ LksStream << "*/\n";
+ LksStream << "TARGET(binary)\n";
+ auto DTC = OpenMPToolChains.first;
+ for (auto &II : Inputs) {
+ const Action *A = II.getAction();
+ // Is this a device linking action?
+ if (A && isa<LinkJobAction>(A) &&
+ A->isDeviceOffloading(Action::OFK_OpenMP)) {
+ assert(DTC != OpenMPToolChains.second &&
+ "More device inputs than device toolchains??");
+ InputBinaryInfo.push_back(std::make_pair(
+ DTC->second->getTriple().normalize(), II.getFilename()));
+ ++DTC;
+ LksStream << "INPUT(" << II.getFilename() << ")\n";
+ }
+ }
+
+ assert(DTC == OpenMPToolChains.second &&
+ "Less device inputs than device toolchains??");
+
+ LksStream << "SECTIONS\n";
+ LksStream << "{\n";
+
+ // Put each target binary into a separate section.
+ for (const auto &BI : InputBinaryInfo) {
+ LksStream << " .omp_offloading." << BI.first << " :\n";
+ LksStream << " ALIGN(0x10)\n";
+ LksStream << " {\n";
+ LksStream << " PROVIDE_HIDDEN(.omp_offloading.img_start." << BI.first
+ << " = .);\n";
+ LksStream << " " << BI.second << "\n";
+ LksStream << " PROVIDE_HIDDEN(.omp_offloading.img_end." << BI.first
+ << " = .);\n";
+ LksStream << " }\n";
+ }
+
+ // Add commands to define host entries begin and end. We use 1-byte subalign
+ // so that the linker does not add any padding and the elements in this
+ // section form an array.
+ LksStream << " .omp_offloading.entries :\n";
+ LksStream << " ALIGN(0x10)\n";
+ LksStream << " SUBALIGN(0x01)\n";
+ LksStream << " {\n";
+ LksStream << " PROVIDE_HIDDEN(.omp_offloading.entries_begin = .);\n";
+ LksStream << " *(.omp_offloading.entries)\n";
+ LksStream << " PROVIDE_HIDDEN(.omp_offloading.entries_end = .);\n";
+ LksStream << " }\n";
+ LksStream << "}\n";
+ LksStream << "INSERT BEFORE .data\n";
+ LksStream.flush();
+
+ // Dump the contents of the linker script if the user requested that. We
+ // support this option to enable testing of behavior with -###.
+ if (C.getArgs().hasArg(options::OPT_fopenmp_dump_offload_linker_script))
+ llvm::errs() << LksBuffer;
+
+ // If this is a dry run, do not create the linker script file.
+ if (C.getArgs().hasArg(options::OPT__HASH_HASH_HASH))
+ return;
+
+ // Open script file and write the contents.
+ std::error_code EC;
+ llvm::raw_fd_ostream Lksf(LKS, EC, llvm::sys::fs::F_None);
+
+ if (EC) {
+ C.getDriver().Diag(clang::diag::err_unable_to_make_temp) << EC.message();
+ return;
+ }
+
+ Lksf << LksBuffer;
+}
llvm::opt::ArgStringList &CmdArgs,
const llvm::opt::ArgList &Args);
+void AddOpenMPLinkerScript(const ToolChain &TC, Compilation &C,
+ const InputInfo &Output,
+ const InputInfoList &Inputs,
+ const llvm::opt::ArgList &Args,
+ llvm::opt::ArgStringList &CmdArgs,
+ const JobAction &JA);
+
const char *SplitDebugName(const llvm::opt::ArgList &Args,
const InputInfo &Input);
#include "Cuda.h"
#include "InputInfo.h"
+#include "CommonArgs.h"
#include "clang/Basic/Cuda.h"
+#include "clang/Config/config.h"
#include "clang/Basic/VirtualFileSystem.h"
#include "clang/Driver/Compilation.h"
#include "clang/Driver/Driver.h"
CmdArgs.push_back("--gpu-name");
CmdArgs.push_back(Args.MakeArgString(CudaArchToString(gpu_arch)));
CmdArgs.push_back("--output-file");
- CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
+ SmallString<256> OutputFileName(Output.getFilename());
+ if (JA.isOffloading(Action::OFK_OpenMP))
+ llvm::sys::path::replace_extension(OutputFileName, "cubin");
+ CmdArgs.push_back(Args.MakeArgString(OutputFileName));
for (const auto& II : Inputs)
CmdArgs.push_back(Args.MakeArgString(II.getFilename()));
C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
}
+void NVPTX::OpenMPLinker::ConstructJob(Compilation &C, const JobAction &JA,
+ const InputInfo &Output,
+ const InputInfoList &Inputs,
+ const ArgList &Args,
+ const char *LinkingOutput) const {
+ const auto &TC =
+ static_cast<const toolchains::CudaToolChain &>(getToolChain());
+ assert(TC.getTriple().isNVPTX() && "Wrong platform");
+
+ ArgStringList CmdArgs;
+
+ // OpenMP uses nvlink to link cubin files. The result will be embedded in the
+ // host binary by the host linker.
+ assert(!JA.isHostOffloading(Action::OFK_OpenMP) &&
+ "CUDA toolchain not expected for an OpenMP host device.");
+
+ if (Output.isFilename()) {
+ CmdArgs.push_back("-o");
+ CmdArgs.push_back(Output.getFilename());
+ } else
+ assert(Output.isNothing() && "Invalid output.");
+ if (Args.hasArg(options::OPT_g_Flag))
+ CmdArgs.push_back("-g");
+
+ if (Args.hasArg(options::OPT_v))
+ CmdArgs.push_back("-v");
+
+ StringRef GPUArch =
+ Args.getLastArgValue(options::OPT_march_EQ);
+ assert(!GPUArch.empty() && "At least one GPU Arch required for ptxas.");
+
+ CmdArgs.push_back("-arch");
+ CmdArgs.push_back(Args.MakeArgString(GPUArch));
+
+ // Add paths specified in LIBRARY_PATH environment variable as -L options.
+ addDirectoryList(Args, CmdArgs, "-L", "LIBRARY_PATH");
+
+ // Add paths for the default clang library path.
+ SmallString<256> DefaultLibPath =
+ llvm::sys::path::parent_path(TC.getDriver().Dir);
+ llvm::sys::path::append(DefaultLibPath, "lib" CLANG_LIBDIR_SUFFIX);
+ CmdArgs.push_back(Args.MakeArgString(Twine("-L") + DefaultLibPath));
+
+ // Add linking against library implementing OpenMP calls on NVPTX target.
+ CmdArgs.push_back("-lomptarget-nvptx");
+
+ for (const auto &II : Inputs) {
+ if (II.getType() == types::TY_LLVM_IR ||
+ II.getType() == types::TY_LTO_IR ||
+ II.getType() == types::TY_LTO_BC ||
+ II.getType() == types::TY_LLVM_BC) {
+ C.getDriver().Diag(diag::err_drv_no_linker_llvm_support)
+ << getToolChain().getTripleString();
+ continue;
+ }
+
+ // Currently, we only pass the input files to the linker, we do not pass
+ // any libraries that may be valid only for the host.
+ if (!II.isFilename())
+ continue;
+
+ SmallString<256> Name = llvm::sys::path::filename(II.getFilename());
+ llvm::sys::path::replace_extension(Name, "cubin");
+
+ const char *CubinF =
+ C.addTempFile(C.getArgs().MakeArgString(Name));
+
+ CmdArgs.push_back(CubinF);
+ }
+
+ AddOpenMPLinkerScript(getToolChain(), C, Output, Inputs, Args, CmdArgs, JA);
+
+ const char *Exec =
+ Args.MakeArgString(getToolChain().GetProgramPath("nvlink"));
+ C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
+}
+
/// CUDA toolchain. Our assembler is ptxas, and our "linker" is fatbinary,
/// which isn't properly a linker but nonetheless performs the step of stitching
/// together object files from the assembler into a single blob.
CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple,
- const ToolChain &HostTC, const ArgList &Args)
+ const ToolChain &HostTC, const ArgList &Args,
+ const Action::OffloadKind OK)
: ToolChain(D, Triple, Args), HostTC(HostTC),
- CudaInstallation(D, HostTC.getTriple(), Args) {
+ CudaInstallation(D, HostTC.getTriple(), Args), OK(OK) {
if (CudaInstallation.isValid())
getProgramPaths().push_back(CudaInstallation.getBinPath());
}
}
Tool *CudaToolChain::buildLinker() const {
+ if (OK == Action::OFK_OpenMP)
+ return new tools::NVPTX::OpenMPLinker(*this);
return new tools::NVPTX::Linker(*this);
}
const char *LinkingOutput) const override;
};
+class LLVM_LIBRARY_VISIBILITY OpenMPLinker : public Tool {
+ public:
+ OpenMPLinker(const ToolChain &TC)
+ : Tool("NVPTX::OpenMPLinker", "fatbinary", TC, RF_Full, llvm::sys::WEM_UTF8,
+ "--options-file") {}
+
+ bool hasIntegratedCPP() const override { return false; }
+
+ void ConstructJob(Compilation &C, const JobAction &JA,
+ const InputInfo &Output, const InputInfoList &Inputs,
+ const llvm::opt::ArgList &TCArgs,
+ const char *LinkingOutput) const override;
+};
+
} // end namespace NVPTX
} // end namespace tools
class LLVM_LIBRARY_VISIBILITY CudaToolChain : public ToolChain {
public:
CudaToolChain(const Driver &D, const llvm::Triple &Triple,
- const ToolChain &HostTC, const llvm::opt::ArgList &Args);
+ const ToolChain &HostTC, const llvm::opt::ArgList &Args,
+ const Action::OffloadKind OK);
virtual const llvm::Triple *getAuxTriple() const override {
return &HostTC.getTriple();
protected:
Tool *buildAssembler() const override; // ptxas
Tool *buildLinker() const override; // fatbinary (ok, not really a linker)
+
+private:
+ const Action::OffloadKind OK;
};
} // end namespace toolchains
// The types are (hopefully) good enough.
}
-/// Add OpenMP linker script arguments at the end of the argument list so that
-/// the fat binary is built by embedding each of the device images into the
-/// host. The linker script also defines a few symbols required by the code
-/// generation so that the images can be easily retrieved at runtime by the
-/// offloading library. This should be used only in tool chains that support
-/// linker scripts.
-static void AddOpenMPLinkerScript(const ToolChain &TC, Compilation &C,
- const InputInfo &Output,
- const InputInfoList &Inputs,
- const ArgList &Args, ArgStringList &CmdArgs,
- const JobAction &JA) {
-
- // If this is not an OpenMP host toolchain, we don't need to do anything.
- if (!JA.isHostOffloading(Action::OFK_OpenMP))
- return;
-
- // Create temporary linker script. Keep it if save-temps is enabled.
- const char *LKS;
- SmallString<256> Name = llvm::sys::path::filename(Output.getFilename());
- if (C.getDriver().isSaveTempsEnabled()) {
- llvm::sys::path::replace_extension(Name, "lk");
- LKS = C.getArgs().MakeArgString(Name.c_str());
- } else {
- llvm::sys::path::replace_extension(Name, "");
- Name = C.getDriver().GetTemporaryPath(Name, "lk");
- LKS = C.addTempFile(C.getArgs().MakeArgString(Name.c_str()));
- }
-
- // Add linker script option to the command.
- CmdArgs.push_back("-T");
- CmdArgs.push_back(LKS);
-
- // Create a buffer to write the contents of the linker script.
- std::string LksBuffer;
- llvm::raw_string_ostream LksStream(LksBuffer);
-
- // Get the OpenMP offload tool chains so that we can extract the triple
- // associated with each device input.
- auto OpenMPToolChains = C.getOffloadToolChains<Action::OFK_OpenMP>();
- assert(OpenMPToolChains.first != OpenMPToolChains.second &&
- "No OpenMP toolchains??");
-
- // Track the input file name and device triple in order to build the script,
- // inserting binaries in the designated sections.
- SmallVector<std::pair<std::string, const char *>, 8> InputBinaryInfo;
-
- // Add commands to embed target binaries. We ensure that each section and
- // image is 16-byte aligned. This is not mandatory, but increases the
- // likelihood of data to be aligned with a cache block in several main host
- // machines.
- LksStream << "/*\n";
- LksStream << " OpenMP Offload Linker Script\n";
- LksStream << " *** Automatically generated by Clang ***\n";
- LksStream << "*/\n";
- LksStream << "TARGET(binary)\n";
- auto DTC = OpenMPToolChains.first;
- for (auto &II : Inputs) {
- const Action *A = II.getAction();
- // Is this a device linking action?
- if (A && isa<LinkJobAction>(A) &&
- A->isDeviceOffloading(Action::OFK_OpenMP)) {
- assert(DTC != OpenMPToolChains.second &&
- "More device inputs than device toolchains??");
- InputBinaryInfo.push_back(std::make_pair(
- DTC->second->getTriple().normalize(), II.getFilename()));
- ++DTC;
- LksStream << "INPUT(" << II.getFilename() << ")\n";
- }
- }
-
- assert(DTC == OpenMPToolChains.second &&
- "Less device inputs than device toolchains??");
-
- LksStream << "SECTIONS\n";
- LksStream << "{\n";
-
- // Put each target binary into a separate section.
- for (const auto &BI : InputBinaryInfo) {
- LksStream << " .omp_offloading." << BI.first << " :\n";
- LksStream << " ALIGN(0x10)\n";
- LksStream << " {\n";
- LksStream << " PROVIDE_HIDDEN(.omp_offloading.img_start." << BI.first
- << " = .);\n";
- LksStream << " " << BI.second << "\n";
- LksStream << " PROVIDE_HIDDEN(.omp_offloading.img_end." << BI.first
- << " = .);\n";
- LksStream << " }\n";
- }
-
- // Add commands to define host entries begin and end. We use 1-byte subalign
- // so that the linker does not add any padding and the elements in this
- // section form an array.
- LksStream << " .omp_offloading.entries :\n";
- LksStream << " ALIGN(0x10)\n";
- LksStream << " SUBALIGN(0x01)\n";
- LksStream << " {\n";
- LksStream << " PROVIDE_HIDDEN(.omp_offloading.entries_begin = .);\n";
- LksStream << " *(.omp_offloading.entries)\n";
- LksStream << " PROVIDE_HIDDEN(.omp_offloading.entries_end = .);\n";
- LksStream << " }\n";
- LksStream << "}\n";
- LksStream << "INSERT BEFORE .data\n";
- LksStream.flush();
-
- // Dump the contents of the linker script if the user requested that. We
- // support this option to enable testing of behavior with -###.
- if (C.getArgs().hasArg(options::OPT_fopenmp_dump_offload_linker_script))
- llvm::errs() << LksBuffer;
-
- // If this is a dry run, do not create the linker script file.
- if (C.getArgs().hasArg(options::OPT__HASH_HASH_HASH))
- return;
-
- // Open script file and write the contents.
- std::error_code EC;
- llvm::raw_fd_ostream Lksf(LKS, EC, llvm::sys::fs::F_None);
-
- if (EC) {
- C.getDriver().Diag(clang::diag::err_unable_to_make_temp) << EC.message();
- return;
- }
-
- Lksf << LksBuffer;
-}
-
static bool addXRayRuntime(const ToolChain &TC, const ArgList &Args,
ArgStringList &CmdArgs) {
if (Args.hasFlag(options::OPT_fxray_instrument,
// RUN: | FileCheck -check-prefix=CHK-FOPENMP-TARGET-NESTED-ERROR %s
// CHK-FOPENMP-TARGET-NESTED-ERROR: clang{{.*}} error: invalid -Xopenmp-target argument: '-Xopenmp-target -Xopenmp-target', options requiring arguments are unsupported
+
+/// ###########################################################################
+
+/// Check -Xopenmp-target uses one of the archs provided when several archs are used.
+// RUN: %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_35 -Xopenmp-target -march=sm_60 %s 2>&1 \
+// RUN: | FileCheck -check-prefix=CHK-FOPENMP-TARGET-ARCHS %s
+
+// CHK-FOPENMP-TARGET-ARCHS: ptxas{{.*}}" "--gpu-name" "sm_60"
+// CHK-FOPENMP-TARGET-ARCHS: nvlink{{.*}}" "-arch" "sm_60"
+
+/// ###########################################################################
+
+/// Check -Xopenmp-target -march=sm_35 works as expected when two triples are present.
+// RUN: %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=powerpc64le-ibm-linux-gnu,nvptx64-nvidia-cuda -Xopenmp-target=nvptx64-nvidia-cuda -march=sm_35 %s 2>&1 \
+// RUN: | FileCheck -check-prefix=CHK-FOPENMP-TARGET-COMPILATION %s
+
+// CHK-FOPENMP-TARGET-COMPILATION: ptxas{{.*}}" "--gpu-name" "sm_35"
+// CHK-FOPENMP-TARGET-COMPILATION: nvlink{{.*}}" "-arch" "sm_35"
+
+/// ###########################################################################
+
+/// Check cubin file generation and usage by nvlink
+// RUN: %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -save-temps -no-canonical-prefixes %s 2>&1 \
+// RUN: | FileCheck -check-prefix=CHK-CUBIN %s
+
+// CHK-CUBIN: clang{{.*}}" "-o" "{{.*}}-openmp-nvptx64-nvidia-cuda.s"
+// CHK-CUBIN-NEXT: ptxas{{.*}}" "--output-file" "{{.*}}-openmp-nvptx64-nvidia-cuda.cubin" "{{.*}}-openmp-nvptx64-nvidia-cuda.s"
+// CHK-CUBIN-NEXT: nvlink" "-o" "{{.*}}-openmp-nvptx64-nvidia-cuda" {{.*}} "openmp-offload-openmp-nvptx64-nvidia-cuda.cubin"
+
+/// ###########################################################################
+
+/// Check cubin file generation and usage by nvlink
+// RUN: touch %t1.o
+// RUN: touch %t2.o
+// RUN: %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda %t1.o %t2.o 2>&1 \
+// RUN: | FileCheck -check-prefix=CHK-TWOCUBIN %s
+
+// CHK-TWOCUBIN: nvlink" "-o" "{{.*}}openmp-offload-{{.*}}.out" {{.*}} "openmp-offload-{{.*}}.cubin" "openmp-offload-{{.*}}.cubin"