[CUDA] Invoke ptxas and fatbinary during compilation.

author Justin Lebar <jlebar@google.com>

Thu, 14 Jan 2016 21:41:27 +0000 (21:41 +0000)

committer Justin Lebar <jlebar@google.com>

Thu, 14 Jan 2016 21:41:27 +0000 (21:41 +0000)
author Justin Lebar <jlebar@google.com>
Thu, 14 Jan 2016 21:41:27 +0000 (21:41 +0000)
committer Justin Lebar <jlebar@google.com>
Thu, 14 Jan 2016 21:41:27 +0000 (21:41 +0000)
diff --git a/include/clang/Driver/Action.h b/include/clang/Driver/Action.h

index c5b0f4755092f79b16c8374dc2e5febff5ec2784..2cf53bc787792f665fc120bf6de76ff1a54effea 100644 (file)
--- a/include/clang/Driver/Action.h
+++ b/include/clang/Driver/Action.h
@@ -136,7 +136,8 @@ public:
  
  class CudaDeviceAction : public Action {
    virtual void anchor();
-  /// GPU architecture to bind.  Always of the form /sm_\d+/.
+  /// GPU architecture to bind.  Always of the form /sm_\d+/ or null (when the
+  /// action applies to multiple architectures).
    const char *GpuArchName;
    /// True when action results are not consumed by the host action (e.g when
    /// -fsyntax-only or --cuda-device-only options are used).
@@ -147,7 +148,8 @@ public:
  
    const char *getGpuArchName() const { return GpuArchName; }
  
-  /// Gets the compute_XX that corresponds to getGpuArchName().
+  /// Gets the compute_XX that corresponds to getGpuArchName().  Returns null
+  /// when getGpuArchName() is null.
    const char *getComputeArchName() const;
  
    bool isAtTopLevel() const { return AtTopLevel; }
diff --git a/include/clang/Driver/Options.td b/include/clang/Driver/Options.td

index e4279e80d847eb356abb0ba7ce2c1c4e28313ff6..6d62ffd7c42fb1521e47d1f9d5d93480c02adce3 100644 (file)
--- a/include/clang/Driver/Options.td
+++ b/include/clang/Driver/Options.td
@@ -336,6 +336,10 @@ def Xassembler : Separate<["-"], "Xassembler">,
  def Xclang : Separate<["-"], "Xclang">,
    HelpText<"Pass <arg> to the clang compiler">, MetaVarName<"<arg>">,
    Flags<[DriverOption, CoreOption]>;
+def Xcuda_fatbinary : Separate<["-"], "Xcuda-fatbinary">,
+  HelpText<"Pass <arg> to fatbinary invocation">, MetaVarName<"<arg>">;
+def Xcuda_ptxas : Separate<["-"], "Xcuda-ptxas">,
+  HelpText<"Pass <arg> to the ptxas assembler">, MetaVarName<"<arg>">;
  def z : Separate<["-"], "z">, Flags<[LinkerInput, RenderAsInput]>,
    HelpText<"Pass -z <arg> to the linker">, MetaVarName<"<arg>">;
  def Xlinker : Separate<["-"], "Xlinker">, Flags<[LinkerInput, RenderAsInput]>,
diff --git a/include/clang/Driver/ToolChain.h b/include/clang/Driver/ToolChain.h

index 7e68d0a59a0944d99f093989f8376e0d00b3cbef..177868098307917551eedef5e5803a8ea593f340 100644 (file)
--- a/include/clang/Driver/ToolChain.h
+++ b/include/clang/Driver/ToolChain.h
@@ -228,7 +228,7 @@ public:
    virtual bool IsIntegratedAssemblerDefault() const { return false; }
  
    /// \brief Check if the toolchain should use the integrated assembler.
-  bool useIntegratedAs() const;
+  virtual bool useIntegratedAs() const;
  
    /// IsMathErrnoDefault - Does this tool chain use -fmath-errno by default.
    virtual bool IsMathErrnoDefault() const { return true; }
diff --git a/include/clang/Driver/Types.def b/include/clang/Driver/Types.def

index d1b69151b0621279f7b8c7ea7822384afae75ac1..baaa4116cad283711490520f5f91c3be7e882441 100644 (file)
--- a/include/clang/Driver/Types.def
+++ b/include/clang/Driver/Types.def
@@ -93,4 +93,5 @@ TYPE("treelang",                 Treelang,     INVALID,         nullptr, "u")
  TYPE("image",                    Image,        INVALID,         "out",   "")
  TYPE("dSYM",                     dSYM,         INVALID,         "dSYM",  "A")
  TYPE("dependencies",             Dependencies, INVALID,         "d",     "")
+TYPE("cuda-fatbin",              CUDA_FATBIN,  INVALID,         "fatbin","A")
  TYPE("none",                     Nothing,      INVALID,         nullptr, "u")
diff --git a/lib/CodeGen/CGCUDANV.cpp b/lib/CodeGen/CGCUDANV.cpp

index 045e19b189dcac5fbd828b6a7d67633d9c7c5a02..9dd7928bcf97e0abc1644fd05a321ed1d55e65b9 100644 (file)
--- a/lib/CodeGen/CGCUDANV.cpp
+++ b/lib/CodeGen/CGCUDANV.cpp
@@ -259,6 +259,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
          TheModule, FatbinWrapperTy, true, llvm::GlobalValue::InternalLinkage,
          llvm::ConstantStruct::get(FatbinWrapperTy, Values),
          "__cuda_fatbin_wrapper");
+    // NVIDIA's cuobjdump looks for fatbins in this section.
+    FatbinWrapper->setSection(".nvFatBinSegment");
  
      // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
      llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
diff --git a/lib/Driver/Action.cpp b/lib/Driver/Action.cpp

index e9490e96db8da3b5f440b263b8c328079c07874d..b45f290efc1b8e2efea2e4f3ba3e1435bd309ebf 100644 (file)
--- a/lib/Driver/Action.cpp
+++ b/lib/Driver/Action.cpp
@@ -75,7 +75,7 @@ CudaDeviceAction::CudaDeviceAction(Action *Input, const char *ArchName,
                                     bool AtTopLevel)
      : Action(CudaDeviceClass, Input), GpuArchName(ArchName),
        AtTopLevel(AtTopLevel) {
-  assert(IsValidGpuArchName(GpuArchName));
+  assert(!GpuArchName || IsValidGpuArchName(GpuArchName));
  }
  
  const char *CudaDeviceAction::getComputeArchName() const {
diff --git a/lib/Driver/Driver.cpp b/lib/Driver/Driver.cpp

index e9a37c6aa815530b1652981ddfe109372efb0468..0cdfb4fe1059be06e327110a8b65ed7c7967e093 100644 (file)
--- a/lib/Driver/Driver.cpp
+++ b/lib/Driver/Driver.cpp
@@ -949,8 +949,9 @@ static unsigned PrintActions1(const Compilation &C, Action *A,
      os << '"' << BIA->getArchName() << '"' << ", {"
         << PrintActions1(C, *BIA->begin(), Ids) << "}";
    } else if (CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
-    os << '"' << CDA->getGpuArchName() << '"' << ", {"
-       << PrintActions1(C, *CDA->begin(), Ids) << "}";
+    os << '"'
+       << (CDA->getGpuArchName() ? CDA->getGpuArchName() : "(multiple archs)")
+       << '"' << ", {" << PrintActions1(C, *CDA->begin(), Ids) << "}";
    } else {
      const ActionList *AL;
      if (CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) {
@@ -1327,7 +1328,7 @@ static Action *buildCudaActions(Compilation &C, DerivedArgList &Args,
    // Check whether any of device actions stopped before they could generate PTX.
    bool PartialCompilation =
        llvm::any_of(CudaDeviceActions, [](const Action *a) {
-        return a->getKind() != Action::BackendJobClass;
+        return a->getKind() != Action::AssembleJobClass;
        });
  
    // Figure out what to do with device actions -- pass them as inputs to the
@@ -1356,16 +1357,32 @@ static Action *buildCudaActions(Compilation &C, DerivedArgList &Args,
      return HostAction;
    }
  
-  // Outputs of device actions during complete CUDA compilation get created
-  // with AtTopLevel=false and become inputs for the host action.
+  // If we're not a partial or device-only compilation, we compile each arch to
+  // ptx and assemble to cubin, then feed the cubin *and* the ptx into a device
+  // "link" action, which uses fatbinary to combine these cubins into one
+  // fatbin.  The fatbin is then an input to the host compilation.
    ActionList DeviceActions;
-  for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
-    DeviceActions.push_back(
-        C.MakeAction<CudaDeviceAction>(CudaDeviceActions[I], GpuArchList[I],
-                                       /* AtTopLevel */ false));
+  for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+    Action* AssembleAction = CudaDeviceActions[I];
+    assert(AssembleAction->getType() == types::TY_Object);
+    assert(AssembleAction->getInputs().size() == 1);
+
+    Action* BackendAction = AssembleAction->getInputs()[0];
+    assert(BackendAction->getType() == types::TY_PP_Asm);
+
+    for (const auto& A : {AssembleAction, BackendAction}) {
+      DeviceActions.push_back(C.MakeAction<CudaDeviceAction>(
+          A, GpuArchList[I], /* AtTopLevel */ false));
+    }
+  }
+  auto FatbinAction = C.MakeAction<CudaDeviceAction>(
+      C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN),
+      /* GpuArchName = */ nullptr,
+      /* AtTopLevel = */ false);
    // Return a new host action that incorporates original host action and all
    // device actions.
-  return C.MakeAction<CudaHostAction>(HostAction, DeviceActions);
+  return C.MakeAction<CudaHostAction>(std::move(HostAction),
+                                      ActionList({FatbinAction}));
  }
  
  void Driver::BuildActions(Compilation &C, const ToolChain &TC,
@@ -1600,7 +1617,7 @@ Action *Driver::ConstructPhaseAction(Compilation &C, const ToolChain &TC,
      return C.MakeAction<BackendJobAction>(Input, types::TY_PP_Asm);
    }
    case phases::Assemble:
-    return C.MakeAction<AssembleJobAction>(Input, types::TY_Object);
+    return C.MakeAction<AssembleJobAction>(std::move(Input), types::TY_Object);
    }
  
    llvm_unreachable("invalid phase in ConstructPhaseAction");
@@ -1849,11 +1866,14 @@ InputInfo Driver::BuildJobsForActionNoCache(
    if (const CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
      // Initial processing of CudaDeviceAction carries host params.
      // Call BuildJobsForAction() again, now with correct device parameters.
-    assert(CDA->getGpuArchName() && "No GPU name in device action.");
-    return BuildJobsForAction(C, *CDA->begin(), C.getCudaDeviceToolChain(),
-                              CDA->getGpuArchName(), CDA->isAtTopLevel(),
-                              /*MultipleArchs*/ true, LinkingOutput,
-                              CachedResults);
+    InputInfo II = BuildJobsForAction(
+        C, *CDA->begin(), C.getCudaDeviceToolChain(), CDA->getGpuArchName(),
+        CDA->isAtTopLevel(), /*MultipleArchs*/ true, LinkingOutput,
+        CachedResults);
+    // Currently II's Action is *CDA->begin().  Set it to CDA instead, so that
+    // one can retrieve II's GPU arch.
+    II.setAction(A);
+    return II;
    }
  
    const ActionList *Inputs = &A->getInputs();
diff --git a/lib/Driver/ToolChains.cpp b/lib/Driver/ToolChains.cpp

index e3f25f1e32e72182f355b211cd22d2b616d00a3f..15b36778220539cfbb5c799f2505890a0b5dd80f 100644 (file)
--- a/lib/Driver/ToolChains.cpp
+++ b/lib/Driver/ToolChains.cpp
@@ -1652,13 +1652,14 @@ void Generic_GCC::CudaInstallationDetector::init(
        continue;
  
      CudaInstallPath = CudaPath;
+    CudaBinPath = CudaPath + "/bin";
      CudaIncludePath = CudaInstallPath + "/include";
      CudaLibDevicePath = CudaInstallPath + "/nvvm/libdevice";
      CudaLibPath =
          CudaInstallPath + (TargetTriple.isArch64Bit() ? "/lib64" : "/lib");
  
      if (!(D.getVFS().exists(CudaIncludePath) &&
-          D.getVFS().exists(CudaLibPath) &&
+          D.getVFS().exists(CudaBinPath) && D.getVFS().exists(CudaLibPath) &&
            D.getVFS().exists(CudaLibDevicePath)))
        continue;
  
@@ -4182,13 +4183,16 @@ Tool *DragonFly::buildLinker() const {
    return new tools::dragonfly::Linker(*this);
  }
  
-/// Stub for CUDA toolchain. At the moment we don't have assembler or
-/// linker and need toolchain mainly to propagate device-side options
-/// to CC1.
+/// CUDA toolchain.  Our assembler is ptxas, and our "linker" is fatbinary,
+/// which isn't properly a linker but nonetheless performs the step of stitching
+/// together object files from the assembler into a single blob.
  
  CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple,
                               const ArgList &Args)
-    : Linux(D, Triple, Args) {}
+    : Linux(D, Triple, Args) {
+  if (CudaInstallation.isValid())
+    getProgramPaths().push_back(CudaInstallation.getBinPath());
+}
  
  void
  CudaToolChain::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
@@ -4222,7 +4226,7 @@ CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
    for (Arg *A : Args) {
      if (A->getOption().matches(options::OPT_Xarch__)) {
        // Skip this argument unless the architecture matches BoundArch
-      if (A->getValue(0) != StringRef(BoundArch))
+      if (!BoundArch || A->getValue(0) != StringRef(BoundArch))
          continue;
  
        unsigned Index = Args.getBaseArgs().MakeIndex(A->getValue(1));
@@ -4253,10 +4257,19 @@ CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
      DAL->append(A);
    }
  
-  DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch);
+  if (BoundArch)
+    DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch);
    return DAL;
  }
  
+Tool *CudaToolChain::buildAssembler() const {
+  return new tools::NVPTX::Assembler(*this);
+}
+
+Tool *CudaToolChain::buildLinker() const {
+  return new tools::NVPTX::Linker(*this);
+}
+
  /// XCore tool chain
  XCoreToolChain::XCoreToolChain(const Driver &D, const llvm::Triple &Triple,
                                 const ArgList &Args)
diff --git a/lib/Driver/ToolChains.h b/lib/Driver/ToolChains.h

index 622c414c62e3a9e7dbbb31762bbf6ef6a8e9d45f..a5f54733e8bdbe7497adb63f52b9db1d269e4d12 100644 (file)
--- a/lib/Driver/ToolChains.h
+++ b/lib/Driver/ToolChains.h
@@ -163,6 +163,7 @@ protected:
      bool IsValid;
      const Driver &D;
      std::string CudaInstallPath;
+    std::string CudaBinPath;
      std::string CudaLibPath;
      std::string CudaLibDevicePath;
      std::string CudaIncludePath;
@@ -179,6 +180,8 @@ protected:
  
      /// \brief Get the detected Cuda installation path.
      StringRef getInstallPath() const { return CudaInstallPath; }
+    /// \brief Get the detected path to Cuda's bin directory.
+    StringRef getBinPath() const { return CudaBinPath; }
      /// \brief Get the detected Cuda Include path.
      StringRef getIncludePath() const { return CudaIncludePath; }
      /// \brief Get the detected Cuda library path.
@@ -816,6 +819,14 @@ public:
                  const char *BoundArch) const override;
    void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
                               llvm::opt::ArgStringList &CC1Args) const override;
+
+  // Never try to use the integrated assembler with CUDA; always fork out to
+  // ptxas.
+  bool useIntegratedAs() const override { return false; }
+
+protected:
+  Tool *buildAssembler() const override;  // ptxas
+  Tool *buildLinker() const override;     // fatbinary (ok, not really a linker)
  };
  
  class LLVM_LIBRARY_VISIBILITY MipsLLVMToolChain : public Linux {
diff --git a/lib/Driver/Tools.cpp b/lib/Driver/Tools.cpp

index 5e4777b4666d6ce4c41e21dbf96f6be1e4b57173..e498f98355c4e3a39400dd2b7df136d10d819f4e 100644 (file)
--- a/lib/Driver/Tools.cpp
+++ b/lib/Driver/Tools.cpp
@@ -10625,3 +10625,81 @@ void PS4cpu::Link::ConstructJob(Compilation &C, const JobAction &JA,
    else
      ConstructGoldLinkJob(*this, C, JA, Output, Inputs, Args, LinkingOutput);
  }
+
+void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
+                                    const InputInfo &Output,
+                                    const InputInfoList &Inputs,
+                                    const ArgList &Args,
+                                    const char *LinkingOutput) const {
+  const auto &TC =
+      static_cast<const toolchains::CudaToolChain &>(getToolChain());
+  assert(TC.getArch() == llvm::Triple::nvptx ||
+         TC.getArch() == llvm::Triple::nvptx64);
+
+  std::vector<std::string> gpu_archs =
+      Args.getAllArgValues(options::OPT_march_EQ);
+  assert(gpu_archs.size() == 1 && "Exactly one GPU Arch required for ptxas.");
+  const std::string& gpu_arch = gpu_archs[0];
+
+
+  ArgStringList CmdArgs;
+  CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-m64" : "-m32");
+
+  // Clang's default optimization level is -O0, but ptxas's default is -O3.
+  CmdArgs.push_back(Args.MakeArgString(
+      llvm::Twine("-O") +
+      Args.getLastArgValue(options::OPT_O_Group, "0").data()));
+
+  // Don't bother passing -g to ptxas: It's enabled by default at -O0, and
+  // not supported at other optimization levels.
+
+  CmdArgs.push_back("--gpu-name");
+  CmdArgs.push_back(Args.MakeArgString(gpu_arch));
+  CmdArgs.push_back("--output-file");
+  CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
+  for (const auto& II : Inputs)
+    CmdArgs.push_back(Args.MakeArgString(II.getFilename()));
+
+  for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_ptxas))
+    CmdArgs.push_back(Args.MakeArgString(A));
+
+  const char *Exec = Args.MakeArgString(TC.GetProgramPath("ptxas"));
+  C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
+}
+
+// All inputs to this linker must be from CudaDeviceActions, as we need to look
+// at the Inputs' Actions in order to figure out which GPU architecture they
+// correspond to.
+void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
+                                 const InputInfo &Output,
+                                 const InputInfoList &Inputs,
+                                 const ArgList &Args,
+                                 const char *LinkingOutput) const {
+  const auto &TC =
+      static_cast<const toolchains::CudaToolChain &>(getToolChain());
+  assert(TC.getArch() == llvm::Triple::nvptx ||
+         TC.getArch() == llvm::Triple::nvptx64);
+
+  ArgStringList CmdArgs;
+  CmdArgs.push_back("--cuda");
+  CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-64" : "-32");
+  CmdArgs.push_back(Args.MakeArgString("--create"));
+  CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
+
+  for (const auto& II : Inputs) {
+    auto* A = cast<const CudaDeviceAction>(II.getAction());
+    // We need to pass an Arch of the form "sm_XX" for cubin files and
+    // "compute_XX" for ptx.
+    const char *Arch = (II.getType() == types::TY_PP_Asm)
+                           ? A->getComputeArchName()
+                           : A->getGpuArchName();
+    CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") +
+                                         Arch + ",file=" + II.getFilename()));
+  }
+
+  for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_fatbinary))
+    CmdArgs.push_back(Args.MakeArgString(A));
+
+  const char *Exec = Args.MakeArgString(TC.GetProgramPath("fatbinary"));
+  C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
+}
diff --git a/lib/Driver/Tools.h b/lib/Driver/Tools.h

index 2b137f4a6d0b52a230e92b48f3e53c9f18b98e1c..69c046587f4a9f6ba0fdcc3b7dd1df9436ab37ec 100644 (file)
--- a/lib/Driver/Tools.h
+++ b/lib/Driver/Tools.h
@@ -903,6 +903,41 @@ public:
  };
  } // end namespace PS4cpu
  
+namespace NVPTX {
+
+// Run ptxas, the NVPTX assembler.
+class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
+ public:
+   Assembler(const ToolChain &TC)
+       : Tool("NVPTX::Assembler", "ptxas", TC, RF_Full, llvm::sys::WEM_UTF8,
+              "--options-file") {}
+
+   bool hasIntegratedCPP() const override { return false; }
+
+   void ConstructJob(Compilation &C, const JobAction &JA,
+                     const InputInfo &Output, const InputInfoList &Inputs,
+                     const llvm::opt::ArgList &TCArgs,
+                     const char *LinkingOutput) const override;
+};
+
+// Runs fatbinary, which combines GPU object files ("cubin" files) and/or PTX
+// assembly into a single output file.
+class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+ public:
+   Linker(const ToolChain &TC)
+       : Tool("NVPTX::Linker", "fatbinary", TC, RF_Full, llvm::sys::WEM_UTF8,
+              "--options-file") {}
+
+   bool hasIntegratedCPP() const override { return false; }
+
+   void ConstructJob(Compilation &C, const JobAction &JA,
+                     const InputInfo &Output, const InputInfoList &Inputs,
+                     const llvm::opt::ArgList &TCArgs,
+                     const char *LinkingOutput) const override;
+};
+
+}  // end namespace NVPTX
+
  } // end namespace tools
  } // end namespace driver
  } // end namespace clang
diff --git a/lib/Driver/Types.cpp b/lib/Driver/Types.cpp

index c29ce9462a0792398d192cd6f5bb2465391035a8..3b3b67fc5d5345bef41018693905905b32a879fc 100644 (file)
--- a/lib/Driver/Types.cpp
+++ b/lib/Driver/Types.cpp
@@ -232,8 +232,7 @@ void types::getCompilationPhases(ID Id, llvm::SmallVectorImpl<phases::ID> &P) {
          P.push_back(phases::Compile);
          P.push_back(phases::Backend);
        }
-      if (Id != TY_CUDA_DEVICE)
-        P.push_back(phases::Assemble);
+      P.push_back(phases::Assemble);
      }
    }
  
diff --git a/test/Driver/Inputs/CUDA/usr/local/cuda/bin/.keep b/test/Driver/Inputs/CUDA/usr/local/cuda/bin/.keep

new file mode 100644 (file)

index 0000000..e69de29
diff --git a/test/Driver/cuda-arch-translation.cu b/test/Driver/cuda-arch-translation.cu

new file mode 100644 (file)

index 0000000..64ddb31
--- /dev/null
+++ b/test/Driver/cuda-arch-translation.cu
@@ -0,0 +1,37 @@
+// Tests that "sm_XX" gets correctly converted to "compute_YY" when we invoke
+// fatbinary.
+//
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// CHECK:fatbinary
+
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_21 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM21 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_30 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM30 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_32 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM32 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_35 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM35 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_37 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM37 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_50 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM50 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_52 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM52 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_53 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM53 %s
+
+// SM20:--image=profile=sm_20{{.*}}--image=profile=compute_20
+// SM21:--image=profile=sm_21{{.*}}--image=profile=compute_20
+// SM30:--image=profile=sm_30{{.*}}--image=profile=compute_30
+// SM32:--image=profile=sm_32{{.*}}--image=profile=compute_32
+// SM35:--image=profile=sm_35{{.*}}--image=profile=compute_35
+// SM37:--image=profile=sm_37{{.*}}--image=profile=compute_37
+// SM50:--image=profile=sm_50{{.*}}--image=profile=compute_50
+// SM52:--image=profile=sm_52{{.*}}--image=profile=compute_52
+// SM53:--image=profile=sm_53{{.*}}--image=profile=compute_53
diff --git a/test/Driver/cuda-external-tools.cu b/test/Driver/cuda-external-tools.cu

new file mode 100644 (file)

index 0000000..801b491
--- /dev/null
+++ b/test/Driver/cuda-external-tools.cu
@@ -0,0 +1,70 @@
+// Tests that ptxas and fatbinary are correctly during CUDA compilation.
+//
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// Regular compile with -O2.
+// RUN: %clang -### -target x86_64-linux-gnu -O2 -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT2 %s
+
+// Regular compile without -O.  This should result in us passing -O0 to ptxas.
+// RUN: %clang -### -target x86_64-linux-gnu -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT0 %s
+
+// Regular compile targeting sm_35.
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM35 %s
+
+// 32-bit compile.
+// RUN: %clang -### -target x86_32-linux-gnu -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH32 -check-prefix SM20 %s
+
+// Compile with -fintegrated-as.  This should still cause us to invoke ptxas.
+// RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT0 %s
+
+// Check -Xcuda-ptxas and -Xcuda-fatbinary
+// RUN: %clang -### -target x86_64-linux-gnu -c -Xcuda-ptxas -foo1 \
+// RUN:   -Xcuda-fatbinary -bar1 -Xcuda-ptxas -foo2 -Xcuda-fatbinary -bar2 %s 2>&1 \
+// RUN: | FileCheck -check-prefix SM20 -check-prefix PTXAS-EXTRA \
+// RUN:   -check-prefix FATBINARY-EXTRA %s
+
+// Match clang job that produces PTX assembly.
+// CHECK: "-cc1" "-triple" "nvptx64-nvidia-cuda"
+// SM20: "-target-cpu" "sm_20"
+// SM35: "-target-cpu" "sm_35"
+// SM20: "-o" "[[PTXFILE:[^"]*]]"
+// SM35: "-o" "[[PTXFILE:[^"]*]]"
+
+// Match the call to ptxas (which assembles PTX to SASS).
+// CHECK:ptxas
+// ARCH64: "-m64"
+// ARCH32: "-m32"
+// OPT0: "-O0"
+// OPT2: "-O2"
+// SM20: "--gpu-name" "sm_20"
+// SM35: "--gpu-name" "sm_35"
+// SM20: "--output-file" "[[CUBINFILE:[^"]*]]"
+// SM35: "--output-file" "[[CUBINFILE:[^"]*]]"
+// PTXAS-EXTRA: "-foo1"
+// PTXAS-EXTRA-SAME: "-foo2"
+// CHECK-SAME: "[[PTXFILE]]"
+
+// Match the call to fatbinary (which combines all our PTX and SASS into one
+// blob).
+// CHECK:fatbinary
+// CHECK-DAG: "--cuda"
+// ARCH64-DAG: "-64"
+// ARCH32-DAG: "-32"
+// CHECK-DAG: "--create" "[[FATBINARY:[^"]*]]"
+// SM20-DAG: "--image=profile=compute_20,file=[[PTXFILE]]"
+// SM35-DAG: "--image=profile=compute_35,file=[[PTXFILE]]"
+// SM20-DAG: "--image=profile=sm_20,file=[[CUBINFILE]]"
+// SM35-DAG: "--image=profile=sm_35,file=[[CUBINFILE]]"
+// FATBINARY-EXTRA: "-bar1"
+// FATBINARY-EXTRA-SAME: "-bar2"
+
+// Match the clang job for host compilation.
+// CHECK: "-cc1" "-triple" "x86_64--linux-gnu"
+// CHECK-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]"
diff --git a/test/Driver/cuda-options.cu b/test/Driver/cuda-options.cu

index bf71633a4cf93c9558deba969e574b4f13ae2bb1..903009048620d22a094f306c659a4915099dbacd 100644 (file)
--- a/test/Driver/cuda-options.cu
+++ b/test/Driver/cuda-options.cu
@@ -39,13 +39,6 @@
  // RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
  // RUN:    -check-prefix NOHOST -check-prefix NOLINK %s
  
-// Verify that with -S we compile host and device sides to assembly and
-// incorporate device code into the host side.
-// RUN: %clang -### -target x86_64-linux-gnu -S -c %s 2>&1 \
-// RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
-// RUN:    -check-prefix HOST -check-prefix INCLUDES-DEVICE \
-// RUN:    -check-prefix NOLINK %s
-
  // Verify that --cuda-gpu-arch option passes the correct GPU archtecture to
  // device compilation.
  // RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \
@@ -61,7 +54,7 @@
  // RUN:    -check-prefix DEVICE2 -check-prefix DEVICE-SM35 \
  // RUN:    -check-prefix DEVICE2-SM30 -check-prefix HOST \
  // RUN:    -check-prefix HOST-NOSAVE -check-prefix INCLUDES-DEVICE \
-// RUN:    -check-prefix INCLUDES-DEVICE2 -check-prefix NOLINK %s
+// RUN:    -check-prefix NOLINK %s
  
  // Verify that device-side results are passed to the correct tool when
  // -save-temps is used.
@@ -92,10 +85,16 @@
  // DEVICE-NOSAVE-SAME: "-aux-triple" "x86_64--linux-gnu"
  // DEVICE-SAME: "-fcuda-is-device"
  // DEVICE-SM35-SAME: "-target-cpu" "sm_35"
-// DEVICE-SAME: "-o" "[[GPUBINARY1:[^"]*]]"
+// DEVICE-SAME: "-o" "[[PTXFILE:[^"]*]]"
  // DEVICE-NOSAVE-SAME: "-x" "cuda"
  // DEVICE-SAVE-SAME: "-x" "ir"
  
+// Match the call to ptxas (which assembles PTX to SASS).
+// DEVICE:ptxas
+// DEVICE-SM35-DAG: "--gpu-name" "sm_35"
+// DEVICE-DAG: "--output-file" "[[CUBINFILE:[^"]*]]"
+// DEVICE-DAG: "[[PTXFILE]]"
+
  // Match another device-side compilation.
  // DEVICE2: "-cc1" "-triple" "nvptx64-nvidia-cuda"
  // DEVICE2-SAME: "-aux-triple" "x86_64--linux-gnu"
@@ -108,6 +107,11 @@
  // NODEVICE-NOT: "-cc1" "-triple" "nvptx64-nvidia-cuda"
  // NODEVICE-SAME-NOT: "-fcuda-is-device"
  
+// INCLUDES-DEVICE:fatbinary
+// INCLUDES-DEVICE-DAG: "--create" "[[FATBINARY:[^"]*]]"
+// INCLUDES-DEVICE-DAG: "--image=profile=sm_{{[0-9]+}},file=[[CUBINFILE]]"
+// INCLUDES-DEVICE-DAG: "--image=profile=compute_{{[0-9]+}},file=[[PTXFILE]]"
+
  // Match host-side preprocessor job with -save-temps.
  // HOST-SAVE: "-cc1" "-triple" "x86_64--linux-gnu"
  // HOST-SAVE-SAME: "-aux-triple" "nvptx64-nvidia-cuda"
@@ -121,8 +125,7 @@
  // HOST-SAME: "-o" "[[HOSTOUTPUT:[^"]*]]"
  // HOST-NOSAVE-SAME: "-x" "cuda"
  // HOST-SAVE-SAME: "-x" "cuda-cpp-output"
-// INCLUDES-DEVICE-SAME: "-fcuda-include-gpubinary" "[[GPUBINARY1]]"
-// INCLUDES-DEVICE2-SAME: "-fcuda-include-gpubinary" "[[GPUBINARY2]]"
+// INCLUDES-DEVICE-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]"
  
  // Match external assembler that uses compilation output.
  // HOST-AS: "-o" "{{.*}}.o" "[[HOSTOUTPUT]]"
author	Justin Lebar <jlebar@google.com>
	Thu, 14 Jan 2016 21:41:27 +0000 (21:41 +0000)
committer	Justin Lebar <jlebar@google.com>
	Thu, 14 Jan 2016 21:41:27 +0000 (21:41 +0000)
include/clang/Driver/Action.h		patch \| blob \| history
include/clang/Driver/Options.td		patch \| blob \| history
include/clang/Driver/ToolChain.h		patch \| blob \| history
include/clang/Driver/Types.def		patch \| blob \| history
lib/CodeGen/CGCUDANV.cpp		patch \| blob \| history
lib/Driver/Action.cpp		patch \| blob \| history
lib/Driver/Driver.cpp		patch \| blob \| history
lib/Driver/ToolChains.cpp		patch \| blob \| history
lib/Driver/ToolChains.h		patch \| blob \| history
lib/Driver/Tools.cpp		patch \| blob \| history
lib/Driver/Tools.h		patch \| blob \| history
lib/Driver/Types.cpp		patch \| blob \| history
test/Driver/Inputs/CUDA/usr/local/cuda/bin/.keep	[new file with mode: 0644]	patch \| blob
test/Driver/cuda-arch-translation.cu	[new file with mode: 0644]	patch \| blob
test/Driver/cuda-external-tools.cu	[new file with mode: 0644]	patch \| blob
test/Driver/cuda-options.cu		patch \| blob \| history