[CUDA] Add option to generate relocatable device code

author Jonas Hahnfeld <hahnjo@hahnjo.de>

Mon, 12 Feb 2018 10:46:45 +0000 (10:46 +0000)

committer Jonas Hahnfeld <hahnjo@hahnjo.de>

Mon, 12 Feb 2018 10:46:45 +0000 (10:46 +0000)
author Jonas Hahnfeld <hahnjo@hahnjo.de>
Mon, 12 Feb 2018 10:46:45 +0000 (10:46 +0000)
committer Jonas Hahnfeld <hahnjo@hahnjo.de>
Mon, 12 Feb 2018 10:46:45 +0000 (10:46 +0000)
diff --git a/include/clang/Basic/LangOptions.def b/include/clang/Basic/LangOptions.def

index c6ed256dd6e4cefa50ce805731c5670ccb940302..f2a09208418b8e1a716fb958fcbb83d75c79d716 100644 (file)
--- a/include/clang/Basic/LangOptions.def
+++ b/include/clang/Basic/LangOptions.def
@@ -204,6 +204,7 @@ LANGOPT(CUDAAllowVariadicFunctions, 1, 0, "allowing variadic functions in CUDA d
  LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__")
  LANGOPT(CUDADeviceFlushDenormalsToZero, 1, 0, "flushing denormals to zero")
  LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")
+LANGOPT(CUDARelocatableDeviceCode, 1, 0, "generate relocatable device code")
  
  LANGOPT(SizedDeallocation , 1, 0, "sized deallocation")
  LANGOPT(AlignedAllocation , 1, 0, "aligned allocation")
diff --git a/include/clang/Driver/Options.td b/include/clang/Driver/Options.td

index a64d6ad43176025a5467d5f5e749c3c105bbf67b..666b1d9c4fd1f50a06aee0c4149f9d6395cca89a 100644 (file)
--- a/include/clang/Driver/Options.td
+++ b/include/clang/Driver/Options.td
@@ -566,6 +566,9 @@ def fno_cuda_flush_denormals_to_zero : Flag<["-"], "fno-cuda-flush-denormals-to-
  def fcuda_approx_transcendentals : Flag<["-"], "fcuda-approx-transcendentals">,
    Flags<[CC1Option]>, HelpText<"Use approximate transcendental functions">;
  def fno_cuda_approx_transcendentals : Flag<["-"], "fno-cuda-approx-transcendentals">;
+def fcuda_rdc : Flag<["-"], "fcuda-rdc">, Flags<[CC1Option, HelpHidden]>,
+  HelpText<"Generate relocatable device code, also known as separate compilation mode.">;
+def fno_cuda_rdc : Flag<["-"], "fno-cuda-rdc">;
  def dA : Flag<["-"], "dA">, Group<d_Group>;
  def dD : Flag<["-"], "dD">, Group<d_Group>, Flags<[CC1Option]>,
    HelpText<"Print macro definitions in -E mode in addition to normal output">;
diff --git a/lib/Driver/ToolChains/Clang.cpp b/lib/Driver/ToolChains/Clang.cpp

index 2a8c9c97f5576b01e2f24e68ff5500e1cc967a77..185270c61abdfbf74fb1dbe9a83cd081197e9397 100644 (file)
--- a/lib/Driver/ToolChains/Clang.cpp
+++ b/lib/Driver/ToolChains/Clang.cpp
@@ -4658,14 +4658,20 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
      CmdArgs.push_back(Args.MakeArgString(Flags));
    }
  
-  // Host-side cuda compilation receives device-side outputs as Inputs[1...].
-  // Include them with -fcuda-include-gpubinary.
-  if (IsCuda && Inputs.size() > 1)
-    for (auto I = std::next(Inputs.begin()), E = Inputs.end(); I != E; ++I) {
-      CmdArgs.push_back("-fcuda-include-gpubinary");
-      CmdArgs.push_back(I->getFilename());
+  if (IsCuda) {
+    // Host-side cuda compilation receives device-side outputs as Inputs[1...].
+    // Include them with -fcuda-include-gpubinary.
+    if (Inputs.size() > 1) {
+      for (auto I = std::next(Inputs.begin()), E = Inputs.end(); I != E; ++I) {
+        CmdArgs.push_back("-fcuda-include-gpubinary");
+        CmdArgs.push_back(I->getFilename());
+      }
      }
  
+    if (Args.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, false))
+      CmdArgs.push_back("-fcuda-rdc");
+  }
+
    // OpenMP offloading device jobs take the argument -fopenmp-host-ir-file-path
    // to specify the result of the compile phase on the host, so the meaningful
    // device declarations can be identified. Also, -fopenmp-is-device is passed
diff --git a/lib/Driver/ToolChains/Cuda.cpp b/lib/Driver/ToolChains/Cuda.cpp

index e513e818ebfbaff420f33f8700d4a87b3e733891..86a11847e5a8e749d9cbee25825676abe02916f0 100644 (file)
--- a/lib/Driver/ToolChains/Cuda.cpp
+++ b/lib/Driver/ToolChains/Cuda.cpp
@@ -355,11 +355,17 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
    for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_ptxas))
      CmdArgs.push_back(Args.MakeArgString(A));
  
-  // In OpenMP we need to generate relocatable code.
-  if (JA.isOffloading(Action::OFK_OpenMP) &&
-      Args.hasFlag(options::OPT_fopenmp_relocatable_target,
-                   options::OPT_fnoopenmp_relocatable_target,
-                   /*Default=*/ true))
+  bool Relocatable = false;
+  if (JA.isOffloading(Action::OFK_OpenMP))
+    // In OpenMP we need to generate relocatable code.
+    Relocatable = Args.hasFlag(options::OPT_fopenmp_relocatable_target,
+                               options::OPT_fnoopenmp_relocatable_target,
+                               /*Default=*/true);
+  else if (JA.isOffloading(Action::OFK_Cuda))
+    Relocatable = Args.hasFlag(options::OPT_fcuda_rdc,
+                               options::OPT_fno_cuda_rdc, /*Default=*/false);
+
+  if (Relocatable)
      CmdArgs.push_back("-c");
  
    const char *Exec;
@@ -540,6 +546,10 @@ void CudaToolChain::addClangTargetOptions(
      if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals,
                             options::OPT_fno_cuda_approx_transcendentals, false))
        CC1Args.push_back("-fcuda-approx-transcendentals");
+
+    if (DriverArgs.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc,
+                           false))
+      CC1Args.push_back("-fcuda-rdc");
    }
  
    if (DriverArgs.hasArg(options::OPT_nocudalib))
diff --git a/lib/Frontend/CompilerInvocation.cpp b/lib/Frontend/CompilerInvocation.cpp

index 95fd1665b406e4d6fda48e9c983de1fde51b1cf1..5b5c24dfc58252db9ef8154504cda2f563ebd564 100644 (file)
--- a/lib/Frontend/CompilerInvocation.cpp
+++ b/lib/Frontend/CompilerInvocation.cpp
@@ -2074,6 +2074,8 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK,
    if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_approx_transcendentals))
      Opts.CUDADeviceApproxTranscendentals = 1;
  
+  Opts.CUDARelocatableDeviceCode = Args.hasArg(OPT_fcuda_rdc);
+
    if (Opts.ObjC1) {
      if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) {
        StringRef value = arg->getValue();
diff --git a/test/Driver/cuda-external-tools.cu b/test/Driver/cuda-external-tools.cu

index 99efb6449215d2960e42b7495229d17f5a7e2550..c15cc05a7d0df915a2abdf37d2e26045ffc4d177 100644 (file)
--- a/test/Driver/cuda-external-tools.cu
+++ b/test/Driver/cuda-external-tools.cu
@@ -18,6 +18,9 @@
  // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT3 %s
  // RUN: %clang -### -target x86_64-linux-gnu -Ofast -c %s 2>&1 \
  // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT3 %s
+// Generating relocatable device code
+// RUN: %clang -### -target x86_64-linux-gnu -fcuda-rdc -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s
  
  // With debugging enabled, ptxas should be run with with no ptxas optimizations.
  // RUN: %clang -### -target x86_64-linux-gnu --cuda-noopt-device-debug -O2 -c %s 2>&1 \
@@ -42,14 +45,23 @@
  // Regular compile targeting sm_35.
  // RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \
  // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35 %s
+// Separate compilation targeting sm_35.
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -fcuda-rdc -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s
  
  // 32-bit compile.
  // RUN: %clang -### -target i386-linux-gnu -c %s 2>&1 \
  // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20 %s
+// 32-bit compile when generating relocatable device code.
+// RUN: %clang -### -target i386-linux-gnu -fcuda-rdc -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20,RDC %s
  
  // Compile with -fintegrated-as.  This should still cause us to invoke ptxas.
  // RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -c %s 2>&1 \
  // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT0 %s
+// Check that we still pass -c when generating relocatable device code.
+// RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -fcuda-rdc -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s
  
  // Check -Xcuda-ptxas and -Xcuda-fatbinary
  // RUN: %clang -### -target x86_64-linux-gnu -c -Xcuda-ptxas -foo1 \
@@ -64,6 +76,14 @@
  // RUN: %clang -### -target i386-apple-macosx -c %s 2>&1 \
  // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20 %s
  
+// Check relocatable device code generation on MacOS.
+// RUN: %clang -### -target x86_64-apple-macosx -O0 -fcuda-rdc -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s
+// RUN: %clang -### -target x86_64-apple-macosx --cuda-gpu-arch=sm_35 -fcuda-rdc -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s
+// RUN: %clang -### -target i386-apple-macosx -fcuda-rdc -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20,RDC %s
+
  // Check that CLANG forwards the -v flag to PTXAS.
  // RUN:   %clang -### -save-temps -no-canonical-prefixes -v %s 2>&1 \
  // RUN:   | FileCheck -check-prefix=CHK-PTXAS-VERBOSE %s
@@ -76,6 +96,8 @@
  // SM35-SAME: "-target-cpu" "sm_35"
  // SM20-SAME: "-o" "[[PTXFILE:[^"]*]]"
  // SM35-SAME: "-o" "[[PTXFILE:[^"]*]]"
+// RDC-SAME: "-fcuda-rdc"
+// CHECK-NOT: "-fcuda-rdc"
  
  // Match the call to ptxas (which assembles PTX to SASS).
  // CHECK: ptxas
@@ -97,6 +119,8 @@
  // CHECK-SAME: "[[PTXFILE]]"
  // PTXAS-EXTRA-SAME: "-foo1"
  // PTXAS-EXTRA-SAME: "-foo2"
+// RDC-SAME: "-c"
+// CHECK-NOT: "-c"
  
  // Match the call to fatbinary (which combines all our PTX and SASS into one
  // blob).
@@ -117,5 +141,7 @@
  // ARCH64-SAME: "-triple" "x86_64-
  // ARCH32-SAME: "-triple" "i386-
  // CHECK-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]"
+// RDC-SAME: "-fcuda-rdc"
+// CHECK-NOT: "-fcuda-rdc"
  
  // CHK-PTXAS-VERBOSE: ptxas{{.*}}" "-v"
author	Jonas Hahnfeld <hahnjo@hahnjo.de>
	Mon, 12 Feb 2018 10:46:45 +0000 (10:46 +0000)
committer	Jonas Hahnfeld <hahnjo@hahnjo.de>
	Mon, 12 Feb 2018 10:46:45 +0000 (10:46 +0000)
include/clang/Basic/LangOptions.def		patch \| blob \| history
include/clang/Driver/Options.td		patch \| blob \| history
lib/Driver/ToolChains/Clang.cpp		patch \| blob \| history
lib/Driver/ToolChains/Cuda.cpp		patch \| blob \| history
lib/Frontend/CompilerInvocation.cpp		patch \| blob \| history
test/Driver/cuda-external-tools.cu		patch \| blob \| history