[CUDA] Improve CUDA compilation pipeline creation.

author Artem Belevich <tra@google.com>

Thu, 27 Aug 2015 18:10:41 +0000 (18:10 +0000)

committer Artem Belevich <tra@google.com>

Thu, 27 Aug 2015 18:10:41 +0000 (18:10 +0000)
author Artem Belevich <tra@google.com>
Thu, 27 Aug 2015 18:10:41 +0000 (18:10 +0000)
committer Artem Belevich <tra@google.com>
Thu, 27 Aug 2015 18:10:41 +0000 (18:10 +0000)
diff --git a/lib/Driver/Driver.cpp b/lib/Driver/Driver.cpp

index fa7780fa57730398b69830f8724abed89d48c5d1..56d1afbc08ca81d44cc3e973697149dc6ba5706b 100644 (file)
--- a/lib/Driver/Driver.cpp
+++ b/lib/Driver/Driver.cpp
@@ -1233,11 +1233,13 @@ void Driver::BuildInputs(const ToolChain &TC, DerivedArgList &Args,
    }
  }
  
-// For each unique --cuda-gpu-arch= argument creates a TY_CUDA_DEVICE input
-// action and then wraps each in CudaDeviceAction paired with appropriate GPU
-// arch name. If we're only building device-side code, each action remains
-// independent. Otherwise we pass device-side actions as inputs to a new
-// CudaHostAction which combines both host and device side actions.
+// For each unique --cuda-gpu-arch= argument creates a TY_CUDA_DEVICE
+// input action and then wraps each in CudaDeviceAction paired with
+// appropriate GPU arch name. In case of partial (i.e preprocessing
+// only) or device-only compilation, each device action is added to /p
+// Actions and /p Current is released. Otherwise the function creates
+// and returns a new CudaHostAction which wraps /p Current and device
+// side actions.
  static std::unique_ptr<Action>
  buildCudaActions(const Driver &D, const ToolChain &TC, DerivedArgList &Args,
                   const Arg *InputArg, std::unique_ptr<Action> HostAction,
@@ -1421,22 +1423,14 @@ void Driver::BuildActions(const ToolChain &TC, DerivedArgList &Args,
      }
  
      phases::ID CudaInjectionPhase;
-    if (isSaveTempsEnabled()) {
-      // All phases are done independently, inject GPU blobs during compilation
-      // phase as that's where we generate glue code to init them.
-      CudaInjectionPhase = phases::Compile;
-    } else {
-      // Assumes that clang does everything up until linking phase, so we inject
-      // cuda device actions at the last step before linking. Otherwise CUDA
-      // host action forces preprocessor into a separate invocation.
-      CudaInjectionPhase = FinalPhase;
-      if (FinalPhase == phases::Link)
-        for (auto PI = PL.begin(), PE = PL.end(); PI != PE; ++PI) {
-          auto next = PI + 1;
-          if (next != PE && *next == phases::Link)
-            CudaInjectionPhase = *PI;
-        }
-    }
+    bool InjectCuda = (InputType == types::TY_CUDA &&
+                       !Args.hasArg(options::OPT_cuda_host_only));
+    CudaInjectionPhase = FinalPhase;
+    for (auto &Phase : PL)
+      if (Phase <= FinalPhase && Phase == phases::Compile) {
+        CudaInjectionPhase = Phase;
+        break;
+      }
  
      // Build the pipeline for this file.
      std::unique_ptr<Action> Current(new InputAction(*InputArg, InputType));
@@ -1464,8 +1458,7 @@ void Driver::BuildActions(const ToolChain &TC, DerivedArgList &Args,
        // Otherwise construct the appropriate action.
        Current = ConstructPhaseAction(TC, Args, Phase, std::move(Current));
  
-      if (InputType == types::TY_CUDA && Phase == CudaInjectionPhase &&
-          !Args.hasArg(options::OPT_cuda_host_only)) {
+      if (InjectCuda && Phase == CudaInjectionPhase) {
          Current = buildCudaActions(*this, TC, Args, InputArg,
                                     std::move(Current), Actions);
          if (!Current)
@@ -1679,10 +1672,17 @@ void Driver::BuildJobs(Compilation &C) const {
    }
  }
  
-static const Tool *SelectToolForJob(Compilation &C, bool SaveTemps,
+// Returns a Tool for a given JobAction.  In case the action and its
+// predecessors can be combined, updates Inputs with the inputs of the
+// first combined action. If one of the collapsed actions is a
+// CudaHostAction, updates CollapsedCHA with the pointer to it so the
+// caller can deal with extra handling such action requires.
+static const Tool *selectToolForJob(Compilation &C, bool SaveTemps,
                                      const ToolChain *TC, const JobAction *JA,
-                                    const ActionList *&Inputs) {
+                                    const ActionList *&Inputs,
+                                    const CudaHostAction *&CollapsedCHA) {
    const Tool *ToolForJob = nullptr;
+  CollapsedCHA = nullptr;
  
    // See if we should look for a compiler with an integrated assembler. We match
    // bottom up, so what we are actually looking for is an assembler job with a
@@ -1699,13 +1699,19 @@ static const Tool *SelectToolForJob(Compilation &C, bool SaveTemps,
      // checking the backend tool, check if the tool for the CompileJob
      // has an integrated assembler.
      const ActionList *BackendInputs = &(*Inputs)[0]->getInputs();
-    JobAction *CompileJA = cast<CompileJobAction>(*BackendInputs->begin());
+    // Compile job may be wrapped in CudaHostAction, extract it if
+    // that's the case and update CollapsedCHA if we combine phases.
+    CudaHostAction *CHA = dyn_cast<CudaHostAction>(*BackendInputs->begin());
+    JobAction *CompileJA =
+        cast<CompileJobAction>(CHA ? *CHA->begin() : *BackendInputs->begin());
+    assert(CompileJA && "Backend job is not preceeded by compile job.");
      const Tool *Compiler = TC->SelectTool(*CompileJA);
      if (!Compiler)
        return nullptr;
      if (Compiler->hasIntegratedAssembler()) {
-      Inputs = &(*BackendInputs)[0]->getInputs();
+      Inputs = &CompileJA->getInputs();
        ToolForJob = Compiler;
+      CollapsedCHA = CHA;
      }
    }
  
@@ -1715,19 +1721,19 @@ static const Tool *SelectToolForJob(Compilation &C, bool SaveTemps,
    if (isa<BackendJobAction>(JA)) {
      // Check if the compiler supports emitting LLVM IR.
      assert(Inputs->size() == 1);
-    JobAction *CompileJA;
-    // Extract real host action, if it's a CudaHostAction.
-    if (CudaHostAction *CudaHA = dyn_cast<CudaHostAction>(*Inputs->begin()))
-      CompileJA = cast<CompileJobAction>(*CudaHA->begin());
-    else
-      CompileJA = cast<CompileJobAction>(*Inputs->begin());
-
+    // Compile job may be wrapped in CudaHostAction, extract it if
+    // that's the case and update CollapsedCHA if we combine phases.
+    CudaHostAction *CHA = dyn_cast<CudaHostAction>(*Inputs->begin());
+    JobAction *CompileJA =
+        cast<CompileJobAction>(CHA ? *CHA->begin() : *Inputs->begin());
+    assert(CompileJA && "Backend job is not preceeded by compile job.");
      const Tool *Compiler = TC->SelectTool(*CompileJA);
      if (!Compiler)
        return nullptr;
      if (!Compiler->canEmitIR() || !SaveTemps) {
-      Inputs = &(*Inputs)[0]->getInputs();
+      Inputs = &CompileJA->getInputs();
        ToolForJob = Compiler;
+      CollapsedCHA = CHA;
      }
    }
  
@@ -1811,10 +1817,23 @@ void Driver::BuildJobsForAction(Compilation &C, const Action *A,
    const ActionList *Inputs = &A->getInputs();
  
    const JobAction *JA = cast<JobAction>(A);
-  const Tool *T = SelectToolForJob(C, isSaveTempsEnabled(), TC, JA, Inputs);
+  const CudaHostAction *CollapsedCHA = nullptr;
+  const Tool *T =
+      selectToolForJob(C, isSaveTempsEnabled(), TC, JA, Inputs, CollapsedCHA);
    if (!T)
      return;
  
+  // If we've collapsed action list that contained CudaHostAction we
+  // need to build jobs for device-side inputs it may have held.
+  if (CollapsedCHA) {
+    InputInfo II;
+    for (const Action *DA : CollapsedCHA->getDeviceActions()) {
+      BuildJobsForAction(C, DA, TC, "", AtTopLevel,
+                         /*MultipleArchs*/ false, LinkingOutput, II);
+      CudaDeviceInputInfos.push_back(II);
+    }
+  }
+
    // Only use pipes when there is exactly one input.
    InputInfoList InputInfos;
    for (const Action *Input : *Inputs) {
diff --git a/test/Driver/cuda-options.cu b/test/Driver/cuda-options.cu

index d1575db727ba635da5d3d3e281164e65cc6a9291..2ac4fbf34c6c7c47c68441b46a8b86e9e88f94f8 100644 (file)
--- a/test/Driver/cuda-options.cu
+++ b/test/Driver/cuda-options.cu
@@ -6,7 +6,7 @@
  // Simple compilation case:
  // RUN: %clang -### -target x86_64-linux-gnu -c %s 2>&1 \
  // Compile device-side to PTX assembly and make sure we use it on the host side.
-// RUN:   | FileCheck -check-prefix CUDA-D1 \
+// RUN:   | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS\
  // Then compile host side and incorporate device code.
  // RUN:   -check-prefix CUDA-H -check-prefix CUDA-H-I1 \
  // Make sure we don't link anything.
@@ -15,7 +15,7 @@
  // Typical compilation + link case:
  // RUN: %clang -### -target x86_64-linux-gnu %s 2>&1 \
  // Compile device-side to PTX assembly and make sure we use it on the host side
-// RUN:   | FileCheck -check-prefix CUDA-D1 \
+// RUN:   | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS\
  // Then compile host side and incorporate device code.
  // RUN:   -check-prefix CUDA-H -check-prefix CUDA-H-I1 \
  // Then link things.
@@ -33,7 +33,7 @@
  // Verify that -cuda-no-host disables host-side compilation and linking
  // RUN: %clang -### -target x86_64-linux-gnu --cuda-device-only %s 2>&1 \
  // Compile device-side to PTX assembly
-// RUN:   | FileCheck -check-prefix CUDA-D1 \
+// RUN:   | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS\
  // Make sure there are no host cmpilation or linking.
  // RUN:   -check-prefix CUDA-NH -check-prefix CUDA-NL %s
  
@@ -41,7 +41,7 @@
  // and incorporate device code on the host side.
  // RUN: %clang -### -target x86_64-linux-gnu -S -c %s 2>&1 \
  // Compile device-side to PTX assembly
-// RUN:   | FileCheck -check-prefix CUDA-D1 \
+// RUN:   | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS\
  // Then compile host side and incorporate GPU code.
  // RUN:  -check-prefix CUDA-H -check-prefix CUDA-H-I1 \
  // Make sure we don't link anything.
@@ -51,7 +51,8 @@
  // archtecture info to device compilation.
  // RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \
  // Compile device-side to PTX assembly.
-// RUN:   | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1-SM35 \
+// RUN:   | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS \
+// RUN:   -check-prefix CUDA-D1-SM35 \
  // Then compile host side and incorporate GPU code.
  // RUN:   -check-prefix CUDA-H -check-prefix CUDA-H-I1 \
  // Make sure we don't link anything.
@@ -59,16 +60,47 @@
  
  // Verify that there is device-side compilation per --cuda-gpu-arch args
  // and that all results are included on the host side.
-// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 -c %s 2>&1 \
+// RUN: %clang -### -target x86_64-linux-gnu \
+// RUN:        --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 -c %s 2>&1 \
  // Compile both device-sides to PTX assembly
  // RUN:   | FileCheck \
-// RUN: -check-prefix CUDA-D1 -check-prefix CUDA-D1-SM35 \
+// RUN: -check-prefix CUDA-D1 -check-prefix CUDA-D1NS -check-prefix CUDA-D1-SM35 \
  // RUN: -check-prefix CUDA-D2 -check-prefix CUDA-D2-SM30 \
  // Then compile host side and incorporate both device-side outputs
-// RUN:   -check-prefix CUDA-H -check-prefix CUDA-H-I1 -check-prefix CUDA-H-I2 \
+// RUN:   -check-prefix CUDA-H -check-prefix CUDA-HNS \
+// RUN:   -check-prefix CUDA-H-I1 -check-prefix CUDA-H-I2 \
  // Make sure we don't link anything.
  // RUN:   -check-prefix CUDA-NL %s
  
+// Verify that device-side results are passed to correct tool when
+// -save-temps is used
+// RUN: %clang -### -target x86_64-linux-gnu -save-temps -c %s 2>&1 \
+// Compile device-side to PTX assembly and make sure we use it on the host side.
+// RUN:   | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1S \
+// Then compile host side and incorporate device code.
+// RUN:   -check-prefix CUDA-H -check-prefix CUDA-HS -check-prefix CUDA-HS-I1 \
+// Make sure we don't link anything.
+// RUN:   -check-prefix CUDA-NL %s
+
+// Verify that device-side results are passed to correct tool when
+// -fno-integrated-as is used
+// RUN: %clang -### -target x86_64-linux-gnu -fno-integrated-as -c %s 2>&1 \
+// Compile device-side to PTX assembly and make sure we use it on the host side.
+// RUN:   | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS \
+// Then compile host side and incorporate device code.
+// RUN:   -check-prefix CUDA-H -check-prefix CUDA-HNS -check-prefix CUDA-HS-I1 \
+// RUN:   -check-prefix CUDA-H-AS \
+// Make sure we don't link anything.
+// RUN:   -check-prefix CUDA-NL %s
+
+// Match device-side preprocessor, and compiler phases with -save-temps
+// CUDA-D1S: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
+// CUDA-D1S-SAME: "-fcuda-is-device"
+// CUDA-D1S-SAME: "-x" "cuda"
+// CUDA-D1S: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
+// CUDA-D1S-SAME: "-fcuda-is-device"
+// CUDA-D1S-SAME: "-x" "cuda-cpp-output"
+
  // --cuda-host-only should never trigger unused arg warning.
  // RUN: %clang -### -target x86_64-linux-gnu --cuda-host-only -c %s 2>&1 | \
  // RUN:    FileCheck -check-prefix CUDA-NO-UNUSED-CHO %s
@@ -83,12 +115,13 @@
  // RUN: %clang -### -target x86_64-linux-gnu --cuda-device-only -x c -c %s 2>&1 | \
  // RUN:    FileCheck -check-prefix CUDA-UNUSED-CDO %s
  
-// Match device-side compilation
+// Match the job that produces PTX assembly
  // CUDA-D1: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
  // CUDA-D1-SAME: "-fcuda-is-device"
  // CUDA-D1-SM35-SAME: "-target-cpu" "sm_35"
  // CUDA-D1-SAME: "-o" "[[GPUBINARY1:[^"]*]]"
-// CUDA-D1-SAME: "-x" "cuda"
+// CUDA-D1NS-SAME: "-x" "cuda"
+// CUDA-D1S-SAME: "-x" "ir"
  
  // Match anothe device-side compilation
  // CUDA-D2: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
@@ -98,18 +131,28 @@
  // CUDA-D2-SAME: "-x" "cuda"
  
  // Match no device-side compilation
-// CUDA-ND-NOT: "-cc1" "-triple" "nvptx{{64?}}-nvidia-cuda"
+// CUDA-ND-NOT: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
  // CUDA-ND-SAME-NOT: "-fcuda-is-device"
  
+// Match host-side preprocessor job with -save-temps
+// CUDA-HS: "-cc1" "-triple"
+// CUDA-HS-SAME-NOT: "nvptx{{(64)?}}-nvidia-cuda"
+// CUDA-HS-SAME-NOT: "-fcuda-is-device"
+// CUDA-HS-SAME: "-x" "cuda"
+
  // Match host-side compilation
  // CUDA-H: "-cc1" "-triple"
-// CUDA-H-SAME-NOT: "nvptx{{64?}}-nvidia-cuda"
+// CUDA-H-SAME-NOT: "nvptx{{(64)?}}-nvidia-cuda"
  // CUDA-H-SAME-NOT: "-fcuda-is-device"
-// CUDA-H-SAME: "-o" "[[HOSTOBJ:[^"]*]]"
-// CUDA-H-SAME: "-x" "cuda"
+// CUDA-H-SAME: "-o" "[[HOSTOUTPUT:[^"]*]]"
+// CUDA-HNS-SAME: "-x" "cuda"
+// CUDA-HS-SAME: "-x" "cuda-cpp-output"
  // CUDA-H-I1-SAME: "-fcuda-include-gpubinary" "[[GPUBINARY1]]"
  // CUDA-H-I2-SAME: "-fcuda-include-gpubinary" "[[GPUBINARY2]]"
  
+// Match external assembler that uses compilation output
+// CUDA-H-AS: "-o" "{{.*}}.o" "[[HOSTOUTPUT]]"
+
  // Match no GPU code inclusion.
  // CUDA-H-NI-NOT: "-fcuda-include-gpubinary"
  
@@ -119,7 +162,7 @@
  
  // Match linker
  // CUDA-L: "{{.*}}{{ld|link}}{{(.exe)?}}"
-// CUDA-L-SAME: "[[HOSTOBJ]]"
+// CUDA-L-SAME: "[[HOSTOUTPUT]]"
  
  // Match no linker
  // CUDA-NL-NOT: "{{.*}}{{ld|link}}{{(.exe)?}}"
author	Artem Belevich <tra@google.com>
	Thu, 27 Aug 2015 18:10:41 +0000 (18:10 +0000)
committer	Artem Belevich <tra@google.com>
	Thu, 27 Aug 2015 18:10:41 +0000 (18:10 +0000)
lib/Driver/Driver.cpp		patch \| blob \| history
test/Driver/cuda-options.cu		patch \| blob \| history