}
}
-// For each unique --cuda-gpu-arch= argument creates a TY_CUDA_DEVICE input
-// action and then wraps each in CudaDeviceAction paired with appropriate GPU
-// arch name. If we're only building device-side code, each action remains
-// independent. Otherwise we pass device-side actions as inputs to a new
-// CudaHostAction which combines both host and device side actions.
+// For each unique --cuda-gpu-arch= argument creates a TY_CUDA_DEVICE
+// input action and then wraps each in CudaDeviceAction paired with
+// appropriate GPU arch name. In case of partial (i.e preprocessing
+// only) or device-only compilation, each device action is added to /p
+// Actions and /p Current is released. Otherwise the function creates
+// and returns a new CudaHostAction which wraps /p Current and device
+// side actions.
static std::unique_ptr<Action>
buildCudaActions(const Driver &D, const ToolChain &TC, DerivedArgList &Args,
const Arg *InputArg, std::unique_ptr<Action> HostAction,
}
phases::ID CudaInjectionPhase;
- if (isSaveTempsEnabled()) {
- // All phases are done independently, inject GPU blobs during compilation
- // phase as that's where we generate glue code to init them.
- CudaInjectionPhase = phases::Compile;
- } else {
- // Assumes that clang does everything up until linking phase, so we inject
- // cuda device actions at the last step before linking. Otherwise CUDA
- // host action forces preprocessor into a separate invocation.
- CudaInjectionPhase = FinalPhase;
- if (FinalPhase == phases::Link)
- for (auto PI = PL.begin(), PE = PL.end(); PI != PE; ++PI) {
- auto next = PI + 1;
- if (next != PE && *next == phases::Link)
- CudaInjectionPhase = *PI;
- }
- }
+ bool InjectCuda = (InputType == types::TY_CUDA &&
+ !Args.hasArg(options::OPT_cuda_host_only));
+ CudaInjectionPhase = FinalPhase;
+ for (auto &Phase : PL)
+ if (Phase <= FinalPhase && Phase == phases::Compile) {
+ CudaInjectionPhase = Phase;
+ break;
+ }
// Build the pipeline for this file.
std::unique_ptr<Action> Current(new InputAction(*InputArg, InputType));
// Otherwise construct the appropriate action.
Current = ConstructPhaseAction(TC, Args, Phase, std::move(Current));
- if (InputType == types::TY_CUDA && Phase == CudaInjectionPhase &&
- !Args.hasArg(options::OPT_cuda_host_only)) {
+ if (InjectCuda && Phase == CudaInjectionPhase) {
Current = buildCudaActions(*this, TC, Args, InputArg,
std::move(Current), Actions);
if (!Current)
}
}
-static const Tool *SelectToolForJob(Compilation &C, bool SaveTemps,
+// Returns a Tool for a given JobAction. In case the action and its
+// predecessors can be combined, updates Inputs with the inputs of the
+// first combined action. If one of the collapsed actions is a
+// CudaHostAction, updates CollapsedCHA with the pointer to it so the
+// caller can deal with extra handling such action requires.
+static const Tool *selectToolForJob(Compilation &C, bool SaveTemps,
const ToolChain *TC, const JobAction *JA,
- const ActionList *&Inputs) {
+ const ActionList *&Inputs,
+ const CudaHostAction *&CollapsedCHA) {
const Tool *ToolForJob = nullptr;
+ CollapsedCHA = nullptr;
// See if we should look for a compiler with an integrated assembler. We match
// bottom up, so what we are actually looking for is an assembler job with a
// checking the backend tool, check if the tool for the CompileJob
// has an integrated assembler.
const ActionList *BackendInputs = &(*Inputs)[0]->getInputs();
- JobAction *CompileJA = cast<CompileJobAction>(*BackendInputs->begin());
+ // Compile job may be wrapped in CudaHostAction, extract it if
+ // that's the case and update CollapsedCHA if we combine phases.
+ CudaHostAction *CHA = dyn_cast<CudaHostAction>(*BackendInputs->begin());
+ JobAction *CompileJA =
+ cast<CompileJobAction>(CHA ? *CHA->begin() : *BackendInputs->begin());
+ assert(CompileJA && "Backend job is not preceeded by compile job.");
const Tool *Compiler = TC->SelectTool(*CompileJA);
if (!Compiler)
return nullptr;
if (Compiler->hasIntegratedAssembler()) {
- Inputs = &(*BackendInputs)[0]->getInputs();
+ Inputs = &CompileJA->getInputs();
ToolForJob = Compiler;
+ CollapsedCHA = CHA;
}
}
if (isa<BackendJobAction>(JA)) {
// Check if the compiler supports emitting LLVM IR.
assert(Inputs->size() == 1);
- JobAction *CompileJA;
- // Extract real host action, if it's a CudaHostAction.
- if (CudaHostAction *CudaHA = dyn_cast<CudaHostAction>(*Inputs->begin()))
- CompileJA = cast<CompileJobAction>(*CudaHA->begin());
- else
- CompileJA = cast<CompileJobAction>(*Inputs->begin());
-
+ // Compile job may be wrapped in CudaHostAction, extract it if
+ // that's the case and update CollapsedCHA if we combine phases.
+ CudaHostAction *CHA = dyn_cast<CudaHostAction>(*Inputs->begin());
+ JobAction *CompileJA =
+ cast<CompileJobAction>(CHA ? *CHA->begin() : *Inputs->begin());
+ assert(CompileJA && "Backend job is not preceeded by compile job.");
const Tool *Compiler = TC->SelectTool(*CompileJA);
if (!Compiler)
return nullptr;
if (!Compiler->canEmitIR() || !SaveTemps) {
- Inputs = &(*Inputs)[0]->getInputs();
+ Inputs = &CompileJA->getInputs();
ToolForJob = Compiler;
+ CollapsedCHA = CHA;
}
}
const ActionList *Inputs = &A->getInputs();
const JobAction *JA = cast<JobAction>(A);
- const Tool *T = SelectToolForJob(C, isSaveTempsEnabled(), TC, JA, Inputs);
+ const CudaHostAction *CollapsedCHA = nullptr;
+ const Tool *T =
+ selectToolForJob(C, isSaveTempsEnabled(), TC, JA, Inputs, CollapsedCHA);
if (!T)
return;
+ // If we've collapsed action list that contained CudaHostAction we
+ // need to build jobs for device-side inputs it may have held.
+ if (CollapsedCHA) {
+ InputInfo II;
+ for (const Action *DA : CollapsedCHA->getDeviceActions()) {
+ BuildJobsForAction(C, DA, TC, "", AtTopLevel,
+ /*MultipleArchs*/ false, LinkingOutput, II);
+ CudaDeviceInputInfos.push_back(II);
+ }
+ }
+
// Only use pipes when there is exactly one input.
InputInfoList InputInfos;
for (const Action *Input : *Inputs) {
// Simple compilation case:
// RUN: %clang -### -target x86_64-linux-gnu -c %s 2>&1 \
// Compile device-side to PTX assembly and make sure we use it on the host side.
-// RUN: | FileCheck -check-prefix CUDA-D1 \
+// RUN: | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS\
// Then compile host side and incorporate device code.
// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 \
// Make sure we don't link anything.
// Typical compilation + link case:
// RUN: %clang -### -target x86_64-linux-gnu %s 2>&1 \
// Compile device-side to PTX assembly and make sure we use it on the host side
-// RUN: | FileCheck -check-prefix CUDA-D1 \
+// RUN: | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS\
// Then compile host side and incorporate device code.
// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 \
// Then link things.
// Verify that -cuda-no-host disables host-side compilation and linking
// RUN: %clang -### -target x86_64-linux-gnu --cuda-device-only %s 2>&1 \
// Compile device-side to PTX assembly
-// RUN: | FileCheck -check-prefix CUDA-D1 \
+// RUN: | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS\
// Make sure there are no host cmpilation or linking.
// RUN: -check-prefix CUDA-NH -check-prefix CUDA-NL %s
// and incorporate device code on the host side.
// RUN: %clang -### -target x86_64-linux-gnu -S -c %s 2>&1 \
// Compile device-side to PTX assembly
-// RUN: | FileCheck -check-prefix CUDA-D1 \
+// RUN: | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS\
// Then compile host side and incorporate GPU code.
// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 \
// Make sure we don't link anything.
// archtecture info to device compilation.
// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \
// Compile device-side to PTX assembly.
-// RUN: | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1-SM35 \
+// RUN: | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS \
+// RUN: -check-prefix CUDA-D1-SM35 \
// Then compile host side and incorporate GPU code.
// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 \
// Make sure we don't link anything.
// Verify that there is device-side compilation per --cuda-gpu-arch args
// and that all results are included on the host side.
-// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 -c %s 2>&1 \
+// RUN: %clang -### -target x86_64-linux-gnu \
+// RUN: --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 -c %s 2>&1 \
// Compile both device-sides to PTX assembly
// RUN: | FileCheck \
-// RUN: -check-prefix CUDA-D1 -check-prefix CUDA-D1-SM35 \
+// RUN: -check-prefix CUDA-D1 -check-prefix CUDA-D1NS -check-prefix CUDA-D1-SM35 \
// RUN: -check-prefix CUDA-D2 -check-prefix CUDA-D2-SM30 \
// Then compile host side and incorporate both device-side outputs
-// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 -check-prefix CUDA-H-I2 \
+// RUN: -check-prefix CUDA-H -check-prefix CUDA-HNS \
+// RUN: -check-prefix CUDA-H-I1 -check-prefix CUDA-H-I2 \
// Make sure we don't link anything.
// RUN: -check-prefix CUDA-NL %s
+// Verify that device-side results are passed to correct tool when
+// -save-temps is used
+// RUN: %clang -### -target x86_64-linux-gnu -save-temps -c %s 2>&1 \
+// Compile device-side to PTX assembly and make sure we use it on the host side.
+// RUN: | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1S \
+// Then compile host side and incorporate device code.
+// RUN: -check-prefix CUDA-H -check-prefix CUDA-HS -check-prefix CUDA-HS-I1 \
+// Make sure we don't link anything.
+// RUN: -check-prefix CUDA-NL %s
+
+// Verify that device-side results are passed to correct tool when
+// -fno-integrated-as is used
+// RUN: %clang -### -target x86_64-linux-gnu -fno-integrated-as -c %s 2>&1 \
+// Compile device-side to PTX assembly and make sure we use it on the host side.
+// RUN: | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS \
+// Then compile host side and incorporate device code.
+// RUN: -check-prefix CUDA-H -check-prefix CUDA-HNS -check-prefix CUDA-HS-I1 \
+// RUN: -check-prefix CUDA-H-AS \
+// Make sure we don't link anything.
+// RUN: -check-prefix CUDA-NL %s
+
+// Match device-side preprocessor, and compiler phases with -save-temps
+// CUDA-D1S: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
+// CUDA-D1S-SAME: "-fcuda-is-device"
+// CUDA-D1S-SAME: "-x" "cuda"
+// CUDA-D1S: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
+// CUDA-D1S-SAME: "-fcuda-is-device"
+// CUDA-D1S-SAME: "-x" "cuda-cpp-output"
+
// --cuda-host-only should never trigger unused arg warning.
// RUN: %clang -### -target x86_64-linux-gnu --cuda-host-only -c %s 2>&1 | \
// RUN: FileCheck -check-prefix CUDA-NO-UNUSED-CHO %s
// RUN: %clang -### -target x86_64-linux-gnu --cuda-device-only -x c -c %s 2>&1 | \
// RUN: FileCheck -check-prefix CUDA-UNUSED-CDO %s
-// Match device-side compilation
+// Match the job that produces PTX assembly
// CUDA-D1: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
// CUDA-D1-SAME: "-fcuda-is-device"
// CUDA-D1-SM35-SAME: "-target-cpu" "sm_35"
// CUDA-D1-SAME: "-o" "[[GPUBINARY1:[^"]*]]"
-// CUDA-D1-SAME: "-x" "cuda"
+// CUDA-D1NS-SAME: "-x" "cuda"
+// CUDA-D1S-SAME: "-x" "ir"
// Match anothe device-side compilation
// CUDA-D2: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
// CUDA-D2-SAME: "-x" "cuda"
// Match no device-side compilation
-// CUDA-ND-NOT: "-cc1" "-triple" "nvptx{{64?}}-nvidia-cuda"
+// CUDA-ND-NOT: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
// CUDA-ND-SAME-NOT: "-fcuda-is-device"
+// Match host-side preprocessor job with -save-temps
+// CUDA-HS: "-cc1" "-triple"
+// CUDA-HS-SAME-NOT: "nvptx{{(64)?}}-nvidia-cuda"
+// CUDA-HS-SAME-NOT: "-fcuda-is-device"
+// CUDA-HS-SAME: "-x" "cuda"
+
// Match host-side compilation
// CUDA-H: "-cc1" "-triple"
-// CUDA-H-SAME-NOT: "nvptx{{64?}}-nvidia-cuda"
+// CUDA-H-SAME-NOT: "nvptx{{(64)?}}-nvidia-cuda"
// CUDA-H-SAME-NOT: "-fcuda-is-device"
-// CUDA-H-SAME: "-o" "[[HOSTOBJ:[^"]*]]"
-// CUDA-H-SAME: "-x" "cuda"
+// CUDA-H-SAME: "-o" "[[HOSTOUTPUT:[^"]*]]"
+// CUDA-HNS-SAME: "-x" "cuda"
+// CUDA-HS-SAME: "-x" "cuda-cpp-output"
// CUDA-H-I1-SAME: "-fcuda-include-gpubinary" "[[GPUBINARY1]]"
// CUDA-H-I2-SAME: "-fcuda-include-gpubinary" "[[GPUBINARY2]]"
+// Match external assembler that uses compilation output
+// CUDA-H-AS: "-o" "{{.*}}.o" "[[HOSTOUTPUT]]"
+
// Match no GPU code inclusion.
// CUDA-H-NI-NOT: "-fcuda-include-gpubinary"
// Match linker
// CUDA-L: "{{.*}}{{ld|link}}{{(.exe)?}}"
-// CUDA-L-SAME: "[[HOSTOBJ]]"
+// CUDA-L-SAME: "[[HOSTOUTPUT]]"
// Match no linker
// CUDA-NL-NOT: "{{.*}}{{ld|link}}{{(.exe)?}}"