From: Artem Belevich Date: Thu, 31 Jan 2019 21:34:03 +0000 (+0000) Subject: [CUDA] add support for the new kernel launch API in CUDA-9.2+. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=6a79c4432e4dfc5a6f4bed8d48b2b02afb6824b2;p=clang [CUDA] add support for the new kernel launch API in CUDA-9.2+. Instead of calling CUDA runtime to arrange function arguments, the new API constructs arguments in a local array and the kernels are launched with __cudaLaunchKernel(). The old API has been deprecated and is expected to go away in the next CUDA release. Differential Revision: https://reviews.llvm.org/D57488 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@352799 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/clang/Basic/DiagnosticSemaKinds.td b/include/clang/Basic/DiagnosticSemaKinds.td index d1d99876c3..32c92e4632 100644 --- a/include/clang/Basic/DiagnosticSemaKinds.td +++ b/include/clang/Basic/DiagnosticSemaKinds.td @@ -7143,7 +7143,7 @@ def err_kern_type_not_void_return : Error< def err_kern_is_nonstatic_method : Error< "kernel function %0 must be a free function or static member function">; def err_config_scalar_return : Error< - "CUDA special function 'cudaConfigureCall' must have scalar return type">; + "CUDA special function '%0' must have scalar return type">; def err_kern_call_not_global_function : Error< "kernel call to non-global function %0">; def err_global_call_not_config : Error< diff --git a/include/clang/Sema/Sema.h b/include/clang/Sema/Sema.h index dc70a8b04a..1c75642716 100644 --- a/include/clang/Sema/Sema.h +++ b/include/clang/Sema/Sema.h @@ -10348,6 +10348,11 @@ public: /// Copies target attributes from the template TD to the function FD. void inheritCUDATargetAttrs(FunctionDecl *FD, const FunctionTemplateDecl &TD); + /// Returns the name of the launch configuration function. This is the name + /// of the function that will be called to configure kernel call, with the + /// parameters specified via <<<>>>. + std::string getCudaConfigureFuncName() const; + /// \name Code completion //@{ /// Describes the context in which code completion occurs. diff --git a/lib/CodeGen/CGCUDANV.cpp b/lib/CodeGen/CGCUDANV.cpp index 0678f14e7d..9aaa5f76c7 100644 --- a/lib/CodeGen/CGCUDANV.cpp +++ b/lib/CodeGen/CGCUDANV.cpp @@ -15,6 +15,8 @@ #include "CodeGenFunction.h" #include "CodeGenModule.h" #include "clang/AST/Decl.h" +#include "clang/Basic/Cuda.h" +#include "clang/CodeGen/CodeGenABITypes.h" #include "clang/CodeGen/ConstantInitBuilder.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" @@ -102,7 +104,8 @@ private: return DummyFunc; } - void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args); + void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args); + void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args); public: CGNVCUDARuntime(CodeGenModule &CGM); @@ -187,11 +190,110 @@ llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const { void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) { EmittedKernels.push_back(CGF.CurFn); - emitDeviceStubBody(CGF, Args); + if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), + CudaFeature::CUDA_USES_NEW_LAUNCH)) + emitDeviceStubBodyNew(CGF, Args); + else + emitDeviceStubBodyLegacy(CGF, Args); } -void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF, - FunctionArgList &Args) { +// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local +// array and kernels are launched using cudaLaunchKernel(). +void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, + FunctionArgList &Args) { + // Build the shadow stack entry at the very start of the function. + + // Calculate amount of space we will need for all arguments. If we have no + // args, allocate a single pointer so we still have a valid pointer to the + // argument array that we can pass to runtime, even if it will be unused. + Address KernelArgs = CGF.CreateTempAlloca( + VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args", + llvm::ConstantInt::get(SizeTy, std::max(1, Args.size()))); + // Store pointers to the arguments in a locally allocated launch_args. + for (unsigned i = 0; i < Args.size(); ++i) { + llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer(); + llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy); + CGF.Builder.CreateDefaultAlignedStore( + VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i)); + } + + llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); + + // Lookup cudaLaunchKernel function. + // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, + // void **args, size_t sharedMem, + // cudaStream_t stream); + TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl(); + DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); + IdentifierInfo &cudaLaunchKernelII = + CGM.getContext().Idents.get("cudaLaunchKernel"); + FunctionDecl *cudaLaunchKernelFD = nullptr; + for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) { + if (FunctionDecl *FD = dyn_cast(Result)) + cudaLaunchKernelFD = FD; + } + + if (cudaLaunchKernelFD == nullptr) { + CGM.Error(CGF.CurFuncDecl->getLocation(), + "Can't find declaration for cudaLaunchKernel()"); + return; + } + // Create temporary dim3 grid_dim, block_dim. + ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1); + QualType Dim3Ty = GridDimParam->getType(); + Address GridDim = + CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim"); + Address BlockDim = + CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim"); + Address ShmemSize = + CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size"); + Address Stream = + CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream"); + llvm::Constant *cudaPopConfigFn = CGM.CreateRuntimeFunction( + llvm::FunctionType::get(IntTy, + {/*gridDim=*/GridDim.getType(), + /*blockDim=*/BlockDim.getType(), + /*ShmemSize=*/ShmemSize.getType(), + /*Stream=*/Stream.getType()}, + /*isVarArg=*/false), + "__cudaPopCallConfiguration"); + + CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, + {GridDim.getPointer(), BlockDim.getPointer(), + ShmemSize.getPointer(), Stream.getPointer()}); + + // Emit the call to cudaLaunch + llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy); + CallArgList LaunchKernelArgs; + LaunchKernelArgs.add(RValue::get(Kernel), + cudaLaunchKernelFD->getParamDecl(0)->getType()); + LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty); + LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty); + LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()), + cudaLaunchKernelFD->getParamDecl(3)->getType()); + LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)), + cudaLaunchKernelFD->getParamDecl(4)->getType()); + LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)), + cudaLaunchKernelFD->getParamDecl(5)->getType()); + + QualType QT = cudaLaunchKernelFD->getType(); + QualType CQT = QT.getCanonicalType(); + llvm::Type *Ty = CGM.getTypes().ConvertFunctionType(CQT, cudaLaunchKernelFD); + llvm::FunctionType *FTy = dyn_cast(Ty); + + const CGFunctionInfo &FI = + CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD); + llvm::Constant *cudaLaunchKernelFn = + CGM.CreateRuntimeFunction(FTy, "cudaLaunchKernel"); + CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(), + LaunchKernelArgs); + CGF.EmitBranch(EndBlock); + + CGF.EmitBlock(EndBlock); +} + +void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF, + FunctionArgList &Args) { // Emit a call to cudaSetupArgument for each arg in Args. llvm::Constant *cudaSetupArgFn = getSetupArgumentFn(); llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); diff --git a/lib/Headers/__clang_cuda_runtime_wrapper.h b/lib/Headers/__clang_cuda_runtime_wrapper.h index f05c0454a8..4b3672463e 100644 --- a/lib/Headers/__clang_cuda_runtime_wrapper.h +++ b/lib/Headers/__clang_cuda_runtime_wrapper.h @@ -426,5 +426,15 @@ __device__ inline __cuda_builtin_gridDim_t::operator dim3() const { #pragma pop_macro("__USE_FAST_MATH__") #pragma pop_macro("__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__") +// CUDA runtime uses this undocumented function to access kernel launch +// configuration. The declaration is in crt/device_functions.h but that file +// includes a lot of other stuff we don't want. Instead, we'll provide our own +// declaration for it here. +#if CUDA_VERSION >= 9020 +extern "C" unsigned __cudaPushCallConfiguration(dim3 gridDim, dim3 blockDim, + size_t sharedMem = 0, + void *stream = 0); +#endif + #endif // __CUDA__ #endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__ diff --git a/lib/Sema/SemaCUDA.cpp b/lib/Sema/SemaCUDA.cpp index ec926eaf85..43cc14dfc9 100644 --- a/lib/Sema/SemaCUDA.cpp +++ b/lib/Sema/SemaCUDA.cpp @@ -13,6 +13,7 @@ #include "clang/AST/ASTContext.h" #include "clang/AST/Decl.h" #include "clang/AST/ExprCXX.h" +#include "clang/Basic/Cuda.h" #include "clang/Lex/Preprocessor.h" #include "clang/Sema/Lookup.h" #include "clang/Sema/Sema.h" @@ -41,9 +42,8 @@ ExprResult Sema::ActOnCUDAExecConfigExpr(Scope *S, SourceLocation LLLLoc, SourceLocation GGGLoc) { FunctionDecl *ConfigDecl = Context.getcudaConfigureCallDecl(); if (!ConfigDecl) - return ExprError( - Diag(LLLLoc, diag::err_undeclared_var_use) - << (getLangOpts().HIP ? "hipConfigureCall" : "cudaConfigureCall")); + return ExprError(Diag(LLLLoc, diag::err_undeclared_var_use) + << getCudaConfigureFuncName()); QualType ConfigQTy = ConfigDecl->getType(); DeclRefExpr *ConfigDR = new (Context) @@ -957,3 +957,16 @@ void Sema::inheritCUDATargetAttrs(FunctionDecl *FD, copyAttrIfPresent(*this, FD, TemplateFD); copyAttrIfPresent(*this, FD, TemplateFD); } + +std::string Sema::getCudaConfigureFuncName() const { + if (getLangOpts().HIP) + return "hipConfigureCall"; + + // New CUDA kernel launch sequence. + if (CudaFeatureEnabled(Context.getTargetInfo().getSDKVersion(), + CudaFeature::CUDA_USES_NEW_LAUNCH)) + return "__cudaPushCallConfiguration"; + + // Legacy CUDA kernel configuration call + return "cudaConfigureCall"; +} diff --git a/lib/Sema/SemaDecl.cpp b/lib/Sema/SemaDecl.cpp index 38a73302e8..112184d87d 100644 --- a/lib/Sema/SemaDecl.cpp +++ b/lib/Sema/SemaDecl.cpp @@ -9146,13 +9146,12 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC, if (getLangOpts().CUDA) { IdentifierInfo *II = NewFD->getIdentifier(); - if (II && - II->isStr(getLangOpts().HIP ? "hipConfigureCall" - : "cudaConfigureCall") && + if (II && II->isStr(getCudaConfigureFuncName()) && !NewFD->isInvalidDecl() && NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) { if (!R->getAs()->getReturnType()->isScalarType()) - Diag(NewFD->getLocation(), diag::err_config_scalar_return); + Diag(NewFD->getLocation(), diag::err_config_scalar_return) + << getCudaConfigureFuncName(); Context.setcudaConfigureCallDecl(NewFD); } diff --git a/test/CodeGenCUDA/Inputs/cuda.h b/test/CodeGenCUDA/Inputs/cuda.h index 3adbdc5b6d..0fd175765a 100644 --- a/test/CodeGenCUDA/Inputs/cuda.h +++ b/test/CodeGenCUDA/Inputs/cuda.h @@ -15,13 +15,20 @@ struct dim3 { }; typedef struct cudaStream *cudaStream_t; - +typedef enum cudaError {} cudaError_t; #ifdef __HIP__ int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0, cudaStream_t stream = 0); #else -int cudaConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0, - cudaStream_t stream = 0); +extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize, + size_t sharedSize = 0, + cudaStream_t stream = 0); +extern "C" int __cudaPushCallConfiguration(dim3 gridSize, dim3 blockSize, + size_t sharedSize = 0, + cudaStream_t stream = 0); +extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, + dim3 blockDim, void **args, + size_t sharedMem, cudaStream_t stream); #endif extern "C" __device__ int printf(const char*, ...); diff --git a/test/CodeGenCUDA/device-stub.cu b/test/CodeGenCUDA/device-stub.cu index ea45c391d2..30f8838942 100644 --- a/test/CodeGenCUDA/device-stub.cu +++ b/test/CodeGenCUDA/device-stub.cu @@ -1,14 +1,36 @@ // RUN: echo "GPU binary would be here" > %t // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ -// RUN: -fcuda-include-gpubinary %t -o - \ -// RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,CUDA,CUDANORDC +// RUN: -target-sdk-version=8.0 -fcuda-include-gpubinary %t -o - \ +// RUN: | FileCheck -allow-deprecated-dag-overlap %s \ +// RUN: --check-prefixes=ALL,NORDC,CUDA,CUDANORDC,CUDA-OLD // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ -// RUN: -fcuda-include-gpubinary %t -o - -DNOGLOBALS \ -// RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=NOGLOBALS,CUDANOGLOBALS +// RUN: -target-sdk-version=8.0 -fcuda-include-gpubinary %t \ +// RUN: -o - -DNOGLOBALS \ +// RUN: | FileCheck -allow-deprecated-dag-overlap %s \ +// RUN: -check-prefixes=NOGLOBALS,CUDANOGLOBALS // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ -// RUN: -fgpu-rdc -fcuda-include-gpubinary %t -o - \ -// RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,RDC,CUDA,CUDARDC -// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - \ +// RUN: -target-sdk-version=8.0 -fgpu-rdc -fcuda-include-gpubinary %t \ +// RUN: -o - \ +// RUN: | FileCheck -allow-deprecated-dag-overlap %s \ +// RUN: --check-prefixes=ALL,RDC,CUDA,CUDARDC,CUDA-OLD +// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ +// RUN: -target-sdk-version=8.0 -o - \ +// RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN + +// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ +// RUN: -target-sdk-version=9.2 -fcuda-include-gpubinary %t -o - \ +// RUN: | FileCheck %s -allow-deprecated-dag-overlap \ +// RUN: --check-prefixes=ALL,NORDC,CUDA,CUDANORDC,CUDA-NEW +// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ +// RUN: -target-sdk-version=9.2 -fcuda-include-gpubinary %t -o - -DNOGLOBALS \ +// RUN: | FileCheck -allow-deprecated-dag-overlap %s \ +// RUN: --check-prefixes=NOGLOBALS,CUDANOGLOBALS +// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ +// RUN: -target-sdk-version=9.2 -fgpu-rdc -fcuda-include-gpubinary %t -o - \ +// RUN: | FileCheck %s -allow-deprecated-dag-overlap \ +// RUN: --check-prefixes=ALL,RDC,CUDA,CUDARDC,CUDA_NEW +// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ +// RUN: -target-sdk-version=9.2 -o - \ // RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ @@ -103,15 +125,34 @@ void use_pointers() { // by a call to cudaLaunch. // ALL: define{{.*}}kernelfunc -// ALL: call{{.*}}[[PREFIX]]SetupArgument -// ALL: call{{.*}}[[PREFIX]]SetupArgument -// ALL: call{{.*}}[[PREFIX]]SetupArgument -// ALL: call{{.*}}[[PREFIX]]Launch + +// New launch sequence stores arguments into local buffer and passes array of +// pointers to them directly to cudaLaunchKernel +// CUDA-NEW: alloca +// CUDA-NEW: store +// CUDA-NEW: store +// CUDA-NEW: store +// CUDA-NEW: call{{.*}}__cudaPopCallConfiguration +// CUDA-NEW: call{{.*}}cudaLaunchKernel + +// Legacy style launch sequence sets up arguments by passing them to +// [cuda|hip]SetupArgument. +// CUDA-OLD: call{{.*}}[[PREFIX]]SetupArgument +// CUDA-OLD: call{{.*}}[[PREFIX]]SetupArgument +// CUDA-OLD: call{{.*}}[[PREFIX]]SetupArgument +// CUDA-OLD: call{{.*}}[[PREFIX]]Launch + +// HIP: call{{.*}}[[PREFIX]]SetupArgument +// HIP: call{{.*}}[[PREFIX]]SetupArgument +// HIP: call{{.*}}[[PREFIX]]SetupArgument +// HIP: call{{.*}}[[PREFIX]]Launch __global__ void kernelfunc(int i, int j, int k) {} // Test that we've built correct kernel launch sequence. // ALL: define{{.*}}hostfunc -// ALL: call{{.*}}[[PREFIX]]ConfigureCall +// CUDA-OLD: call{{.*}}[[PREFIX]]ConfigureCall +// CUDA-NEW: call{{.*}}__cudaPushCallConfiguration +// HIP: call{{.*}}[[PREFIX]]ConfigureCall // ALL: call{{.*}}kernelfunc void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); } #endif diff --git a/test/CodeGenCUDA/kernel-args-alignment.cu b/test/CodeGenCUDA/kernel-args-alignment.cu index 4bd5eb1bb1..653f3eb23d 100644 --- a/test/CodeGenCUDA/kernel-args-alignment.cu +++ b/test/CodeGenCUDA/kernel-args-alignment.cu @@ -1,8 +1,12 @@ -// RUN: %clang_cc1 --std=c++11 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | \ -// RUN: FileCheck -check-prefix HOST -check-prefix CHECK %s +// New CUDA kernel launch sequence does not require explicit specification of +// size/offset for each argument, so only the old way is tested. +// +// RUN: %clang_cc1 --std=c++11 -triple x86_64-unknown-linux-gnu -emit-llvm \ +// RUN: -target-sdk-version=8.0 -o - %s \ +// RUN: | FileCheck -check-prefixes=HOST-OLD,CHECK %s // RUN: %clang_cc1 --std=c++11 -fcuda-is-device -triple nvptx64-nvidia-cuda \ -// RUN: -emit-llvm -o - %s | FileCheck -check-prefix DEVICE -check-prefix CHECK %s +// RUN: -emit-llvm -o - %s | FileCheck -check-prefixes=DEVICE,CHECK %s #include "Inputs/cuda.h" @@ -27,9 +31,9 @@ static_assert(alignof(S) == 8, "Unexpected alignment."); // 1. offset 0, width 1 // 2. offset 8 (because alignof(S) == 8), width 16 // 3. offset 24, width 8 -// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 1, i64 0) -// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 16, i64 8) -// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 8, i64 24) +// HOST-OLD: call i32 @cudaSetupArgument({{[^,]*}}, i64 1, i64 0) +// HOST-OLD: call i32 @cudaSetupArgument({{[^,]*}}, i64 16, i64 8) +// HOST-OLD: call i32 @cudaSetupArgument({{[^,]*}}, i64 8, i64 24) // DEVICE-LABEL: @_Z6kernelc1SPi // DEVICE-SAME: i8{{[^,]*}}, %struct.S* byval align 8{{[^,]*}}, i32* diff --git a/test/CodeGenCUDA/kernel-call.cu b/test/CodeGenCUDA/kernel-call.cu index 43d08dfaf8..ed48a6cc81 100644 --- a/test/CodeGenCUDA/kernel-call.cu +++ b/test/CodeGenCUDA/kernel-call.cu @@ -1,5 +1,9 @@ -// RUN: %clang_cc1 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CUDA,CHECK -// RUN: %clang_cc1 -x hip -emit-llvm %s -o - | FileCheck %s --check-prefixes=HIP,CHECK +// RUN: %clang_cc1 -target-sdk-version=8.0 -emit-llvm %s -o - \ +// RUN: | FileCheck %s --check-prefixes=CUDA-OLD,CHECK +// RUN: %clang_cc1 -target-sdk-version=9.2 -emit-llvm %s -o - \ +// RUN: | FileCheck %s --check-prefixes=CUDA-NEW,CHECK +// RUN: %clang_cc1 -x hip -emit-llvm %s -o - \ +// RUN: | FileCheck %s --check-prefixes=HIP,CHECK #include "Inputs/cuda.h" @@ -7,14 +11,17 @@ // CHECK-LABEL: define{{.*}}g1 // HIP: call{{.*}}hipSetupArgument // HIP: call{{.*}}hipLaunchByPtr -// CUDA: call{{.*}}cudaSetupArgument -// CUDA: call{{.*}}cudaLaunch +// CUDA-OLD: call{{.*}}cudaSetupArgument +// CUDA-OLD: call{{.*}}cudaLaunch +// CUDA-NEW: call{{.*}}__cudaPopCallConfiguration +// CUDA-NEW: call{{.*}}cudaLaunchKernel __global__ void g1(int x) {} // CHECK-LABEL: define{{.*}}main int main(void) { // HIP: call{{.*}}hipConfigureCall - // CUDA: call{{.*}}cudaConfigureCall + // CUDA-OLD: call{{.*}}cudaConfigureCall + // CUDA-NEW: call{{.*}}__cudaPushCallConfiguration // CHECK: icmp // CHECK: br // CHECK: call{{.*}}g1 diff --git a/test/Driver/cuda-simple.cu b/test/Driver/cuda-simple.cu index fbc5aa1413..b6840be4e2 100644 --- a/test/Driver/cuda-simple.cu +++ b/test/Driver/cuda-simple.cu @@ -2,7 +2,7 @@ // http://llvm.org/PR22936 // RUN: %clang -nocudainc -nocudalib -Werror -fsyntax-only -c %s // -// Verify that we pass -x cuda-cpp-output to compiler after +// Verify that we pass -x cuda-cpp-output to compiler after // preprocessing a CUDA file // RUN: %clang -Werror -### -save-temps -c %s 2>&1 | FileCheck %s // CHECK: "-cc1" @@ -14,7 +14,9 @@ // Verify that compiler accepts CUDA syntax with "-x cuda-cpp-output". // RUN: %clang -Werror -fsyntax-only -x cuda-cpp-output -c %s -int cudaConfigureCall(int, int); +extern "C" int cudaConfigureCall(int, int); +extern "C" int __cudaPushCallConfiguration(int, int); + __attribute__((global)) void kernel() {} void func() { diff --git a/test/SemaCUDA/Inputs/cuda.h b/test/SemaCUDA/Inputs/cuda.h index 4544369411..2600bfa9c4 100644 --- a/test/SemaCUDA/Inputs/cuda.h +++ b/test/SemaCUDA/Inputs/cuda.h @@ -18,9 +18,17 @@ struct dim3 { }; typedef struct cudaStream *cudaStream_t; - -int cudaConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0, - cudaStream_t stream = 0); +typedef enum cudaError {} cudaError_t; + +extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize, + size_t sharedSize = 0, + cudaStream_t stream = 0); +extern "C" int __cudaPushCallConfiguration(dim3 gridSize, dim3 blockSize, + size_t sharedSize = 0, + cudaStream_t stream = 0); +extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, + dim3 blockDim, void **args, + size_t sharedMem, cudaStream_t stream); // Host- and device-side placement new overloads. void *operator new(__SIZE_TYPE__, void *p) { return p; } diff --git a/test/SemaCUDA/config-type.cu b/test/SemaCUDA/config-type.cu index a469d38d3e..a122c4539a 100644 --- a/test/SemaCUDA/config-type.cu +++ b/test/SemaCUDA/config-type.cu @@ -1,3 +1,7 @@ -// RUN: %clang_cc1 -fsyntax-only -verify %s +// RUN: %clang_cc1 -target-sdk-version=8.0 -fsyntax-only -verify=legacy-launch %s +// RUN: %clang_cc1 -target-sdk-version=9.2 -fsyntax-only -verify=new-launch %s -void cudaConfigureCall(unsigned gridSize, unsigned blockSize); // expected-error {{must have scalar return type}} +// legacy-launch-error@+1 {{must have scalar return type}} +void cudaConfigureCall(unsigned gridSize, unsigned blockSize); +// new-launch-error@+1 {{must have scalar return type}} +void __cudaPushCallConfiguration(unsigned gridSize, unsigned blockSize); diff --git a/unittests/ASTMatchers/ASTMatchersTest.h b/unittests/ASTMatchers/ASTMatchersTest.h index a8ad3c9c54..b22cafefd9 100644 --- a/unittests/ASTMatchers/ASTMatchersTest.h +++ b/unittests/ASTMatchers/ASTMatchersTest.h @@ -183,7 +183,9 @@ testing::AssertionResult matchesConditionallyWithCuda( "typedef struct cudaStream *cudaStream_t;" "int cudaConfigureCall(dim3 gridSize, dim3 blockSize," " size_t sharedSize = 0," - " cudaStream_t stream = 0);"; + " cudaStream_t stream = 0);" + "extern \"C\" unsigned __cudaPushCallConfiguration(" + " dim3 gridDim, dim3 blockDim, size_t sharedMem = 0, void *stream = 0);"; bool Found = false, DynamicFound = false; MatchFinder Finder;