From: Yaxun Liu Date: Tue, 24 Sep 2019 19:16:40 +0000 (+0000) Subject: [HIP] Support new kernel launching API X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=7fd15a3b66eaa8c90756983ddd960624f543949f;p=clang [HIP] Support new kernel launching API Differential Revision: https://reviews.llvm.org/D67947 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@372773 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/clang/Basic/LangOptions.def b/include/clang/Basic/LangOptions.def index 42ed1145ef..53d0e4a490 100644 --- a/include/clang/Basic/LangOptions.def +++ b/include/clang/Basic/LangOptions.def @@ -226,6 +226,8 @@ LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code") LANGOPT(SYCLIsDevice , 1, 0, "Generate code for SYCL device") +LANGOPT(HIPUseNewLaunchAPI, 1, 0, "Use new kernel launching API for HIP") + LANGOPT(SizedDeallocation , 1, 0, "sized deallocation") LANGOPT(AlignedAllocation , 1, 0, "aligned allocation") LANGOPT(AlignedAllocationUnavailable, 1, 0, "aligned allocation functions are unavailable") diff --git a/include/clang/Driver/Options.td b/include/clang/Driver/Options.td index 569891c6b3..d31c4f46e6 100644 --- a/include/clang/Driver/Options.td +++ b/include/clang/Driver/Options.td @@ -599,6 +599,9 @@ def hip_device_lib_EQ : Joined<["--"], "hip-device-lib=">, Group, HelpText<"HIP device library">; def fhip_dump_offload_linker_script : Flag<["-"], "fhip-dump-offload-linker-script">, Group, Flags<[NoArgumentUnused, HelpHidden]>; +def fhip_new_launch_api : Flag<["-"], "fhip-new-launch-api">, + Flags<[CC1Option]>, HelpText<"Use new kernel launching API for HIP.">; +def fno_hip_new_launch_api : Flag<["-"], "fno-hip-new-launch-api">; def libomptarget_nvptx_path_EQ : Joined<["--"], "libomptarget-nvptx-path=">, Group, HelpText<"Path to libomptarget-nvptx libraries">; def dD : Flag<["-"], "dD">, Group, Flags<[CC1Option]>, diff --git a/lib/CodeGen/CGCUDANV.cpp b/lib/CodeGen/CGCUDANV.cpp index 4d4038dae9..05aeef4194 100644 --- a/lib/CodeGen/CGCUDANV.cpp +++ b/lib/CodeGen/CGCUDANV.cpp @@ -236,7 +236,8 @@ void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl}); if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), - CudaFeature::CUDA_USES_NEW_LAUNCH)) + CudaFeature::CUDA_USES_NEW_LAUNCH) || + CGF.getLangOpts().HIPUseNewLaunchAPI) emitDeviceStubBodyNew(CGF, Args); else emitDeviceStubBodyLegacy(CGF, Args); @@ -264,14 +265,18 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); - // Lookup cudaLaunchKernel function. + // Lookup cudaLaunchKernel/hipLaunchKernel function. // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, // void **args, size_t sharedMem, // cudaStream_t stream); + // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, + // void **args, size_t sharedMem, + // hipStream_t stream); TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl(); DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); + auto LaunchKernelName = addPrefixToName("LaunchKernel"); IdentifierInfo &cudaLaunchKernelII = - CGM.getContext().Idents.get("cudaLaunchKernel"); + CGM.getContext().Idents.get(LaunchKernelName); FunctionDecl *cudaLaunchKernelFD = nullptr; for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) { if (FunctionDecl *FD = dyn_cast(Result)) @@ -280,7 +285,7 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, if (cudaLaunchKernelFD == nullptr) { CGM.Error(CGF.CurFuncDecl->getLocation(), - "Can't find declaration for cudaLaunchKernel()"); + "Can't find declaration for " + LaunchKernelName); return; } // Create temporary dim3 grid_dim, block_dim. @@ -301,7 +306,7 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, /*ShmemSize=*/ShmemSize.getType(), /*Stream=*/Stream.getType()}, /*isVarArg=*/false), - "__cudaPopCallConfiguration"); + addUnderscoredPrefixToName("PopCallConfiguration")); CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, {GridDim.getPointer(), BlockDim.getPointer(), @@ -329,7 +334,7 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, const CGFunctionInfo &FI = CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD); llvm::FunctionCallee cudaLaunchKernelFn = - CGM.CreateRuntimeFunction(FTy, "cudaLaunchKernel"); + CGM.CreateRuntimeFunction(FTy, LaunchKernelName); CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(), LaunchKernelArgs); CGF.EmitBranch(EndBlock); diff --git a/lib/Driver/ToolChains/Clang.cpp b/lib/Driver/ToolChains/Clang.cpp index aa17efbee3..16c208b982 100644 --- a/lib/Driver/ToolChains/Clang.cpp +++ b/lib/Driver/ToolChains/Clang.cpp @@ -4774,6 +4774,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, // Forward -cl options to -cc1 RenderOpenCLOptions(Args, CmdArgs); + if (Args.hasFlag(options::OPT_fhip_new_launch_api, + options::OPT_fno_hip_new_launch_api, false)) + CmdArgs.push_back("-fhip-new-launch-api"); + if (Arg *A = Args.getLastArg(options::OPT_fcf_protection_EQ)) { CmdArgs.push_back( Args.MakeArgString(Twine("-fcf-protection=") + A->getValue())); diff --git a/lib/Frontend/CompilerInvocation.cpp b/lib/Frontend/CompilerInvocation.cpp index d3b2c37553..61a2c07890 100644 --- a/lib/Frontend/CompilerInvocation.cpp +++ b/lib/Frontend/CompilerInvocation.cpp @@ -2517,6 +2517,7 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK, Opts.CUDADeviceApproxTranscendentals = 1; Opts.GPURelocatableDeviceCode = Args.hasArg(OPT_fgpu_rdc); + Opts.HIPUseNewLaunchAPI = Args.hasArg(OPT_fhip_new_launch_api); if (Opts.ObjC) { if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) { diff --git a/lib/Sema/SemaCUDA.cpp b/lib/Sema/SemaCUDA.cpp index cf8910cd84..3c9c991c77 100644 --- a/lib/Sema/SemaCUDA.cpp +++ b/lib/Sema/SemaCUDA.cpp @@ -820,7 +820,8 @@ void Sema::inheritCUDATargetAttrs(FunctionDecl *FD, std::string Sema::getCudaConfigureFuncName() const { if (getLangOpts().HIP) - return "hipConfigureCall"; + return getLangOpts().HIPUseNewLaunchAPI ? "__hipPushCallConfiguration" + : "hipConfigureCall"; // New CUDA kernel launch sequence. if (CudaFeatureEnabled(Context.getTargetInfo().getSDKVersion(), diff --git a/test/CodeGenCUDA/Inputs/cuda.h b/test/CodeGenCUDA/Inputs/cuda.h index 0fd175765a..5d73b81041 100644 --- a/test/CodeGenCUDA/Inputs/cuda.h +++ b/test/CodeGenCUDA/Inputs/cuda.h @@ -14,12 +14,21 @@ struct dim3 { __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {} }; -typedef struct cudaStream *cudaStream_t; -typedef enum cudaError {} cudaError_t; #ifdef __HIP__ +typedef struct hipStream *hipStream_t; +typedef enum hipError {} hipError_t; int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0, - cudaStream_t stream = 0); + hipStream_t stream = 0); +extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize, + size_t sharedSize = 0, + hipStream_t stream = 0); +extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim, + dim3 blockDim, void **args, + size_t sharedMem, + hipStream_t stream); #else +typedef struct cudaStream *cudaStream_t; +typedef enum cudaError {} cudaError_t; extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0, cudaStream_t stream = 0); diff --git a/test/CodeGenCUDA/kernel-call.cu b/test/CodeGenCUDA/kernel-call.cu index ed48a6cc81..b76f2c1883 100644 --- a/test/CodeGenCUDA/kernel-call.cu +++ b/test/CodeGenCUDA/kernel-call.cu @@ -3,14 +3,17 @@ // RUN: %clang_cc1 -target-sdk-version=9.2 -emit-llvm %s -o - \ // RUN: | FileCheck %s --check-prefixes=CUDA-NEW,CHECK // RUN: %clang_cc1 -x hip -emit-llvm %s -o - \ -// RUN: | FileCheck %s --check-prefixes=HIP,CHECK - +// RUN: | FileCheck %s --check-prefixes=HIP-OLD,CHECK +// RUN: %clang_cc1 -fhip-new-launch-api -x hip -emit-llvm %s -o - \ +// RUN: | FileCheck %s --check-prefixes=HIP-NEW,CHECK #include "Inputs/cuda.h" // CHECK-LABEL: define{{.*}}g1 -// HIP: call{{.*}}hipSetupArgument -// HIP: call{{.*}}hipLaunchByPtr +// HIP-OLD: call{{.*}}hipSetupArgument +// HIP-OLD: call{{.*}}hipLaunchByPtr +// HIP-NEW: call{{.*}}__hipPopCallConfiguration +// HIP-NEW: call{{.*}}hipLaunchKernel // CUDA-OLD: call{{.*}}cudaSetupArgument // CUDA-OLD: call{{.*}}cudaLaunch // CUDA-NEW: call{{.*}}__cudaPopCallConfiguration @@ -19,7 +22,8 @@ __global__ void g1(int x) {} // CHECK-LABEL: define{{.*}}main int main(void) { - // HIP: call{{.*}}hipConfigureCall + // HIP-OLD: call{{.*}}hipConfigureCall + // HIP-NEW: call{{.*}}__hipPushCallConfiguration // CUDA-OLD: call{{.*}}cudaConfigureCall // CUDA-NEW: call{{.*}}__cudaPushCallConfiguration // CHECK: icmp