From: Yaxun Liu <Yaxun.Liu@amd.com>
Date: Tue, 24 Sep 2019 19:16:40 +0000 (+0000)
Subject: [HIP] Support new kernel launching API
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=7fd15a3b66eaa8c90756983ddd960624f543949f;p=clang

[HIP] Support new kernel launching API

Differential Revision: https://reviews.llvm.org/D67947


git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@372773 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/include/clang/Basic/LangOptions.def b/include/clang/Basic/LangOptions.def
index 42ed1145ef..53d0e4a490 100644
--- a/include/clang/Basic/LangOptions.def
+++ b/include/clang/Basic/LangOptions.def
@@ -226,6 +226,8 @@ LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code")
 
 LANGOPT(SYCLIsDevice      , 1, 0, "Generate code for SYCL device")
 
+LANGOPT(HIPUseNewLaunchAPI, 1, 0, "Use new kernel launching API for HIP")
+
 LANGOPT(SizedDeallocation , 1, 0, "sized deallocation")
 LANGOPT(AlignedAllocation , 1, 0, "aligned allocation")
 LANGOPT(AlignedAllocationUnavailable, 1, 0, "aligned allocation functions are unavailable")
diff --git a/include/clang/Driver/Options.td b/include/clang/Driver/Options.td
index 569891c6b3..d31c4f46e6 100644
--- a/include/clang/Driver/Options.td
+++ b/include/clang/Driver/Options.td
@@ -599,6 +599,9 @@ def hip_device_lib_EQ : Joined<["--"], "hip-device-lib=">, Group<Link_Group>,
   HelpText<"HIP device library">;
 def fhip_dump_offload_linker_script : Flag<["-"], "fhip-dump-offload-linker-script">,
   Group<f_Group>, Flags<[NoArgumentUnused, HelpHidden]>;
+def fhip_new_launch_api : Flag<["-"], "fhip-new-launch-api">,
+  Flags<[CC1Option]>, HelpText<"Use new kernel launching API for HIP.">;
+def fno_hip_new_launch_api : Flag<["-"], "fno-hip-new-launch-api">;
 def libomptarget_nvptx_path_EQ : Joined<["--"], "libomptarget-nvptx-path=">, Group<i_Group>,
   HelpText<"Path to libomptarget-nvptx libraries">;
 def dD : Flag<["-"], "dD">, Group<d_Group>, Flags<[CC1Option]>,
diff --git a/lib/CodeGen/CGCUDANV.cpp b/lib/CodeGen/CGCUDANV.cpp
index 4d4038dae9..05aeef4194 100644
--- a/lib/CodeGen/CGCUDANV.cpp
+++ b/lib/CodeGen/CGCUDANV.cpp
@@ -236,7 +236,8 @@ void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
 
   EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
   if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
-                         CudaFeature::CUDA_USES_NEW_LAUNCH))
+                         CudaFeature::CUDA_USES_NEW_LAUNCH) ||
+      CGF.getLangOpts().HIPUseNewLaunchAPI)
     emitDeviceStubBodyNew(CGF, Args);
   else
     emitDeviceStubBodyLegacy(CGF, Args);
@@ -264,14 +265,18 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
 
   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
 
-  // Lookup cudaLaunchKernel function.
+  // Lookup cudaLaunchKernel/hipLaunchKernel function.
   // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
   //                              void **args, size_t sharedMem,
   //                              cudaStream_t stream);
+  // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
+  //                            void **args, size_t sharedMem,
+  //                            hipStream_t stream);
   TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
   DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
+  auto LaunchKernelName = addPrefixToName("LaunchKernel");
   IdentifierInfo &cudaLaunchKernelII =
-      CGM.getContext().Idents.get("cudaLaunchKernel");
+      CGM.getContext().Idents.get(LaunchKernelName);
   FunctionDecl *cudaLaunchKernelFD = nullptr;
   for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) {
     if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
@@ -280,7 +285,7 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
 
   if (cudaLaunchKernelFD == nullptr) {
     CGM.Error(CGF.CurFuncDecl->getLocation(),
-              "Can't find declaration for cudaLaunchKernel()");
+              "Can't find declaration for " + LaunchKernelName);
     return;
   }
   // Create temporary dim3 grid_dim, block_dim.
@@ -301,7 +306,7 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
                                /*ShmemSize=*/ShmemSize.getType(),
                                /*Stream=*/Stream.getType()},
                               /*isVarArg=*/false),
-      "__cudaPopCallConfiguration");
+      addUnderscoredPrefixToName("PopCallConfiguration"));
 
   CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
                               {GridDim.getPointer(), BlockDim.getPointer(),
@@ -329,7 +334,7 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
   const CGFunctionInfo &FI =
       CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
   llvm::FunctionCallee cudaLaunchKernelFn =
-      CGM.CreateRuntimeFunction(FTy, "cudaLaunchKernel");
+      CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
   CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
                LaunchKernelArgs);
   CGF.EmitBranch(EndBlock);
diff --git a/lib/Driver/ToolChains/Clang.cpp b/lib/Driver/ToolChains/Clang.cpp
index aa17efbee3..16c208b982 100644
--- a/lib/Driver/ToolChains/Clang.cpp
+++ b/lib/Driver/ToolChains/Clang.cpp
@@ -4774,6 +4774,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   // Forward -cl options to -cc1
   RenderOpenCLOptions(Args, CmdArgs);
 
+  if (Args.hasFlag(options::OPT_fhip_new_launch_api,
+                   options::OPT_fno_hip_new_launch_api, false))
+    CmdArgs.push_back("-fhip-new-launch-api");
+
   if (Arg *A = Args.getLastArg(options::OPT_fcf_protection_EQ)) {
     CmdArgs.push_back(
         Args.MakeArgString(Twine("-fcf-protection=") + A->getValue()));
diff --git a/lib/Frontend/CompilerInvocation.cpp b/lib/Frontend/CompilerInvocation.cpp
index d3b2c37553..61a2c07890 100644
--- a/lib/Frontend/CompilerInvocation.cpp
+++ b/lib/Frontend/CompilerInvocation.cpp
@@ -2517,6 +2517,7 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK,
     Opts.CUDADeviceApproxTranscendentals = 1;
 
   Opts.GPURelocatableDeviceCode = Args.hasArg(OPT_fgpu_rdc);
+  Opts.HIPUseNewLaunchAPI = Args.hasArg(OPT_fhip_new_launch_api);
 
   if (Opts.ObjC) {
     if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) {
diff --git a/lib/Sema/SemaCUDA.cpp b/lib/Sema/SemaCUDA.cpp
index cf8910cd84..3c9c991c77 100644
--- a/lib/Sema/SemaCUDA.cpp
+++ b/lib/Sema/SemaCUDA.cpp
@@ -820,7 +820,8 @@ void Sema::inheritCUDATargetAttrs(FunctionDecl *FD,
 
 std::string Sema::getCudaConfigureFuncName() const {
   if (getLangOpts().HIP)
-    return "hipConfigureCall";
+    return getLangOpts().HIPUseNewLaunchAPI ? "__hipPushCallConfiguration"
+                                            : "hipConfigureCall";
 
   // New CUDA kernel launch sequence.
   if (CudaFeatureEnabled(Context.getTargetInfo().getSDKVersion(),
diff --git a/test/CodeGenCUDA/Inputs/cuda.h b/test/CodeGenCUDA/Inputs/cuda.h
index 0fd175765a..5d73b81041 100644
--- a/test/CodeGenCUDA/Inputs/cuda.h
+++ b/test/CodeGenCUDA/Inputs/cuda.h
@@ -14,12 +14,21 @@ struct dim3 {
   __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {}
 };
 
-typedef struct cudaStream *cudaStream_t;
-typedef enum cudaError {} cudaError_t;
 #ifdef __HIP__
+typedef struct hipStream *hipStream_t;
+typedef enum hipError {} hipError_t;
 int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
-                     cudaStream_t stream = 0);
+                     hipStream_t stream = 0);
+extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize,
+                                                 size_t sharedSize = 0,
+                                                 hipStream_t stream = 0);
+extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim,
+                                      dim3 blockDim, void **args,
+                                      size_t sharedMem,
+                                      hipStream_t stream);
 #else
+typedef struct cudaStream *cudaStream_t;
+typedef enum cudaError {} cudaError_t;
 extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize,
                                  size_t sharedSize = 0,
                                  cudaStream_t stream = 0);
diff --git a/test/CodeGenCUDA/kernel-call.cu b/test/CodeGenCUDA/kernel-call.cu
index ed48a6cc81..b76f2c1883 100644
--- a/test/CodeGenCUDA/kernel-call.cu
+++ b/test/CodeGenCUDA/kernel-call.cu
@@ -3,14 +3,17 @@
 // RUN: %clang_cc1 -target-sdk-version=9.2  -emit-llvm %s -o - \
 // RUN: | FileCheck %s --check-prefixes=CUDA-NEW,CHECK
 // RUN: %clang_cc1 -x hip -emit-llvm %s -o - \
-// RUN: | FileCheck %s --check-prefixes=HIP,CHECK
-
+// RUN: | FileCheck %s --check-prefixes=HIP-OLD,CHECK
+// RUN: %clang_cc1 -fhip-new-launch-api -x hip -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefixes=HIP-NEW,CHECK
 
 #include "Inputs/cuda.h"
 
 // CHECK-LABEL: define{{.*}}g1
-// HIP: call{{.*}}hipSetupArgument
-// HIP: call{{.*}}hipLaunchByPtr
+// HIP-OLD: call{{.*}}hipSetupArgument
+// HIP-OLD: call{{.*}}hipLaunchByPtr
+// HIP-NEW: call{{.*}}__hipPopCallConfiguration
+// HIP-NEW: call{{.*}}hipLaunchKernel
 // CUDA-OLD: call{{.*}}cudaSetupArgument
 // CUDA-OLD: call{{.*}}cudaLaunch
 // CUDA-NEW: call{{.*}}__cudaPopCallConfiguration
@@ -19,7 +22,8 @@ __global__ void g1(int x) {}
 
 // CHECK-LABEL: define{{.*}}main
 int main(void) {
-  // HIP: call{{.*}}hipConfigureCall
+  // HIP-OLD: call{{.*}}hipConfigureCall
+  // HIP-NEW: call{{.*}}__hipPushCallConfiguration
   // CUDA-OLD: call{{.*}}cudaConfigureCall
   // CUDA-NEW: call{{.*}}__cudaPushCallConfiguration
   // CHECK: icmp