[HIP] Support new kernel launching API

author Yaxun Liu <Yaxun.Liu@amd.com>

Tue, 24 Sep 2019 19:16:40 +0000 (19:16 +0000)

committer Yaxun Liu <Yaxun.Liu@amd.com>

Tue, 24 Sep 2019 19:16:40 +0000 (19:16 +0000)
author Yaxun Liu <Yaxun.Liu@amd.com>
Tue, 24 Sep 2019 19:16:40 +0000 (19:16 +0000)
committer Yaxun Liu <Yaxun.Liu@amd.com>
Tue, 24 Sep 2019 19:16:40 +0000 (19:16 +0000)
diff --git a/include/clang/Basic/LangOptions.def b/include/clang/Basic/LangOptions.def

index 42ed1145ef7c8edd13f1b63893106ec754f2df00..53d0e4a490e5676681681581990838b8261bdb91 100644 (file)
--- a/include/clang/Basic/LangOptions.def
+++ b/include/clang/Basic/LangOptions.def
@@ -226,6 +226,8 @@ LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code")
  
  LANGOPT(SYCLIsDevice      , 1, 0, "Generate code for SYCL device")
  
+LANGOPT(HIPUseNewLaunchAPI, 1, 0, "Use new kernel launching API for HIP")
+
  LANGOPT(SizedDeallocation , 1, 0, "sized deallocation")
  LANGOPT(AlignedAllocation , 1, 0, "aligned allocation")
  LANGOPT(AlignedAllocationUnavailable, 1, 0, "aligned allocation functions are unavailable")
diff --git a/include/clang/Driver/Options.td b/include/clang/Driver/Options.td

index 569891c6b36841f1a3a00712320569cfa70bb06a..d31c4f46e6b4b97c1e8276d15fa5cf21a4bc7c88 100644 (file)
--- a/include/clang/Driver/Options.td
+++ b/include/clang/Driver/Options.td
@@ -599,6 +599,9 @@ def hip_device_lib_EQ : Joined<["--"], "hip-device-lib=">, Group<Link_Group>,
    HelpText<"HIP device library">;
  def fhip_dump_offload_linker_script : Flag<["-"], "fhip-dump-offload-linker-script">,
    Group<f_Group>, Flags<[NoArgumentUnused, HelpHidden]>;
+def fhip_new_launch_api : Flag<["-"], "fhip-new-launch-api">,
+  Flags<[CC1Option]>, HelpText<"Use new kernel launching API for HIP.">;
+def fno_hip_new_launch_api : Flag<["-"], "fno-hip-new-launch-api">;
  def libomptarget_nvptx_path_EQ : Joined<["--"], "libomptarget-nvptx-path=">, Group<i_Group>,
    HelpText<"Path to libomptarget-nvptx libraries">;
  def dD : Flag<["-"], "dD">, Group<d_Group>, Flags<[CC1Option]>,
diff --git a/lib/CodeGen/CGCUDANV.cpp b/lib/CodeGen/CGCUDANV.cpp

index 4d4038dae9cfcde267314d2dcd20e8f75fddeae5..05aeef4194d4d8881da3e50faa621710844d72dc 100644 (file)
--- a/lib/CodeGen/CGCUDANV.cpp
+++ b/lib/CodeGen/CGCUDANV.cpp
@@ -236,7 +236,8 @@ void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
  
    EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
    if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
-                         CudaFeature::CUDA_USES_NEW_LAUNCH))
+                         CudaFeature::CUDA_USES_NEW_LAUNCH) ||
+      CGF.getLangOpts().HIPUseNewLaunchAPI)
      emitDeviceStubBodyNew(CGF, Args);
    else
      emitDeviceStubBodyLegacy(CGF, Args);
@@ -264,14 +265,18 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
  
    llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
  
-  // Lookup cudaLaunchKernel function.
+  // Lookup cudaLaunchKernel/hipLaunchKernel function.
    // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
    //                              void **args, size_t sharedMem,
    //                              cudaStream_t stream);
+  // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
+  //                            void **args, size_t sharedMem,
+  //                            hipStream_t stream);
    TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
    DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
+  auto LaunchKernelName = addPrefixToName("LaunchKernel");
    IdentifierInfo &cudaLaunchKernelII =
-      CGM.getContext().Idents.get("cudaLaunchKernel");
+      CGM.getContext().Idents.get(LaunchKernelName);
    FunctionDecl *cudaLaunchKernelFD = nullptr;
    for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) {
      if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
@@ -280,7 +285,7 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
  
    if (cudaLaunchKernelFD == nullptr) {
      CGM.Error(CGF.CurFuncDecl->getLocation(),
-              "Can't find declaration for cudaLaunchKernel()");
+              "Can't find declaration for " + LaunchKernelName);
      return;
    }
    // Create temporary dim3 grid_dim, block_dim.
@@ -301,7 +306,7 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
                                 /*ShmemSize=*/ShmemSize.getType(),
                                 /*Stream=*/Stream.getType()},
                                /*isVarArg=*/false),
-      "__cudaPopCallConfiguration");
+      addUnderscoredPrefixToName("PopCallConfiguration"));
  
    CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
                                {GridDim.getPointer(), BlockDim.getPointer(),
@@ -329,7 +334,7 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
    const CGFunctionInfo &FI =
        CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
    llvm::FunctionCallee cudaLaunchKernelFn =
-      CGM.CreateRuntimeFunction(FTy, "cudaLaunchKernel");
+      CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
    CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
                 LaunchKernelArgs);
    CGF.EmitBranch(EndBlock);
diff --git a/lib/Driver/ToolChains/Clang.cpp b/lib/Driver/ToolChains/Clang.cpp

index aa17efbee32bb591ea4d367cdd65c40d05803c39..16c208b98230dcc5c3557cfd8c544a4b65a89b32 100644 (file)
--- a/lib/Driver/ToolChains/Clang.cpp
+++ b/lib/Driver/ToolChains/Clang.cpp
@@ -4774,6 +4774,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
    // Forward -cl options to -cc1
    RenderOpenCLOptions(Args, CmdArgs);
  
+  if (Args.hasFlag(options::OPT_fhip_new_launch_api,
+                   options::OPT_fno_hip_new_launch_api, false))
+    CmdArgs.push_back("-fhip-new-launch-api");
+
    if (Arg *A = Args.getLastArg(options::OPT_fcf_protection_EQ)) {
      CmdArgs.push_back(
          Args.MakeArgString(Twine("-fcf-protection=") + A->getValue()));
diff --git a/lib/Frontend/CompilerInvocation.cpp b/lib/Frontend/CompilerInvocation.cpp

index d3b2c37553f27f1921a96c09019aaf36895da369..61a2c07890fc5fd8b89b41cb36e4a644970d15b5 100644 (file)
--- a/lib/Frontend/CompilerInvocation.cpp
+++ b/lib/Frontend/CompilerInvocation.cpp
@@ -2517,6 +2517,7 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK,
      Opts.CUDADeviceApproxTranscendentals = 1;
  
    Opts.GPURelocatableDeviceCode = Args.hasArg(OPT_fgpu_rdc);
+  Opts.HIPUseNewLaunchAPI = Args.hasArg(OPT_fhip_new_launch_api);
  
    if (Opts.ObjC) {
      if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) {
diff --git a/lib/Sema/SemaCUDA.cpp b/lib/Sema/SemaCUDA.cpp

index cf8910cd84f753151c366c51cbdd84ed785ab7db..3c9c991c77feb4e05baa1e9d2d92a64ef9f364b4 100644 (file)
--- a/lib/Sema/SemaCUDA.cpp
+++ b/lib/Sema/SemaCUDA.cpp
@@ -820,7 +820,8 @@ void Sema::inheritCUDATargetAttrs(FunctionDecl *FD,
  
  std::string Sema::getCudaConfigureFuncName() const {
    if (getLangOpts().HIP)
-    return "hipConfigureCall";
+    return getLangOpts().HIPUseNewLaunchAPI ? "__hipPushCallConfiguration"
+                                            : "hipConfigureCall";
  
    // New CUDA kernel launch sequence.
    if (CudaFeatureEnabled(Context.getTargetInfo().getSDKVersion(),
diff --git a/test/CodeGenCUDA/Inputs/cuda.h b/test/CodeGenCUDA/Inputs/cuda.h

index 0fd175765a205e4f29aa90540671e95436b68fbb..5d73b81041ab55e1c5dfaa6f82685c815d33d89c 100644 (file)
--- a/test/CodeGenCUDA/Inputs/cuda.h
+++ b/test/CodeGenCUDA/Inputs/cuda.h
@@ -14,12 +14,21 @@ struct dim3 {
    __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {}
  };
  
-typedef struct cudaStream *cudaStream_t;
-typedef enum cudaError {} cudaError_t;
  #ifdef __HIP__
+typedef struct hipStream *hipStream_t;
+typedef enum hipError {} hipError_t;
  int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
-                     cudaStream_t stream = 0);
+                     hipStream_t stream = 0);
+extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize,
+                                                 size_t sharedSize = 0,
+                                                 hipStream_t stream = 0);
+extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim,
+                                      dim3 blockDim, void **args,
+                                      size_t sharedMem,
+                                      hipStream_t stream);
  #else
+typedef struct cudaStream *cudaStream_t;
+typedef enum cudaError {} cudaError_t;
  extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize,
                                   size_t sharedSize = 0,
                                   cudaStream_t stream = 0);
diff --git a/test/CodeGenCUDA/kernel-call.cu b/test/CodeGenCUDA/kernel-call.cu

index ed48a6cc8138440c8b5c0c3ba29eb5cbd0d18df6..b76f2c1883576584f7bda95bdb3cd98bb31ab874 100644 (file)
--- a/test/CodeGenCUDA/kernel-call.cu
+++ b/test/CodeGenCUDA/kernel-call.cu
@@ -3,14 +3,17 @@
  // RUN: %clang_cc1 -target-sdk-version=9.2  -emit-llvm %s -o - \
  // RUN: | FileCheck %s --check-prefixes=CUDA-NEW,CHECK
  // RUN: %clang_cc1 -x hip -emit-llvm %s -o - \
-// RUN: | FileCheck %s --check-prefixes=HIP,CHECK
-
+// RUN: | FileCheck %s --check-prefixes=HIP-OLD,CHECK
+// RUN: %clang_cc1 -fhip-new-launch-api -x hip -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefixes=HIP-NEW,CHECK
  
  #include "Inputs/cuda.h"
  
  // CHECK-LABEL: define{{.*}}g1
-// HIP: call{{.*}}hipSetupArgument
-// HIP: call{{.*}}hipLaunchByPtr
+// HIP-OLD: call{{.*}}hipSetupArgument
+// HIP-OLD: call{{.*}}hipLaunchByPtr
+// HIP-NEW: call{{.*}}__hipPopCallConfiguration
+// HIP-NEW: call{{.*}}hipLaunchKernel
  // CUDA-OLD: call{{.*}}cudaSetupArgument
  // CUDA-OLD: call{{.*}}cudaLaunch
  // CUDA-NEW: call{{.*}}__cudaPopCallConfiguration
@@ -19,7 +22,8 @@ __global__ void g1(int x) {}
  
  // CHECK-LABEL: define{{.*}}main
  int main(void) {
-  // HIP: call{{.*}}hipConfigureCall
+  // HIP-OLD: call{{.*}}hipConfigureCall
+  // HIP-NEW: call{{.*}}__hipPushCallConfiguration
    // CUDA-OLD: call{{.*}}cudaConfigureCall
    // CUDA-NEW: call{{.*}}__cudaPushCallConfiguration
    // CHECK: icmp
author	Yaxun Liu <Yaxun.Liu@amd.com>
	Tue, 24 Sep 2019 19:16:40 +0000 (19:16 +0000)
committer	Yaxun Liu <Yaxun.Liu@amd.com>
	Tue, 24 Sep 2019 19:16:40 +0000 (19:16 +0000)
include/clang/Basic/LangOptions.def		patch \| blob \| history
include/clang/Driver/Options.td		patch \| blob \| history
lib/CodeGen/CGCUDANV.cpp		patch \| blob \| history
lib/Driver/ToolChains/Clang.cpp		patch \| blob \| history
lib/Frontend/CompilerInvocation.cpp		patch \| blob \| history
lib/Sema/SemaCUDA.cpp		patch \| blob \| history
test/CodeGenCUDA/Inputs/cuda.h		patch \| blob \| history
test/CodeGenCUDA/kernel-call.cu		patch \| blob \| history