[HIP] Register/unregister device fat binary only once

author Yaxun Liu <Yaxun.Liu@amd.com>

Fri, 20 Jul 2018 22:45:24 +0000 (22:45 +0000)

committer Yaxun Liu <Yaxun.Liu@amd.com>

Fri, 20 Jul 2018 22:45:24 +0000 (22:45 +0000)
author Yaxun Liu <Yaxun.Liu@amd.com>
Fri, 20 Jul 2018 22:45:24 +0000 (22:45 +0000)
committer Yaxun Liu <Yaxun.Liu@amd.com>
Fri, 20 Jul 2018 22:45:24 +0000 (22:45 +0000)
diff --git a/lib/CodeGen/CGCUDANV.cpp b/lib/CodeGen/CGCUDANV.cpp

index b541b1046f57d4a6483e704b00a8d3c108444ca8..5fcc9e011bcbee71c961e9138d98b210402e9fbe 100644 (file)
--- a/lib/CodeGen/CGCUDANV.cpp
+++ b/lib/CodeGen/CGCUDANV.cpp
@@ -309,12 +309,24 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
  }
  
  /// Creates a global constructor function for the module:
+///
+/// For CUDA:
  /// \code
  /// void __cuda_module_ctor(void*) {
  ///     Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
  ///     __cuda_register_globals(Handle);
  /// }
  /// \endcode
+///
+/// For HIP:
+/// \code
+/// void __hip_module_ctor(void*) {
+///     if (__hip_gpubin_handle == 0) {
+///         __hip_gpubin_handle  = __hipRegisterFatBinary(GpuBinaryBlob);
+///         __hip_register_globals(__hip_gpubin_handle);
+///     }
+/// }
+/// \endcode
  llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
    bool IsHIP = CGM.getLangOpts().HIP;
    // No need to generate ctors/dtors if there is no GPU binary.
@@ -427,22 +439,68 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
        /*constant*/ true);
    FatbinWrapper->setSection(FatbinSectionName);
  
-  // Register binary with CUDA/HIP runtime. This is substantially different in
-  // default mode vs. separate compilation!
-  if (!RelocatableDeviceCode) {
-    // GpuBinaryHandle = __{cuda|hip}RegisterFatBinary(&FatbinWrapper);
+  // There is only one HIP fat binary per linked module, however there are
+  // multiple constructor functions. Make sure the fat binary is registered
+  // only once. The constructor functions are executed by the dynamic loader
+  // before the program gains control. The dynamic loader cannot execute the
+  // constructor functions concurrently since doing that would not guarantee
+  // thread safety of the loaded program. Therefore we can assume sequential
+  // execution of constructor functions here.
+  if (IsHIP) {
+    llvm::BasicBlock *IfBlock =
+        llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
+    llvm::BasicBlock *ExitBlock =
+        llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
+    // The name, size, and initialization pattern of this variable is part
+    // of HIP ABI.
+    GpuBinaryHandle = new llvm::GlobalVariable(
+        TheModule, VoidPtrPtrTy, /*isConstant=*/false,
+        llvm::GlobalValue::LinkOnceAnyLinkage,
+        /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
+        "__hip_gpubin_handle");
+    GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity());
+    Address GpuBinaryAddr(
+        GpuBinaryHandle,
+        CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
+    {
+      auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
+      llvm::Constant *Zero =
+          llvm::Constant::getNullValue(HandleValue->getType());
+      llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
+      CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
+    }
+    {
+      CtorBuilder.SetInsertPoint(IfBlock);
+      // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
+      llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
+          RegisterFatbinFunc,
+          CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
+      CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
+      CtorBuilder.CreateBr(ExitBlock);
+    }
+    {
+      CtorBuilder.SetInsertPoint(ExitBlock);
+      // Call __hip_register_globals(GpuBinaryHandle);
+      if (RegisterGlobalsFunc) {
+        auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
+        CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
+      }
+    }
+  } else if (!RelocatableDeviceCode) {
+    // Register binary with CUDA runtime. This is substantially different in
+    // default mode vs. separate compilation!
+    // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
      llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
          RegisterFatbinFunc,
          CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
      GpuBinaryHandle = new llvm::GlobalVariable(
          TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
-        llvm::ConstantPointerNull::get(VoidPtrPtrTy),
-        addUnderscoredPrefixToName("_gpubin_handle"));
-
+        llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
+    GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity());
      CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
                                     CGM.getPointerAlign());
  
-    // Call __{cuda|hip}_register_globals(GpuBinaryHandle);
+    // Call __cuda_register_globals(GpuBinaryHandle);
      if (RegisterGlobalsFunc)
        CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
    } else {
@@ -453,15 +511,13 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
      llvm::Constant *ModuleIDConstant =
          makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32);
  
-    // Create an alias for the FatbinWrapper that nvcc or hip backend will
-    // look for.
+    // Create an alias for the FatbinWrapper that nvcc will look for.
      llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
                                Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
  
-    // void __{cuda|hip}RegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
+    // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
      // void *, void (*)(void **))
-    SmallString<128> RegisterLinkedBinaryName(
-        addUnderscoredPrefixToName("RegisterLinkedBinary"));
+    SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
      RegisterLinkedBinaryName += ModuleID;
      llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
          getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
@@ -493,11 +549,23 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
  
  /// Creates a global destructor function that unregisters the GPU code blob
  /// registered by constructor.
+///
+/// For CUDA:
  /// \code
  /// void __cuda_module_dtor(void*) {
  ///     __cudaUnregisterFatBinary(Handle);
  /// }
  /// \endcode
+///
+/// For HIP:
+/// \code
+/// void __hip_module_dtor(void*) {
+///     if (__hip_gpubin_handle) {
+///         __hipUnregisterFatBinary(__hip_gpubin_handle);
+///         __hip_gpubin_handle = 0;
+///     }
+/// }
+/// \endcode
  llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
    // No need for destructor if we don't have a handle to unregister.
    if (!GpuBinaryHandle)
@@ -518,10 +586,30 @@ llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
    CGBuilderTy DtorBuilder(CGM, Context);
    DtorBuilder.SetInsertPoint(DtorEntryBB);
  
-  auto HandleValue =
-      DtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign());
-  DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
-
+  Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity(
+                                             GpuBinaryHandle->getAlignment()));
+  auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
+  // There is only one HIP fat binary per linked module, however there are
+  // multiple destructor functions. Make sure the fat binary is unregistered
+  // only once.
+  if (CGM.getLangOpts().HIP) {
+    llvm::BasicBlock *IfBlock =
+        llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
+    llvm::BasicBlock *ExitBlock =
+        llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
+    llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
+    llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
+    DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
+
+    DtorBuilder.SetInsertPoint(IfBlock);
+    DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
+    DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
+    DtorBuilder.CreateBr(ExitBlock);
+
+    DtorBuilder.SetInsertPoint(ExitBlock);
+  } else {
+    DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
+  }
    DtorBuilder.CreateRetVoid();
    return ModuleDtorFunc;
  }
diff --git a/test/CodeGenCUDA/device-stub.cu b/test/CodeGenCUDA/device-stub.cu

index 85f53c3b4f79523f6a3e098976b4caca59f2a250..716381b7a82649cf9894ba1027fb591c6c857d82 100644 (file)
--- a/test/CodeGenCUDA/device-stub.cu
+++ b/test/CodeGenCUDA/device-stub.cu
@@ -19,7 +19,7 @@
  // RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=NOGLOBALS,HIPNOGLOBALS
  // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
  // RUN:     -fcuda-rdc -fcuda-include-gpubinary %t -o - -x hip \
-// RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,RDC,HIP,HIPRDC
+// RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,HIP
  // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\
  // RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN
  
@@ -79,11 +79,11 @@ void use_pointers() {
  // CUDA-SAME: section ".nvFatBinSegment"
  // HIP-SAME: section ".hipFatBinSegment"
  // * variable to save GPU binary handle after initialization
-// NORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null
+// CUDANORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null
+// HIP: @__[[PREFIX]]_gpubin_handle = linkonce global i8** null
  // * constant unnamed string with NVModuleID
  // RDC: [[MODULE_ID_GLOBAL:@.*]] = private constant
  // CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
-// HIPRDC-SAME: c"[[MODULE_ID:.+]]\00", section "__hip_module_id", align 32
  // * Make sure our constructor was added to global ctor list.
  // ALL: @llvm.global_ctors = appending global {{.*}}@__[[PREFIX]]_module_ctor
  // * Alias to global symbol containing the NVModuleID.
@@ -120,10 +120,18 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
  // ALL: define internal void @__[[PREFIX]]_module_ctor
  
  // In separate mode it calls __[[PREFIX]]RegisterFatBinary(&__[[PREFIX]]_fatbin_wrapper)
+// HIP only register fat binary once.
+// HIP: load i8**, i8*** @__hip_gpubin_handle
+// HIP-NEXT: icmp eq i8** {{.*}}, null
+// HIP-NEXT: br i1 {{.*}}, label %if, label %exit
+// HIP: if:
  // NORDC: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper
  //   .. stores return value in __[[PREFIX]]_gpubin_handle
  // NORDC-NEXT: store{{.*}}__[[PREFIX]]_gpubin_handle
  //   .. and then calls __[[PREFIX]]_register_globals
+// HIP-NEXT: br label %exit
+// HIP: exit:
+// HIP-NEXT: load i8**, i8*** @__hip_gpubin_handle
  // NORDC-NEXT: call void @__[[PREFIX]]_register_globals
  // * In separate mode we also register a destructor.
  // NORDC-NEXT: call i32 @atexit(void (i8*)* @__[[PREFIX]]_module_dtor)
@@ -136,7 +144,14 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
  // Test that we've created destructor.
  // NORDC: define internal void @__[[PREFIX]]_module_dtor
  // NORDC: load{{.*}}__[[PREFIX]]_gpubin_handle
-// NORDC-NEXT: call void @__[[PREFIX]]UnregisterFatBinary
+// CUDANORDC-NEXT: call void @__[[PREFIX]]UnregisterFatBinary
+// HIP-NEXT: icmp ne i8** {{.*}}, null
+// HIP-NEXT: br i1 {{.*}}, label %if, label %exit
+// HIP: if:
+// HIP-NEXT: call void @__[[PREFIX]]UnregisterFatBinary
+// HIP-NEXT: store i8** null, i8*** @__hip_gpubin_handle
+// HIP-NEXT: br label %exit
+// HIP: exit:
  
  // There should be no __[[PREFIX]]_register_globals if we have no
  // device-side globals, but we still need to register GPU binary.
author	Yaxun Liu <Yaxun.Liu@amd.com>
	Fri, 20 Jul 2018 22:45:24 +0000 (22:45 +0000)
committer	Yaxun Liu <Yaxun.Liu@amd.com>
	Fri, 20 Jul 2018 22:45:24 +0000 (22:45 +0000)
lib/CodeGen/CGCUDANV.cpp		patch \| blob \| history
test/CodeGenCUDA/device-stub.cu		patch \| blob \| history