[HIP] Support early finalization of device code for -fno-gpu-rdc

author Yaxun Liu <Yaxun.Liu@amd.com>

Tue, 2 Oct 2018 17:48:54 +0000 (17:48 +0000)

committer Yaxun Liu <Yaxun.Liu@amd.com>

Tue, 2 Oct 2018 17:48:54 +0000 (17:48 +0000)
author Yaxun Liu <Yaxun.Liu@amd.com>
Tue, 2 Oct 2018 17:48:54 +0000 (17:48 +0000)
committer Yaxun Liu <Yaxun.Liu@amd.com>
Tue, 2 Oct 2018 17:48:54 +0000 (17:48 +0000)
diff --git a/include/clang/Basic/LangOptions.def b/include/clang/Basic/LangOptions.def

index 285fcc1458cb9a58380b0c0a8abd6af517641469..8b9284129853624eda3a11d42a50bfa131677606 100644 (file)
--- a/include/clang/Basic/LangOptions.def
+++ b/include/clang/Basic/LangOptions.def
@@ -211,7 +211,7 @@ LANGOPT(CUDAIsDevice      , 1, 0, "compiling for CUDA device")
  LANGOPT(CUDAAllowVariadicFunctions, 1, 0, "allowing variadic functions in CUDA device code")
  LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__")
  LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")
-LANGOPT(CUDARelocatableDeviceCode, 1, 0, "generate relocatable device code")
+LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code")
  
  LANGOPT(SizedDeallocation , 1, 0, "sized deallocation")
  LANGOPT(AlignedAllocation , 1, 0, "aligned allocation")
diff --git a/include/clang/Driver/Options.td b/include/clang/Driver/Options.td

index 83669444a4c260232c87de073c231b50872b5fe7..e006187c0e34fcafe017b224f62d64ab7d1a4ee4 100644 (file)
--- a/include/clang/Driver/Options.td
+++ b/include/clang/Driver/Options.td
@@ -584,9 +584,11 @@ def fno_cuda_flush_denormals_to_zero : Flag<["-"], "fno-cuda-flush-denormals-to-
  def fcuda_approx_transcendentals : Flag<["-"], "fcuda-approx-transcendentals">,
    Flags<[CC1Option]>, HelpText<"Use approximate transcendental functions">;
  def fno_cuda_approx_transcendentals : Flag<["-"], "fno-cuda-approx-transcendentals">;
-def fcuda_rdc : Flag<["-"], "fcuda-rdc">, Flags<[CC1Option]>,
+def fgpu_rdc : Flag<["-"], "fgpu-rdc">, Flags<[CC1Option]>,
    HelpText<"Generate relocatable device code, also known as separate compilation mode.">;
-def fno_cuda_rdc : Flag<["-"], "fno-cuda-rdc">;
+def fno_gpu_rdc : Flag<["-"], "fno-gpu-rdc">;
+def : Flag<["-"], "fcuda-rdc">, Alias<fgpu_rdc>;
+def : Flag<["-"], "fno-cuda-rdc">, Alias<fno_gpu_rdc>;
  def fcuda_short_ptr : Flag<["-"], "fcuda-short-ptr">, Flags<[CC1Option]>,
    HelpText<"Use 32-bit pointers for accessing const/local/shared address spaces.">;
  def fno_cuda_short_ptr : Flag<["-"], "fno-cuda-short-ptr">;
diff --git a/include/clang/Driver/Types.def b/include/clang/Driver/Types.def

index c23f1f13361f75cf63bde01f6195fce5e5f1a138..c25bc4b08084105641e8c77b92a19646dd26aa8f 100644 (file)
--- a/include/clang/Driver/Types.def
+++ b/include/clang/Driver/Types.def
@@ -101,4 +101,5 @@ TYPE("image",                    Image,        INVALID,         "out",   "")
  TYPE("dSYM",                     dSYM,         INVALID,         "dSYM",  "A")
  TYPE("dependencies",             Dependencies, INVALID,         "d",     "")
  TYPE("cuda-fatbin",              CUDA_FATBIN,  INVALID,         "fatbin","A")
+TYPE("hip-fatbin",               HIP_FATBIN,   INVALID,         "hipfb", "A")
  TYPE("none",                     Nothing,      INVALID,         nullptr, "u")
diff --git a/lib/AST/Decl.cpp b/lib/AST/Decl.cpp

index 5a64c67f9b8e896940cee9a7d4773ce6f352aea1..bcdbd3b36e759ccb6ba5c1967baf92ab26f01e0b 100644 (file)
--- a/lib/AST/Decl.cpp
+++ b/lib/AST/Decl.cpp
@@ -2459,7 +2459,7 @@ bool VarDecl::isKnownToBeDefined() const {
    //
    // With CUDA relocatable device code enabled, these variables don't get
    // special handling; they're treated like regular extern variables.
-  if (LangOpts.CUDA && !LangOpts.CUDARelocatableDeviceCode &&
+  if (LangOpts.CUDA && !LangOpts.GPURelocatableDeviceCode &&
        hasExternalStorage() && hasAttr<CUDASharedAttr>() &&
        isa<IncompleteArrayType>(getType()))
      return true;
diff --git a/lib/CodeGen/CGCUDANV.cpp b/lib/CodeGen/CGCUDANV.cpp

index 3f3f2c5e43c409ce51a22f9a16ea1403f7c99ea9..d76076326e4661a7adee5667e6a6a04fb4fa9789 100644 (file)
--- a/lib/CodeGen/CGCUDANV.cpp
+++ b/lib/CodeGen/CGCUDANV.cpp
@@ -137,7 +137,7 @@ CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
  CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
      : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
        TheModule(CGM.getModule()),
-      RelocatableDeviceCode(CGM.getLangOpts().CUDARelocatableDeviceCode) {
+      RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode) {
    CodeGen::CodeGenTypes &Types = CGM.getTypes();
    ASTContext &Ctx = CGM.getContext();
  
@@ -353,8 +353,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
    // global variable and save a reference in GpuBinaryHandle to be cleaned up
    // in destructor on exit. Then associate all known kernels with the GPU binary
    // handle so CUDA runtime can figure out what to call on the GPU side.
-  std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary;
-  if (!IsHIP) {
+  std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
+  if (!CudaGpuBinaryFileName.empty()) {
      llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
          llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
      if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
@@ -388,15 +388,23 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
      ModuleIDSectionName = "__hip_module_id";
      ModuleIDPrefix = "__hip_";
  
-    // For HIP, create an external symbol __hip_fatbin in section .hip_fatbin.
-    // The external symbol is supposed to contain the fat binary but will be
-    // populated somewhere else, e.g. by lld through link script.
-    FatBinStr = new llvm::GlobalVariable(
+    if (CudaGpuBinary) {
+      // If fatbin is available from early finalization, create a string
+      // literal containing the fat binary loaded from the given file.
+      FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "",
+                                     FatbinConstantName, 8);
+    } else {
+      // If fatbin is not available, create an external symbol
+      // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
+      // to contain the fat binary but will be populated somewhere else,
+      // e.g. by lld through link script.
+      FatBinStr = new llvm::GlobalVariable(
          CGM.getModule(), CGM.Int8Ty,
          /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
          "__hip_fatbin", nullptr,
          llvm::GlobalVariable::NotThreadLocal);
-    cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
+      cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
+    }
  
      FatMagic = HIPFatMagic;
    } else {
@@ -447,6 +455,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
    // thread safety of the loaded program. Therefore we can assume sequential
    // execution of constructor functions here.
    if (IsHIP) {
+    auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
+        llvm::GlobalValue::LinkOnceAnyLinkage;
      llvm::BasicBlock *IfBlock =
          llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
      llvm::BasicBlock *ExitBlock =
@@ -455,12 +465,13 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
      // of HIP ABI.
      GpuBinaryHandle = new llvm::GlobalVariable(
          TheModule, VoidPtrPtrTy, /*isConstant=*/false,
-        llvm::GlobalValue::LinkOnceAnyLinkage,
+        Linkage,
          /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
          "__hip_gpubin_handle");
      GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity());
      // Prevent the weak symbol in different shared libraries being merged.
-    GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
+    if (Linkage != llvm::GlobalValue::InternalLinkage)
+      GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
      Address GpuBinaryAddr(
          GpuBinaryHandle,
          CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
diff --git a/lib/Driver/Driver.cpp b/lib/Driver/Driver.cpp

index c5aed55eb94926b72359f3021753429dbd094a72..ebf2ffc3558ffb15af9bb4972c023ec20c90adbc 100644 (file)
--- a/lib/Driver/Driver.cpp
+++ b/lib/Driver/Driver.cpp
@@ -2486,11 +2486,13 @@ class OffloadingActionBuilder final {
    class HIPActionBuilder final : public CudaActionBuilderBase {
      /// The linker inputs obtained for each device arch.
      SmallVector<ActionList, 8> DeviceLinkerInputs;
+    bool Relocatable;
  
    public:
      HIPActionBuilder(Compilation &C, DerivedArgList &Args,
                       const Driver::InputList &Inputs)
-        : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP) {}
+        : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP),
+          Relocatable(false) {}
  
      bool canUseBundlerUnbundler() const override { return true; }
  
@@ -2499,23 +2501,68 @@ class OffloadingActionBuilder final {
                           phases::ID CurPhase, phases::ID FinalPhase,
                           PhasesTy &Phases) override {
        // amdgcn does not support linking of object files, therefore we skip
-      // backend and assemble phases to output LLVM IR.
-      if (CudaDeviceActions.empty() || CurPhase == phases::Backend ||
+      // backend and assemble phases to output LLVM IR. Except for generating
+      // non-relocatable device coee, where we generate fat binary for device
+      // code and pass to host in Backend phase.
+      if (CudaDeviceActions.empty() ||
+          (CurPhase == phases::Backend && Relocatable) ||
            CurPhase == phases::Assemble)
          return ABRT_Success;
  
-      assert((CurPhase == phases::Link ||
+      assert(((CurPhase == phases::Link && Relocatable) ||
                CudaDeviceActions.size() == GpuArchList.size()) &&
               "Expecting one action per GPU architecture.");
        assert(!CompileHostOnly &&
               "Not expecting CUDA actions in host-only compilation.");
  
-      // Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch.
-      // This happens to each device action originated from each input file.
-      // Later on, device actions in DeviceLinkerInputs are used to create
-      // device link actions in appendLinkDependences and the created device
-      // link actions are passed to the offload action as device dependence.
-      if (CurPhase == phases::Link) {
+      if (!Relocatable && CurPhase == phases::Backend) {
+        // If we are in backend phase, we attempt to generate the fat binary.
+        // We compile each arch to IR and use a link action to generate code
+        // object containing ISA. Then we use a special "link" action to create
+        // a fat binary containing all the code objects for different GPU's.
+        // The fat binary is then an input to the host action.
+        for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+          // Create a link action to link device IR with device library
+          // and generate ISA.
+          ActionList AL;
+          AL.push_back(CudaDeviceActions[I]);
+          CudaDeviceActions[I] =
+              C.MakeAction<LinkJobAction>(AL, types::TY_Image);
+
+          // OffloadingActionBuilder propagates device arch until an offload
+          // action. Since the next action for creating fatbin does
+          // not have device arch, whereas the above link action and its input
+          // have device arch, an offload action is needed to stop the null
+          // device arch of the next action being propagated to the above link
+          // action.
+          OffloadAction::DeviceDependences DDep;
+          DDep.add(*CudaDeviceActions[I], *ToolChains.front(),
+                   CudaArchToString(GpuArchList[I]), AssociatedOffloadKind);
+          CudaDeviceActions[I] = C.MakeAction<OffloadAction>(
+              DDep, CudaDeviceActions[I]->getType());
+        }
+        // Create HIP fat binary with a special "link" action.
+        CudaFatBinary =
+            C.MakeAction<LinkJobAction>(CudaDeviceActions,
+                types::TY_HIP_FATBIN);
+
+        DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr,
+               AssociatedOffloadKind);
+        // Clear the fat binary, it is already a dependence to an host
+        // action.
+        CudaFatBinary = nullptr;
+
+        // Remove the CUDA actions as they are already connected to an host
+        // action or fat binary.
+        CudaDeviceActions.clear();
+
+        return ABRT_Success;
+      } else if (CurPhase == phases::Link) {
+        // Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch.
+        // This happens to each device action originated from each input file.
+        // Later on, device actions in DeviceLinkerInputs are used to create
+        // device link actions in appendLinkDependences and the created device
+        // link actions are passed to the offload action as device dependence.
          DeviceLinkerInputs.resize(CudaDeviceActions.size());
          auto LI = DeviceLinkerInputs.begin();
          for (auto *A : CudaDeviceActions) {
@@ -2548,6 +2595,13 @@ class OffloadingActionBuilder final {
          ++I;
        }
      }
+
+    bool initialize() override {
+      Relocatable = Args.hasFlag(options::OPT_fgpu_rdc,
+          options::OPT_fno_gpu_rdc, /*Default=*/false);
+
+      return CudaActionBuilderBase::initialize();
+    }
    };
  
    /// OpenMP action builder. The host bitcode is passed to the device frontend
diff --git a/lib/Driver/ToolChains/Clang.cpp b/lib/Driver/ToolChains/Clang.cpp

index 3ca31ab5cbd72d47e7d77f4a6efd1481f003bf3a..070abd82508e2793e61af0a431e8ed7f6d138117 100644 (file)
--- a/lib/Driver/ToolChains/Clang.cpp
+++ b/lib/Driver/ToolChains/Clang.cpp
@@ -4920,16 +4920,16 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
      CmdArgs.push_back(Args.MakeArgString(Flags));
    }
  
-  if (IsCuda) {
-    // Host-side cuda compilation receives all device-side outputs in a single
-    // fatbin as Inputs[1]. Include the binary with -fcuda-include-gpubinary.
-    if (CudaDeviceInput) {
+  // Host-side cuda compilation receives all device-side outputs in a single
+  // fatbin as Inputs[1]. Include the binary with -fcuda-include-gpubinary.
+  if ((IsCuda || IsHIP) && CudaDeviceInput) {
        CmdArgs.push_back("-fcuda-include-gpubinary");
        CmdArgs.push_back(CudaDeviceInput->getFilename());
-    }
+      if (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false))
+        CmdArgs.push_back("-fgpu-rdc");
+  }
  
-    if (Args.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, false))
-      CmdArgs.push_back("-fcuda-rdc");
+  if (IsCuda) {
      if (Args.hasFlag(options::OPT_fcuda_short_ptr,
                       options::OPT_fno_cuda_short_ptr, false))
        CmdArgs.push_back("-fcuda-short-ptr");
diff --git a/lib/Driver/ToolChains/CommonArgs.cpp b/lib/Driver/ToolChains/CommonArgs.cpp

index 9e6aeea9581216bc328081becc7660570e91d6de..355e8d4c7ba9c3f7ed8705ebe31af77eb60b0f19 100644 (file)
--- a/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/lib/Driver/ToolChains/CommonArgs.cpp
@@ -15,6 +15,7 @@
  #include "Arch/SystemZ.h"
  #include "Arch/X86.h"
  #include "Hexagon.h"
+#include "HIP.h"
  #include "InputInfo.h"
  #include "clang/Basic/CharInfo.h"
  #include "clang/Basic/LangOptions.h"
@@ -1337,6 +1338,18 @@ void tools::AddHIPLinkerScript(const ToolChain &TC, Compilation &C,
    if (!JA.isHostOffloading(Action::OFK_HIP))
      return;
  
+  InputInfoList DeviceInputs;
+  for (const auto &II : Inputs) {
+    const Action *A = II.getAction();
+    // Is this a device linking action?
+    if (A && isa<LinkJobAction>(A) && A->isDeviceOffloading(Action::OFK_HIP)) {
+      DeviceInputs.push_back(II);
+    }
+  }
+
+  if (DeviceInputs.empty())
+    return;
+
    // Create temporary linker script. Keep it if save-temps is enabled.
    const char *LKS;
    SmallString<256> Name = llvm::sys::path::filename(Output.getFilename());
@@ -1364,39 +1377,12 @@ void tools::AddHIPLinkerScript(const ToolChain &TC, Compilation &C,
           "Wrong platform");
    (void)HIPTC;
  
-  // Construct clang-offload-bundler command to bundle object files for
-  // for different GPU archs.
-  ArgStringList BundlerArgs;
-  BundlerArgs.push_back(Args.MakeArgString("-type=o"));
-
-  // ToDo: Remove the dummy host binary entry which is required by
-  // clang-offload-bundler.
-  std::string BundlerTargetArg = "-targets=host-x86_64-unknown-linux";
-  std::string BundlerInputArg = "-inputs=/dev/null";
-
-  for (const auto &II : Inputs) {
-    const Action *A = II.getAction();
-    // Is this a device linking action?
-    if (A && isa<LinkJobAction>(A) && A->isDeviceOffloading(Action::OFK_HIP)) {
-      BundlerTargetArg = BundlerTargetArg + ",hip-amdgcn-amd-amdhsa-" +
-                         StringRef(A->getOffloadingArch()).str();
-      BundlerInputArg = BundlerInputArg + "," + II.getFilename();
-    }
-  }
-  BundlerArgs.push_back(Args.MakeArgString(BundlerTargetArg));
-  BundlerArgs.push_back(Args.MakeArgString(BundlerInputArg));
-
-  std::string BundleFileName = C.getDriver().GetTemporaryPath("BUNDLE", "o");
+  // The output file name needs to persist through the compilation, therefore
+  // it needs to be created through MakeArgString.
+  std::string BundleFileName = C.getDriver().GetTemporaryPath("BUNDLE", "hipfb");
    const char *BundleFile =
        C.addTempFile(C.getArgs().MakeArgString(BundleFileName.c_str()));
-  auto BundlerOutputArg =
-      Args.MakeArgString(std::string("-outputs=").append(BundleFile));
-  BundlerArgs.push_back(BundlerOutputArg);
-
-  SmallString<128> BundlerPath(C.getDriver().Dir);
-  llvm::sys::path::append(BundlerPath, "clang-offload-bundler");
-  const char *Bundler = Args.MakeArgString(BundlerPath);
-  C.addCommand(llvm::make_unique<Command>(JA, T, Bundler, BundlerArgs, Inputs));
+  AMDGCN::constructHIPFatbinCommand(C, JA, BundleFile, DeviceInputs, Args, T);
  
    // Add commands to embed target binaries. We ensure that each section and
    // image is 16-byte aligned. This is not mandatory, but increases the
diff --git a/lib/Driver/ToolChains/Cuda.cpp b/lib/Driver/ToolChains/Cuda.cpp

index 08e49fa6a56089bb076f6ba573d9584d0d859bdb..ddcb00809eab8a13f568536387dc91837f4a24e8 100644 (file)
--- a/lib/Driver/ToolChains/Cuda.cpp
+++ b/lib/Driver/ToolChains/Cuda.cpp
@@ -398,8 +398,8 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
                                 options::OPT_fnoopenmp_relocatable_target,
                                 /*Default=*/true);
    else if (JA.isOffloading(Action::OFK_Cuda))
-    Relocatable = Args.hasFlag(options::OPT_fcuda_rdc,
-                               options::OPT_fno_cuda_rdc, /*Default=*/false);
+    Relocatable = Args.hasFlag(options::OPT_fgpu_rdc,
+                               options::OPT_fno_gpu_rdc, /*Default=*/false);
  
    if (Relocatable)
      CmdArgs.push_back("-c");
@@ -609,9 +609,9 @@ void CudaToolChain::addClangTargetOptions(
                             options::OPT_fno_cuda_approx_transcendentals, false))
        CC1Args.push_back("-fcuda-approx-transcendentals");
  
-    if (DriverArgs.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc,
+    if (DriverArgs.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
                             false))
-      CC1Args.push_back("-fcuda-rdc");
+      CC1Args.push_back("-fgpu-rdc");
    }
  
    if (DriverArgs.hasArg(options::OPT_nocudalib))
diff --git a/lib/Driver/ToolChains/HIP.cpp b/lib/Driver/ToolChains/HIP.cpp

index 6efcfaee8fd8987c85d4bc1531293e528f6116dd..58e8e79420d544bd26d6a3a2b8431096e8eed733 100644 (file)
--- a/lib/Driver/ToolChains/HIP.cpp
+++ b/lib/Driver/ToolChains/HIP.cpp
@@ -184,6 +184,40 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA,
    C.addCommand(llvm::make_unique<Command>(JA, *this, Lld, LldArgs, Inputs));
  }
  
+// Construct a clang-offload-bundler command to bundle code objects for
+// different GPU's into a HIP fat binary.
+void AMDGCN::constructHIPFatbinCommand(Compilation &C, const JobAction &JA,
+                  StringRef OutputFileName, const InputInfoList &Inputs,
+                  const llvm::opt::ArgList &Args, const Tool& T) {
+  // Construct clang-offload-bundler command to bundle object files for
+  // for different GPU archs.
+  ArgStringList BundlerArgs;
+  BundlerArgs.push_back(Args.MakeArgString("-type=o"));
+
+  // ToDo: Remove the dummy host binary entry which is required by
+  // clang-offload-bundler.
+  std::string BundlerTargetArg = "-targets=host-x86_64-unknown-linux";
+  std::string BundlerInputArg = "-inputs=/dev/null";
+
+  for (const auto &II : Inputs) {
+    const auto* A = II.getAction();
+    BundlerTargetArg = BundlerTargetArg + ",hip-amdgcn-amd-amdhsa-" +
+                       StringRef(A->getOffloadingArch()).str();
+    BundlerInputArg = BundlerInputArg + "," + II.getFilename();
+  }
+  BundlerArgs.push_back(Args.MakeArgString(BundlerTargetArg));
+  BundlerArgs.push_back(Args.MakeArgString(BundlerInputArg));
+
+  auto BundlerOutputArg =
+      Args.MakeArgString(std::string("-outputs=").append(OutputFileName));
+  BundlerArgs.push_back(BundlerOutputArg);
+
+  SmallString<128> BundlerPath(C.getDriver().Dir);
+  llvm::sys::path::append(BundlerPath, "clang-offload-bundler");
+  const char *Bundler = Args.MakeArgString(BundlerPath);
+  C.addCommand(llvm::make_unique<Command>(JA, T, Bundler, BundlerArgs, Inputs));
+}
+
  // For amdgcn the inputs of the linker job are device bitcode and output is
  // object file. It calls llvm-link, opt, llc, then lld steps.
  void AMDGCN::Linker::ConstructJob(Compilation &C, const JobAction &JA,
@@ -192,6 +226,9 @@ void AMDGCN::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                                     const ArgList &Args,
                                     const char *LinkingOutput) const {
  
+  if (JA.getType() == types::TY_HIP_FATBIN)
+    return constructHIPFatbinCommand(C, JA, Output.getFilename(), Inputs, Args, *this);
+
    assert(getToolChain().getTriple().getArch() == llvm::Triple::amdgcn &&
           "Unsupported target");
  
@@ -244,9 +281,9 @@ void HIPToolChain::addClangTargetOptions(
                           options::OPT_fno_cuda_approx_transcendentals, false))
      CC1Args.push_back("-fcuda-approx-transcendentals");
  
-  if (DriverArgs.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc,
+  if (DriverArgs.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
                           false))
-    CC1Args.push_back("-fcuda-rdc");
+    CC1Args.push_back("-fgpu-rdc");
  
    // Default to "hidden" visibility, as object level linking will not be
    // supported for the foreseeable future.
diff --git a/lib/Driver/ToolChains/HIP.h b/lib/Driver/ToolChains/HIP.h

index 40c9128e2f5923c28341a7b5e9ad427ea112d273..3af19d44dae0e8d2ad09478432f1e89a7e1f9f7e 100644 (file)
--- a/lib/Driver/ToolChains/HIP.h
+++ b/lib/Driver/ToolChains/HIP.h
@@ -19,6 +19,11 @@ namespace driver {
  namespace tools {
  
  namespace AMDGCN {
+  // Construct command for creating HIP fatbin.
+  void constructHIPFatbinCommand(Compilation &C, const JobAction &JA,
+                  StringRef OutputFileName, const InputInfoList &Inputs,
+                  const llvm::opt::ArgList &TCArgs, const Tool& T);
+
  // Runs llvm-link/opt/llc/lld, which links multiple LLVM bitcode, together with
  // device library, then compiles it to ISA in a shared object.
  class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
diff --git a/lib/Frontend/CompilerInvocation.cpp b/lib/Frontend/CompilerInvocation.cpp

index d4236615015e523f012359348e8a95f9ce34605d..32fc93906e41ffd62c1261cadad5156753582222 100644 (file)
--- a/lib/Frontend/CompilerInvocation.cpp
+++ b/lib/Frontend/CompilerInvocation.cpp
@@ -2220,7 +2220,7 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK,
    if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_approx_transcendentals))
      Opts.CUDADeviceApproxTranscendentals = 1;
  
-  Opts.CUDARelocatableDeviceCode = Args.hasArg(OPT_fcuda_rdc);
+  Opts.GPURelocatableDeviceCode = Args.hasArg(OPT_fgpu_rdc);
  
    if (Opts.ObjC1) {
      if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) {
diff --git a/lib/Sema/SemaDeclAttr.cpp b/lib/Sema/SemaDeclAttr.cpp

index 17b93e0add848228bfddd1ef615e7c8799e70247..a437f1599682709730a354a87666ecb8d93e989c 100644 (file)
--- a/lib/Sema/SemaDeclAttr.cpp
+++ b/lib/Sema/SemaDeclAttr.cpp
@@ -4143,7 +4143,7 @@ static void handleSharedAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
    const auto *VD = cast<VarDecl>(D);
    // extern __shared__ is only allowed on arrays with no length (e.g.
    // "int x[]").
-  if (!S.getLangOpts().CUDARelocatableDeviceCode && VD->hasExternalStorage() &&
+  if (!S.getLangOpts().GPURelocatableDeviceCode && VD->hasExternalStorage() &&
        !isa<IncompleteArrayType>(VD->getType())) {
      S.Diag(AL.getLoc(), diag::err_cuda_extern_shared) << VD;
      return;
diff --git a/test/CodeGenCUDA/device-stub.cu b/test/CodeGenCUDA/device-stub.cu

index 06877da8588f406c54030f66d763027813ba04e7..7abb7ae3a033ef086bc007c4500da0fec154d48f 100644 (file)
--- a/test/CodeGenCUDA/device-stub.cu
+++ b/test/CodeGenCUDA/device-stub.cu
@@ -6,22 +6,22 @@
  // RUN:     -fcuda-include-gpubinary %t -o -  -DNOGLOBALS \
  // RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=NOGLOBALS,CUDANOGLOBALS
  // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
-// RUN:     -fcuda-rdc -fcuda-include-gpubinary %t -o - \
+// RUN:     -fgpu-rdc -fcuda-include-gpubinary %t -o - \
  // RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,RDC,CUDA,CUDARDC
  // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - \
  // RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN
  
  // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
  // RUN:     -fcuda-include-gpubinary %t -o - -x hip\
-// RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,HIP
+// RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,HIP,HIPEF
  // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
  // RUN:     -fcuda-include-gpubinary %t -o -  -DNOGLOBALS -x hip \
  // RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=NOGLOBALS,HIPNOGLOBALS
  // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
-// RUN:     -fcuda-rdc -fcuda-include-gpubinary %t -o - -x hip \
-// RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,HIP
+// RUN:     -fgpu-rdc -fcuda-include-gpubinary %t -o - -x hip \
+// RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,HIP,HIPEF
  // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\
-// RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN
+// RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=ALL,NORDC,HIP,HIPNEF
  
  #include "Inputs/cuda.h"
  
@@ -64,8 +64,9 @@ void use_pointers() {
  // * constant unnamed string with the kernel name
  // ALL: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00"
  // * constant unnamed string with GPU binary
-// HIP: @[[FATBIN:__hip_fatbin]] = external constant i8, section ".hip_fatbin"
  // CUDA: @[[FATBIN:.*]] = private constant{{.*GPU binary would be here.*}}\00",
+// HIPEF: @[[FATBIN:.*]] = private constant{{.*GPU binary would be here.*}}\00",
+// HIPNEF: @[[FATBIN:__hip_fatbin]] = external constant i8, section ".hip_fatbin"
  // CUDANORDC-SAME: section ".nv_fatbin", align 8
  // CUDARDC-SAME: section "__nv_relfatbin", align 8
  // * constant struct that wraps GPU binary
@@ -74,13 +75,14 @@ void use_pointers() {
  // CUDA-SAME: { i32 1180844977, i32 1,
  // HIP-SAME: { i32 1212764230, i32 1,
  // CUDA-SAME: i8* getelementptr inbounds ({{.*}}@[[FATBIN]], i64 0, i64 0),
-// HIP-SAME:  i8* @[[FATBIN]],
+// HIPEF-SAME: i8* getelementptr inbounds ({{.*}}@[[FATBIN]], i64 0, i64 0),
+// HIPNEF-SAME:  i8* @[[FATBIN]],
  // ALL-SAME: i8* null }
  // CUDA-SAME: section ".nvFatBinSegment"
  // HIP-SAME: section ".hipFatBinSegment"
  // * variable to save GPU binary handle after initialization
  // CUDANORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null
-// HIP: @__[[PREFIX]]_gpubin_handle = linkonce hidden global i8** null
+// HIPNEF: @__[[PREFIX]]_gpubin_handle = linkonce hidden global i8** null
  // * constant unnamed string with NVModuleID
  // RDC: [[MODULE_ID_GLOBAL:@.*]] = private constant
  // CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
@@ -157,7 +159,7 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
  // device-side globals, but we still need to register GPU binary.
  // Skip GPU binary string first.
  // CUDANOGLOBALS: @{{.*}} = private constant{{.*}}
-// HIPNOGLOBALS: @{{.*}} = external constant{{.*}}
+// HIPNOGLOBALS: @{{.*}} = internal constant{{.*}}
  // NOGLOBALS-NOT: define internal void @__{{.*}}_register_globals
  // NOGLOBALS: define internal void @__[[PREFIX:cuda|hip]]_module_ctor
  // NOGLOBALS: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper
diff --git a/test/Driver/cuda-external-tools.cu b/test/Driver/cuda-external-tools.cu

index 367c6997ea874e8c1ecf44318adf26bad411c402..5da6ffe0262ed44f05898bc1a1e5a942c4ec625b 100644 (file)
--- a/test/Driver/cuda-external-tools.cu
+++ b/test/Driver/cuda-external-tools.cu
@@ -19,7 +19,7 @@
  // RUN: %clang -### -target x86_64-linux-gnu -Ofast -c %s 2>&1 \
  // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT3 %s
  // Generating relocatable device code
-// RUN: %clang -### -target x86_64-linux-gnu -fcuda-rdc -c %s 2>&1 \
+// RUN: %clang -### -target x86_64-linux-gnu -fgpu-rdc -c %s 2>&1 \
  // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s
  
  // With debugging enabled, ptxas should be run with with no ptxas optimizations.
@@ -46,21 +46,21 @@
  // RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \
  // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35 %s
  // Separate compilation targeting sm_35.
-// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -fcuda-rdc -c %s 2>&1 \
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -fgpu-rdc -c %s 2>&1 \
  // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s
  
  // 32-bit compile.
  // RUN: %clang -### -target i386-linux-gnu -c %s 2>&1 \
  // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20 %s
  // 32-bit compile when generating relocatable device code.
-// RUN: %clang -### -target i386-linux-gnu -fcuda-rdc -c %s 2>&1 \
+// RUN: %clang -### -target i386-linux-gnu -fgpu-rdc -c %s 2>&1 \
  // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20,RDC %s
  
  // Compile with -fintegrated-as.  This should still cause us to invoke ptxas.
  // RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -c %s 2>&1 \
  // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT0 %s
  // Check that we still pass -c when generating relocatable device code.
-// RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -fcuda-rdc -c %s 2>&1 \
+// RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -fgpu-rdc -c %s 2>&1 \
  // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s
  
  // Check -Xcuda-ptxas and -Xcuda-fatbinary
@@ -77,11 +77,11 @@
  // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20 %s
  
  // Check relocatable device code generation on MacOS.
-// RUN: %clang -### -target x86_64-apple-macosx -O0 -fcuda-rdc -c %s 2>&1 \
+// RUN: %clang -### -target x86_64-apple-macosx -O0 -fgpu-rdc -c %s 2>&1 \
  // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s
-// RUN: %clang -### -target x86_64-apple-macosx --cuda-gpu-arch=sm_35 -fcuda-rdc -c %s 2>&1 \
+// RUN: %clang -### -target x86_64-apple-macosx --cuda-gpu-arch=sm_35 -fgpu-rdc -c %s 2>&1 \
  // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s
-// RUN: %clang -### -target i386-apple-macosx -fcuda-rdc -c %s 2>&1 \
+// RUN: %clang -### -target i386-apple-macosx -fgpu-rdc -c %s 2>&1 \
  // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20,RDC %s
  
  // Check that CLANG forwards the -v flag to PTXAS.
@@ -92,12 +92,12 @@
  // CHECK: "-cc1"
  // ARCH64-SAME: "-triple" "nvptx64-nvidia-cuda"
  // ARCH32-SAME: "-triple" "nvptx-nvidia-cuda"
+// RDC-SAME: "-fgpu-rdc"
+// CHECK-NOT: "-fgpu-rdc"
  // SM20-SAME: "-target-cpu" "sm_20"
  // SM35-SAME: "-target-cpu" "sm_35"
  // SM20-SAME: "-o" "[[PTXFILE:[^"]*]]"
  // SM35-SAME: "-o" "[[PTXFILE:[^"]*]]"
-// RDC-SAME: "-fcuda-rdc"
-// CHECK-NOT: "-fcuda-rdc"
  
  // Match the call to ptxas (which assembles PTX to SASS).
  // CHECK: ptxas
@@ -141,7 +141,7 @@
  // ARCH64-SAME: "-triple" "x86_64-
  // ARCH32-SAME: "-triple" "i386-
  // CHECK-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]"
-// RDC-SAME: "-fcuda-rdc"
-// CHECK-NOT: "-fcuda-rdc"
+// RDC-SAME: "-fgpu-rdc"
+// CHECK-NOT: "-fgpu-rdc"
  
  // CHK-PTXAS-VERBOSE: ptxas{{.*}}" "-v"
diff --git a/test/Driver/cuda-phases.cu b/test/Driver/cuda-phases.cu

index e34ffa802d97c1ddd8f99328ae422d8cfddf0012..2160d19f5c6ebd0d59d1b0d5683d5fe1dc63fdfc 100644 (file)
--- a/test/Driver/cuda-phases.cu
+++ b/test/Driver/cuda-phases.cu
@@ -11,12 +11,21 @@
  //
  // Test single gpu architecture with complete compilation.
  //
+// Test CUDA NVPTX phases.
  // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
  // RUN: --cuda-gpu-arch=sm_30 %s 2>&1 \
  // RUN: | FileCheck -check-prefixes=BIN,BIN_NV %s
+//
+// Test HIP AMDGPU -fgpu-rdc phases.
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=BIN,BIN_AMD,BIN_AMD_RDC %s
+//
+// Test HIP AMDGPU -fno-gpu-rdc phases (default).
  // RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
  // RUN: --cuda-gpu-arch=gfx803 %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,BIN_AMD %s
+// RUN: | FileCheck -check-prefixes=BIN,BIN_AMD,BIN_AMD_NRDC %s
+//
  // BIN_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
  // BIN_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
  // BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
@@ -32,12 +41,17 @@
  // BIN_NV-DAG: [[P10:[0-9]+]]: linker, {[[P8]], [[P9]]}, cuda-fatbin, (device-[[T]])
  // BIN_NV-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-[[T]] ([[TRIPLE]])" {[[P10]]}, ir
  // BIN_NV-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
-// BIN_AMD-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// BIN_AMD_RDC-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// BIN_AMD_NRDC-DAG: [[P6:[0-9]+]]: linker, {[[P5]]}, image, (device-hip, [[ARCH]])
+// BIN_AMD_NRDC-DAG: [[P7:[0-9]+]]: offload, "device-hip (amdgcn-amd-amdhsa:[[ARCH]])" {[[P6]]}, image
+// BIN_AMD_NRDC-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, hip-fatbin, (device-hip)
+// BIN_AMD_NRDC-DAG: [[P11:[0-9]+]]: offload, "host-hip (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-hip (amdgcn-amd-amdhsa)" {[[P8]]}, ir
+// BIN_AMD_NRDC-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
  // BIN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
  // BIN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
-// BIN_AMD-DAG: [[P15:[0-9]+]]: linker, {[[P5]]}, image, (device-[[T]], [[ARCH]])
-// BIN_AMD-DAG: [[P16:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P14]]},
-// BIN_AMD-DAG-SAME:  "device-[[T]] ([[TRIPLE:amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P15]]}, object
+// BIN_AMD_RDC-DAG: [[P15:[0-9]+]]: linker, {[[P5]]}, image, (device-[[T]], [[ARCH]])
+// BIN_AMD_RDC-DAG: [[P16:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P14]]},
+// BIN_AMD_RDC-DAG-SAME:  "device-[[T]] ([[TRIPLE:amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P15]]}, object
  
  //
  // Test single gpu architecture up to the assemble phase.
@@ -46,7 +60,10 @@
  // RUN: --cuda-gpu-arch=sm_30 %s -S 2>&1 \
  // RUN: | FileCheck -check-prefixes=ASM,ASM_NV %s
  // RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
-// RUN: --cuda-gpu-arch=gfx803 %s -S 2>&1 \
+// RUN: --cuda-gpu-arch=gfx803 -fgpu-rdc %s -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=ASM,ASM_AMD %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 -fcuda-rdc %s -S 2>&1 \
  // RUN: | FileCheck -check-prefixes=ASM,ASM_AMD %s
  // ASM_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]])
  // ASM_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
@@ -66,7 +83,7 @@
  // RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \
  // RUN: | FileCheck -check-prefixes=BIN2,BIN2_NV %s
  // RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
-// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s 2>&1 \
+// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 -fgpu-rdc %s 2>&1 \
  // RUN: | FileCheck -check-prefixes=BIN2,BIN2_AMD %s
  // BIN2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
  // BIN2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
@@ -105,7 +122,7 @@
  // RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \
  // RUN: | FileCheck -check-prefixes=ASM2,ASM2_NV %s
  // RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
-// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s -S 2>&1 \
+// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 -fgpu-rdc %s -S 2>&1 \
  // RUN: | FileCheck -check-prefixes=ASM2,ASM2_AMD %s
  // ASM2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH1:sm_30]])
  // ASM2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH1:gfx803]])
diff --git a/test/Driver/hip-output-file-name.hip b/test/Driver/hip-output-file-name.hip

index 65887f44d4c03bfb1fde368176b51150aca49f7c..cce9f76406eec6128f79808b500ae5c0d5493770 100644 (file)
--- a/test/Driver/hip-output-file-name.hip
+++ b/test/Driver/hip-output-file-name.hip
@@ -2,7 +2,7 @@
  // REQUIRES: x86-registered-target
  // REQUIRES: amdgpu-registered-target
  
-// RUN: %clang -### -c -target x86_64-linux-gnu \
+// RUN: %clang -### -c -target x86_64-linux-gnu -fgpu-rdc \
  // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
  // RUN: 2>&1 | FileCheck %s
  
diff --git a/test/Driver/hip-toolchain-no-rdc.hip b/test/Driver/hip-toolchain-no-rdc.hip

new file mode 100644 (file)

index 0000000..d1fad58
--- /dev/null
+++ b/test/Driver/hip-toolchain-no-rdc.hip
@@ -0,0 +1,150 @@
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang -### -target x86_64-linux-gnu -fno-gpu-rdc \
+// RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
+// RUN:   --hip-device-lib=lib1.bc --hip-device-lib=lib2.bc \
+// RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
+// RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib2 \
+// RUN:   -fuse-ld=lld \
+// RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK %s
+
+//
+// Compile device code in a.cu to code object for gfx803.
+//
+
+// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
+// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
+// CHECK-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx803"
+// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden"
+// CHECK-SAME: {{.*}} "-o" [[A_BC_803:".*bc"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[A_SRC:".*a.cu"]]
+
+// CHECK: [[LLVM_LINK:"*.llvm-link"]] [[A_BC_803]]
+// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
+// CHECK-SAME: "-o" [[LINKED_BC_DEV_A_803:".*-gfx803-linked-.*bc"]]
+
+// CHECK: [[OPT:".*opt"]] [[LINKED_BC_DEV_A_803]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-mcpu=gfx803"
+// CHECK-SAME: "-o" [[OPT_BC_DEV_A_803:".*-gfx803-optimized.*bc"]]
+
+// CHECK: [[LLC: ".*llc"]] [[OPT_BC_DEV_A_803]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-filetype=obj" "-mcpu=gfx803" "-o" [[OBJ_DEV_A_803:".*-gfx803-.*o"]]
+
+// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" [[OBJ_DEV_A_803]]
+
+//
+// Compile device code in a.cu to code object for gfx900.
+//
+
+// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
+// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
+// CHECK-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx900"
+// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden"
+// CHECK-SAME: {{.*}} "-o" [[A_BC_900:".*bc"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[A_SRC]]
+
+// CHECK: [[LLVM_LINK:"*.llvm-link"]] [[A_BC_900]]
+// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
+// CHECK-SAME: "-o" [[LINKED_BC_DEV_A_900:".*-gfx900-linked-.*bc"]]
+
+// CHECK: [[OPT:".*opt"]] [[LINKED_BC_DEV_A_900]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-mcpu=gfx900"
+// CHECK-SAME: "-o" [[OPT_BC_DEV_A_900:".*-gfx900-optimized.*bc"]]
+
+// CHECK: [[LLC: ".*llc"]] [[OPT_BC_DEV_A_900]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-filetype=obj" "-mcpu=gfx900" "-o" [[OBJ_DEV_A_900:".*-gfx900-.*o"]]
+
+// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" [[OBJ_DEV_A_900]]
+
+//
+// Bundle and embed device code in host object for a.cu.
+//
+
+// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
+// CHECK-SAME: "-inputs={{.*}},[[IMG_DEV_A_803]],[[IMG_DEV_A_900]]" "-outputs=[[BUNDLE_A:.*hipfb]]"
+
+// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
+// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa" "-emit-obj"
+// CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
+// CHECK-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[A_SRC]]
+// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
+
+//
+// Compile device code in b.hip to code object for gfx803.
+//
+
+// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
+// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
+// CHECK-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx803"
+// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden"
+// CHECK-SAME: {{.*}} "-o" [[B_BC_803:".*bc"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[B_SRC:".*b.hip"]]
+
+// CHECK: [[LLVM_LINK:"*.llvm-link"]] [[B_BC_803]]
+// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
+// CHECK-SAME: "-o" [[LINKED_BC_DEV_B_803:".*-gfx803-linked-.*bc"]]
+
+// CHECK: [[OPT:".*opt"]] [[LINKED_BC_DEV_B_803]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-mcpu=gfx803"
+// CHECK-SAME: "-o" [[OPT_BC_DEV_B_803:".*-gfx803-optimized.*bc"]]
+
+// CHECK: [[LLC: ".*llc"]] [[OPT_BC_DEV_B_803]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-filetype=obj" "-mcpu=gfx803" "-o" [[OBJ_DEV_B_803:".*-gfx803-.*o"]]
+
+// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV_B_803:.*out]]" [[OBJ_DEV_B_803]]
+
+//
+// Compile device code in b.hip to code object for gfx900.
+//
+
+// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
+// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
+// CHECK-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx900"
+// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden"
+// CHECK-SAME: {{.*}} "-o" [[B_BC_900:".*bc"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[B_SRC]]
+
+// CHECK: [[LLVM_LINK:"*.llvm-link"]] [[B_BC_900]]
+// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
+// CHECK-SAME: "-o" [[LINKED_BC_DEV_B_900:".*-gfx900-linked-.*bc"]]
+
+// CHECK: [[OPT:".*opt"]] [[LINKED_BC_DEV_B_900]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-mcpu=gfx900"
+// CHECK-SAME: "-o" [[OPT_BC_DEV_B_900:".*-gfx900-optimized.*bc"]]
+
+// CHECK: [[LLC: ".*llc"]] [[OPT_BC_DEV_B_900]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-filetype=obj" "-mcpu=gfx900" "-o" [[OBJ_DEV_B_900:".*-gfx900-.*o"]]
+
+// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV_B_900:.*out]]" [[OBJ_DEV_B_900]]
+
+//
+// Bundle and embed device code in host object for b.hip.
+//
+
+// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
+// CHECK-SAME: "-inputs={{.*}},[[IMG_DEV_B_803]],[[IMG_DEV_B_900]]" "-outputs=[[BUNDLE_A:.*hipfb]]"
+
+// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
+// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa" "-emit-obj"
+// CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
+// CHECK-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[B_SRC]]
+// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
+
+//
+// Link host objects.
+//
+
+// CHECK: [[LD:".*ld.*"]] {{.*}} [[A_OBJ_HOST]] [[B_OBJ_HOST]]
+// CHECK-NOT: "-T" "{{.*}}.lk"
diff --git a/test/Driver/hip-toolchain.hip b/test/Driver/hip-toolchain-rdc.hip

similarity index 95%

rename from test/Driver/hip-toolchain.hip

rename to test/Driver/hip-toolchain-rdc.hip

index 8bdce487b1d7a2327f1e93ccee4e8d97a7ff711b..8852061f6e2697f82114d9cac9a0704dd13814bc 100644 (file)
--- a/test/Driver/hip-toolchain.hip
+++ b/test/Driver/hip-toolchain-rdc.hip
@@ -7,7 +7,7 @@
  // RUN:   --hip-device-lib=lib1.bc --hip-device-lib=lib2.bc \
  // RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
  // RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib2 \
-// RUN:   -fuse-ld=lld \
+// RUN:   -fuse-ld=lld -fgpu-rdc \
  // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
  // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
  // RUN: 2>&1 | FileCheck %s
@@ -15,14 +15,14 @@
  // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa" 
  // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
  // CHECK-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx803"
-// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden"
+// CHECK-SAME: "-fcuda-is-device" "-fgpu-rdc" "-fvisibility" "hidden"
  // CHECK-SAME: {{.*}} "-o" [[A_BC:".*bc"]] "-x" "hip"
  // CHECK-SAME: {{.*}} [[A_SRC:".*a.cu"]]
  
  // CHECK: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
  // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
  // CHECK-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx803"
-// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden"
+// CHECK-SAME: "-fcuda-is-device" "-fgpu-rdc" "-fvisibility" "hidden"
  // CHECK-SAME: {{.*}} "-o" [[B_BC:".*bc"]] "-x" "hip"
  // CHECK-SAME: {{.*}} [[B_SRC:".*b.hip"]]
  
@@ -80,7 +80,7 @@
  
  // CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
  // CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
-// CHECK-SAME: "-inputs={{.*}},[[IMG_DEV1]],[[IMG_DEV2]]" "-outputs=[[BUNDLE:.*o]]"
+// CHECK-SAME: "-inputs={{.*}},[[IMG_DEV1]],[[IMG_DEV2]]" "-outputs=[[BUNDLE:.*hipfb]]"
  
  // CHECK: [[LD:".*ld.*"]] {{.*}} [[A_OBJ_HOST]] [[B_OBJ_HOST]]
  // CHECK-SAME: {{.*}} "-T" "{{.*}}.lk"
diff --git a/test/SemaCUDA/extern-shared.cu b/test/SemaCUDA/extern-shared.cu

index ec6f31695052a233073c34a44aa96c1bcd902160..bc4f10d3082c0d1906fa168d6c2582e585e15bf8 100644 (file)
--- a/test/SemaCUDA/extern-shared.cu
+++ b/test/SemaCUDA/extern-shared.cu
@@ -1,8 +1,8 @@
  // RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -verify %s
  // RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -fcuda-is-device -verify %s
  
-// RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -fcuda-rdc -verify=rdc %s
-// RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -fcuda-is-device -fcuda-rdc -verify=rdc %s
+// RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -fgpu-rdc -verify=rdc %s
+// RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -fcuda-is-device -fgpu-rdc -verify=rdc %s
  
  // Most of these declarations are fine in separate compilation mode.
author	Yaxun Liu <Yaxun.Liu@amd.com>
	Tue, 2 Oct 2018 17:48:54 +0000 (17:48 +0000)
committer	Yaxun Liu <Yaxun.Liu@amd.com>
	Tue, 2 Oct 2018 17:48:54 +0000 (17:48 +0000)
include/clang/Basic/LangOptions.def		patch \| blob \| history
include/clang/Driver/Options.td		patch \| blob \| history
include/clang/Driver/Types.def		patch \| blob \| history
lib/AST/Decl.cpp		patch \| blob \| history
lib/CodeGen/CGCUDANV.cpp		patch \| blob \| history
lib/Driver/Driver.cpp		patch \| blob \| history
lib/Driver/ToolChains/Clang.cpp		patch \| blob \| history
lib/Driver/ToolChains/CommonArgs.cpp		patch \| blob \| history
lib/Driver/ToolChains/Cuda.cpp		patch \| blob \| history
lib/Driver/ToolChains/HIP.cpp		patch \| blob \| history
lib/Driver/ToolChains/HIP.h		patch \| blob \| history
lib/Frontend/CompilerInvocation.cpp		patch \| blob \| history
lib/Sema/SemaDeclAttr.cpp		patch \| blob \| history
test/CodeGenCUDA/device-stub.cu		patch \| blob \| history
test/Driver/cuda-external-tools.cu		patch \| blob \| history
test/Driver/cuda-phases.cu		patch \| blob \| history
test/Driver/hip-output-file-name.hip		patch \| blob \| history
test/Driver/hip-toolchain-no-rdc.hip	[new file with mode: 0644]	patch \| blob
test/Driver/hip-toolchain-rdc.hip	[moved from test/Driver/hip-toolchain.hip with 95% similarity]	patch \| blob \| history
test/SemaCUDA/extern-shared.cu		patch \| blob \| history