[CUDA] Added support for CUDA-8

author Artem Belevich <tra@google.com>

Wed, 28 Sep 2016 17:47:40 +0000 (17:47 +0000)

committer Artem Belevich <tra@google.com>

Wed, 28 Sep 2016 17:47:40 +0000 (17:47 +0000)
author Artem Belevich <tra@google.com>
Wed, 28 Sep 2016 17:47:40 +0000 (17:47 +0000)
committer Artem Belevich <tra@google.com>
Wed, 28 Sep 2016 17:47:40 +0000 (17:47 +0000)
diff --git a/lib/Driver/ToolChains.cpp b/lib/Driver/ToolChains.cpp

index 67f165c18bb2de649871bc7be0dc558bbf9d9f36..06cb2278dc273a5d78f0c547ed9d351f727940c3 100644 (file)
--- a/lib/Driver/ToolChains.cpp
+++ b/lib/Driver/ToolChains.cpp
@@ -1774,8 +1774,7 @@ void Generic_GCC::CudaInstallationDetector::init(
          Args.getLastArgValue(options::OPT_cuda_path_EQ));
    else {
      CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda");
-    // FIXME: Uncomment this once we can compile the cuda 8 headers.
-    // CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda-8.0");
+    CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda-8.0");
      CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda-7.5");
      CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda-7.0");
    }
@@ -1795,6 +1794,16 @@ void Generic_GCC::CudaInstallationDetector::init(
            FS.exists(LibDevicePath)))
        continue;
  
+    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> VersionFile =
+        FS.getBufferForFile(InstallPath + "/version.txt");
+    if (!VersionFile) {
+      // CUDA 7.0 doesn't have a version.txt, so guess that's our version if
+      // version.txt isn't present.
+      Version = CudaVersion::CUDA_70;
+    } else {
+      Version = ParseCudaVersionFile((*VersionFile)->getBuffer());
+    }
+
      std::error_code EC;
      for (llvm::sys::fs::directory_iterator LI(LibDevicePath, EC), LE;
           !EC && LI != LE; LI = LI.increment(EC)) {
@@ -1807,24 +1816,20 @@ void Generic_GCC::CudaInstallationDetector::init(
        StringRef GpuArch = FileName.slice(
            LibDeviceName.size(), FileName.find('.', LibDeviceName.size()));
        LibDeviceMap[GpuArch] = FilePath.str();
-      // Insert map entries for specifc devices with this compute capability.
-      // NVCC's choice of libdevice library version is rather peculiar:
-      // http://docs.nvidia.com/cuda/libdevice-users-guide/basic-usage.html#version-selection
-      // TODO: this will need to be updated once CUDA-8 is released.
+      // Insert map entries for specifc devices with this compute
+      // capability. NVCC's choice of the libdevice library version is
+      // rather peculiar and depends on the CUDA version.
        if (GpuArch == "compute_20") {
          LibDeviceMap["sm_20"] = FilePath;
          LibDeviceMap["sm_21"] = FilePath;
          LibDeviceMap["sm_32"] = FilePath;
        } else if (GpuArch == "compute_30") {
          LibDeviceMap["sm_30"] = FilePath;
-        // compute_30 is the fallback libdevice variant for sm_30+,
-        // unless CUDA specifies different version for specific GPU
-        // arch.
-        LibDeviceMap["sm_50"] = FilePath;
-        LibDeviceMap["sm_52"] = FilePath;
-        LibDeviceMap["sm_53"] = FilePath;
-        // sm_6? are currently all aliases for sm_53 in LLVM and
-        // should use compute_30.
+        if (Version < CudaVersion::CUDA_80) {
+          LibDeviceMap["sm_50"] = FilePath;
+          LibDeviceMap["sm_52"] = FilePath;
+          LibDeviceMap["sm_53"] = FilePath;
+        }
          LibDeviceMap["sm_60"] = FilePath;
          LibDeviceMap["sm_61"] = FilePath;
          LibDeviceMap["sm_62"] = FilePath;
@@ -1832,21 +1837,14 @@ void Generic_GCC::CudaInstallationDetector::init(
          LibDeviceMap["sm_35"] = FilePath;
          LibDeviceMap["sm_37"] = FilePath;
        } else if (GpuArch == "compute_50") {
-        // NVCC does not use compute_50 libdevice at all at the moment.
-        // The version that's shipped with CUDA-7.5 is a copy of compute_30.
+        if (Version >= CudaVersion::CUDA_80) {
+          LibDeviceMap["sm_50"] = FilePath;
+          LibDeviceMap["sm_52"] = FilePath;
+          LibDeviceMap["sm_53"] = FilePath;
+        }
        }
      }
  
-    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> VersionFile =
-        FS.getBufferForFile(InstallPath + "/version.txt");
-    if (!VersionFile) {
-      // CUDA 7.0 doesn't have a version.txt, so guess that's our version if
-      // version.txt isn't present.
-      Version = CudaVersion::CUDA_70;
-    } else {
-      Version = ParseCudaVersionFile((*VersionFile)->getBuffer());
-    }
-
      IsValid = true;
      break;
    }
diff --git a/lib/Headers/__clang_cuda_runtime_wrapper.h b/lib/Headers/__clang_cuda_runtime_wrapper.h

index 05a85fa3d5651afe736509c1d0c8d50e1572933c..0cf8b17def5e86c7d1083e9b1802295cf5ae29a4 100644 (file)
--- a/lib/Headers/__clang_cuda_runtime_wrapper.h
+++ b/lib/Headers/__clang_cuda_runtime_wrapper.h
@@ -62,7 +62,7 @@
  #include "cuda.h"
  #if !defined(CUDA_VERSION)
  #error "cuda.h did not define CUDA_VERSION"
-#elif CUDA_VERSION < 7000 || CUDA_VERSION > 7050
+#elif CUDA_VERSION < 7000 || CUDA_VERSION > 8000
  #error "Unsupported CUDA version!"
  #endif
  
@@ -113,6 +113,7 @@
  #undef __cxa_vec_ctor
  #undef __cxa_vec_cctor
  #undef __cxa_vec_dtor
+#undef __cxa_vec_new
  #undef __cxa_vec_new2
  #undef __cxa_vec_new3
  #undef __cxa_vec_delete2
@@ -135,6 +136,21 @@
  // the headers we're about to include.
  #define __host__ UNEXPECTED_HOST_ATTRIBUTE
  
+// CUDA 8.0.41 relies on __USE_FAST_MATH__ and __CUDA_PREC_DIV's values.
+// Previous versions used to check whether they are defined or not.
+// CU_DEVICE_INVALID macro is only defined in 8.0.41, so we use it
+// here to detect the switch.
+
+#if defined(CU_DEVICE_INVALID)
+#if !defined(__USE_FAST_MATH__)
+#define __USE_FAST_MATH__ 0
+#endif
+
+#if !defined(__CUDA_PREC_DIV)
+#define __CUDA_PREC_DIV 0
+#endif
+#endif
+
  // device_functions.hpp and math_functions*.hpp use 'static
  // __forceinline__' (with no __device__) for definitions of device
  // functions. Temporarily redefine __forceinline__ to include
@@ -151,7 +167,7 @@
  // slow divides), so we need to scope our define carefully here.
  #pragma push_macro("__USE_FAST_MATH__")
  #if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
-#define __USE_FAST_MATH__
+#define __USE_FAST_MATH__ 1
  #endif
  #include "math_functions.hpp"
  #pragma pop_macro("__USE_FAST_MATH__")
diff --git a/test/Driver/Inputs/CUDA/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.bc b/test/Driver/Inputs/CUDA/usr/local/cuda/nvvm/libdevice/libdevice.compute_30.10.bc

similarity index 100%

rename from test/Driver/Inputs/CUDA/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.bc

rename to test/Driver/Inputs/CUDA/usr/local/cuda/nvvm/libdevice/libdevice.compute_30.10.bc
diff --git a/test/Driver/Inputs/CUDA_80/usr/local/cuda/nvvm/libdevice/libdevice.compute_50.10.bc b/test/Driver/Inputs/CUDA_80/usr/local/cuda/nvvm/libdevice/libdevice.compute_50.10.bc

new file mode 100644 (file)

index 0000000..e69de29
diff --git a/test/Driver/cuda-detect.cu b/test/Driver/cuda-detect.cu

index 22d36062ae5cc7859c76a68d85c0992ceb11808b..4694cc0eb8d1196aa40e28fb1633476138ae5dbf 100644 (file)
--- a/test/Driver/cuda-detect.cu
+++ b/test/Driver/cuda-detect.cu
@@ -22,13 +22,14 @@
  // RUN:   --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \
  // RUN:   | FileCheck %s -check-prefix COMMON \
  // RUN:     -check-prefix LIBDEVICE -check-prefix LIBDEVICE20
-// sm_30, sm_5x and sm_6x map to compute_30
+// sm_30, sm_6x map to compute_30.
  // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_30 \
  // RUN:   --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \
  // RUN:   | FileCheck %s -check-prefix COMMON \
  // RUN:     -check-prefix LIBDEVICE -check-prefix LIBDEVICE30
+// sm_5x is a special case. Maps to compute_30 for cuda-7.x only.
  // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_50 \
-// RUN:   --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \
+// RUN:   --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
  // RUN:   | FileCheck %s -check-prefix COMMON \
  // RUN:     -check-prefix LIBDEVICE -check-prefix LIBDEVICE30
  // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_60 \
@@ -44,6 +45,12 @@
  // RUN:   --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \
  // RUN:   | FileCheck %s -check-prefix COMMON -check-prefix CUDAINC \
  // RUN:     -check-prefix LIBDEVICE -check-prefix LIBDEVICE35
+// sm_5x -> compute_50 for CUDA-8.0 and newer.
+// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_50 \
+// RUN:   --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \
+// RUN:   | FileCheck %s -check-prefix COMMON \
+// RUN:     -check-prefix LIBDEVICE -check-prefix LIBDEVICE50
+
  
  // Verify that -nocudainc prevents adding include path to CUDA headers.
  // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \
@@ -56,8 +63,8 @@
  // RUN:   | FileCheck %s -check-prefix COMMON -check-prefix NOCUDAINC
  
  // Verify that we get an error if there's no libdevice library to link with.
-// NOTE: Inputs/CUDA deliberately does *not* have libdevice.compute_30  for this purpose.
-// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_30 \
+// NOTE: Inputs/CUDA deliberately does *not* have libdevice.compute_20  for this purpose.
+// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_20 \
  // RUN:   --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
  // RUN:   | FileCheck %s -check-prefix COMMON -check-prefix MISSINGLIBDEVICE
  
@@ -81,7 +88,7 @@
  // CHECK: Found CUDA installation: {{.*}}/Inputs/CUDA/usr/local/cuda
  // NOCUDA-NOT: Found CUDA installation:
  
-// MISSINGLIBDEVICE: error: cannot find libdevice for sm_30.
+// MISSINGLIBDEVICE: error: cannot find libdevice for sm_20.
  
  // COMMON: "-triple" "nvptx-nvidia-cuda"
  // COMMON-SAME: "-fcuda-is-device"
@@ -90,6 +97,7 @@
  // LIBDEVICE20-SAME: libdevice.compute_20.10.bc
  // LIBDEVICE30-SAME: libdevice.compute_30.10.bc
  // LIBDEVICE35-SAME: libdevice.compute_35.10.bc
+// LIBDEVICE50-SAME: libdevice.compute_50.10.bc
  // NOLIBDEVICE-NOT: libdevice.compute_{{.*}}.bc
  // LIBDEVICE-SAME: "-target-feature" "+ptx42"
  // NOLIBDEVICE-NOT: "-target-feature" "+ptx42"
author	Artem Belevich <tra@google.com>
	Wed, 28 Sep 2016 17:47:40 +0000 (17:47 +0000)
committer	Artem Belevich <tra@google.com>
	Wed, 28 Sep 2016 17:47:40 +0000 (17:47 +0000)
lib/Driver/ToolChains.cpp		patch \| blob \| history
lib/Headers/__clang_cuda_runtime_wrapper.h		patch \| blob \| history
test/Driver/Inputs/CUDA/usr/local/cuda/nvvm/libdevice/libdevice.compute_30.10.bc	[moved from test/Driver/Inputs/CUDA/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.bc with 100% similarity]	patch \| blob \| history
test/Driver/Inputs/CUDA_80/usr/local/cuda/nvvm/libdevice/libdevice.compute_50.10.bc	[new file with mode: 0644]	patch \| blob
test/Driver/cuda-detect.cu		patch \| blob \| history