[CUDA] Add -fcuda-approx-transcendentals flag.

author Justin Lebar <jlebar@google.com>

Mon, 23 May 2016 20:19:56 +0000 (20:19 +0000)

committer Justin Lebar <jlebar@google.com>

Mon, 23 May 2016 20:19:56 +0000 (20:19 +0000)
author Justin Lebar <jlebar@google.com>
Mon, 23 May 2016 20:19:56 +0000 (20:19 +0000)
committer Justin Lebar <jlebar@google.com>
Mon, 23 May 2016 20:19:56 +0000 (20:19 +0000)
diff --git a/include/clang/Basic/LangOptions.def b/include/clang/Basic/LangOptions.def

index b19095c3f905941be0b263a959bfcabf6fda4806..2ca8dba0e108a78349aabd17c6e925a12e51f912 100644 (file)
--- a/include/clang/Basic/LangOptions.def
+++ b/include/clang/Basic/LangOptions.def
@@ -190,6 +190,7 @@ LANGOPT(CUDAIsDevice      , 1, 0, "compiling for CUDA device")
  LANGOPT(CUDAAllowVariadicFunctions, 1, 0, "allowing variadic functions in CUDA device code")
  LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__")
  LANGOPT(CUDADeviceFlushDenormalsToZero, 1, 0, "flushing denormals to zero")
+LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")
  
  LANGOPT(SizedDeallocation , 1, 0, "enable sized deallocation functions")
  LANGOPT(ConceptsTS , 1, 0, "enable C++ Extensions for Concepts")
diff --git a/include/clang/Driver/Options.td b/include/clang/Driver/Options.td

index 989a06fa13dc6c63a591b6de5f64975687284851..ecf0ade1b08b2f78524582501091bf92d1ff61b4 100644 (file)
--- a/include/clang/Driver/Options.td
+++ b/include/clang/Driver/Options.td
@@ -395,6 +395,9 @@ def cuda_path_EQ : Joined<["--"], "cuda-path=">, Group<i_Group>,
  def fcuda_flush_denormals_to_zero : Flag<["-"], "fcuda-flush-denormals-to-zero">,
    Flags<[CC1Option]>, HelpText<"Flush denormal floating point values to zero in CUDA device mode.">;
  def fno_cuda_flush_denormals_to_zero : Flag<["-"], "fno-cuda-flush-denormals-to-zero">;
+def fcuda_approx_transcendentals : Flag<["-"], "fcuda-approx-transcendentals">,
+  Flags<[CC1Option]>, HelpText<"Use approximate transcendental functions">;
+def fno_cuda_approx_transcendentals : Flag<["-"], "fno-cuda-approx-transcendentals">;
  def dA : Flag<["-"], "dA">, Group<d_Group>;
  def dD : Flag<["-"], "dD">, Group<d_Group>, Flags<[CC1Option]>,
    HelpText<"Print macro definitions in -E mode in addition to normal output">;
diff --git a/lib/Driver/ToolChains.cpp b/lib/Driver/ToolChains.cpp

index 2f1420e943f964d3fe5b313a99ff25b555c89364..75aa99580282b1d5357038b9206543253565b706 100644 (file)
--- a/lib/Driver/ToolChains.cpp
+++ b/lib/Driver/ToolChains.cpp
@@ -4502,6 +4502,10 @@ CudaToolChain::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
                           options::OPT_fno_cuda_flush_denormals_to_zero, false))
      CC1Args.push_back("-fcuda-flush-denormals-to-zero");
  
+  if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals,
+                         options::OPT_fno_cuda_approx_transcendentals, false))
+    CC1Args.push_back("-fcuda-approx-transcendentals");
+
    if (DriverArgs.hasArg(options::OPT_nocudalib))
      return;
  
diff --git a/lib/Frontend/CompilerInvocation.cpp b/lib/Frontend/CompilerInvocation.cpp

index c5f839e67374394bd57c50cf86057ab9f11edf95..852801ab24053779b0ae212144d49bf614ac7aeb 100644 (file)
--- a/lib/Frontend/CompilerInvocation.cpp
+++ b/lib/Frontend/CompilerInvocation.cpp
@@ -1616,6 +1616,9 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK,
    if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_flush_denormals_to_zero))
      Opts.CUDADeviceFlushDenormalsToZero = 1;
  
+  if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_approx_transcendentals))
+    Opts.CUDADeviceApproxTranscendentals = 1;
+
    if (Opts.ObjC1) {
      if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) {
        StringRef value = arg->getValue();
diff --git a/lib/Frontend/InitPreprocessor.cpp b/lib/Frontend/InitPreprocessor.cpp

index 5d38d5f95035290823b7d9607fdc5969ebdb0b80..f8b407ba26481468987b56026d8b599b93799d38 100644 (file)
--- a/lib/Frontend/InitPreprocessor.cpp
+++ b/lib/Frontend/InitPreprocessor.cpp
@@ -938,6 +938,12 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
      Builder.defineMacro("__CUDA_ARCH__");
    }
  
+  // We need to communicate this to our CUDA header wrapper, which in turn
+  // informs the proper CUDA headers of this choice.
+  if (LangOpts.CUDADeviceApproxTranscendentals || LangOpts.FastMath) {
+    Builder.defineMacro("__CLANG_CUDA_APPROX_TRANSCENDENTALS__");
+  }
+
    // OpenCL definitions.
    if (LangOpts.OpenCL) {
  #define OPENCLEXT(Ext) \
diff --git a/lib/Headers/__clang_cuda_runtime_wrapper.h b/lib/Headers/__clang_cuda_runtime_wrapper.h

index 3e41eabac032882d9c499b291d716993dc081b17..bce9d72af354f1e180e63d9a446c4ea4cb7c1554 100644 (file)
--- a/lib/Headers/__clang_cuda_runtime_wrapper.h
+++ b/lib/Headers/__clang_cuda_runtime_wrapper.h
@@ -142,7 +142,20 @@
  #pragma push_macro("__forceinline__")
  #define __forceinline__ __device__ __inline__ __attribute__((always_inline))
  #include "device_functions.hpp"
+
+// math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we
+// get the slow-but-accurate or fast-but-inaccurate versions of functions like
+// sin and exp.  This is controlled in clang by -fcuda-approx-transcendentals.
+//
+// device_functions.hpp uses __USE_FAST_MATH__ for a different purpose (fast vs.
+// slow divides), so we need to scope our define carefully here.
+#pragma push_macro("__USE_FAST_MATH__")
+#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
+#define __USE_FAST_MATH__
+#endif
  #include "math_functions.hpp"
+#pragma pop_macro("__USE_FAST_MATH__")
+
  #include "math_functions_dbl_ptx3.hpp"
  #pragma pop_macro("__forceinline__")
  
@@ -296,6 +309,7 @@ __device__ inline __cuda_builtin_gridDim_t::operator dim3() const {
  #include "curand_mtgp32_kernel.h"
  #pragma pop_macro("dim3")
  #pragma pop_macro("uint3")
+#pragma pop_macro("__USE_FAST_MATH__")
  
  #endif // __CUDA__
  #endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__
diff --git a/test/Preprocessor/cuda-approx-transcendentals.cu b/test/Preprocessor/cuda-approx-transcendentals.cu

new file mode 100644 (file)

index 0000000..8d106ea
--- /dev/null
+++ b/test/Preprocessor/cuda-approx-transcendentals.cu
@@ -0,0 +1,8 @@
+// RUN: %clang --cuda-host-only -nocudainc -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null | FileCheck --check-prefix HOST %s
+// RUN: %clang --cuda-device-only -nocudainc -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null | FileCheck --check-prefix DEVICE-NOFAST %s
+// RUN: %clang -fcuda-approx-transcendentals --cuda-device-only -nocudainc -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null | FileCheck --check-prefix DEVICE-FAST %s
+// RUN: %clang -ffast-math --cuda-device-only -nocudainc -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null | FileCheck --check-prefix DEVICE-FAST %s
+
+// HOST-NOT: __CLANG_CUDA_APPROX_TRANSCENDENTALS__
+// DEVICE-NOFAST-NOT: __CLANG_CUDA_APPROX_TRANSCENDENTALS__
+// DEVICE-FAST: __CLANG_CUDA_APPROX_TRANSCENDENTALS__
author	Justin Lebar <jlebar@google.com>
	Mon, 23 May 2016 20:19:56 +0000 (20:19 +0000)
committer	Justin Lebar <jlebar@google.com>
	Mon, 23 May 2016 20:19:56 +0000 (20:19 +0000)
include/clang/Basic/LangOptions.def		patch \| blob \| history
include/clang/Driver/Options.td		patch \| blob \| history
lib/Driver/ToolChains.cpp		patch \| blob \| history
lib/Frontend/CompilerInvocation.cpp		patch \| blob \| history
lib/Frontend/InitPreprocessor.cpp		patch \| blob \| history
lib/Headers/__clang_cuda_runtime_wrapper.h		patch \| blob \| history
test/Preprocessor/cuda-approx-transcendentals.cu	[new file with mode: 0644]	patch \| blob