From 445e59e90c1afb49c4f09fe8fcc1c85318967a2b Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Mon, 23 May 2016 20:19:56 +0000 Subject: [PATCH] [CUDA] Add -fcuda-approx-transcendentals flag. Summary: This lets us emit e.g. sin.approx.f32. See http://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-sin Reviewers: rnk Subscribers: tra, cfe-commits Differential Revision: http://reviews.llvm.org/D20493 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@270484 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/clang/Basic/LangOptions.def | 1 + include/clang/Driver/Options.td | 3 +++ lib/Driver/ToolChains.cpp | 4 ++++ lib/Frontend/CompilerInvocation.cpp | 3 +++ lib/Frontend/InitPreprocessor.cpp | 6 ++++++ lib/Headers/__clang_cuda_runtime_wrapper.h | 14 ++++++++++++++ test/Preprocessor/cuda-approx-transcendentals.cu | 8 ++++++++ 7 files changed, 39 insertions(+) create mode 100644 test/Preprocessor/cuda-approx-transcendentals.cu diff --git a/include/clang/Basic/LangOptions.def b/include/clang/Basic/LangOptions.def index b19095c3f9..2ca8dba0e1 100644 --- a/include/clang/Basic/LangOptions.def +++ b/include/clang/Basic/LangOptions.def @@ -190,6 +190,7 @@ LANGOPT(CUDAIsDevice , 1, 0, "compiling for CUDA device") LANGOPT(CUDAAllowVariadicFunctions, 1, 0, "allowing variadic functions in CUDA device code") LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__") LANGOPT(CUDADeviceFlushDenormalsToZero, 1, 0, "flushing denormals to zero") +LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions") LANGOPT(SizedDeallocation , 1, 0, "enable sized deallocation functions") LANGOPT(ConceptsTS , 1, 0, "enable C++ Extensions for Concepts") diff --git a/include/clang/Driver/Options.td b/include/clang/Driver/Options.td index 989a06fa13..ecf0ade1b0 100644 --- a/include/clang/Driver/Options.td +++ b/include/clang/Driver/Options.td @@ -395,6 +395,9 @@ def cuda_path_EQ : Joined<["--"], "cuda-path=">, Group, def fcuda_flush_denormals_to_zero : Flag<["-"], "fcuda-flush-denormals-to-zero">, Flags<[CC1Option]>, HelpText<"Flush denormal floating point values to zero in CUDA device mode.">; def fno_cuda_flush_denormals_to_zero : Flag<["-"], "fno-cuda-flush-denormals-to-zero">; +def fcuda_approx_transcendentals : Flag<["-"], "fcuda-approx-transcendentals">, + Flags<[CC1Option]>, HelpText<"Use approximate transcendental functions">; +def fno_cuda_approx_transcendentals : Flag<["-"], "fno-cuda-approx-transcendentals">; def dA : Flag<["-"], "dA">, Group; def dD : Flag<["-"], "dD">, Group, Flags<[CC1Option]>, HelpText<"Print macro definitions in -E mode in addition to normal output">; diff --git a/lib/Driver/ToolChains.cpp b/lib/Driver/ToolChains.cpp index 2f1420e943..75aa995802 100644 --- a/lib/Driver/ToolChains.cpp +++ b/lib/Driver/ToolChains.cpp @@ -4502,6 +4502,10 @@ CudaToolChain::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs, options::OPT_fno_cuda_flush_denormals_to_zero, false)) CC1Args.push_back("-fcuda-flush-denormals-to-zero"); + if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals, + options::OPT_fno_cuda_approx_transcendentals, false)) + CC1Args.push_back("-fcuda-approx-transcendentals"); + if (DriverArgs.hasArg(options::OPT_nocudalib)) return; diff --git a/lib/Frontend/CompilerInvocation.cpp b/lib/Frontend/CompilerInvocation.cpp index c5f839e673..852801ab24 100644 --- a/lib/Frontend/CompilerInvocation.cpp +++ b/lib/Frontend/CompilerInvocation.cpp @@ -1616,6 +1616,9 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK, if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_flush_denormals_to_zero)) Opts.CUDADeviceFlushDenormalsToZero = 1; + if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_approx_transcendentals)) + Opts.CUDADeviceApproxTranscendentals = 1; + if (Opts.ObjC1) { if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) { StringRef value = arg->getValue(); diff --git a/lib/Frontend/InitPreprocessor.cpp b/lib/Frontend/InitPreprocessor.cpp index 5d38d5f950..f8b407ba26 100644 --- a/lib/Frontend/InitPreprocessor.cpp +++ b/lib/Frontend/InitPreprocessor.cpp @@ -938,6 +938,12 @@ static void InitializePredefinedMacros(const TargetInfo &TI, Builder.defineMacro("__CUDA_ARCH__"); } + // We need to communicate this to our CUDA header wrapper, which in turn + // informs the proper CUDA headers of this choice. + if (LangOpts.CUDADeviceApproxTranscendentals || LangOpts.FastMath) { + Builder.defineMacro("__CLANG_CUDA_APPROX_TRANSCENDENTALS__"); + } + // OpenCL definitions. if (LangOpts.OpenCL) { #define OPENCLEXT(Ext) \ diff --git a/lib/Headers/__clang_cuda_runtime_wrapper.h b/lib/Headers/__clang_cuda_runtime_wrapper.h index 3e41eabac0..bce9d72af3 100644 --- a/lib/Headers/__clang_cuda_runtime_wrapper.h +++ b/lib/Headers/__clang_cuda_runtime_wrapper.h @@ -142,7 +142,20 @@ #pragma push_macro("__forceinline__") #define __forceinline__ __device__ __inline__ __attribute__((always_inline)) #include "device_functions.hpp" + +// math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we +// get the slow-but-accurate or fast-but-inaccurate versions of functions like +// sin and exp. This is controlled in clang by -fcuda-approx-transcendentals. +// +// device_functions.hpp uses __USE_FAST_MATH__ for a different purpose (fast vs. +// slow divides), so we need to scope our define carefully here. +#pragma push_macro("__USE_FAST_MATH__") +#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__) +#define __USE_FAST_MATH__ +#endif #include "math_functions.hpp" +#pragma pop_macro("__USE_FAST_MATH__") + #include "math_functions_dbl_ptx3.hpp" #pragma pop_macro("__forceinline__") @@ -296,6 +309,7 @@ __device__ inline __cuda_builtin_gridDim_t::operator dim3() const { #include "curand_mtgp32_kernel.h" #pragma pop_macro("dim3") #pragma pop_macro("uint3") +#pragma pop_macro("__USE_FAST_MATH__") #endif // __CUDA__ #endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__ diff --git a/test/Preprocessor/cuda-approx-transcendentals.cu b/test/Preprocessor/cuda-approx-transcendentals.cu new file mode 100644 index 0000000000..8d106ea27a --- /dev/null +++ b/test/Preprocessor/cuda-approx-transcendentals.cu @@ -0,0 +1,8 @@ +// RUN: %clang --cuda-host-only -nocudainc -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null | FileCheck --check-prefix HOST %s +// RUN: %clang --cuda-device-only -nocudainc -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null | FileCheck --check-prefix DEVICE-NOFAST %s +// RUN: %clang -fcuda-approx-transcendentals --cuda-device-only -nocudainc -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null | FileCheck --check-prefix DEVICE-FAST %s +// RUN: %clang -ffast-math --cuda-device-only -nocudainc -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null | FileCheck --check-prefix DEVICE-FAST %s + +// HOST-NOT: __CLANG_CUDA_APPROX_TRANSCENDENTALS__ +// DEVICE-NOFAST-NOT: __CLANG_CUDA_APPROX_TRANSCENDENTALS__ +// DEVICE-FAST: __CLANG_CUDA_APPROX_TRANSCENDENTALS__ -- 2.40.0