From 92b0ff91a2faa573f171d41835303114a24d6729 Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Tue, 5 Apr 2016 18:26:20 +0000 Subject: [PATCH] [CUDA] Add -fcuda-flush-denormals-to-zero. Summary: Setting this flag causes all functions are annotated with the "nvvm-f32ftz" = "true" attribute. In addition, we annotate the module with "nvvm-reflect-ftz" set to 0 or 1, depending on whether -cuda-flush-denormals-to-zero is set. This is read by the NVVMReflect pass. Reviewers: tra, rnk Subscribers: cfe-commits Differential Revision: http://reviews.llvm.org/D18671 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@265435 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/clang/Basic/LangOptions.def | 1 + include/clang/Driver/Options.td | 3 +++ lib/CodeGen/CGCall.cpp | 4 ++++ lib/CodeGen/CodeGenModule.cpp | 8 ++++++++ lib/Driver/ToolChains.cpp | 4 ++++ lib/Frontend/CompilerInvocation.cpp | 3 +++ test/CodeGenCUDA/flush-denormals.cu | 23 +++++++++++++++++++++++ 7 files changed, 46 insertions(+) create mode 100644 test/CodeGenCUDA/flush-denormals.cu diff --git a/include/clang/Basic/LangOptions.def b/include/clang/Basic/LangOptions.def index a628368842..43e96d10d6 100644 --- a/include/clang/Basic/LangOptions.def +++ b/include/clang/Basic/LangOptions.def @@ -173,6 +173,7 @@ LANGOPT(OpenMPIsDevice , 1, 0, "Generate code only for OpenMP target device") LANGOPT(CUDAIsDevice , 1, 0, "compiling for CUDA device") LANGOPT(CUDAAllowVariadicFunctions, 1, 0, "allowing variadic functions in CUDA device code") LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__") +LANGOPT(CUDADeviceFlushDenormalsToZero, 1, 0, "flushing denormals to zero") LANGOPT(AssumeSaneOperatorNew , 1, 1, "implicit __attribute__((malloc)) for C++'s new operators") LANGOPT(SizedDeallocation , 1, 0, "enable sized deallocation functions") diff --git a/include/clang/Driver/Options.td b/include/clang/Driver/Options.td index a1befe379b..9af2043999 100644 --- a/include/clang/Driver/Options.td +++ b/include/clang/Driver/Options.td @@ -382,6 +382,9 @@ def cuda_noopt_device_debug : Flag<["--"], "cuda-noopt-device-debug">, HelpText<"Enable device-side debug info generation. Disables ptxas optimizations.">; def cuda_path_EQ : Joined<["--"], "cuda-path=">, Group, HelpText<"CUDA installation path">; +def fcuda_flush_denormals_to_zero : Flag<["-"], "fcuda-flush-denormals-to-zero">, + Flags<[CC1Option]>, HelpText<"Flush denormal floating point values to zero in CUDA device mode.">; +def fno_cuda_flush_denormals_to_zero : Flag<["-"], "fno-cuda-flush-denormals-to-zero">; def dA : Flag<["-"], "dA">, Group; def dD : Flag<["-"], "dD">, Group, Flags<[CC1Option]>, HelpText<"Print macro definitions in -E mode in addition to normal output">; diff --git a/lib/CodeGen/CGCall.cpp b/lib/CodeGen/CGCall.cpp index cbd7422e6a..04c0a11600 100644 --- a/lib/CodeGen/CGCall.cpp +++ b/lib/CodeGen/CGCall.cpp @@ -1768,6 +1768,10 @@ void CodeGenModule::ConstructAttributeList( // __syncthreads(), and so can't have certain optimizations applied around // them). LLVM will remove this attribute where it safely can. FuncAttrs.addAttribute(llvm::Attribute::Convergent); + + // Respect -fcuda-flush-denormals-to-zero. + if (getLangOpts().CUDADeviceFlushDenormalsToZero) + FuncAttrs.addAttribute("nvptx-f32ftz", "true"); } ClangToLLVMArgMapping IRFunctionArgs(getContext(), FI); diff --git a/lib/CodeGen/CodeGenModule.cpp b/lib/CodeGen/CodeGenModule.cpp index 11f5bea64c..f525246e6e 100644 --- a/lib/CodeGen/CodeGenModule.cpp +++ b/lib/CodeGen/CodeGenModule.cpp @@ -472,6 +472,14 @@ void CodeGenModule::Release() { getModule().addModuleFlag(llvm::Module::Override, "Cross-DSO CFI", 1); } + if (LangOpts.CUDAIsDevice && getTarget().getTriple().isNVPTX()) { + // Indicate whether __nvvm_reflect should be configured to flush denormal + // floating point values to 0. (This corresponds to its "__CUDA_FTZ" + // property.) + getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz", + LangOpts.CUDADeviceFlushDenormalsToZero ? 1 : 0); + } + if (uint32_t PLevel = Context.getLangOpts().PICLevel) { llvm::PICLevel::Level PL = llvm::PICLevel::Default; switch (PLevel) { diff --git a/lib/Driver/ToolChains.cpp b/lib/Driver/ToolChains.cpp index 11ded7cd5e..902338b288 100644 --- a/lib/Driver/ToolChains.cpp +++ b/lib/Driver/ToolChains.cpp @@ -4208,6 +4208,10 @@ CudaToolChain::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs, Linux::addClangTargetOptions(DriverArgs, CC1Args); CC1Args.push_back("-fcuda-is-device"); + if (DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero, + options::OPT_fno_cuda_flush_denormals_to_zero, false)) + CC1Args.push_back("-fcuda-flush-denormals-to-zero"); + if (DriverArgs.hasArg(options::OPT_nocudalib)) return; diff --git a/lib/Frontend/CompilerInvocation.cpp b/lib/Frontend/CompilerInvocation.cpp index 4d97b76d75..ba56665bc8 100644 --- a/lib/Frontend/CompilerInvocation.cpp +++ b/lib/Frontend/CompilerInvocation.cpp @@ -1571,6 +1571,9 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK, if (Args.hasArg(OPT_fno_cuda_host_device_constexpr)) Opts.CUDAHostDeviceConstexpr = 0; + if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_flush_denormals_to_zero)) + Opts.CUDADeviceFlushDenormalsToZero = 1; + if (Opts.ObjC1) { if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) { StringRef value = arg->getValue(); diff --git a/test/CodeGenCUDA/flush-denormals.cu b/test/CodeGenCUDA/flush-denormals.cu new file mode 100644 index 0000000000..cab660254d --- /dev/null +++ b/test/CodeGenCUDA/flush-denormals.cu @@ -0,0 +1,23 @@ +// RUN: %clang_cc1 -fcuda-is-device \ +// RUN: -triple nvptx-nvidia-cuda -emit-llvm -o - %s | FileCheck %s -check-prefix NOFTZ +// RUN: %clang_cc1 -fcuda-is-device -fcuda-flush-denormals-to-zero \ +// RUN: -triple nvptx-nvidia-cuda -emit-llvm -o - %s | FileCheck %s -check-prefix FTZ + +#include "Inputs/cuda.h" + +// Checks that device function calls get emitted with the "ntpvx-f32ftz" +// attribute set to "true" when we compile CUDA device code with +// -fcuda-flush-denormals-to-zero. Further, check that we reflect the presence +// or absence of -fcuda-flush-denormals-to-zero in a module flag. + +// CHECK: define void @foo() #0 +extern "C" __device__ void foo() {} + +// FTZ: attributes #0 = {{.*}} "nvptx-f32ftz"="true" +// NOFTZ-NOT: attributes #0 = {{.*}} "nvptx-f32ftz" + +// FTZ:!llvm.module.flags = !{[[MODFLAG:![0-9]+]]} +// FTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1} + +// NOFTZ:!llvm.module.flags = !{[[MODFLAG:![0-9]+]]} +// NOFTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 0} -- 2.40.0