From f159984fe052abd07234c80f7cc15902ba8e297d Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Thu, 19 May 2016 18:44:45 +0000 Subject: [PATCH] [CUDA] Enable fusing FP ops (-ffp-contract=fast) for CUDA by default. This matches default nvcc behavior and gives substantial performance boost on GPU where fmad is much cheaper compared to add+mul. Differential Revision: http://reviews.llvm.org/D20341 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@270094 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Frontend/CompilerInvocation.cpp | 13 ++++++++---- test/CodeGenCUDA/fp-contract.cu | 32 +++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 4 deletions(-) create mode 100644 test/CodeGenCUDA/fp-contract.cu diff --git a/lib/Frontend/CompilerInvocation.cpp b/lib/Frontend/CompilerInvocation.cpp index 7c3850eeab..c5f839e673 100644 --- a/lib/Frontend/CompilerInvocation.cpp +++ b/lib/Frontend/CompilerInvocation.cpp @@ -2255,10 +2255,15 @@ bool CompilerInvocation::CreateFromArgs(CompilerInvocation &Res, LangOpts.ObjCExceptions = 1; } - // During CUDA device-side compilation, the aux triple is the triple used for - // host compilation. - if (LangOpts.CUDA && LangOpts.CUDAIsDevice) { - Res.getTargetOpts().HostTriple = Res.getFrontendOpts().AuxTriple; + if (LangOpts.CUDA) { + // During CUDA device-side compilation, the aux triple is the + // triple used for host compilation. + if (LangOpts.CUDAIsDevice) + Res.getTargetOpts().HostTriple = Res.getFrontendOpts().AuxTriple; + + // Set default FP_CONTRACT to FAST. + if (!Args.hasArg(OPT_ffp_contract)) + Res.getCodeGenOpts().setFPContractMode(CodeGenOptions::FPC_Fast); } // FIXME: Override value name discarding when asan or msan is used because the diff --git a/test/CodeGenCUDA/fp-contract.cu b/test/CodeGenCUDA/fp-contract.cu new file mode 100644 index 0000000000..070ebaea44 --- /dev/null +++ b/test/CodeGenCUDA/fp-contract.cu @@ -0,0 +1,32 @@ +// REQUIRES: x86-registered-target +// REQUIRES: nvptx-registered-target + +// By default we should fuse multiply/add into fma instruction. +// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ +// RUN: -disable-llvm-passes -o - %s | FileCheck -check-prefix ENABLED %s + +// Explicit -ffp-contract=fast +// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ +// RUN: -ffp-contract=fast -disable-llvm-passes -o - %s \ +// RUN: | FileCheck -check-prefix ENABLED %s + +// Explicit -ffp-contract=on -- fusing by front-end (disabled). +// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ +// RUN: -ffp-contract=on -disable-llvm-passes -o - %s \ +// RUN: | FileCheck -check-prefix DISABLED %s + +// Explicit -ffp-contract=off should disable instruction fusing. +// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \ +// RUN: -ffp-contract=off -disable-llvm-passes -o - %s \ +// RUN: | FileCheck -check-prefix DISABLED %s + + +#include "Inputs/cuda.h" + +__host__ __device__ float func(float a, float b, float c) { return a + b * c; } +// ENABLED: fma.rn.f32 +// ENABLED-NEXT: st.param.f32 + +// DISABLED: mul.rn.f32 +// DISABLED-NEXT: add.rn.f32 +// DISABLED-NEXT: st.param.f32 -- 2.40.0