From b50d315fd23f0fbc4c11e2583801dd123d933745 Mon Sep 17 00:00:00 2001 From: Szabolcs Nagy Date: Mon, 26 Nov 2018 23:30:00 +0000 Subject: [PATCH] math: add fp_arch.h with fp_barrier and fp_force_eval C99 has ways to support fenv access, but compilers don't implement it and assume nearest rounding mode and no fp status flag access. (gcc has -frounding-math and then it does not assume nearest rounding mode, but it still assumes the compiled code itself does not change the mode. Even if the C99 mechanism was implemented it is not ideal: it requires all code in the library to be compiled with FENV_ACCESS "on" to make it usable in non-nearest rounding mode, but that limits optimizations more than necessary.) The math functions should give reasonable results in all rounding modes (but the quality may be degraded in non-nearest rounding modes) and the fp status flag settings should follow the spec, so fenv side-effects are important and code transformations that break them should be prevented. Unfortunately compilers don't give any help with this, the best we can do is to add fp barriers to the code using volatile local variables (they create a stack frame and undesirable memory accesses to it) or inline asm (gcc specific, requires target specific fp reg constraints, often creates unnecessary reg moves and multiple barriers are needed to express that an operation has side-effects) or extern call (only useful in tail-call position to avoid stack-frame creation and does not work with lto). We assume that in a math function if an operation depends on the input and the output depends on it, then the operation will be evaluated at runtime when the function is called, producing all the expected fenv side-effects (this is not true in case of lto and in case the operation is evaluated with excess precision that is not rounded away). So fp barriers are needed (1) to prevent the move of an operation within a function (in case it may be moved from an unevaluated code path into an evaluated one or if it may be moved across a fenv access), (2) force the evaluation of an operation for its side-effect when it has no input dependency (may be constant folded) or (3) when its output is unused. I belive that fp_barrier and fp_force_eval can take care of these and they should not be needed in hot code paths. --- arch/aarch64/fp_arch.h | 25 +++++++++++++++ arch/generic/fp_arch.h | 0 src/internal/libm.h | 71 ++++++++++++++++++++++++++++++++++++++---- 3 files changed, 90 insertions(+), 6 deletions(-) create mode 100644 arch/aarch64/fp_arch.h create mode 100644 arch/generic/fp_arch.h diff --git a/arch/aarch64/fp_arch.h b/arch/aarch64/fp_arch.h new file mode 100644 index 00000000..f3d445b9 --- /dev/null +++ b/arch/aarch64/fp_arch.h @@ -0,0 +1,25 @@ +#define fp_barrierf fp_barrierf +static inline float fp_barrierf(float x) +{ + __asm__ __volatile__ ("" : "+w"(x)); + return x; +} + +#define fp_barrier fp_barrier +static inline double fp_barrier(double x) +{ + __asm__ __volatile__ ("" : "+w"(x)); + return x; +} + +#define fp_force_evalf fp_force_evalf +static inline void fp_force_evalf(float x) +{ + __asm__ __volatile__ ("" : "+w"(x)); +} + +#define fp_force_eval fp_force_eval +static inline void fp_force_eval(double x) +{ + __asm__ __volatile__ ("" : "+w"(x)); +} diff --git a/arch/generic/fp_arch.h b/arch/generic/fp_arch.h new file mode 100644 index 00000000..e69de29b diff --git a/src/internal/libm.h b/src/internal/libm.h index f7dd9678..5669c046 100644 --- a/src/internal/libm.h +++ b/src/internal/libm.h @@ -5,6 +5,7 @@ #include #include #include +#include "fp_arch.h" #if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024 #elif LDBL_MANT_DIG == 64 && LDBL_MAX_EXP == 16384 && __BYTE_ORDER == __LITTLE_ENDIAN @@ -58,16 +59,74 @@ union ldshape { #error Unsupported long double representation #endif +/* fp_barrier returns its input, but limits code transformations + as if it had a side-effect (e.g. observable io) and returned + an arbitrary value. */ + +#ifndef fp_barrierf +#define fp_barrierf fp_barrierf +static inline float fp_barrierf(float x) +{ + volatile float y = x; + return y; +} +#endif + +#ifndef fp_barrier +#define fp_barrier fp_barrier +static inline double fp_barrier(double x) +{ + volatile double y = x; + return y; +} +#endif + +#ifndef fp_barrierl +#define fp_barrierl fp_barrierl +static inline long double fp_barrierl(long double x) +{ + volatile long double y = x; + return y; +} +#endif + +/* fp_force_eval ensures that the input value is computed when that's + otherwise unused. To prevent the constant folding of the input + expression, an additional fp_barrier may be needed or a compilation + mode that does so (e.g. -frounding-math in gcc). Then it can be + used to evaluate an expression for its fenv side-effects only. */ + +#ifndef fp_force_evalf +#define fp_force_evalf fp_force_evalf +static inline void fp_force_evalf(float x) +{ + volatile float y = x; +} +#endif + +#ifndef fp_force_eval +#define fp_force_eval fp_force_eval +static inline void fp_force_eval(double x) +{ + volatile double y = x; +} +#endif + +#ifndef fp_force_evall +#define fp_force_evall fp_force_evall +static inline void fp_force_evall(long double x) +{ + volatile long double y = x; +} +#endif + #define FORCE_EVAL(x) do { \ if (sizeof(x) == sizeof(float)) { \ - volatile float __x; \ - __x = (x); \ + fp_force_evalf(x); \ } else if (sizeof(x) == sizeof(double)) { \ - volatile double __x; \ - __x = (x); \ + fp_force_eval(x); \ } else { \ - volatile long double __x; \ - __x = (x); \ + fp_force_evall(x); \ } \ } while(0) -- 2.40.0