mirror of
git://sourceware.org/git/glibc.git
synced 2025-01-24 12:25:35 +08:00
e70c176825
Optimized exp and exp2 implementations using a lookup table for fractional powers of 2. There are several variants, see e_exp_data.c, they can be selected by modifying math_config.h allowing different tradeoffs. The default selection should be acceptable as generic libm code. Worst case error is 0.509 ULP for exp and 0.507 ULP for exp2, on aarch64 the rodata size is 2160 bytes, shared between exp and exp2. On aarch64 .text + .rodata size decreased by 24912 bytes. The non-nearest rounding error is less than 1 ULP even on targets without efficient round implementation (although the error rate is higher in that case). Targets with single instruction, rounding mode independent, to nearest integer rounding and conversion can use them by setting TOINT_INTRINSICS and adding the necessary code to their math_private.h. The __exp1 code uses the same algorithm, so the error bound of pow increased a bit. New double precision error handling code was added following the style of the single precision error handling code. Improvements on Cortex-A72 compared to current glibc master: exp thruput: 1.61x in [-9.9 9.9] exp latency: 1.53x in [-9.9 9.9] exp thruput: 1.13x in [0.5 1] exp latency: 1.30x in [0.5 1] exp2 thruput: 2.03x in [-9.9 9.9] exp2 latency: 1.64x in [-9.9 9.9] For small (< 1) inputs the current exp code uses a separate algorithm so the speed up there is less. Was tested on aarch64-linux-gnu (TOINT_INTRINSICS, fma contraction) and arm-linux-gnueabihf (!TOINT_INTRINSICS, no fma contraction) and x86_64-linux-gnu (!TOINT_INTRINSICS, no fma contraction) and powerpc64le-linux-gnu (!TOINT_INTRINSICS, fma contraction) targets, only non-nearest rounding ulp errors increase and they are within acceptable bounds (ulp updates are in separate patches). * NEWS: Mention exp and exp2 improvements. * math/Makefile (libm-support): Remove t_exp. (type-double-routines): Add math_err and e_exp_data. * sysdeps/aarch64/libm-test-ulps: Update. * sysdeps/arm/libm-test-ulps: Update. * sysdeps/i386/fpu/e_exp_data.c: New file. * sysdeps/i386/fpu/math_err.c: New file. * sysdeps/i386/fpu/t_exp.c: Remove. * sysdeps/ia64/fpu/e_exp_data.c: New file. * sysdeps/ia64/fpu/math_err.c: New file. * sysdeps/ia64/fpu/t_exp.c: Remove. * sysdeps/ieee754/dbl-64/e_exp.c: Rewrite. * sysdeps/ieee754/dbl-64/e_exp2.c: Rewrite. * sysdeps/ieee754/dbl-64/e_exp_data.c: New file. * sysdeps/ieee754/dbl-64/e_pow.c (__ieee754_pow): Update error bound. * sysdeps/ieee754/dbl-64/eexp.tbl: Remove. * sysdeps/ieee754/dbl-64/math_config.h: New file. * sysdeps/ieee754/dbl-64/math_err.c: New file. * sysdeps/ieee754/dbl-64/t_exp.c: Remove. * sysdeps/ieee754/dbl-64/t_exp2.h: Remove. * sysdeps/ieee754/dbl-64/uexp.h: Remove. * sysdeps/ieee754/dbl-64/uexp.tbl: Remove. * sysdeps/m68k/m680x0/fpu/e_exp_data.c: New file. * sysdeps/m68k/m680x0/fpu/math_err.c: New file. * sysdeps/m68k/m680x0/fpu/t_exp.c: Remove. * sysdeps/powerpc/fpu/libm-test-ulps: Update. * sysdeps/x86_64/fpu/libm-test-ulps: Update.
178 lines
5.7 KiB
C
178 lines
5.7 KiB
C
/* Double-precision e^x function.
|
|
Copyright (C) 2018 Free Software Foundation, Inc.
|
|
This file is part of the GNU C Library.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library; if not, see
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
#include <math.h>
|
|
#include <stdint.h>
|
|
#include <math-barriers.h>
|
|
#include <math-narrow-eval.h>
|
|
#include "math_config.h"
|
|
|
|
#define N (1 << EXP_TABLE_BITS)
|
|
#define InvLn2N __exp_data.invln2N
|
|
#define NegLn2hiN __exp_data.negln2hiN
|
|
#define NegLn2loN __exp_data.negln2loN
|
|
#define Shift __exp_data.shift
|
|
#define T __exp_data.tab
|
|
#define C2 __exp_data.poly[5 - EXP_POLY_ORDER]
|
|
#define C3 __exp_data.poly[6 - EXP_POLY_ORDER]
|
|
#define C4 __exp_data.poly[7 - EXP_POLY_ORDER]
|
|
#define C5 __exp_data.poly[8 - EXP_POLY_ORDER]
|
|
|
|
/* Handle cases that may overflow or underflow when computing the result that
|
|
is scale*(1+TMP) without intermediate rounding. The bit representation of
|
|
scale is in SBITS, however it has a computed exponent that may have
|
|
overflown into the sign bit so that needs to be adjusted before using it as
|
|
a double. (int32_t)KI is the k used in the argument reduction and exponent
|
|
adjustment of scale, positive k here means the result may overflow and
|
|
negative k means the result may underflow. */
|
|
static inline double
|
|
specialcase (double_t tmp, uint64_t sbits, uint64_t ki)
|
|
{
|
|
double_t scale, y;
|
|
|
|
if ((ki & 0x80000000) == 0)
|
|
{
|
|
/* k > 0, the exponent of scale might have overflowed by <= 460. */
|
|
sbits -= 1009ull << 52;
|
|
scale = asdouble (sbits);
|
|
y = 0x1p1009 * (scale + scale * tmp);
|
|
return check_oflow (y);
|
|
}
|
|
/* k < 0, need special care in the subnormal range. */
|
|
sbits += 1022ull << 52;
|
|
scale = asdouble (sbits);
|
|
y = scale + scale * tmp;
|
|
if (y < 1.0)
|
|
{
|
|
/* Round y to the right precision before scaling it into the subnormal
|
|
range to avoid double rounding that can cause 0.5+E/2 ulp error where
|
|
E is the worst-case ulp error outside the subnormal range. So this
|
|
is only useful if the goal is better than 1 ulp worst-case error. */
|
|
double_t hi, lo;
|
|
lo = scale - y + scale * tmp;
|
|
hi = 1.0 + y;
|
|
lo = 1.0 - hi + y + lo;
|
|
y = math_narrow_eval (hi + lo) - 1.0;
|
|
/* Avoid -0.0 with downward rounding. */
|
|
if (WANT_ROUNDING && y == 0.0)
|
|
y = 0.0;
|
|
/* The underflow exception needs to be signaled explicitly. */
|
|
math_force_eval (math_opt_barrier (0x1p-1022) * 0x1p-1022);
|
|
}
|
|
y = 0x1p-1022 * y;
|
|
return check_uflow (y);
|
|
}
|
|
|
|
/* Top 12 bits of a double (sign and exponent bits). */
|
|
static inline uint32_t
|
|
top12 (double x)
|
|
{
|
|
return asuint64 (x) >> 52;
|
|
}
|
|
|
|
/* Computes exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
|
|
If hastail is 0 then xtail is assumed to be 0 too. */
|
|
static inline double
|
|
exp_inline (double x, double xtail, int hastail)
|
|
{
|
|
uint32_t abstop;
|
|
uint64_t ki, idx, top, sbits;
|
|
/* double_t for better performance on targets with FLT_EVAL_METHOD==2. */
|
|
double_t kd, z, r, r2, scale, tail, tmp;
|
|
|
|
abstop = top12 (x) & 0x7ff;
|
|
if (__glibc_unlikely (abstop - top12 (0x1p-54)
|
|
>= top12 (512.0) - top12 (0x1p-54)))
|
|
{
|
|
if (abstop - top12 (0x1p-54) >= 0x80000000)
|
|
/* Avoid spurious underflow for tiny x. */
|
|
/* Note: 0 is common input. */
|
|
return WANT_ROUNDING ? 1.0 + x : 1.0;
|
|
if (abstop >= top12 (1024.0))
|
|
{
|
|
if (asuint64 (x) == asuint64 (-INFINITY))
|
|
return 0.0;
|
|
if (abstop >= top12 (INFINITY))
|
|
return 1.0 + x;
|
|
if (asuint64 (x) >> 63)
|
|
return __math_uflow (0);
|
|
else
|
|
return __math_oflow (0);
|
|
}
|
|
/* Large x is special cased below. */
|
|
abstop = 0;
|
|
}
|
|
|
|
/* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */
|
|
/* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */
|
|
z = InvLn2N * x;
|
|
#if TOINT_INTRINSICS
|
|
kd = roundtoint (z);
|
|
ki = converttoint (z);
|
|
#else
|
|
/* z - kd is in [-1, 1] in non-nearest rounding modes. */
|
|
kd = math_narrow_eval (z + Shift);
|
|
ki = asuint64 (kd);
|
|
kd -= Shift;
|
|
#endif
|
|
r = x + kd * NegLn2hiN + kd * NegLn2loN;
|
|
/* The code assumes 2^-200 < |xtail| < 2^-8/N. */
|
|
if (hastail)
|
|
r += xtail;
|
|
/* 2^(k/N) ~= scale * (1 + tail). */
|
|
idx = 2 * (ki % N);
|
|
top = ki << (52 - EXP_TABLE_BITS);
|
|
tail = asdouble (T[idx]);
|
|
/* This is only a valid scale when -1023*N < k < 1024*N. */
|
|
sbits = T[idx + 1] + top;
|
|
/* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (tail + exp(r) - 1). */
|
|
/* Evaluation is optimized assuming superscalar pipelined execution. */
|
|
r2 = r * r;
|
|
/* Without fma the worst case error is 0.25/N ulp larger. */
|
|
/* Worst case error is less than 0.5+1.11/N+(abs poly error * 2^53) ulp. */
|
|
tmp = tail + r + r2 * (C2 + r * C3) + r2 * r2 * (C4 + r * C5);
|
|
if (__glibc_unlikely (abstop == 0))
|
|
return specialcase (tmp, sbits, ki);
|
|
scale = asdouble (sbits);
|
|
/* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
|
|
is no spurious underflow here even without fma. */
|
|
return scale + scale * tmp;
|
|
}
|
|
|
|
#ifndef SECTION
|
|
# define SECTION
|
|
#endif
|
|
|
|
double
|
|
SECTION
|
|
__ieee754_exp (double x)
|
|
{
|
|
return exp_inline (x, 0, 0);
|
|
}
|
|
#ifndef __ieee754_exp
|
|
strong_alias (__ieee754_exp, __exp_finite)
|
|
#endif
|
|
|
|
/* Compute e^(x+xx). */
|
|
double
|
|
SECTION
|
|
__exp1 (double x, double xx)
|
|
{
|
|
return exp_inline (x, xx, 1);
|
|
}
|