glibc/sysdeps/ieee754/flt-32/s_sincosf.h

/* Used by sinf, cosf and sincosf functions.
   Copyright (C) 2018-2019 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#include <stdint.h>
#include <math.h>
#include "math_config.h"
#include <sincosf_poly.h>

/* 2PI * 2^-64.  */
static const double pi63 = 0x1.921FB54442D18p-62;
/* PI / 4.  */
static const double pio4 = 0x1.921FB54442D18p-1;

/* Polynomial data (the cosine polynomial is negated in the 2nd entry).  */
extern const sincos_t __sincosf_table[2] attribute_hidden;

/* Table with 4/PI to 192 bit precision.  */
extern const uint32_t __inv_pio4[] attribute_hidden;

/* Top 12 bits of the float representation with the sign bit cleared.  */
static inline uint32_t
abstop12 (float x)
{
  return (asuint (x) >> 20) & 0x7ff;
}

/* Fast range reduction using single multiply-subtract.  Return the modulo of
   X as a value between -PI/4 and PI/4 and store the quadrant in NP.
   The values for PI/2 and 2/PI are accessed via P.  Since PI/2 as a double
   is accurate to 55 bits and the worst-case cancellation happens at 6 * PI/4,
   the result is accurate for |X| <= 120.0.  */
static inline double
reduce_fast (double x, const sincos_t *p, int *np)
{
  double r;
#if TOINT_INTRINSICS
  /* Use fast round and lround instructions when available.  */
  r = x * p->hpi_inv;
  *np = converttoint (r);
  return x - roundtoint (r) * p->hpi;
#else
  /* Use scaled float to int conversion with explicit rounding.
     hpi_inv is prescaled by 2^24 so the quadrant ends up in bits 24..31.
     This avoids inaccuracies introduced by truncating negative values.  */
  r = x * p->hpi_inv;
  int n = ((int32_t)r + 0x800000) >> 24;
  *np = n;
  return x - n * p->hpi;
#endif
}

/* Reduce the range of XI to a multiple of PI/2 using fast integer arithmetic.
   XI is a reinterpreted float and must be >= 2.0f (the sign bit is ignored).
   Return the modulo between -PI/4 and PI/4 and store the quadrant in NP.
   Reduction uses a table of 4/PI with 192 bits of precision.  A 32x96->128 bit
   multiply computes the exact 2.62-bit fixed-point modulo.  Since the result
   can have at most 29 leading zeros after the binary point, the double
   precision result is accurate to 33 bits.  */
static inline double
reduce_large (uint32_t xi, int *np)
{
  const uint32_t *arr = &__inv_pio4[(xi >> 26) & 15];
  int shift = (xi >> 23) & 7;
  uint64_t n, res0, res1, res2;

  xi = (xi & 0xffffff) | 0x800000;
  xi <<= shift;

  res0 = xi * arr[0];
  res1 = (uint64_t)xi * arr[4];
  res2 = (uint64_t)xi * arr[8];
  res0 = (res2 >> 32) | (res0 << 32);
  res0 += res1;

  n = (res0 + (1ULL << 61)) >> 62;
  res0 -= n << 62;
  double x = (int64_t)res0;
  *np = n;
  return x * pi63;
}
New generic sincosf This implementation is based on generic s_sinf.c and s_cosf.c. Tested on s390x, powerpc64le and powerpc32. 2017-12-16 16:31:37 +08:00			`/* Used by sinf, cosf and sincosf functions.`
Update copyright dates with scripts/update-copyrights. * All files with FSF copyright notices: Update copyright dates using scripts/update-copyrights. * locale/programs/charmap-kw.h: Regenerated. * locale/programs/locfile-kw.h: Likewise. 2019-01-01 08:11:28 +08:00			`Copyright (C) 2018-2019 Free Software Foundation, Inc.`
New generic sincosf This implementation is based on generic s_sinf.c and s_cosf.c. Tested on s390x, powerpc64le and powerpc32. 2017-12-16 16:31:37 +08:00			`This file is part of the GNU C Library.`

			`The GNU C Library is free software; you can redistribute it and/or`
			`modify it under the terms of the GNU Lesser General Public`
			`License as published by the Free Software Foundation; either`
			`version 2.1 of the License, or (at your option) any later version.`

			`The GNU C Library is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`Lesser General Public License for more details.`

			`You should have received a copy of the GNU Lesser General Public`
			`License along with the GNU C Library; if not, see`
			`<http://www.gnu.org/licenses/>. */`

Improve performance of sincosf This patch is a complete rewrite of sincosf. The new version is significantly faster, as well as simple and accurate. The worst-case ULP is 0.5607, maximum relative error is 0.5303 * 2^-23 over all 4 billion inputs. In non-nearest rounding modes the error is 1ULP. The algorithm uses 3 main cases: small inputs which don't need argument reduction, small inputs which need a simple range reduction and large inputs requiring complex range reduction. The code uses approximate integer comparisons to quickly decide between these cases. The small range reducer uses a single reduction step to handle values up to 120.0. It is fastest on targets which support inlined round instructions. The large range reducer uses integer arithmetic for simplicity. It does a 32x96 bit multiply to compute a 64-bit modulo result. This is more than accurate enough to handle the worst-case cancellation for values close to an integer multiple of PI/4. It could be further optimized, however it is already much faster than necessary. sincosf throughput gains on Cortex-A72: * \|x\| < 0x1p-12 : 1.6x * \|x\| < M_PI_4 : 1.7x * \|x\| < 2 * M_PI: 1.5x * \|x\| < 120.0 : 1.8x * \|x\| < Inf : 2.3x * math/Makefile: Add s_sincosf_data.c. * sysdeps/ia64/fpu/s_sincosf_data.c: New file. * sysdeps/ieee754/flt-32/s_sincosf.h (abstop12): Add new function. (sincosf_poly): Likewise. (reduce_small): Likewise. (reduce_large): Likewise. * sysdeps/ieee754/flt-32/s_sincosf.c (sincosf): Rewrite. * sysdeps/ieee754/flt-32/s_sincosf_data.c: New file with sincosf data. * sysdeps/m68k/m680x0/fpu/s_sincosf_data.c: New file. * sysdeps/x86_64/fpu/s_sincosf_data.c: New file. 2018-08-11 00:31:30 +08:00			`#include <stdint.h>`
			`#include <math.h>`
			`#include "math_config.h"`
x86-64: Vectorize sincosf_poly and update s_sincosf-fma.c Add <sincosf_poly.h> and include it in s_sincosf.h to allow vectorized sincosf_poly. Add x86 sincosf_poly.h to vectorize sincosf_poly. On Broadwell, bench-sincosf shows: Before After Improvement max 160.273 114.198 40% min 6.25 5.625 11% mean 13.0325 10.6462 22% Vectorized sincosf_poly shows Before After Improvement max 138.653 114.198 21% min 5.004 5.625 -11% mean 11.5934 10.6462 9% Tested on x86-64 and i686 as well as with build-many-glibcs.py. * sysdeps/ieee754/flt-32/s_sincosf.h: Include <sincosf_poly.h>. (sincos_t, sincosf_poly, sinf_poly): Moved to ... * sysdeps/ieee754/flt-32/sincosf_poly.h: Here. New file. * sysdeps/x86/fpu/s_sincosf_data.c: New file. * sysdeps/x86/fpu/sincosf_poly.h: Likewise. * sysdeps/x86_64/fpu/multiarch/s_sincosf-fma.c: Just include <sysdeps/ieee754/flt-32/s_sincosf.c>. 2018-12-26 22:56:04 +08:00			`#include <sincosf_poly.h>`
Improve performance of sincosf This patch is a complete rewrite of sincosf. The new version is significantly faster, as well as simple and accurate. The worst-case ULP is 0.5607, maximum relative error is 0.5303 * 2^-23 over all 4 billion inputs. In non-nearest rounding modes the error is 1ULP. The algorithm uses 3 main cases: small inputs which don't need argument reduction, small inputs which need a simple range reduction and large inputs requiring complex range reduction. The code uses approximate integer comparisons to quickly decide between these cases. The small range reducer uses a single reduction step to handle values up to 120.0. It is fastest on targets which support inlined round instructions. The large range reducer uses integer arithmetic for simplicity. It does a 32x96 bit multiply to compute a 64-bit modulo result. This is more than accurate enough to handle the worst-case cancellation for values close to an integer multiple of PI/4. It could be further optimized, however it is already much faster than necessary. sincosf throughput gains on Cortex-A72: * \|x\| < 0x1p-12 : 1.6x * \|x\| < M_PI_4 : 1.7x * \|x\| < 2 * M_PI: 1.5x * \|x\| < 120.0 : 1.8x * \|x\| < Inf : 2.3x * math/Makefile: Add s_sincosf_data.c. * sysdeps/ia64/fpu/s_sincosf_data.c: New file. * sysdeps/ieee754/flt-32/s_sincosf.h (abstop12): Add new function. (sincosf_poly): Likewise. (reduce_small): Likewise. (reduce_large): Likewise. * sysdeps/ieee754/flt-32/s_sincosf.c (sincosf): Rewrite. * sysdeps/ieee754/flt-32/s_sincosf_data.c: New file with sincosf data. * sysdeps/m68k/m680x0/fpu/s_sincosf_data.c: New file. * sysdeps/x86_64/fpu/s_sincosf_data.c: New file. 2018-08-11 00:31:30 +08:00
			`/* 2PI * 2^-64. */`
			`static const double pi63 = 0x1.921FB54442D18p-62;`
			`/* PI / 4. */`
			`static const double pio4 = 0x1.921FB54442D18p-1;`

			`/* Polynomial data (the cosine polynomial is negated in the 2nd entry). */`
			`extern const sincos_t __sincosf_table[2] attribute_hidden;`

			`/* Table with 4/PI to 192 bit precision. */`
			`extern const uint32_t __inv_pio4[] attribute_hidden;`

			`/* Top 12 bits of the float representation with the sign bit cleared. */`
			`static inline uint32_t`
			`abstop12 (float x)`
			`{`
			`return (asuint (x) >> 20) & 0x7ff;`
			`}`

			`/* Fast range reduction using single multiply-subtract. Return the modulo of`
			`X as a value between -PI/4 and PI/4 and store the quadrant in NP.`
			`The values for PI/2 and 2/PI are accessed via P. Since PI/2 as a double`
			`is accurate to 55 bits and the worst-case cancellation happens at 6 * PI/4,`
			`the result is accurate for \|X\| <= 120.0. */`
			`static inline double`
			`reduce_fast (double x, const sincos_t p, int np)`
			`{`
			`double r;`
			`#if TOINT_INTRINSICS`
			`/* Use fast round and lround instructions when available. */`
			`r = x * p->hpi_inv;`
			`*np = converttoint (r);`
			`return x - roundtoint (r) * p->hpi;`
			`#else`
			`/* Use scaled float to int conversion with explicit rounding.`
			`hpi_inv is prescaled by 2^24 so the quadrant ends up in bits 24..31.`
			`This avoids inaccuracies introduced by truncating negative values. */`
			`r = x * p->hpi_inv;`
			`int n = ((int32_t)r + 0x800000) >> 24;`
			`*np = n;`
			`return x - n * p->hpi;`
			`#endif`
			`}`

			`/* Reduce the range of XI to a multiple of PI/2 using fast integer arithmetic.`
			`XI is a reinterpreted float and must be >= 2.0f (the sign bit is ignored).`
			`Return the modulo between -PI/4 and PI/4 and store the quadrant in NP.`
			`Reduction uses a table of 4/PI with 192 bits of precision. A 32x96->128 bit`
			`multiply computes the exact 2.62-bit fixed-point modulo. Since the result`
			`can have at most 29 leading zeros after the binary point, the double`
			`precision result is accurate to 33 bits. */`
			`static inline double`
			`reduce_large (uint32_t xi, int *np)`
			`{`
			`const uint32_t *arr = &__inv_pio4[(xi >> 26) & 15];`
			`int shift = (xi >> 23) & 7;`
			`uint64_t n, res0, res1, res2;`

			`xi = (xi & 0xffffff) \| 0x800000;`
			`xi <<= shift;`

			`res0 = xi * arr[0];`
			`res1 = (uint64_t)xi * arr[4];`
			`res2 = (uint64_t)xi * arr[8];`
			`res0 = (res2 >> 32) \| (res0 << 32);`
			`res0 += res1;`

			`n = (res0 + (1ULL << 61)) >> 62;`
			`res0 -= n << 62;`
			`double x = (int64_t)res0;`
			`*np = n;`
			`return x * pi63;`
			`}`