bug #1652: implements a much more accurate version of vectorized sin/cos. This new version achieve same speed for SSE/AVX, and is slightly faster with FMA. Guarantees are as follows:

- no FMA: 1ULP up to 3pi, 2ULP up to sin(25966) and cos(18838), fallback to std::sin/cos for larger inputs
  - FMA: 1ULP up to sin(117435.992) and cos(71476.0625), fallback to std::sin/cos for larger inputs
This commit is contained in:
Gael Guennebaud 2019-01-09 15:25:17 +01:00
parent e70ffef967
commit e6b217b8dd
5 changed files with 181 additions and 75 deletions

View File

@ -393,18 +393,39 @@ typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_trai
predux_half_dowto4(const Packet& a)
{ return a; }
/** \internal \returns the product of the elements of \a a*/
/** \internal \returns the product of the elements of \a a */
template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
{ return a; }
/** \internal \returns the min of the elements of \a a*/
/** \internal \returns the min of the elements of \a a */
template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a)
{ return a; }
/** \internal \returns the max of the elements of \a a*/
/** \internal \returns the max of the elements of \a a */
template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(const Packet& a)
{ return a; }
/** \internal \returns true if all coeffs of \a a means "true"
* It is supposed to be called on values returned by pcmp_*.
*/
// not needed yet
// template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_all(const Packet& a)
// { return bool(a); }
/** \internal \returns true if any coeffs of \a a means "true"
* It is supposed to be called on values returned by pcmp_*.
*/
template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_any(const Packet& a)
{
// Dirty but generic implementation where "true" is assumed to be non 0 and all the sames.
// It is expected that "true" is either:
// - Scalar(1)
// - bits full of ones (NaN for floats),
// - or first bit equals to 1 (1 for ints, smallest denormal for floats).
// For all these cases, taking the sum is just fine, and this boils down to a no-op for scalars.
return bool(predux(a));
}
/** \internal \returns the reversed elements of \a a*/
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a)
{ return a; }

View File

@ -575,6 +575,16 @@ template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
}
// not needed yet
// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet8f& x)
// {
// return _mm256_movemask_ps(x)==0xFF;
// }
template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x)
{
return _mm256_movemask_ps(x)!=0;
}
template<int Offset>
struct palign_impl<Offset,Packet8f>

View File

@ -269,88 +269,118 @@ EIGEN_UNUSED
Packet psincos_float(const Packet& _x)
{
typedef typename unpacket_traits<Packet>::integer_packet PacketI;
const Packet cst_1 = pset1<Packet>(1.0f);
const Packet cst_half = pset1<Packet>(0.5f);
const PacketI csti_1 = pset1<PacketI>(1);
const PacketI csti_not1 = pset1<PacketI>(~1);
const PacketI csti_2 = pset1<PacketI>(2);
const PacketI csti_3 = pset1<PacketI>(3);
const Packet cst_sign_mask = pset1frombits<Packet>(0x80000000u);
const Packet cst_minus_cephes_DP1 = pset1<Packet>(-0.78515625f);
const Packet cst_minus_cephes_DP2 = pset1<Packet>(-2.4187564849853515625e-4f);
const Packet cst_minus_cephes_DP3 = pset1<Packet>(-3.77489497744594108e-8f);
const Packet cst_sincof_p0 = pset1<Packet>(-1.9515295891E-4f);
const Packet cst_sincof_p1 = pset1<Packet>( 8.3321608736E-3f);
const Packet cst_sincof_p2 = pset1<Packet>(-1.6666654611E-1f);
const Packet cst_coscof_p0 = pset1<Packet>( 2.443315711809948E-005f);
const Packet cst_coscof_p1 = pset1<Packet>(-1.388731625493765E-003f);
const Packet cst_coscof_p2 = pset1<Packet>( 4.166664568298827E-002f);
const Packet cst_cephes_FOPI = pset1<Packet>( 1.27323954473516f); // 4 / M_PI
const Packet cst_2oPI = pset1<Packet>(0.636619746685028076171875f); // 2/PI
const Packet cst_rounding_magic = pset1<Packet>(12582912); // 2^23 for rounding
const PacketI csti_1 = pset1<PacketI>(1);
const Packet cst_sign_mask = pset1frombits<Packet>(0x80000000u);
Packet x = pabs(_x);
// Scale x by 4/Pi to find x's octant.
Packet y = pmul(x, cst_cephes_FOPI);
// Scale x by 2/Pi to find x's octant.
Packet y = pmul(x, cst_2oPI);
// Get the octant. We'll reduce x by this number of octants or by one more than it.
PacketI y_int = pcast<Packet,PacketI>(y);
// x's from even-numbered octants will translate to octant 0: [0, +Pi/4].
// x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0].
// Adjustment for odd-numbered octants: octant = (octant + 1) & (~1).
PacketI y_int1 = pand(padd(y_int, csti_1), csti_not1); // could be pbitclear<0>(...)
y = pcast<PacketI,Packet>(y_int1);
// Rounding trick:
Packet y_round = padd(y, cst_rounding_magic);
PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24)
y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi
// Compute the sign to apply to the polynomial.
// sign = third_bit(y_int1) xor signbit(_x)
Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(pshiftleft<29>(y_int1)))
: preinterpret<Packet>(pshiftleft<29>(padd(y_int1,csti_3)));
// sin: sign = second_bit(y_int) xor signbit(_x)
// cos: sign = second_bit(y_int+1)
Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(pshiftleft<30>(y_int)))
: preinterpret<Packet>(pshiftleft<30>(padd(y_int,csti_1)));
sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
// Get the polynomial selection mask from the second bit of y_int1
// Get the polynomial selection mask from the second bit of y_int
// We'll calculate both (sin and cos) polynomials and then select from the two.
Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int1, csti_2), pzero(y_int1)));
Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int)));
// Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4.
// The magic pass: "Extended precision modular arithmetic"
// x = ((x - y * DP1) - y * DP2) - y * DP3
x = pmadd(y, cst_minus_cephes_DP1, x);
x = pmadd(y, cst_minus_cephes_DP2, x);
x = pmadd(y, cst_minus_cephes_DP3, x);
// Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4
// using "Extended precision modular arithmetic"
#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD)
// This version requires true FMA for high accuracy
// It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x);
x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x);
#else
// Without true FMA, the previous set of coefficients maintain 1ULP accuracy
// up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7.
// We thus use one more iteration to maintain 2ULPs up to reasonably large inputs.
// The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively.
// and 2 ULP up to:
const float huge_th = ComputeSine ? 25966.f : 18838.f;
x = pmadd(y, pset1<Packet>(-1.5703125), x); // = 0xbfc90000
x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x); // = 0xb9fdc000
x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x); // = 0x342ee000
x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee
// For the record, the following set of coefficients maintain 2ULP up
// to a slightly larger range:
// const float huge_th = ComputeSine ? 51981.f : 39086.125f;
// but it slightly fails to maintain 1ULP for two values of sin below pi.
// x = pmadd(y, pset1<Packet>(-3.140625/2.), x);
// x = pmadd(y, pset1<Packet>(-0.00048351287841796875), x);
// x = pmadd(y, pset1<Packet>(-3.13855707645416259765625e-07), x);
// x = pmadd(y, pset1<Packet>(-6.0771006282767103812147979624569416046142578125e-11), x);
// For the record, with only 3 iterations it is possible to maintain
// 1 ULP up to 3PI (maybe more) and 2ULP up to 255.
// The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee
#endif
Packet huge_mask = pcmp_le(pset1<Packet>(huge_th),pabs(_x));
Packet huge_vals;
if(predux_any(huge_mask))
{
const int PacketSize = unpacket_traits<Packet>::size;
#if EIGEN_HAS_CXX11
alignas(Packet) float vals[PacketSize];
#else
EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize];
#endif
pstoreu(vals, _x);
for(int k=0; k<PacketSize;++k) {
float val = vals[k];
if(numext::abs(val)>=huge_th) {
vals[k] = ComputeSine ? std::sin(val) : std::cos(val);
}
}
huge_vals = ploadu<Packet>(vals);
}
Packet x2 = pmul(x,x);
// Evaluate the cos(x) polynomial. (0 <= x <= Pi/4)
Packet y1 = cst_coscof_p0;
y1 = pmadd(y1, x2, cst_coscof_p1);
y1 = pmadd(y1, x2, cst_coscof_p2);
y1 = pmul(y1, x2);
y1 = pmul(y1, x2);
y1 = psub(y1, pmul(x2, cst_half));
y1 = padd(y1, cst_1);
// Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4)
Packet y1 = pset1<Packet>(2.4372266125283204019069671630859375e-05f);
y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f ));
y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f ));
y1 = pmadd(y1, x2, pset1<Packet>(-0.5f));
y1 = pmadd(y1, x2, pset1<Packet>(1.f));
// Evaluate the sin(x) polynomial. (Pi/4 <= x <= 0)
Packet y2 = cst_sincof_p0;
y2 = pmadd(y2, x2, cst_sincof_p1);
y2 = pmadd(y2, x2, cst_sincof_p2);
// Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4)
// octave/matlab code to compute those coefficients:
// x = (0:0.0001:pi/4)';
// A = [x.^3 x.^5 x.^7];
// w = ((1.-(x/(pi/4)).^2).^5)*2000+1; # weights trading relative accuracy
// c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1
// printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1))
//
Packet y2 = pset1<Packet>(-0.0001959234114083702898469196984621021329076029360294342041015625f);
y2 = pmadd(y2, x2, pset1<Packet>( 0.0083326873655616851693794799871284340042620897293090820312500000f));
y2 = pmadd(y2, x2, pset1<Packet>(-0.1666666203982298255503735617821803316473960876464843750000000000f));
y2 = pmul(y2, x2);
y2 = pmadd(y2, x, x);
// Select the correct result from the two polynoms.
// Select the correct result from the two polynomials.
y = ComputeSine ? pselect(poly_mask,y2,y1)
: pselect(poly_mask,y1,y2);
// For very large arguments the the reduction to the [-Pi/4,+Pi/4] range
// does not work thus leading to sine/cosine out of the [-1:1] range.
// Since computing the sine/cosine for very large entry entries makes little
// sense in term of accuracy, we simply clamp to [-1,1]:
y = pmin(y,pset1<Packet>( 1.f));
y = pmax(y,pset1<Packet>(-1.f));
// Update the sign
return pxor(y, sign_bit);
// Update the sign and filter huge inputs
return pselect(huge_mask, huge_vals, pxor(y, sign_bit));
}
template<typename Packet>

View File

@ -812,6 +812,17 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
#endif // EIGEN_VECTORIZE_SSE4_1
}
// not needed yet
// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet4f& x)
// {
// return _mm_movemask_ps(x) == 0xF;
// }
template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
{
return _mm_movemask_ps(x) != 0x0;
}
#if EIGEN_COMP_GNUC
// template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
// {

View File

@ -568,17 +568,22 @@ template<typename Scalar,typename Packet> void packetmath_real()
if(PacketTraits::HasCos)
{
packet_helper<PacketTraits::HasCos,Packet> h;
for(Scalar k = 1; k<Scalar(1000)/std::numeric_limits<Scalar>::epsilon(); k*=2) {
data1[0] = k*Scalar(EIGEN_PI) * internal::random<Scalar>(0.8,1.2);
data1[1] = (k+1)*Scalar(EIGEN_PI) * internal::random<Scalar>(0.8,1.2);
h.store(data2, internal::pcos(h.load(data1)));
VERIFY(data2[0]<=Scalar(1.) && data2[0]>=Scalar(-1.));
VERIFY(data2[1]<=Scalar(1.) && data2[1]>=Scalar(-1.));
data1[0] = (2*k+1)*Scalar(EIGEN_PI)/2 * internal::random<Scalar>(0.8,1.2);
data1[1] = (2*k+3)*Scalar(EIGEN_PI)/2 * internal::random<Scalar>(0.8,1.2);
h.store(data2, internal::psin(h.load(data1)));
VERIFY(data2[0]<=Scalar(1.) && data2[0]>=Scalar(-1.));
VERIFY(data2[1]<=Scalar(1.) && data2[1]>=Scalar(-1.));
for(Scalar k = 1; k<Scalar(10000)/std::numeric_limits<Scalar>::epsilon(); k*=2)
{
for(int k1=0;k1<=1; ++k1)
{
data1[0] = (2*k+k1 )*Scalar(EIGEN_PI)/2 * internal::random<Scalar>(0.8,1.2);
data1[1] = (2*k+2+k1)*Scalar(EIGEN_PI)/2 * internal::random<Scalar>(0.8,1.2);
h.store(data2, internal::pcos(h.load(data1)));
h.store(data2+PacketSize, internal::psin(h.load(data1)));
VERIFY(data2[0]<=Scalar(1.) && data2[0]>=Scalar(-1.));
VERIFY(data2[1]<=Scalar(1.) && data2[1]>=Scalar(-1.));
VERIFY(data2[PacketSize+0]<=Scalar(1.) && data2[PacketSize+0]>=Scalar(-1.));
VERIFY(data2[PacketSize+1]<=Scalar(1.) && data2[PacketSize+1]>=Scalar(-1.));
VERIFY_IS_APPROX(numext::abs2(data2[0])+numext::abs2(data2[PacketSize+0]), Scalar(1));
VERIFY_IS_APPROX(numext::abs2(data2[1])+numext::abs2(data2[PacketSize+1]), Scalar(1));
}
}
data1[0] = std::numeric_limits<Scalar>::infinity();
@ -596,6 +601,12 @@ template<typename Scalar,typename Packet> void packetmath_real()
VERIFY((numext::isnan)(data2[0]));
h.store(data2, internal::pcos(h.load(data1)));
VERIFY((numext::isnan)(data2[0]));
data1[0] = -Scalar(0.);
h.store(data2, internal::psin(h.load(data1)));
VERIFY( internal::biteq(data2[0], data1[0]) );
h.store(data2, internal::pcos(h.load(data1)));
VERIFY_IS_EQUAL(data2[0], Scalar(1));
}
}
}
@ -633,6 +644,29 @@ template<typename Scalar,typename Packet> void packetmath_notcomplex()
ref[i] = data1[0]+Scalar(i);
internal::pstore(data2, internal::plset<Packet>(data1[0]));
VERIFY(areApprox(ref, data2, PacketSize) && "internal::plset");
{
unsigned char* data1_bits = reinterpret_cast<unsigned char*>(data1);
// predux_all - not needed yet
// for (unsigned int i=0; i<PacketSize*sizeof(Scalar); ++i) data1_bits[i] = 0xff;
// VERIFY(internal::predux_all(internal::pload<Packet>(data1)) && "internal::predux_all(1111)");
// for(int k=0; k<PacketSize; ++k)
// {
// for (unsigned int i=0; i<sizeof(Scalar); ++i) data1_bits[k*sizeof(Scalar)+i] = 0x0;
// VERIFY( (!internal::predux_all(internal::pload<Packet>(data1))) && "internal::predux_all(0101)");
// for (unsigned int i=0; i<sizeof(Scalar); ++i) data1_bits[k*sizeof(Scalar)+i] = 0xff;
// }
// predux_any
for (unsigned int i=0; i<PacketSize*sizeof(Scalar); ++i) data1_bits[i] = 0x0;
VERIFY( (!internal::predux_any(internal::pload<Packet>(data1))) && "internal::predux_any(0000)");
for(int k=0; k<PacketSize; ++k)
{
for (unsigned int i=0; i<sizeof(Scalar); ++i) data1_bits[k*sizeof(Scalar)+i] = 0xff;
VERIFY( internal::predux_any(internal::pload<Packet>(data1)) && "internal::predux_any(0101)");
for (unsigned int i=0; i<sizeof(Scalar); ++i) data1_bits[k*sizeof(Scalar)+i] = 0x00;
}
}
}
template<typename Scalar,typename Packet,bool ConjLhs,bool ConjRhs> void test_conj_helper(Scalar* data1, Scalar* data2, Scalar* ref, Scalar* pval)