Small cleanup of generic plog implementations:

Adding the term e*ln(2) is split into two step for no obvious reason.
This dates back to the original Cephes code from which the algorithm is adapted.
It appears that this was done in Cephes to prevent the compiler from reordering
the addition of the 3 terms in the approximation

  log(1+x) ~= x - 0.5*x^2 + x^3*P(x)/Q(x)

which must be added in reverse order since |x| < (sqrt(2)-1).

This allows rewriting the code to just 2 pmadd and 1 padd instructions,
which on a Skylake processor speeds up the code by 5-7%.
This commit is contained in:
Rasmus Munk Larsen 2020-12-03 19:40:40 +00:00
parent eb4d4ae070
commit 25d8ae7465

View File

@ -73,7 +73,7 @@ Packet plog_float(const Packet _x)
Packet x = _x;
const Packet cst_1 = pset1<Packet>(1.0f);
const Packet cst_half = pset1<Packet>(0.5f);
const Packet cst_neg_half = pset1<Packet>(-0.5f);
// The smallest non denormalized float number.
const Packet cst_min_norm_pos = pset1frombits<Packet>( 0x00800000u);
const Packet cst_minus_inf = pset1frombits<Packet>( 0xff800000u);
@ -90,8 +90,6 @@ Packet plog_float(const Packet _x)
const Packet cst_cephes_log_p6 = pset1<Packet>(+2.0000714765E-1f);
const Packet cst_cephes_log_p7 = pset1<Packet>(-2.4999993993E-1f);
const Packet cst_cephes_log_p8 = pset1<Packet>(+3.3333331174E-1f);
const Packet cst_cephes_log_q1 = pset1<Packet>(-2.12194440e-4f);
const Packet cst_cephes_log_q2 = pset1<Packet>(0.693359375f);
// Truncate input values to the minimum positive normal.
x = pmax(x, cst_min_norm_pos);
@ -129,14 +127,12 @@ Packet plog_float(const Packet _x)
y = pmadd(y, x3, y2);
y = pmul(y, x3);
y = pmadd(cst_neg_half, x2, y);
x = padd(x, y);
// Add the logarithm of the exponent back to the result of the interpolation.
y1 = pmul(e, cst_cephes_log_q1);
tmp = pmul(x2, cst_half);
y = padd(y, y1);
x = psub(x, tmp);
y2 = pmul(e, cst_cephes_log_q2);
x = padd(x, y);
x = padd(x, y2);
const Packet cst_ln2 = pset1<Packet>(M_LN2);
x = pmadd(e, cst_ln2, x);
Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
Packet iszero_mask = pcmp_eq(_x,pzero(_x));
@ -151,15 +147,12 @@ Packet plog_float(const Packet _x)
/* Returns the base e (2.718...) logarithm of x.
* The argument is separated into its exponent and fractional
* parts. If the exponent is between -1 and +1, the logarithm
* of the fraction is approximated by
* The argument is separated into its exponent and fractional parts.
* The logarithm of the fraction in the interval [sqrt(1/2), sqrt(2)],
* is approximated by
*
* log(1+x) = x - 0.5 x**2 + x**3 P(x)/Q(x).
*
* Otherwise, setting z = 2(x-1)/x+1),
* log(x) = z + z**3 P(z)/Q(z).
*
* for more detail see: http://www.netlib.org/cephes/
*/
template <typename Packet>
@ -170,12 +163,13 @@ Packet plog_double(const Packet _x)
Packet x = _x;
const Packet cst_1 = pset1<Packet>(1.0);
const Packet cst_half = pset1<Packet>(0.5);
const Packet cst_neg_half = pset1<Packet>(-0.5);
// The smallest non denormalized float number.
const Packet cst_min_norm_pos = pset1frombits<Packet>( static_cast<uint64_t>(0x0010000000000000ull));
const Packet cst_minus_inf = pset1frombits<Packet>( static_cast<uint64_t>(0xfff0000000000000ull));
const Packet cst_pos_inf = pset1frombits<Packet>( static_cast<uint64_t>(0x7ff0000000000000ull));
// Polynomial Coefficients for log(1+x) = x - x**2/2 + x**3 P(x)/Q(x)
// 1/sqrt(2) <= x < sqrt(2)
const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
@ -186,15 +180,12 @@ Packet plog_double(const Packet _x)
const Packet cst_cephes_log_p4 = pset1<Packet>(1.79368678507819816313E1);
const Packet cst_cephes_log_p5 = pset1<Packet>(7.70838733755885391666E0);
const Packet cst_cephes_log_r0 = pset1<Packet>(1.0);
const Packet cst_cephes_log_r1 = pset1<Packet>(1.12873587189167450590E1);
const Packet cst_cephes_log_r2 = pset1<Packet>(4.52279145837532221105E1);
const Packet cst_cephes_log_r3 = pset1<Packet>(8.29875266912776603211E1);
const Packet cst_cephes_log_r4 = pset1<Packet>(7.11544750618563894466E1);
const Packet cst_cephes_log_r5 = pset1<Packet>(2.31251620126765340583E1);
const Packet cst_cephes_log_q1 = pset1<Packet>(-2.121944400546905827679e-4);
const Packet cst_cephes_log_q2 = pset1<Packet>(0.693359375);
const Packet cst_cephes_log_q0 = pset1<Packet>(1.0);
const Packet cst_cephes_log_q1 = pset1<Packet>(1.12873587189167450590E1);
const Packet cst_cephes_log_q2 = pset1<Packet>(4.52279145837532221105E1);
const Packet cst_cephes_log_q3 = pset1<Packet>(8.29875266912776603211E1);
const Packet cst_cephes_log_q4 = pset1<Packet>(7.11544750618563894466E1);
const Packet cst_cephes_log_q5 = pset1<Packet>(2.31251620126765340583E1);
// Truncate input values to the minimum positive normal.
x = pmax(x, cst_min_norm_pos);
@ -220,31 +211,29 @@ Packet plog_double(const Packet _x)
Packet x3 = pmul(x2, x);
// Evaluate the polynomial approximant , probably to improve instruction-level parallelism.
// y = x * ( z * polevl( x, P, 5 ) / p1evl( x, Q, 5 ) );
Packet y, y1, y2,y_;
// y = x - 0.5*x^2 + x^3 * polevl( x, P, 5 ) / p1evl( x, Q, 5 ) );
Packet y, y1, y_;
y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
y = pmadd(y, x, cst_cephes_log_p2);
y1 = pmadd(y1, x, cst_cephes_log_p5);
y_ = pmadd(y, x3, y1);
y = pmadd(cst_cephes_log_r0, x, cst_cephes_log_r1);
y1 = pmadd(cst_cephes_log_r3, x, cst_cephes_log_r4);
y = pmadd(y, x, cst_cephes_log_r2);
y1 = pmadd(y1, x, cst_cephes_log_r5);
y = pmadd(cst_cephes_log_q0, x, cst_cephes_log_q1);
y1 = pmadd(cst_cephes_log_q3, x, cst_cephes_log_q4);
y = pmadd(y, x, cst_cephes_log_q2);
y1 = pmadd(y1, x, cst_cephes_log_q5);
y = pmadd(y, x3, y1);
y_ = pmul(y_, x3);
y = pdiv(y_, y);
y = pmadd(cst_neg_half, x2, y);
x = padd(x, y);
// Add the logarithm of the exponent back to the result of the interpolation.
y1 = pmul(e, cst_cephes_log_q1);
tmp = pmul(x2, cst_half);
y = padd(y, y1);
x = psub(x, tmp);
y2 = pmul(e, cst_cephes_log_q2);
x = padd(x, y);
x = padd(x, y2);
const Packet cst_ln2 = pset1<Packet>(M_LN2);
x = pmadd(e, cst_ln2, x);
Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
Packet iszero_mask = pcmp_eq(_x,pzero(_x));