mirror of
https://gitlab.com/libeigen/eigen.git
synced 2024-12-21 07:19:46 +08:00
Vectorize pow(x, y)
. This closes https://gitlab.com/libeigen/eigen/-/issues/2085, which also contains a description of the algorithm.
I ran some testing (comparing to `std::pow(double(x), double(y)))` for `x` in the set of all (positive) floats in the interval `[std::sqrt(std::numeric_limits<float>::min()), std::sqrt(std::numeric_limits<float>::max())]`, and `y` in `{2, sqrt(2), -sqrt(2)}` I get the following error statistics: ``` max_rel_error = 8.34405e-07 rms_rel_error = 2.76654e-07 ``` If I widen the range to all normal float I see lower accuracy for arguments where the result is subnormal, e.g. for `y = sqrt(2)`: ``` max_rel_error = 0.666667 rms = 6.8727e-05 count = 1335165689 argmax = 2.56049e-32, 2.10195e-45 != 1.4013e-45 ``` which seems reasonable, since these results are subnormals with only couple of significant bits left.
This commit is contained in:
parent
bde6741641
commit
cdd8fdc32e
@ -88,7 +88,7 @@ struct packet_traits<half> : default_packet_traits {
|
||||
HasCeil = 1,
|
||||
HasRint = 1,
|
||||
HasBessel = 1,
|
||||
HasNdtri = 1,
|
||||
HasNdtri = 1
|
||||
};
|
||||
};
|
||||
|
||||
|
@ -793,6 +793,171 @@ Packet psqrt_complex(const Packet& a) {
|
||||
pselect(is_real_inf, real_inf_result,result));
|
||||
}
|
||||
|
||||
|
||||
// This function implements the Veltkamp splitting. Given a floating point
|
||||
// number x it returns the pair {x_hi, x_lo} such that x_hi + x_lo = x holds
|
||||
// exactly and that half of the significant of x fits in x_hi.
|
||||
// This code corresponds to Algorithms 3 and 4 in
|
||||
// https://hal.inria.fr/hal-01774587v2/document
|
||||
template<typename Packet>
|
||||
EIGEN_STRONG_INLINE
|
||||
void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) {
|
||||
typedef typename unpacket_traits<Packet>::type Scalar;
|
||||
EIGEN_CONSTEXPR int shift = (NumTraits<Scalar>::digits() + 1) / 2;
|
||||
EIGEN_CONSTEXPR Scalar shift_scale = Scalar(uint64_t(1) << shift);
|
||||
Packet gamma = pmul(pset1<Packet>(shift_scale + 1), x);
|
||||
#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
||||
x_hi = pmadd(pset1<Packet>(-shift_scale), x, gamma);
|
||||
#else
|
||||
Packet rho = psub(x, gamma);
|
||||
x_hi = padd(rho, gamma);
|
||||
#endif
|
||||
x_lo = psub(x, x_hi);
|
||||
}
|
||||
|
||||
// This function splits x into the nearest integer n and fractional part r,
|
||||
// such that x = n + r holds exactly.
|
||||
template<typename Packet>
|
||||
EIGEN_STRONG_INLINE
|
||||
void integer_split(const Packet& x, Packet& n, Packet& r) {
|
||||
n = pround(x);
|
||||
r = psub(x, n);
|
||||
}
|
||||
|
||||
// This function implements Dekker's algorithm for two products {x * y1, x * y2} with
|
||||
// a shared factor. Given floating point numbers {x, y1, y2} computes the pairs
|
||||
// {p1, r1} and {p2, r2} such that x * y1 = p1 + r1 holds exactly and
|
||||
// p1 = fl(x * y1), and x * y2 = p2 + r2 holds exactly and p2 = fl(x * y2).
|
||||
template<typename Packet>
|
||||
EIGEN_STRONG_INLINE
|
||||
void double_dekker(const Packet& x, const Packet& y1, const Packet& y2,
|
||||
Packet& p1, Packet& r1, Packet& p2, Packet& r2) {
|
||||
Packet x_hi, x_lo, y1_hi, y1_lo, y2_hi, y2_lo;
|
||||
veltkamp_splitting(x, x_hi, x_lo);
|
||||
veltkamp_splitting(y1, y1_hi, y1_lo);
|
||||
veltkamp_splitting(y2, y2_hi, y2_lo);
|
||||
|
||||
p1 = pmul(x, y1);
|
||||
r1 = pmadd(x_hi, y1_hi, pnegate(p1));
|
||||
r1 = pmadd(x_hi, y1_lo, r1);
|
||||
r1 = pmadd(x_lo, y1_hi, r1);
|
||||
r1 = pmadd(x_lo, y1_lo, r1);
|
||||
|
||||
p2 = pmul(x, y2);
|
||||
r2 = pmadd(x_hi, y2_hi, pnegate(p2));
|
||||
r2 = pmadd(x_hi, y2_lo, r2);
|
||||
r2 = pmadd(x_lo, y2_hi, r2);
|
||||
r2 = pmadd(x_lo, y2_lo, r2);
|
||||
}
|
||||
|
||||
// This function implements the non-trivial case of pow(x,y) where x is
|
||||
// positive and y is (possibly) non-integer.
|
||||
// Formally, pow(x,y) = 2**(y * log2(x))
|
||||
template<typename Packet>
|
||||
EIGEN_STRONG_INLINE
|
||||
Packet generic_pow_impl(const Packet& x, const Packet& y) {
|
||||
typedef typename unpacket_traits<Packet>::type Scalar;
|
||||
// Split x into exponent e_x and mantissa m_x.
|
||||
Packet e_x;
|
||||
Packet m_x = pfrexp(x, e_x);
|
||||
|
||||
// Adjust m_x to lie in [0.75:1.5) to minimize absolute error in log2(m_x).
|
||||
Packet m_x_scale_mask = pcmp_lt(m_x, pset1<Packet>(Scalar(0.75)));
|
||||
m_x = pselect(m_x_scale_mask, pmul(pset1<Packet>(Scalar(2)), m_x), m_x);
|
||||
e_x = pselect(m_x_scale_mask, psub(e_x, pset1<Packet>(Scalar(1))), e_x);
|
||||
|
||||
Packet r_x = plog2(m_x);
|
||||
|
||||
// Compute the two terms {y * e_x, y * r_x} in f = y * log2(x) with doubled
|
||||
// precision using Dekker's algorithm.
|
||||
Packet f1_hi, f1_lo, f2_hi, f2_lo;
|
||||
double_dekker(y, e_x, r_x, f1_hi, f1_lo, f2_hi, f2_lo);
|
||||
|
||||
// Separate f into integer and fractional parts, keeping f1_hi, and f2_hi
|
||||
// separate to avoid cancellation.
|
||||
Packet n1, r1, n2, r2;
|
||||
integer_split(f1_hi, n1, r1);
|
||||
integer_split(f2_hi, n2, r2);
|
||||
|
||||
// Add up integer parts and sum the remainders.
|
||||
Packet n_z = padd(n1, n2);
|
||||
// Notice: I experimented with using compensated (Kahan) summation here,
|
||||
// but it does not seem to matter.
|
||||
Packet rem = padd(padd(f1_lo, f2_lo), padd(r1, r2));
|
||||
|
||||
// Extract any additional integer part that may have accumulated in rem.
|
||||
Packet nrem, r_z;
|
||||
integer_split(rem, nrem, r_z);
|
||||
n_z = padd(n_z, nrem);
|
||||
|
||||
// We now have an accurate split of f = n_z + r_z and can compute
|
||||
// x^y = 2**{n_z + r_z) = exp(ln(2) * r_z) * 2**{n_z}.
|
||||
// The first factor we compute by calling pexp(), while multiplication
|
||||
// by an integer power of 2 can be done exactly using pldexp().
|
||||
// Note: I experimented with using Dekker's algorithms for the
|
||||
// multiplication by ln(2) here, but did not see any difference.
|
||||
Packet e_r = pexp(pmul(pset1<Packet>(Scalar(EIGEN_LN2)), r_z));
|
||||
return pldexp(e_r, n_z);
|
||||
}
|
||||
|
||||
// Generic implementation of pow(x,y).
|
||||
template<typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
EIGEN_UNUSED
|
||||
Packet generic_pow(const Packet& x, const Packet& y) {
|
||||
typedef typename unpacket_traits<Packet>::type Scalar;
|
||||
const Packet cst_pos_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
|
||||
const Packet cst_zero = pset1<Packet>(Scalar(0));
|
||||
const Packet cst_one = pset1<Packet>(Scalar(1));
|
||||
const Packet cst_nan = pset1<Packet>(NumTraits<Scalar>::quiet_NaN());
|
||||
|
||||
Packet abs_x = pabs(x);
|
||||
// Predicates for sign and magnitude of x.
|
||||
Packet x_is_zero = pcmp_eq(x, cst_zero);
|
||||
Packet x_is_neg = pcmp_lt(x, cst_zero);
|
||||
Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf);
|
||||
Packet abs_x_is_one = pcmp_eq(abs_x, cst_one);
|
||||
Packet abs_x_is_gt_one = pcmp_lt(cst_one, abs_x);
|
||||
Packet abs_x_is_lt_one = pcmp_lt(abs_x, cst_one);
|
||||
Packet x_is_one = pandnot(abs_x_is_one, x_is_neg);
|
||||
Packet x_is_neg_one = pand(abs_x_is_one, x_is_neg);
|
||||
Packet x_is_nan = pandnot(ptrue(x), pcmp_eq(x, x));
|
||||
|
||||
// Predicates for sign and magnitude of y.
|
||||
Packet y_is_zero = pcmp_eq(y, cst_zero);
|
||||
Packet y_is_neg = pcmp_lt(y, cst_zero);
|
||||
Packet y_is_pos = pandnot(ptrue(y), por(y_is_zero, y_is_neg));
|
||||
Packet y_is_nan = pandnot(ptrue(y), pcmp_eq(y, y));
|
||||
Packet abs_y_is_inf = pcmp_eq(pabs(y), cst_pos_inf);
|
||||
|
||||
// Predicates for whether y is integer and/or even.
|
||||
Packet y_is_int = pcmp_eq(pfloor(y), y);
|
||||
Packet y_div_2 = pldexp(y, pset1<Packet>(Scalar(-1)));
|
||||
Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2);
|
||||
|
||||
// Predicates encoding special cases for the value of pow(x,y)
|
||||
Packet invalid_negative_x = pandnot(pandnot(pandnot(x_is_neg, abs_x_is_inf), y_is_int), abs_y_is_inf);
|
||||
Packet pow_is_nan = por(invalid_negative_x, por(x_is_nan, y_is_nan));
|
||||
Packet pow_is_one = por(por(y_is_zero, x_is_one), pand(x_is_neg_one, abs_y_is_inf));
|
||||
Packet pow_is_zero = por(por(por(pand(x_is_zero, y_is_pos), pand(abs_x_is_inf, y_is_neg)),
|
||||
pand(pand(abs_x_is_lt_one, abs_y_is_inf), y_is_pos)),
|
||||
pand(pand(abs_x_is_gt_one, abs_y_is_inf), y_is_neg));
|
||||
Packet pow_is_inf = por(por(por(pand(x_is_zero, y_is_neg), pand(abs_x_is_inf, y_is_pos)),
|
||||
pand(pand(abs_x_is_lt_one, abs_y_is_inf), y_is_neg)),
|
||||
pand(pand(abs_x_is_gt_one, abs_y_is_inf), y_is_pos));
|
||||
|
||||
// General computation of pow(x,y) for positive x or negative x and integer y.
|
||||
Packet negate_pow_abs = pandnot(x_is_neg, y_is_even);
|
||||
Packet pow_abs = generic_pow_impl(abs_x, y);
|
||||
|
||||
return pselect(pow_is_one, cst_one,
|
||||
pselect(pow_is_nan, cst_nan,
|
||||
pselect(pow_is_inf, cst_pos_inf,
|
||||
pselect(pow_is_zero, cst_zero,
|
||||
pselect(negate_pow_abs, pnegate(pow_abs), pow_abs)))));
|
||||
}
|
||||
|
||||
|
||||
/* polevl (modified for Eigen)
|
||||
*
|
||||
* Evaluate polynomial
|
||||
|
@ -26,7 +26,7 @@ namespace internal {
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_FMA
|
||||
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
||||
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1
|
||||
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@ -147,13 +147,13 @@ struct packet_traits<float> : default_packet_traits {
|
||||
HasTanh = EIGEN_FAST_MATH,
|
||||
HasErf = EIGEN_FAST_MATH,
|
||||
HasBlend = 1,
|
||||
HasCeil = 1,
|
||||
HasFloor = 1
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_SSE4_1
|
||||
,
|
||||
HasRint = 1,
|
||||
HasRound = 1,
|
||||
HasCeil = 1
|
||||
HasRound = 1
|
||||
#endif
|
||||
};
|
||||
};
|
||||
@ -173,14 +173,14 @@ struct packet_traits<double> : default_packet_traits {
|
||||
HasExp = 1,
|
||||
HasSqrt = 1,
|
||||
HasRsqrt = 1,
|
||||
HasBlend = 1
|
||||
HasBlend = 1,
|
||||
HasFloor = 1,
|
||||
HasCeil = 1
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_SSE4_1
|
||||
,
|
||||
HasRound = 1,
|
||||
HasRint = 1,
|
||||
HasFloor = 1,
|
||||
HasCeil = 1
|
||||
HasRint = 1
|
||||
#endif
|
||||
};
|
||||
};
|
||||
@ -650,6 +650,30 @@ template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
|
||||
mask = pand(mask, cst_1);
|
||||
return psub(tmp, mask);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
|
||||
{
|
||||
const Packet4f cst_1 = pset1<Packet4f>(1.0f);
|
||||
Packet4i emm0 = _mm_cvttps_epi32(a);
|
||||
Packet4f tmp = _mm_cvtepi32_ps(emm0);
|
||||
/* if greater, substract 1 */
|
||||
Packet4f mask = _mm_cmplt_ps(tmp, a);
|
||||
mask = pand(mask, cst_1);
|
||||
return padd(tmp, mask);
|
||||
}
|
||||
|
||||
// WARNING: this pfloor implementation makes sense for small inputs only,
|
||||
// It is currently only used by pexp and not exposed through HasFloor.
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a)
|
||||
{
|
||||
const Packet2d cst_1 = pset1<Packet2d>(1.0);
|
||||
Packet4i emm0 = _mm_cvttpd_epi32(a);
|
||||
Packet2d tmp = _mm_cvtepi32_pd(emm0);
|
||||
/* if greater, substract 1 */
|
||||
Packet2d mask = _mm_cmplt_pd(tmp, a);
|
||||
mask = pand(mask, cst_1);
|
||||
return padd(tmp, mask);
|
||||
}
|
||||
#endif
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
|
||||
|
@ -403,6 +403,7 @@ struct functor_traits<scalar_hypot_op<Scalar,Scalar> > {
|
||||
|
||||
/** \internal
|
||||
* \brief Template functor to compute the pow of two scalars
|
||||
* See the specification of pow in https://en.cppreference.com/w/cpp/numeric/math/pow
|
||||
*/
|
||||
template<typename Scalar, typename Exponent>
|
||||
struct scalar_pow_op : binary_op_base<Scalar,Exponent>
|
||||
@ -417,12 +418,25 @@ struct scalar_pow_op : binary_op_base<Scalar,Exponent>
|
||||
EIGEN_SCALAR_BINARY_OP_PLUGIN
|
||||
}
|
||||
#endif
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline result_type operator() (const Scalar& a, const Exponent& b) const { return numext::pow(a, b); }
|
||||
|
||||
template<typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
|
||||
{
|
||||
return generic_pow(a,b);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Scalar, typename Exponent>
|
||||
struct functor_traits<scalar_pow_op<Scalar,Exponent> > {
|
||||
enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
|
||||
enum {
|
||||
Cost = 5 * NumTraits<Scalar>::MulCost,
|
||||
PacketAccess = (!NumTraits<Scalar>::IsComplex && !NumTraits<Scalar>::IsInteger &&
|
||||
packet_traits<Scalar>::HasExp && packet_traits<Scalar>::HasLog &&
|
||||
packet_traits<Scalar>::HasRound && packet_traits<Scalar>::HasCmp)
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
|
@ -9,6 +9,62 @@
|
||||
|
||||
#include "main.h"
|
||||
|
||||
|
||||
// Test the corner cases of pow(x, y) for real types.
|
||||
template<typename Scalar>
|
||||
void pow_test() {
|
||||
const Scalar zero = Scalar(0);
|
||||
const Scalar one = Scalar(1);
|
||||
const Scalar sqrt_half = Scalar(std::sqrt(0.5));
|
||||
const Scalar sqrt2 = Scalar(std::sqrt(2));
|
||||
const Scalar inf = std::numeric_limits<Scalar>::infinity();
|
||||
const Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
|
||||
const static Scalar abs_vals[] = {zero, sqrt_half, one, sqrt2, inf, nan};
|
||||
const int abs_cases = 6;
|
||||
const int num_cases = 2*abs_cases * 2*abs_cases;
|
||||
// Repeat the same value to make sure we hit the vectorized path.
|
||||
const int num_repeats = 32;
|
||||
Array<Scalar, Dynamic, Dynamic> x(num_repeats, num_cases);
|
||||
Array<Scalar, Dynamic, Dynamic> y(num_repeats, num_cases);
|
||||
Array<Scalar, Dynamic, Dynamic> expected(num_repeats, num_cases);
|
||||
int count = 0;
|
||||
for (int i = 0; i < abs_cases; ++i) {
|
||||
const Scalar abs_x = abs_vals[i];
|
||||
for (int sign_x = 0; sign_x < 2; ++sign_x) {
|
||||
Scalar x_case = sign_x == 0 ? -abs_x : abs_x;
|
||||
for (int j = 0; j < abs_cases; ++j) {
|
||||
const Scalar abs_y = abs_vals[j];
|
||||
for (int sign_y = 0; sign_y < 2; ++sign_y) {
|
||||
Scalar y_case = sign_y == 0 ? -abs_y : abs_y;
|
||||
for (int repeat = 0; repeat < num_repeats; ++repeat) {
|
||||
x(repeat, count) = x_case;
|
||||
y(repeat, count) = y_case;
|
||||
expected(repeat, count) = numext::pow(x_case, y_case);
|
||||
}
|
||||
++count;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Array<Scalar, Dynamic, Dynamic> actual = x.pow(y);
|
||||
const Scalar tol = test_precision<Scalar>();
|
||||
bool all_pass = true;
|
||||
for (int i = 0; i < 1; ++i) {
|
||||
for (int j = 0; j < num_cases; ++j) {
|
||||
Scalar a = actual(i, j);
|
||||
Scalar e = expected(i, j);
|
||||
bool fail = !(a==e) && !internal::isApprox(a, e, tol) && !((std::isnan)(a) && (std::isnan)(e));
|
||||
all_pass &= !fail;
|
||||
if (fail) {
|
||||
std::cout << "pow(" << x(i,j) << "," << y(i,j) << ") = " << a << " != " << e << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
VERIFY(all_pass);
|
||||
}
|
||||
|
||||
|
||||
template<typename ArrayType> void array(const ArrayType& m)
|
||||
{
|
||||
typedef typename ArrayType::Scalar Scalar;
|
||||
@ -371,6 +427,8 @@ template<typename ArrayType> void array_real(const ArrayType& m)
|
||||
|
||||
VERIFY_IS_APPROX(m3.pow(RealScalar(-0.5)), m3.rsqrt());
|
||||
VERIFY_IS_APPROX(pow(m3,RealScalar(-0.5)), m3.rsqrt());
|
||||
VERIFY_IS_APPROX(m1.pow(RealScalar(-2)), m1.square().inverse());
|
||||
pow_test<Scalar>();
|
||||
|
||||
VERIFY_IS_APPROX(log10(m3), log(m3)/log(10));
|
||||
VERIFY_IS_APPROX(log2(m3), log(m3)/log(2));
|
||||
|
Loading…
Reference in New Issue
Block a user