mirror of
https://gitlab.com/libeigen/eigen.git
synced 2024-12-21 07:19:46 +08:00
Added a double-precision implementation of the exp() function for AVX.
This commit is contained in:
parent
4dd7d0b5dc
commit
1dded10cb7
@ -271,6 +271,86 @@ pexp<Packet8f>(const Packet8f& _x) {
|
||||
return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
|
||||
pexp<Packet4d>(const Packet4d& _x) {
|
||||
Packet4d x = _x;
|
||||
|
||||
_EIGEN_DECLARE_CONST_Packet4d(1, 1.0);
|
||||
_EIGEN_DECLARE_CONST_Packet4d(2, 2.0);
|
||||
_EIGEN_DECLARE_CONST_Packet4d(half, 0.5);
|
||||
|
||||
_EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437);
|
||||
_EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303);
|
||||
|
||||
_EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599);
|
||||
|
||||
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4);
|
||||
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2);
|
||||
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1);
|
||||
|
||||
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6);
|
||||
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3);
|
||||
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1);
|
||||
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0);
|
||||
|
||||
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125);
|
||||
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6);
|
||||
_EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
|
||||
|
||||
Packet4d tmp, fx;
|
||||
|
||||
// clamp x
|
||||
x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo);
|
||||
// Express exp(x) as exp(g + n*log(2)).
|
||||
fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half);
|
||||
|
||||
// Get the integer modulus of log(2), i.e. the "n" described above.
|
||||
fx = _mm256_floor_pd(fx);
|
||||
|
||||
// Get the remainder modulo log(2), i.e. the "g" described above. Subtract
|
||||
// n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
|
||||
// digits right.
|
||||
tmp = pmul(fx, p4d_cephes_exp_C1);
|
||||
Packet4d z = pmul(fx, p4d_cephes_exp_C2);
|
||||
x = psub(x, tmp);
|
||||
x = psub(x, z);
|
||||
|
||||
Packet4d x2 = pmul(x, x);
|
||||
|
||||
// Evaluate the numerator polynomial of the rational interpolant.
|
||||
Packet4d px = p4d_cephes_exp_p0;
|
||||
px = pmadd(px, x2, p4d_cephes_exp_p1);
|
||||
px = pmadd(px, x2, p4d_cephes_exp_p2);
|
||||
px = pmul(px, x);
|
||||
|
||||
// Evaluate the denominator polynomial of the rational interpolant.
|
||||
Packet4d qx = p4d_cephes_exp_q0;
|
||||
qx = pmadd(qx, x2, p4d_cephes_exp_q1);
|
||||
qx = pmadd(qx, x2, p4d_cephes_exp_q2);
|
||||
qx = pmadd(qx, x2, p4d_cephes_exp_q3);
|
||||
|
||||
// I don't really get this bit, copied from the SSE2 routines, so...
|
||||
// TODO(gonnet): Figure out what is going on here, perhaps find a better
|
||||
// rational interpolant?
|
||||
x = _mm256_div_pd(px, psub(qx, px));
|
||||
x = pmadd(p4d_2, x, p4d_1);
|
||||
|
||||
// Build e=2^n by constructing the exponents in a 128-bit vector and
|
||||
// shifting them to where they belong in double-precision values.
|
||||
__m128i emm0 = _mm256_cvtpd_epi32(fx);
|
||||
emm0 = _mm_add_epi32(emm0, p4i_1023);
|
||||
emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
__m128i lo = _mm_slli_epi64(emm0, 52);
|
||||
__m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
|
||||
__m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
|
||||
e = _mm256_insertf128_si256(e, hi, 1);
|
||||
|
||||
// Construct the result 2^n * exp(g) = e * x. The max is used to catch
|
||||
// non-finite values in the input.
|
||||
return pmax(pmul(x, Packet4d(e)), _x);
|
||||
}
|
||||
|
||||
// Functions for sqrt.
|
||||
// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
|
||||
// of Newton's method, at a cost of 1-2 bits of precision as opposed to the
|
||||
|
@ -80,7 +80,7 @@ template<> struct packet_traits<double> : default_packet_traits
|
||||
HasHalfPacket = 1,
|
||||
|
||||
HasDiv = 1,
|
||||
HasExp = 0,
|
||||
HasExp = 1,
|
||||
HasSqrt = 1,
|
||||
HasRsqrt = 1,
|
||||
HasBlend = 1
|
||||
|
Loading…
Reference in New Issue
Block a user