From 1dded10cb7ed54a4d2c7137656cfc35b137959a2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 4 May 2015 10:42:51 -0700 Subject: [PATCH] Added a double-precision implementation of the exp() function for AVX. --- Eigen/src/Core/arch/AVX/MathFunctions.h | 80 +++++++++++++++++++++++++ Eigen/src/Core/arch/AVX/PacketMath.h | 2 +- 2 files changed, 81 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h index ef80a8b2d..06cd56684 100644 --- a/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -271,6 +271,86 @@ pexp(const Packet8f& _x) { return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x); } +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d +pexp(const Packet4d& _x) { + Packet4d x = _x; + + _EIGEN_DECLARE_CONST_Packet4d(1, 1.0); + _EIGEN_DECLARE_CONST_Packet4d(2, 2.0); + _EIGEN_DECLARE_CONST_Packet4d(half, 0.5); + + _EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437); + _EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303); + + _EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599); + + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1); + + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0); + + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6); + _EIGEN_DECLARE_CONST_Packet4i(1023, 1023); + + Packet4d tmp, fx; + + // clamp x + x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo); + // Express exp(x) as exp(g + n*log(2)). + fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half); + + // Get the integer modulus of log(2), i.e. the "n" described above. + fx = _mm256_floor_pd(fx); + + // Get the remainder modulo log(2), i.e. the "g" described above. Subtract + // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last + // digits right. + tmp = pmul(fx, p4d_cephes_exp_C1); + Packet4d z = pmul(fx, p4d_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + + Packet4d x2 = pmul(x, x); + + // Evaluate the numerator polynomial of the rational interpolant. + Packet4d px = p4d_cephes_exp_p0; + px = pmadd(px, x2, p4d_cephes_exp_p1); + px = pmadd(px, x2, p4d_cephes_exp_p2); + px = pmul(px, x); + + // Evaluate the denominator polynomial of the rational interpolant. + Packet4d qx = p4d_cephes_exp_q0; + qx = pmadd(qx, x2, p4d_cephes_exp_q1); + qx = pmadd(qx, x2, p4d_cephes_exp_q2); + qx = pmadd(qx, x2, p4d_cephes_exp_q3); + + // I don't really get this bit, copied from the SSE2 routines, so... + // TODO(gonnet): Figure out what is going on here, perhaps find a better + // rational interpolant? + x = _mm256_div_pd(px, psub(qx, px)); + x = pmadd(p4d_2, x, p4d_1); + + // Build e=2^n by constructing the exponents in a 128-bit vector and + // shifting them to where they belong in double-precision values. + __m128i emm0 = _mm256_cvtpd_epi32(fx); + emm0 = _mm_add_epi32(emm0, p4i_1023); + emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0)); + __m128i lo = _mm_slli_epi64(emm0, 52); + __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52); + __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0); + e = _mm256_insertf128_si256(e, hi, 1); + + // Construct the result 2^n * exp(g) = e * x. The max is used to catch + // non-finite values in the input. + return pmax(pmul(x, Packet4d(e)), _x); +} + // Functions for sqrt. // The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step // of Newton's method, at a cost of 1-2 bits of precision as opposed to the diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index fb20a45cc..1e751dc32 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -80,7 +80,7 @@ template<> struct packet_traits : default_packet_traits HasHalfPacket = 1, HasDiv = 1, - HasExp = 0, + HasExp = 1, HasSqrt = 1, HasRsqrt = 1, HasBlend = 1