mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-01-06 14:14:46 +08:00
First step toward a unification of packet log implementation, currently only SSE and AVX are unified.
To this end, I added the following functions: pzero, pcmp_*, pfrexp, pset1frombits functions.
This commit is contained in:
parent
5f6045077c
commit
2c44c40114
@ -214,6 +214,38 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; }
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||
pandnot(const Packet& a, const Packet& b) { return a & (!b); }
|
||||
|
||||
/** \internal \returns the significant and exponent of the underlying floating point numbers
|
||||
* See https://en.cppreference.com/w/cpp/numeric/math/frexp
|
||||
*/
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||
pfrexp(const Packet &a, Packet &exponent) { return std::frexp(a,&exponent); }
|
||||
|
||||
/** \internal \returns zeros */
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||
pzero(const Packet& a) { return pxor(a,a); }
|
||||
|
||||
/** \internal \returns bits of \a or \b according to the input bit mask \a mask */
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||
pselect(const Packet& mask, const Packet& a, const Packet& b) {
|
||||
return por(pand(a,mask),pandnot(b,mask));
|
||||
}
|
||||
|
||||
/** \internal \returns a <= b as a bit mask */
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||
pcmp_le(const Packet& a, const Packet& b); /* { return a<=b ? pnot(pxor(a,a)) : pxor(a,a); } */
|
||||
|
||||
/** \internal \returns a < b as a bit mask */
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||
pcmp_lt(const Packet& a, const Packet& b); /* { return a<b ? pnot(pxor(a,a)) : pxor(a,a); } */
|
||||
|
||||
/** \internal \returns a == b as a bit mask */
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||
pcmp_eq(const Packet& a, const Packet& b); /* { return a==b ? pnot(pxor(a,a)) : pxor(a,a); } */
|
||||
|
||||
/** \internal \returns a < b or a==NaN or b==NaN as a bit mask */
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||
pcmp_lt_or_nan(const Packet& a, const Packet& b); /* { return pnot(pcmp_le(b,a)); } */
|
||||
|
||||
/** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||
pload(const typename unpacket_traits<Packet>::type* from) { return *from; }
|
||||
@ -226,6 +258,10 @@ ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||
pset1(const typename unpacket_traits<Packet>::type& a) { return a; }
|
||||
|
||||
/** \internal \returns a packet with constant coefficients set from bits */
|
||||
template<typename Packet,typename BitsType> EIGEN_DEVICE_FUNC inline Packet
|
||||
pset1frombits(BitsType a);
|
||||
|
||||
/** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||
pload1(const typename unpacket_traits<Packet>::type *a) { return pset1<Packet>(*a); }
|
||||
@ -597,6 +633,29 @@ pinsertlast(const Packet& a, typename unpacket_traits<Packet>::type b)
|
||||
return pblend(mask, pset1<Packet>(b), a);
|
||||
}
|
||||
|
||||
/***************************************************************************
|
||||
* Some generic implementations to be used by implementors
|
||||
***************************************************************************/
|
||||
|
||||
/** \internal shift the bits by n and cast the result to the initial type, i.e.:
|
||||
* return float(reinterpret_cast<uint>(a) >> n)
|
||||
*/
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||
pshiftright_and_cast(Packet a, int n);
|
||||
|
||||
/** Default implementation of pfrexp for float.
|
||||
* It is expected to be called by implementers of template<> pfrexp,
|
||||
* and the above pshiftright_and_cast function must be implemented.
|
||||
*/
|
||||
template<typename Packet> EIGEN_STRONG_INLINE Packet
|
||||
pfrexp_float(const Packet& a, Packet& exponent) {
|
||||
const Packet cst_126f = pset1<Packet>(126.0f);
|
||||
const Packet cst_half = pset1<Packet>(0.5f);
|
||||
const Packet cst_inv_mant_mask = pset1frombits<Packet>(~0x7f800000u);
|
||||
exponent = psub(pshiftright_and_cast(a,23), cst_126f);
|
||||
return por(pand(a, cst_inv_mant_mask), cst_half);
|
||||
}
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
@ -10,7 +10,7 @@
|
||||
#ifndef EIGEN_MATH_FUNCTIONS_AVX_H
|
||||
#define EIGEN_MATH_FUNCTIONS_AVX_H
|
||||
|
||||
/* The sin, cos, exp, and log functions of this file are loosely derived from
|
||||
/* The sin, cos, and exp functions of this file are loosely derived from
|
||||
* Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
|
||||
*/
|
||||
|
||||
@ -29,17 +29,6 @@ inline Packet8i pshiftleft(Packet8i v, int n)
|
||||
#endif
|
||||
}
|
||||
|
||||
inline Packet8f pshiftright(Packet8f v, int n)
|
||||
{
|
||||
#ifdef EIGEN_VECTORIZE_AVX2
|
||||
return _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(v), n));
|
||||
#else
|
||||
__m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 0), n);
|
||||
__m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 1), n);
|
||||
return _mm256_cvtepi32_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1));
|
||||
#endif
|
||||
}
|
||||
|
||||
// Sine function
|
||||
// Computes sin(x) by wrapping x to the interval [-Pi/4,3*Pi/4] and
|
||||
// evaluating interpolants in [-Pi/4,Pi/4] or [Pi/4,3*Pi/4]. The interpolants
|
||||
@ -110,95 +99,10 @@ psin<Packet8f>(const Packet8f& _x) {
|
||||
return res;
|
||||
}
|
||||
|
||||
// Natural logarithm
|
||||
// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
|
||||
// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
|
||||
// be easily approximated by a polynomial centered on m=1 for stability.
|
||||
// TODO(gonnet): Further reduce the interval allowing for lower-degree
|
||||
// polynomial interpolants -> ... -> profit!
|
||||
template <>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
|
||||
plog<Packet8f>(const Packet8f& _x) {
|
||||
Packet8f x = _x;
|
||||
_EIGEN_DECLARE_CONST_Packet8f(1, 1.0f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(half, 0.5f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(126f, 126.0f);
|
||||
|
||||
_EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inv_mant_mask, ~0x7f800000);
|
||||
|
||||
// The smallest non denormalized float number.
|
||||
_EIGEN_DECLARE_CONST_Packet8f_FROM_INT(min_norm_pos, 0x00800000);
|
||||
_EIGEN_DECLARE_CONST_Packet8f_FROM_INT(minus_inf, 0xff800000);
|
||||
|
||||
// Polynomial coefficients.
|
||||
_EIGEN_DECLARE_CONST_Packet8f(cephes_SQRTHF, 0.707106781186547524f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(cephes_log_p0, 7.0376836292E-2f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(cephes_log_p1, -1.1514610310E-1f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(cephes_log_p2, 1.1676998740E-1f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(cephes_log_p3, -1.2420140846E-1f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(cephes_log_p4, +1.4249322787E-1f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(cephes_log_p5, -1.6668057665E-1f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(cephes_log_p6, +2.0000714765E-1f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(cephes_log_p7, -2.4999993993E-1f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(cephes_log_p8, +3.3333331174E-1f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(cephes_log_q1, -2.12194440e-4f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(cephes_log_q2, 0.693359375f);
|
||||
|
||||
Packet8f invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_NGE_UQ); // not greater equal is true if x is NaN
|
||||
Packet8f iszero_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ);
|
||||
|
||||
// Truncate input values to the minimum positive normal.
|
||||
x = pmax(x, p8f_min_norm_pos);
|
||||
|
||||
Packet8f emm0 = pshiftright(x,23);
|
||||
Packet8f e = _mm256_sub_ps(emm0, p8f_126f);
|
||||
|
||||
// Set the exponents to -1, i.e. x are in the range [0.5,1).
|
||||
x = _mm256_and_ps(x, p8f_inv_mant_mask);
|
||||
x = _mm256_or_ps(x, p8f_half);
|
||||
|
||||
// part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
|
||||
// and shift by -1. The values are then centered around 0, which improves
|
||||
// the stability of the polynomial evaluation.
|
||||
// if( x < SQRTHF ) {
|
||||
// e -= 1;
|
||||
// x = x + x - 1.0;
|
||||
// } else { x = x - 1.0; }
|
||||
Packet8f mask = _mm256_cmp_ps(x, p8f_cephes_SQRTHF, _CMP_LT_OQ);
|
||||
Packet8f tmp = _mm256_and_ps(x, mask);
|
||||
x = psub(x, p8f_1);
|
||||
e = psub(e, _mm256_and_ps(p8f_1, mask));
|
||||
x = padd(x, tmp);
|
||||
|
||||
Packet8f x2 = pmul(x, x);
|
||||
Packet8f x3 = pmul(x2, x);
|
||||
|
||||
// Evaluate the polynomial approximant of degree 8 in three parts, probably
|
||||
// to improve instruction-level parallelism.
|
||||
Packet8f y, y1, y2;
|
||||
y = pmadd(p8f_cephes_log_p0, x, p8f_cephes_log_p1);
|
||||
y1 = pmadd(p8f_cephes_log_p3, x, p8f_cephes_log_p4);
|
||||
y2 = pmadd(p8f_cephes_log_p6, x, p8f_cephes_log_p7);
|
||||
y = pmadd(y, x, p8f_cephes_log_p2);
|
||||
y1 = pmadd(y1, x, p8f_cephes_log_p5);
|
||||
y2 = pmadd(y2, x, p8f_cephes_log_p8);
|
||||
y = pmadd(y, x3, y1);
|
||||
y = pmadd(y, x3, y2);
|
||||
y = pmul(y, x3);
|
||||
|
||||
// Add the logarithm of the exponent back to the result of the interpolation.
|
||||
y1 = pmul(e, p8f_cephes_log_q1);
|
||||
tmp = pmul(x2, p8f_half);
|
||||
y = padd(y, y1);
|
||||
x = psub(x, tmp);
|
||||
y2 = pmul(e, p8f_cephes_log_q2);
|
||||
x = padd(x, y);
|
||||
x = padd(x, y2);
|
||||
|
||||
// Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
|
||||
return _mm256_or_ps(
|
||||
_mm256_andnot_ps(iszero_mask, _mm256_or_ps(x, invalid_mask)),
|
||||
_mm256_and_ps(iszero_mask, p8f_minus_inf));
|
||||
return plog_float(_x);
|
||||
}
|
||||
|
||||
// Exponential function. Works by writing "x = m*log(2) + r" where
|
||||
|
@ -121,6 +121,11 @@ template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) { re
|
||||
template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
|
||||
template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int& from) { return _mm256_set1_epi32(from); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8f pset1frombits<Packet8f>(unsigned int from) { return _mm256_castsi256_ps(pset1<Packet8i>(from)); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f& /*a*/) { return _mm256_setzero_ps(); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4d pzero(const Packet4d& /*a*/) { return _mm256_setzero_pd(); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float* from) { return _mm256_broadcast_ss(from); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); }
|
||||
|
||||
@ -199,6 +204,12 @@ template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const
|
||||
// Arguments are swapped to match NaN propagation behavior of std::max.
|
||||
return _mm256_max_pd(b,a);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LE_OQ); }
|
||||
template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LT_OQ); }
|
||||
template<> EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_EQ_OQ); }
|
||||
template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a, b, _CMP_NGE_UQ); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
|
||||
|
||||
@ -363,6 +374,21 @@ template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a)
|
||||
return _mm256_and_pd(a,mask);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8f pshiftright_and_cast<Packet8f>(Packet8f v, int n)
|
||||
{
|
||||
#ifdef EIGEN_VECTORIZE_AVX2
|
||||
return _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(v), n));
|
||||
#else
|
||||
__m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 0), n);
|
||||
__m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 1), n);
|
||||
return _mm256_cvtepi32_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1));
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8f pfrexp<Packet8f>(const Packet8f& a, Packet8f& exponent) {
|
||||
return pfrexp_float(a,exponent);
|
||||
}
|
||||
|
||||
// preduxp should be ok
|
||||
// FIXME: why is this ok? why isn't the simply implementation working as expected?
|
||||
template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs)
|
||||
|
105
Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
Normal file
105
Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
Normal file
@ -0,0 +1,105 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2007 Julien Pommier
|
||||
// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
|
||||
// Copyright (C) 2009-2018 Gael Guennebaud <gael.guennebaud@inria.fr>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
/* The log function of this file initially comes from
|
||||
* Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
|
||||
*/
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
// Natural logarithm
|
||||
// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
|
||||
// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
|
||||
// be easily approximated by a polynomial centered on m=1 for stability.
|
||||
// TODO(gonnet): Further reduce the interval allowing for lower-degree
|
||||
// polynomial interpolants -> ... -> profit!
|
||||
template <typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
EIGEN_UNUSED
|
||||
Packet plog_float(const Packet _x) {
|
||||
Packet x = _x;
|
||||
|
||||
const Packet cst_1 = pset1<Packet>(1.0f);
|
||||
const Packet cst_half = pset1<Packet>(0.5f);
|
||||
//const Packet cst_126f = pset1<Packet>(126.0f);
|
||||
// The smallest non denormalized float number.
|
||||
const Packet cst_min_norm_pos = pset1frombits<Packet>( 0x00800000u);
|
||||
const Packet cst_minus_inf = pset1frombits<Packet>( 0xff800000u);
|
||||
|
||||
// Polynomial coefficients.
|
||||
const Packet cst_cephes_SQRTHF = pset1<Packet>(0.707106781186547524f);
|
||||
const Packet cst_cephes_log_p0 = pset1<Packet>(7.0376836292E-2f);
|
||||
const Packet cst_cephes_log_p1 = pset1<Packet>(-1.1514610310E-1f);
|
||||
const Packet cst_cephes_log_p2 = pset1<Packet>(1.1676998740E-1f);
|
||||
const Packet cst_cephes_log_p3 = pset1<Packet>(-1.2420140846E-1f);
|
||||
const Packet cst_cephes_log_p4 = pset1<Packet>(+1.4249322787E-1f);
|
||||
const Packet cst_cephes_log_p5 = pset1<Packet>(-1.6668057665E-1f);
|
||||
const Packet cst_cephes_log_p6 = pset1<Packet>(+2.0000714765E-1f);
|
||||
const Packet cst_cephes_log_p7 = pset1<Packet>(-2.4999993993E-1f);
|
||||
const Packet cst_cephes_log_p8 = pset1<Packet>(+3.3333331174E-1f);
|
||||
const Packet cst_cephes_log_q1 = pset1<Packet>(-2.12194440e-4f);
|
||||
const Packet cst_cephes_log_q2 = pset1<Packet>(0.693359375f);
|
||||
|
||||
Packet invalid_mask = pcmp_lt_or_nan(x, pzero(x));
|
||||
Packet iszero_mask = pcmp_eq(x,pzero(x));
|
||||
|
||||
// Truncate input values to the minimum positive normal.
|
||||
x = pmax(x, cst_min_norm_pos);
|
||||
|
||||
Packet e;
|
||||
// extract significant in the range [0.5,1) and exponent
|
||||
x = pfrexp(x,e);
|
||||
|
||||
// part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
|
||||
// and shift by -1. The values are then centered around 0, which improves
|
||||
// the stability of the polynomial evaluation.
|
||||
// if( x < SQRTHF ) {
|
||||
// e -= 1;
|
||||
// x = x + x - 1.0;
|
||||
// } else { x = x - 1.0; }
|
||||
Packet mask = pcmp_lt(x, cst_cephes_SQRTHF);
|
||||
Packet tmp = pand(x, mask);
|
||||
x = psub(x, cst_1);
|
||||
e = psub(e, pand(cst_1, mask));
|
||||
x = padd(x, tmp);
|
||||
|
||||
Packet x2 = pmul(x, x);
|
||||
Packet x3 = pmul(x2, x);
|
||||
|
||||
// Evaluate the polynomial approximant of degree 8 in three parts, probably
|
||||
// to improve instruction-level parallelism.
|
||||
Packet y, y1, y2;
|
||||
y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
|
||||
y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
|
||||
y2 = pmadd(cst_cephes_log_p6, x, cst_cephes_log_p7);
|
||||
y = pmadd(y, x, cst_cephes_log_p2);
|
||||
y1 = pmadd(y1, x, cst_cephes_log_p5);
|
||||
y2 = pmadd(y2, x, cst_cephes_log_p8);
|
||||
y = pmadd(y, x3, y1);
|
||||
y = pmadd(y, x3, y2);
|
||||
y = pmul(y, x3);
|
||||
|
||||
// Add the logarithm of the exponent back to the result of the interpolation.
|
||||
y1 = pmul(e, cst_cephes_log_q1);
|
||||
tmp = pmul(x2, cst_half);
|
||||
y = padd(y, y1);
|
||||
x = psub(x, tmp);
|
||||
y2 = pmul(e, cst_cephes_log_q2);
|
||||
x = padd(x, y);
|
||||
x = padd(x, y2);
|
||||
|
||||
// Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
|
||||
return pselect(iszero_mask, cst_minus_inf, por(x, invalid_mask));
|
||||
}
|
||||
|
||||
} // end namespace internal
|
||||
} // end namespace Eigen
|
@ -8,13 +8,15 @@
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
/* The sin, cos, exp, and log functions of this file come from
|
||||
/* The sin, cos and exp functions of this file come from
|
||||
* Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
|
||||
*/
|
||||
|
||||
#ifndef EIGEN_MATH_FUNCTIONS_SSE_H
|
||||
#define EIGEN_MATH_FUNCTIONS_SSE_H
|
||||
|
||||
#include "../Default/GenericPacketMathFunctions.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
@ -22,85 +24,7 @@ namespace internal {
|
||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
||||
Packet4f plog<Packet4f>(const Packet4f& _x)
|
||||
{
|
||||
Packet4f x = _x;
|
||||
_EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
|
||||
_EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
|
||||
|
||||
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
|
||||
|
||||
/* the smallest non denormalized float number */
|
||||
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000);
|
||||
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000);//-1.f/0.f);
|
||||
|
||||
/* natural logarithm computed for 4 simultaneous float
|
||||
return NaN for x <= 0
|
||||
*/
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
|
||||
|
||||
|
||||
Packet4i emm0;
|
||||
|
||||
Packet4f invalid_mask = _mm_cmpnge_ps(x, _mm_setzero_ps()); // not greater equal is true if x is NaN
|
||||
Packet4f iszero_mask = _mm_cmpeq_ps(x, _mm_setzero_ps());
|
||||
|
||||
x = pmax(x, p4f_min_norm_pos); /* cut off denormalized stuff */
|
||||
emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
|
||||
|
||||
/* keep only the fractional part */
|
||||
x = _mm_and_ps(x, p4f_inv_mant_mask);
|
||||
x = _mm_or_ps(x, p4f_half);
|
||||
|
||||
emm0 = _mm_sub_epi32(emm0, p4i_0x7f);
|
||||
Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1);
|
||||
|
||||
/* part2:
|
||||
if( x < SQRTHF ) {
|
||||
e -= 1;
|
||||
x = x + x - 1.0;
|
||||
} else { x = x - 1.0; }
|
||||
*/
|
||||
Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF);
|
||||
Packet4f tmp = pand(x, mask);
|
||||
x = psub(x, p4f_1);
|
||||
e = psub(e, pand(p4f_1, mask));
|
||||
x = padd(x, tmp);
|
||||
|
||||
Packet4f x2 = pmul(x,x);
|
||||
Packet4f x3 = pmul(x2,x);
|
||||
|
||||
Packet4f y, y1, y2;
|
||||
y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
|
||||
y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
|
||||
y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
|
||||
y = pmadd(y , x, p4f_cephes_log_p2);
|
||||
y1 = pmadd(y1, x, p4f_cephes_log_p5);
|
||||
y2 = pmadd(y2, x, p4f_cephes_log_p8);
|
||||
y = pmadd(y, x3, y1);
|
||||
y = pmadd(y, x3, y2);
|
||||
y = pmul(y, x3);
|
||||
|
||||
y1 = pmul(e, p4f_cephes_log_q1);
|
||||
tmp = pmul(x2, p4f_half);
|
||||
y = padd(y, y1);
|
||||
x = psub(x, tmp);
|
||||
y2 = pmul(e, p4f_cephes_log_q2);
|
||||
x = padd(x, y);
|
||||
x = padd(x, y2);
|
||||
// negative arg will be NAN, 0 will be -INF
|
||||
return _mm_or_ps(_mm_andnot_ps(iszero_mask, _mm_or_ps(x, invalid_mask)),
|
||||
_mm_and_ps(iszero_mask, p4f_minus_inf));
|
||||
return plog_float(_x);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
||||
@ -266,7 +190,7 @@ Packet4f psin<Packet4f>(const Packet4f& _x)
|
||||
_EIGEN_DECLARE_CONST_Packet4i(2, 2);
|
||||
_EIGEN_DECLARE_CONST_Packet4i(4, 4);
|
||||
|
||||
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000);
|
||||
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000u);
|
||||
|
||||
_EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
|
||||
@ -482,11 +406,11 @@ Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
|
||||
|
||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
||||
Packet4f prsqrt<Packet4f>(const Packet4f& _x) {
|
||||
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000);
|
||||
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000);
|
||||
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000u);
|
||||
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000u);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000);
|
||||
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000u);
|
||||
|
||||
Packet4f neg_half = pmul(_x, p4f_minus_half);
|
||||
|
||||
|
@ -83,7 +83,7 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; };
|
||||
const Packet2d p2d_##NAME = pset1<Packet2d>(X)
|
||||
|
||||
#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
|
||||
const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1<Packet4i>(X))
|
||||
const Packet4f p4f_##NAME = pset1frombits<Packet4f>(X)
|
||||
|
||||
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
|
||||
const Packet4i p4i_##NAME = pset1<Packet4i>(X)
|
||||
@ -180,6 +180,11 @@ template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { re
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); }
|
||||
#endif
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) { return _mm_castsi128_ps(pset1<Packet4i>(from)); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) { return _mm_setzero_ps(); }
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) { return _mm_setzero_pd(); }
|
||||
|
||||
// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
|
||||
// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
|
||||
// Using inline assembly is also not an option because then gcc fails to reorder properly the instructions.
|
||||
@ -328,6 +333,12 @@ template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); }
|
||||
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_SSE4_1
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, 0); }
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, 0); }
|
||||
@ -517,6 +528,14 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pshiftright_and_cast(Packet4f a, int n) {
|
||||
return _mm_cvtepi32_ps(_mm_srli_epi32(_mm_castps_si128(a),n));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
|
||||
return pfrexp_float(a,exponent);
|
||||
}
|
||||
|
||||
// with AVX, the default implementations based on pload1 are faster
|
||||
#ifndef __AVX__
|
||||
template<> EIGEN_STRONG_INLINE void
|
||||
|
Loading…
Reference in New Issue
Block a user