mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-03-07 18:27:40 +08:00
Unify SSE/AVX psin functions.
It is based on the SSE version which is much more accurate, though very slightly slower. This changeset also includes the following required changes: - add packet-float to packet-int type traits - add packet float<->int reinterpret casts - add faster pselect for AVX based on blendv
This commit is contained in:
parent
08edbc8cfe
commit
fa7fd61eda
@ -161,27 +161,27 @@ using std::ptrdiff_t;
|
||||
#elif defined EIGEN_VECTORIZE_AVX
|
||||
// Use AVX for floats and doubles, SSE for integers
|
||||
#include "src/Core/arch/SSE/PacketMath.h"
|
||||
#include "src/Core/arch/SSE/TypeCasting.h"
|
||||
#include "src/Core/arch/SSE/Complex.h"
|
||||
#include "src/Core/arch/SSE/MathFunctions.h"
|
||||
#include "src/Core/arch/AVX/PacketMath.h"
|
||||
#include "src/Core/arch/AVX/TypeCasting.h"
|
||||
#include "src/Core/arch/AVX/MathFunctions.h"
|
||||
#include "src/Core/arch/AVX/Complex.h"
|
||||
#include "src/Core/arch/AVX/TypeCasting.h"
|
||||
#include "src/Core/arch/SSE/TypeCasting.h"
|
||||
#elif defined EIGEN_VECTORIZE_SSE
|
||||
#include "src/Core/arch/SSE/PacketMath.h"
|
||||
#include "src/Core/arch/SSE/TypeCasting.h"
|
||||
#include "src/Core/arch/SSE/MathFunctions.h"
|
||||
#include "src/Core/arch/SSE/Complex.h"
|
||||
#include "src/Core/arch/SSE/TypeCasting.h"
|
||||
#elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
|
||||
#include "src/Core/arch/AltiVec/PacketMath.h"
|
||||
#include "src/Core/arch/AltiVec/MathFunctions.h"
|
||||
#include "src/Core/arch/AltiVec/Complex.h"
|
||||
#elif defined EIGEN_VECTORIZE_NEON
|
||||
#include "src/Core/arch/NEON/PacketMath.h"
|
||||
#include "src/Core/arch/NEON/TypeCasting.h"
|
||||
#include "src/Core/arch/NEON/MathFunctions.h"
|
||||
#include "src/Core/arch/NEON/Complex.h"
|
||||
#include "src/Core/arch/NEON/TypeCasting.h"
|
||||
#elif defined EIGEN_VECTORIZE_ZVECTOR
|
||||
#include "src/Core/arch/ZVector/PacketMath.h"
|
||||
#include "src/Core/arch/ZVector/MathFunctions.h"
|
||||
|
@ -151,6 +151,11 @@ pcast(const SrcPacket& a, const SrcPacket& /*b*/, const SrcPacket& /*c*/, const
|
||||
return static_cast<TgtPacket>(a);
|
||||
}
|
||||
|
||||
/** \internal \returns reinterpret_cast<Target>(a) */
|
||||
template <typename Target, typename Packet>
|
||||
EIGEN_DEVICE_FUNC inline Target
|
||||
preinterpret(const Packet& a); /* { return reinterpret_cast<const Target&>(a); } */
|
||||
|
||||
/** \internal \returns a + b (coeff-wise) */
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||
padd(const Packet& a,
|
||||
@ -214,6 +219,10 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; }
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||
pandnot(const Packet& a, const Packet& b) { return a & (!b); }
|
||||
|
||||
/** \internal \returns \a a shifted by n bits */
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||
pshiftleft(const Packet& a, int n); /* { return a << n; } */
|
||||
|
||||
/** \internal \returns the significant and exponent of the underlying floating point numbers
|
||||
* See https://en.cppreference.com/w/cpp/numeric/math/frexp
|
||||
*/
|
||||
|
@ -36,67 +36,7 @@ inline Packet8i pshiftleft(Packet8i v, int n)
|
||||
template <>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
|
||||
psin<Packet8f>(const Packet8f& _x) {
|
||||
Packet8f x = _x;
|
||||
|
||||
// Some useful values.
|
||||
_EIGEN_DECLARE_CONST_Packet8i(one, 1);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(two, 2.0f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(one_over_four, 0.25f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(one_over_pi, 3.183098861837907e-01f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(neg_pi_first, -3.140625000000000e+00f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(neg_pi_second, -9.670257568359375e-04f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(neg_pi_third, -6.278329571784980e-07f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(four_over_pi, 1.273239544735163e+00f);
|
||||
|
||||
// Map x from [-Pi/4,3*Pi/4] to z in [-1,3] and subtract the shifted period.
|
||||
Packet8f z = pmul(x, p8f_one_over_pi);
|
||||
Packet8f shift = _mm256_floor_ps(padd(z, p8f_one_over_four));
|
||||
x = pmadd(shift, p8f_neg_pi_first, x);
|
||||
x = pmadd(shift, p8f_neg_pi_second, x);
|
||||
x = pmadd(shift, p8f_neg_pi_third, x);
|
||||
z = pmul(x, p8f_four_over_pi);
|
||||
|
||||
// Make a mask for the entries that need flipping, i.e. wherever the shift
|
||||
// is odd.
|
||||
Packet8i shift_ints = _mm256_cvtps_epi32(shift);
|
||||
Packet8i shift_isodd = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(shift_ints), _mm256_castsi256_ps(p8i_one)));
|
||||
Packet8i sign_flip_mask = pshiftleft(shift_isodd, 31);
|
||||
|
||||
// Create a mask for which interpolant to use, i.e. if z > 1, then the mask
|
||||
// is set to ones for that entry.
|
||||
Packet8f ival_mask = _mm256_cmp_ps(z, p8f_one, _CMP_GT_OQ);
|
||||
|
||||
// Evaluate the polynomial for the interval [1,3] in z.
|
||||
_EIGEN_DECLARE_CONST_Packet8f(coeff_right_0, 9.999999724233232e-01f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(coeff_right_2, -3.084242535619928e-01f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(coeff_right_4, 1.584991525700324e-02f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(coeff_right_6, -3.188805084631342e-04f);
|
||||
Packet8f z_minus_two = psub(z, p8f_two);
|
||||
Packet8f z_minus_two2 = pmul(z_minus_two, z_minus_two);
|
||||
Packet8f right = pmadd(p8f_coeff_right_6, z_minus_two2, p8f_coeff_right_4);
|
||||
right = pmadd(right, z_minus_two2, p8f_coeff_right_2);
|
||||
right = pmadd(right, z_minus_two2, p8f_coeff_right_0);
|
||||
|
||||
// Evaluate the polynomial for the interval [-1,1] in z.
|
||||
_EIGEN_DECLARE_CONST_Packet8f(coeff_left_1, 7.853981525427295e-01f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(coeff_left_3, -8.074536727092352e-02f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(coeff_left_5, 2.489871967827018e-03f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(coeff_left_7, -3.587725841214251e-05f);
|
||||
Packet8f z2 = pmul(z, z);
|
||||
Packet8f left = pmadd(p8f_coeff_left_7, z2, p8f_coeff_left_5);
|
||||
left = pmadd(left, z2, p8f_coeff_left_3);
|
||||
left = pmadd(left, z2, p8f_coeff_left_1);
|
||||
left = pmul(left, z);
|
||||
|
||||
// Assemble the results, i.e. select the left and right polynomials.
|
||||
left = _mm256_andnot_ps(ival_mask, left);
|
||||
right = _mm256_and_ps(ival_mask, right);
|
||||
Packet8f res = _mm256_or_ps(left, right);
|
||||
|
||||
// Flip the sign on the odd intervals and return the result.
|
||||
res = _mm256_xor_ps(res, _mm256_castsi256_ps(sign_flip_mask));
|
||||
return res;
|
||||
return psin_float(_x);
|
||||
}
|
||||
|
||||
template <>
|
||||
|
@ -113,8 +113,17 @@ template<> struct packet_traits<int> : default_packet_traits
|
||||
};
|
||||
*/
|
||||
|
||||
template<> struct unpacket_traits<Packet8f> { typedef float type; typedef Packet4f half; enum {size=8, alignment=Aligned32}; };
|
||||
template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4, alignment=Aligned32}; };
|
||||
template<> struct unpacket_traits<Packet8f> {
|
||||
typedef float type;
|
||||
typedef Packet4f half;
|
||||
typedef Packet8i integer_packet;
|
||||
enum {size=8, alignment=Aligned32};
|
||||
};
|
||||
template<> struct unpacket_traits<Packet4d> {
|
||||
typedef double type;
|
||||
typedef Packet2d half;
|
||||
enum {size=4, alignment=Aligned32};
|
||||
};
|
||||
template<> struct unpacket_traits<Packet8i> { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; };
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) { return _mm256_set1_ps(from); }
|
||||
@ -125,6 +134,7 @@ template<> EIGEN_STRONG_INLINE Packet8f pset1frombits<Packet8f>(unsigned int fro
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f& /*a*/) { return _mm256_setzero_ps(); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4d pzero(const Packet4d& /*a*/) { return _mm256_setzero_pd(); }
|
||||
template<> EIGEN_STRONG_INLINE Packet8i pzero(const Packet8i& /*a*/) { return _mm256_setzero_si256(); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float* from) { return _mm256_broadcast_ss(from); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); }
|
||||
@ -210,6 +220,16 @@ template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8
|
||||
template<> EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_EQ_OQ); }
|
||||
template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a, b, _CMP_NGE_UQ); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8i pcmp_eq(const Packet8i& a, const Packet8i& b) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX2
|
||||
return _mm256_cmpeq_epi32(a,b);
|
||||
#else
|
||||
__m128i lo = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
|
||||
__m128i hi = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
|
||||
return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
|
||||
|
||||
@ -231,6 +251,21 @@ template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const
|
||||
template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(b,a); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(b,a); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8f pselect<Packet8f>(const Packet8f& mask, const Packet8f& a, const Packet8f& b)
|
||||
{ return _mm256_blendv_ps(b,a,mask); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4d pselect<Packet4d>(const Packet4d& mask, const Packet4d& a, const Packet4d& b)
|
||||
{ return _mm256_blendv_pd(b,a,mask); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8i pshiftleft<Packet8i>(const Packet8i& a, int n) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX2
|
||||
return _mm256_slli_epi32(a, n);
|
||||
#else
|
||||
__m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(a, 0), n);
|
||||
__m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(a, 1), n);
|
||||
return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); }
|
||||
template<> EIGEN_STRONG_INLINE Packet8i pload<Packet8i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); }
|
||||
|
@ -37,13 +37,21 @@ struct type_casting_traits<int, float> {
|
||||
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8i pcast<Packet8f, Packet8i>(const Packet8f& a) {
|
||||
return _mm256_cvtps_epi32(a);
|
||||
return _mm256_cvttps_epi32(a);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8i, Packet8f>(const Packet8i& a) {
|
||||
return _mm256_cvtepi32_ps(a);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8i preinterpret<Packet8i,Packet8f>(const Packet8f& a) {
|
||||
return _mm256_castps_si256(a);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8f preinterpret<Packet8f,Packet8i>(const Packet8i& a) {
|
||||
return _mm256_castsi256_ps(a);
|
||||
}
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
@ -229,5 +229,85 @@ Packet pexp_double(const Packet _x)
|
||||
return pmax(pldexp(x,fx), _x);
|
||||
}
|
||||
|
||||
template<typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
EIGEN_UNUSED
|
||||
Packet psin_float(const Packet& _x)
|
||||
{
|
||||
typedef typename unpacket_traits<Packet>::integer_packet PacketI;
|
||||
const Packet cst_1 = pset1<Packet>(1.0f);
|
||||
const Packet cst_half = pset1<Packet>(0.5f);
|
||||
|
||||
const PacketI csti_1 = pset1<PacketI>(1);
|
||||
const PacketI csti_not1 = pset1<PacketI>(~1);
|
||||
const PacketI csti_2 = pset1<PacketI>(2);
|
||||
|
||||
const Packet cst_sign_mask = pset1frombits<Packet>(0x80000000u);
|
||||
|
||||
const Packet cst_minus_cephes_DP1 = pset1<Packet>(-0.78515625f);
|
||||
const Packet cst_minus_cephes_DP2 = pset1<Packet>(-2.4187564849853515625e-4f);
|
||||
const Packet cst_minus_cephes_DP3 = pset1<Packet>(-3.77489497744594108e-8f);
|
||||
const Packet cst_sincof_p0 = pset1<Packet>(-1.9515295891E-4f);
|
||||
const Packet cst_sincof_p1 = pset1<Packet>( 8.3321608736E-3f);
|
||||
const Packet cst_sincof_p2 = pset1<Packet>(-1.6666654611E-1f);
|
||||
const Packet cst_coscof_p0 = pset1<Packet>( 2.443315711809948E-005f);
|
||||
const Packet cst_coscof_p1 = pset1<Packet>(-1.388731625493765E-003f);
|
||||
const Packet cst_coscof_p2 = pset1<Packet>( 4.166664568298827E-002f);
|
||||
const Packet cst_cephes_FOPI = pset1<Packet>( 1.27323954473516f); // 4 / M_PI
|
||||
|
||||
Packet x = pabs(_x);
|
||||
|
||||
// Scale x by 4/Pi to find x's octant.
|
||||
Packet y = pmul(x, cst_cephes_FOPI);
|
||||
|
||||
// Get the octant. We'll reduce x by this number of octants or by one more than it.
|
||||
PacketI y_int = pcast<Packet,PacketI>(y);
|
||||
// x's from even-numbered octants will translate to octant 0: [0, +Pi/4].
|
||||
// x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0].
|
||||
// Adjustment for odd-numbered octants: octant = (octant + 1) & (~1).
|
||||
PacketI y_int1 = pand(padd(y_int, csti_1), csti_not1); // could be pbitclear<0>(...)
|
||||
y = pcast<PacketI,Packet>(y_int1);
|
||||
|
||||
// Compute the sign to apply to the polynomial.
|
||||
// sign = third_bit(y_int1) xor signbit(_x)
|
||||
Packet sign_bit = pxor(_x, preinterpret<Packet>(pshiftleft(y_int1, 29)));
|
||||
sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
|
||||
|
||||
// Get the polynomial selection mask from the second bit of y_int1
|
||||
// We'll calculate both (sin and cos) polynomials and then select from the two.
|
||||
Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int1, csti_2), pzero(y_int1)));
|
||||
|
||||
// Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4.
|
||||
// The magic pass: "Extended precision modular arithmetic"
|
||||
// x = ((x - y * DP1) - y * DP2) - y * DP3
|
||||
x = pmadd(y, cst_minus_cephes_DP1, x);
|
||||
x = pmadd(y, cst_minus_cephes_DP2, x);
|
||||
x = pmadd(y, cst_minus_cephes_DP3, x);
|
||||
|
||||
Packet x2 = pmul(x,x);
|
||||
|
||||
// Evaluate the cos(x) polynomial. (0 <= x <= Pi/4)
|
||||
Packet y1 = cst_coscof_p0;
|
||||
y1 = pmadd(y1, x2, cst_coscof_p1);
|
||||
y1 = pmadd(y1, x2, cst_coscof_p2);
|
||||
y1 = pmul(y1, x2);
|
||||
y1 = pmul(y1, x2);
|
||||
y1 = psub(y1, pmul(x2, cst_half));
|
||||
y1 = padd(y1, cst_1);
|
||||
|
||||
// Evaluate the sin(x) polynomial. (Pi/4 <= x <= 0)
|
||||
Packet y2 = cst_sincof_p0;
|
||||
y2 = pmadd(y2, x2, cst_sincof_p1);
|
||||
y2 = pmadd(y2, x2, cst_sincof_p2);
|
||||
y2 = pmul(y2, x2);
|
||||
y2 = pmadd(y2, x, x);
|
||||
|
||||
// Select the correct result from the two polynoms.
|
||||
y = pselect(poly_mask,y2,y1);
|
||||
|
||||
// Update the sign
|
||||
return pxor(y, sign_bit);
|
||||
}
|
||||
|
||||
} // end namespace internal
|
||||
} // end namespace Eigen
|
||||
|
@ -261,7 +261,7 @@ Packet4f psincos_inner_msa_float(const Packet4f& _x) {
|
||||
// x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0].
|
||||
// Adjustment for odd-numbered octants: octant = (octant + 1) & (~1).
|
||||
Packet4i y_int1 = __builtin_msa_addvi_w(y_int, 1);
|
||||
Packet4i y_int2 = (Packet4i)__builtin_msa_bclri_w((Packet4ui)y_int1, 0);
|
||||
Packet4i y_int2 = (Packet4i)__builtin_msa_bclri_w((Packet4ui)y_int1, 0); // bclri = bit-clear
|
||||
y = __builtin_msa_ffint_s_w(y_int2);
|
||||
|
||||
// Compute the sign to apply to the polynomial.
|
||||
@ -305,7 +305,7 @@ Packet4f psincos_inner_msa_float(const Packet4f& _x) {
|
||||
|
||||
// Update the sign.
|
||||
sign_mask = pxor(sign_mask, (Packet4i)y);
|
||||
y = (Packet4f)__builtin_msa_binsli_w((v4u32)y, (v4u32)sign_mask, 0);
|
||||
y = (Packet4f)__builtin_msa_binsli_w((v4u32)y, (v4u32)sign_mask, 0); // binsli = bit-insert-left
|
||||
return y;
|
||||
}
|
||||
|
||||
|
@ -54,101 +54,7 @@ Packet2d pexp<Packet2d>(const Packet2d& x)
|
||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
||||
Packet4f psin<Packet4f>(const Packet4f& _x)
|
||||
{
|
||||
Packet4f x = _x;
|
||||
_EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
|
||||
|
||||
_EIGEN_DECLARE_CONST_Packet4i(1, 1);
|
||||
_EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
|
||||
_EIGEN_DECLARE_CONST_Packet4i(2, 2);
|
||||
_EIGEN_DECLARE_CONST_Packet4i(4, 4);
|
||||
|
||||
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000u);
|
||||
|
||||
_EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
|
||||
|
||||
Packet4f xmm1, xmm2, xmm3, sign_bit, y;
|
||||
|
||||
Packet4i emm0, emm2;
|
||||
sign_bit = x;
|
||||
/* take the absolute value */
|
||||
x = pabs(x);
|
||||
|
||||
/* take the modulo */
|
||||
|
||||
/* extract the sign bit (upper one) */
|
||||
sign_bit = _mm_and_ps(sign_bit, p4f_sign_mask);
|
||||
|
||||
/* scale by 4/Pi */
|
||||
y = pmul(x, p4f_cephes_FOPI);
|
||||
|
||||
/* store the integer part of y in mm0 */
|
||||
emm2 = _mm_cvttps_epi32(y);
|
||||
/* j=(j+1) & (~1) (see the cephes sources) */
|
||||
emm2 = _mm_add_epi32(emm2, p4i_1);
|
||||
emm2 = _mm_and_si128(emm2, p4i_not1);
|
||||
y = _mm_cvtepi32_ps(emm2);
|
||||
/* get the swap sign flag */
|
||||
emm0 = _mm_and_si128(emm2, p4i_4);
|
||||
emm0 = _mm_slli_epi32(emm0, 29);
|
||||
/* get the polynom selection mask
|
||||
there is one polynom for 0 <= x <= Pi/4
|
||||
and another one for Pi/4<x<=Pi/2
|
||||
|
||||
Both branches will be computed.
|
||||
*/
|
||||
emm2 = _mm_and_si128(emm2, p4i_2);
|
||||
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
|
||||
|
||||
Packet4f swap_sign_bit = _mm_castsi128_ps(emm0);
|
||||
Packet4f poly_mask = _mm_castsi128_ps(emm2);
|
||||
sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
|
||||
|
||||
/* The magic pass: "Extended precision modular arithmetic"
|
||||
x = ((x - y * DP1) - y * DP2) - y * DP3; */
|
||||
xmm1 = pmul(y, p4f_minus_cephes_DP1);
|
||||
xmm2 = pmul(y, p4f_minus_cephes_DP2);
|
||||
xmm3 = pmul(y, p4f_minus_cephes_DP3);
|
||||
x = padd(x, xmm1);
|
||||
x = padd(x, xmm2);
|
||||
x = padd(x, xmm3);
|
||||
|
||||
/* Evaluate the first polynom (0 <= x <= Pi/4) */
|
||||
y = p4f_coscof_p0;
|
||||
Packet4f z = _mm_mul_ps(x,x);
|
||||
|
||||
y = pmadd(y, z, p4f_coscof_p1);
|
||||
y = pmadd(y, z, p4f_coscof_p2);
|
||||
y = pmul(y, z);
|
||||
y = pmul(y, z);
|
||||
Packet4f tmp = pmul(z, p4f_half);
|
||||
y = psub(y, tmp);
|
||||
y = padd(y, p4f_1);
|
||||
|
||||
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
|
||||
|
||||
Packet4f y2 = p4f_sincof_p0;
|
||||
y2 = pmadd(y2, z, p4f_sincof_p1);
|
||||
y2 = pmadd(y2, z, p4f_sincof_p2);
|
||||
y2 = pmul(y2, z);
|
||||
y2 = pmul(y2, x);
|
||||
y2 = padd(y2, x);
|
||||
|
||||
/* select the correct result from the two polynoms */
|
||||
y2 = _mm_and_ps(poly_mask, y2);
|
||||
y = _mm_andnot_ps(poly_mask, y);
|
||||
y = _mm_or_ps(y,y2);
|
||||
/* update the sign */
|
||||
return _mm_xor_ps(y, sign_bit);
|
||||
return psin_float(_x);
|
||||
}
|
||||
|
||||
/* almost the same as psin */
|
||||
|
@ -158,9 +158,22 @@ template<> struct packet_traits<int> : default_packet_traits
|
||||
};
|
||||
};
|
||||
|
||||
template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
|
||||
template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
|
||||
template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
|
||||
template<> struct unpacket_traits<Packet4f> {
|
||||
typedef float type;
|
||||
typedef Packet4f half;
|
||||
typedef Packet4i integer_packet;
|
||||
enum {size=4, alignment=Aligned16};
|
||||
};
|
||||
template<> struct unpacket_traits<Packet2d> {
|
||||
typedef double type;
|
||||
typedef Packet2d half;
|
||||
enum {size=2, alignment=Aligned16};
|
||||
};
|
||||
template<> struct unpacket_traits<Packet4i> {
|
||||
typedef int type;
|
||||
typedef Packet4i half;
|
||||
enum {size=4, alignment=Aligned16};
|
||||
};
|
||||
|
||||
#ifndef EIGEN_VECTORIZE_AVX
|
||||
template<> struct scalar_div_cost<float,true> { enum { value = 7 }; };
|
||||
@ -184,6 +197,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int fro
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) { return _mm_setzero_ps(); }
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) { return _mm_setzero_pd(); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& /*a*/) { return _mm_setzero_si128(); }
|
||||
|
||||
// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
|
||||
// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
|
||||
@ -338,6 +352,8 @@ template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
|
||||
@ -354,6 +370,8 @@ template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, con
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pshiftleft<Packet4i>(const Packet4i& a, int n) { return _mm_slli_epi32(a,n); }
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_SSE4_1
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, 0); }
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, 0); }
|
||||
|
@ -69,6 +69,13 @@ template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f
|
||||
return _mm_cvtps_pd(a);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
|
||||
return _mm_castps_si128(a);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) {
|
||||
return _mm_castsi128_ps(a);
|
||||
}
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user