mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-03-07 18:27:40 +08:00
Add Neon psqrt<Packet2d> and pexp<Packet2d>
This commit is contained in:
parent
5272106826
commit
85428a3440
@ -742,6 +742,12 @@ pfrexp_float(const Packet& a, Packet& exponent);
|
||||
template<typename Packet> EIGEN_STRONG_INLINE Packet
|
||||
pldexp_float(Packet a, Packet exponent);
|
||||
|
||||
/** Default implementation of pldexp for double.
|
||||
* It is expected to be called by implementers of template<> pldexp.
|
||||
*/
|
||||
template<typename Packet> EIGEN_STRONG_INLINE Packet
|
||||
pldexp_double(Packet a, Packet exponent);
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
@ -39,6 +39,16 @@ pldexp_float(Packet a, Packet exponent)
|
||||
return pmul(a, preinterpret<Packet>(plogical_shift_left<23>(ei)));
|
||||
}
|
||||
|
||||
template<typename Packet> EIGEN_STRONG_INLINE Packet
|
||||
pldexp_double(Packet a, Packet exponent)
|
||||
{
|
||||
typedef typename unpacket_traits<Packet>::integer_packet PacketI;
|
||||
const Packet cst_1023 = pset1<Packet>(1023.0);
|
||||
// return a * 2^exponent
|
||||
PacketI ei = pcast<Packet,PacketI>(padd(exponent, cst_1023));
|
||||
return pmul(a, preinterpret<Packet>(plogical_shift_left<52>(ei)));
|
||||
}
|
||||
|
||||
// Natural logarithm
|
||||
// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
|
||||
// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
|
||||
|
@ -44,6 +44,15 @@ BF16_PACKET_FUNCTION(Packet4f, Packet4bf, plog)
|
||||
BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pexp)
|
||||
BF16_PACKET_FUNCTION(Packet4f, Packet4bf, ptanh)
|
||||
|
||||
|
||||
//---------- double ----------
|
||||
|
||||
#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
|
||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d pexp<Packet2d>(const Packet2d& x)
|
||||
{ return pexp_double(x); }
|
||||
|
||||
#endif
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
@ -3580,8 +3580,8 @@ template<> struct packet_traits<double> : default_packet_traits
|
||||
HasSin = 0,
|
||||
HasCos = 0,
|
||||
HasLog = 0,
|
||||
HasExp = 0,
|
||||
HasSqrt = 0,
|
||||
HasExp = 1,
|
||||
HasSqrt = 1,
|
||||
HasTanh = 0,
|
||||
HasErf = 0
|
||||
};
|
||||
@ -3750,6 +3750,38 @@ ptranspose(PacketBlock<Packet2d, 2>& kernel)
|
||||
template<> EIGEN_DEVICE_FUNC inline Packet2d pselect( const Packet2d& mask, const Packet2d& a, const Packet2d& b)
|
||||
{ return vbslq_f64(vreinterpretq_u64_f64(mask), a, b); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent)
|
||||
{ return pldexp_double(a, exponent); }
|
||||
|
||||
#if EIGEN_FAST_MATH
|
||||
|
||||
// Functions for sqrt support packet2d.
|
||||
// The EIGEN_FAST_MATH version uses the vrsqrte_f64 approximation and one step
|
||||
// of Newton's method, at a cost of 1-2 bits of precision as opposed to the
|
||||
// exact solution. It does not handle +inf, or denormalized numbers correctly.
|
||||
// The main advantage of this approach is not just speed, but also the fact that
|
||||
// it can be inlined and pipelined with other computations, further reducing its
|
||||
// effective latency. This is similar to Quake3's fast inverse square root.
|
||||
// For more details see: http://www.beyond3d.com/content/articles/8/
|
||||
template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){
|
||||
Packet2d half = vmulq_n_f64(_x, 0.5);
|
||||
Packet2ul denormal_mask = vandq_u64(vcgeq_f64(_x, vdupq_n_f64(0.0)),
|
||||
vcltq_f64(_x, pset1<Packet2d>((std::numeric_limits<double>::min)())));
|
||||
// Compute approximate reciprocal sqrt.
|
||||
Packet2d x = vrsqrteq_f64(_x);
|
||||
// Do a single step of Newton's iteration.
|
||||
//the number 1.5f was set reference to Quake3's fast inverse square root
|
||||
x = vmulq_f64(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
|
||||
// Do one more Newton's iteration to get more accurate result.
|
||||
x = vmulq_f64(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
|
||||
// Flush results for denormals to zero.
|
||||
return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(pmul(_x, x)), denormal_mask));
|
||||
}
|
||||
|
||||
#else
|
||||
template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){ return vsqrt_f64(_x); }
|
||||
#endif
|
||||
|
||||
#endif // EIGEN_ARCH_ARM64
|
||||
|
||||
} // end namespace internal
|
||||
|
Loading…
Reference in New Issue
Block a user