Add an additional step of Newton-Raphson for psqrt<double> on Arm, which otherwise has an error of ~1000 ulps.

This commit is contained in:
Rasmus Munk Larsen 2020-12-15 04:06:41 +00:00
parent cf0b5b0344
commit 6cee8d347e

View File

@ -3896,7 +3896,8 @@ template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){
// Do a single step of Newton's iteration. // Do a single step of Newton's iteration.
//the number 1.5f was set reference to Quake3's fast inverse square root //the number 1.5f was set reference to Quake3's fast inverse square root
x = vmulq_f64(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x)))); x = vmulq_f64(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
// Do one more Newton's iteration to get more accurate result. // Do two more Newton's iteration to get a result accurate to 1 ulp.
x = vmulq_f64(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
x = vmulq_f64(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x)))); x = vmulq_f64(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
// Flush results for denormals to zero. // Flush results for denormals to zero.
return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(pmul(_x, x)), denormal_mask)); return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(pmul(_x, x)), denormal_mask));