mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-01-30 17:40:05 +08:00
bug #195 - fix this once and for all: just never use _mm_load_sd on gcc/i386, it generates redundant x87 ops
This commit is contained in:
parent
a8f5ef9388
commit
b3544ce2ae
@ -237,6 +237,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { E
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
|
template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
|
||||||
#else
|
#else
|
||||||
// Fast unaligned loads. Note that here we cannot directly use intrinsics: this would
|
// Fast unaligned loads. Note that here we cannot directly use intrinsics: this would
|
||||||
// require pointer casting to incompatible pointer types and leads to invalid code
|
// require pointer casting to incompatible pointer types and leads to invalid code
|
||||||
@ -247,26 +248,44 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { E
|
|||||||
template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
|
template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
|
||||||
{
|
{
|
||||||
EIGEN_DEBUG_UNALIGNED_LOAD
|
EIGEN_DEBUG_UNALIGNED_LOAD
|
||||||
|
#if defined(__GNUC__) && defined(__i386__)
|
||||||
|
// bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd
|
||||||
|
return _mm_loadu_ps(from);
|
||||||
|
#else
|
||||||
__m128d res;
|
__m128d res;
|
||||||
res = _mm_load_sd((const double*)(from)) ;
|
res = _mm_load_sd((const double*)(from)) ;
|
||||||
res = _mm_loadh_pd(res, (const double*)(from+2)) ;
|
res = _mm_loadh_pd(res, (const double*)(from+2)) ;
|
||||||
return _mm_castpd_ps(res);
|
return _mm_castpd_ps(res);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
|
template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
|
||||||
{
|
{
|
||||||
EIGEN_DEBUG_UNALIGNED_LOAD
|
EIGEN_DEBUG_UNALIGNED_LOAD
|
||||||
|
#if defined(__GNUC__) && defined(__i386__)
|
||||||
|
// bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd
|
||||||
|
return _mm_loadu_pd(from);
|
||||||
|
#else
|
||||||
__m128d res;
|
__m128d res;
|
||||||
res = _mm_load_sd(from) ;
|
res = _mm_load_sd(from) ;
|
||||||
res = _mm_loadh_pd(res,from+1);
|
res = _mm_loadh_pd(res,from+1);
|
||||||
return res;
|
return res;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
|
||||||
|
{
|
||||||
|
EIGEN_DEBUG_UNALIGNED_LOAD
|
||||||
|
#if defined(__GNUC__) && defined(__i386__)
|
||||||
|
// bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd
|
||||||
|
return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from));
|
||||||
|
#else
|
||||||
|
__m128d res;
|
||||||
|
res = _mm_load_sd((const double*)(from)) ;
|
||||||
|
res = _mm_loadh_pd(res, (const double*)(from+2)) ;
|
||||||
|
return _mm_castpd_si128(res);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// bug 195: we used to have an optimized ploadu using _mm_load_sd/_mm_loadh_pd but that gave wrong results when some 64bit value,
|
|
||||||
// interpreted as double, was a NaN
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
|
|
||||||
|
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
|
template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
|
||||||
{
|
{
|
||||||
return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd((const double*)from)), 0, 0, 1, 1);
|
return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd((const double*)from)), 0, 0, 1, 1);
|
||||||
|
Loading…
Reference in New Issue
Block a user