mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-03-13 18:37:27 +08:00
Workaround alignment warnings
This commit is contained in:
parent
e497a27ddc
commit
1c0728043a
@ -87,15 +87,10 @@ template<> EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<flo
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from)
|
||||
{
|
||||
// This should be optimized.
|
||||
__m128 complex1 = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)from);
|
||||
complex1 = _mm_movelh_ps(complex1, complex1);
|
||||
__m128 complex2 = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from+1));
|
||||
complex2 = _mm_movelh_ps(complex2, complex2);
|
||||
__m256 result = _mm256_setzero_ps();
|
||||
result = _mm256_insertf128_ps(result, complex1, 0);
|
||||
result = _mm256_insertf128_ps(result, complex2, 1);
|
||||
return Packet4cf(result);
|
||||
// FIXME The following might be optimized using _mm256_movedup_pd
|
||||
Packet2cf a = ploaddup<Packet2cf>(from);
|
||||
Packet2cf b = ploaddup<Packet2cf>(from+1);
|
||||
return Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
|
||||
@ -104,33 +99,30 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<f
|
||||
template<> EIGEN_DEVICE_FUNC inline Packet4cf pgather<std::complex<float>, Packet4cf>(const std::complex<float>* from, int stride)
|
||||
{
|
||||
return Packet4cf(_mm256_set_ps(std::imag(from[3*stride]), std::real(from[3*stride]),
|
||||
std::imag(from[2*stride]), std::real(from[2*stride]),
|
||||
std::imag(from[1*stride]), std::real(from[1*stride]),
|
||||
std::imag(from[0*stride]), std::real(from[0*stride])));
|
||||
std::imag(from[2*stride]), std::real(from[2*stride]),
|
||||
std::imag(from[1*stride]), std::real(from[1*stride]),
|
||||
std::imag(from[0*stride]), std::real(from[0*stride])));
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from, int stride)
|
||||
{
|
||||
__m128 low = _mm256_extractf128_ps(from.v, 0);
|
||||
to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)),
|
||||
_mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)));
|
||||
_mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)));
|
||||
to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)),
|
||||
_mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)));
|
||||
_mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)));
|
||||
|
||||
__m128 high = _mm256_extractf128_ps(from.v, 1);
|
||||
to[stride*2] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)),
|
||||
_mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)));
|
||||
_mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)));
|
||||
to[stride*3] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)),
|
||||
_mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)));
|
||||
_mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)));
|
||||
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet4cf>(const Packet4cf& a)
|
||||
{
|
||||
__m128 low = _mm256_extractf128_ps(a.v, 0);
|
||||
std::complex<float> res;
|
||||
_mm_storel_pi((__m64*)&res, low);
|
||||
return res;
|
||||
return pfirst(Packet2cf(_mm256_castps256_ps128(a.v)));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) {
|
||||
|
@ -173,6 +173,7 @@ template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGE
|
||||
// Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3}
|
||||
template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
|
||||
{
|
||||
// FIXME we should only load the first 128bits
|
||||
Packet8f tmp = ploadu<Packet8f>(from);
|
||||
Packet8f tmp1 = _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2));
|
||||
Packet8f tmp2 = _mm256_permute_ps(tmp, _MM_SHUFFLE(1,1,0,0));
|
||||
@ -181,6 +182,7 @@ template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
|
||||
// Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1}
|
||||
template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from)
|
||||
{
|
||||
// FIXME we should only load the first 128bits
|
||||
Packet4d tmp = ploadu<Packet4d>(from);
|
||||
Packet4d tmp1 = _mm256_permute_pd(tmp,0);
|
||||
Packet4d tmp2 = _mm256_permute_pd(tmp,3);
|
||||
@ -195,11 +197,12 @@ template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f&
|
||||
template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); }
|
||||
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
|
||||
|
||||
// TODO: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
|
||||
// NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
|
||||
// NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4);
|
||||
template<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, int stride)
|
||||
{
|
||||
return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride],
|
||||
from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
|
||||
from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, int stride)
|
||||
{
|
||||
|
@ -117,18 +117,15 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<f
|
||||
template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, int stride)
|
||||
{
|
||||
return Packet2cf(_mm_set_ps(std::imag(from[1*stride]), std::real(from[1*stride]),
|
||||
std::imag(from[0*stride]), std::real(from[0*stride])));
|
||||
std::imag(from[0*stride]), std::real(from[0*stride])));
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, int stride)
|
||||
{
|
||||
/* for (int i = 0; i < 2; i+=2) {
|
||||
to[stride*i] = std::complex<float>(from.v[i], from.v[i+1]);
|
||||
}*/
|
||||
to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 0)),
|
||||
_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1)));
|
||||
_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1)));
|
||||
to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 2)),
|
||||
_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
|
||||
_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||
|
Loading…
x
Reference in New Issue
Block a user