diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index dc1a1d6b0..247ee4efd 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -262,10 +262,6 @@ template<> EIGEN_STRONG_INLINE Packet4d peven_mask(const Packet4d& /*a*/) { retu template<> EIGEN_STRONG_INLINE Packet8f pload1(const float* from) { return _mm256_broadcast_ss(from); } template<> EIGEN_STRONG_INLINE Packet4d pload1(const double* from) { return _mm256_broadcast_sd(from); } -template<> EIGEN_STRONG_INLINE Packet8f plset(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); } -template<> EIGEN_STRONG_INLINE Packet4d plset(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); } -template<> EIGEN_STRONG_INLINE Packet8i plset(const int& a) { return _mm256_add_epi32(_mm256_set1_epi32(a), _mm256_set_epi32(7,6,5,4,3,2,1,0)); } - template<> EIGEN_STRONG_INLINE Packet8f padd(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d padd(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet8i padd(const Packet8i& a, const Packet8i& b) { @@ -278,6 +274,10 @@ template<> EIGEN_STRONG_INLINE Packet8i padd(const Packet8i& a, const #endif } +template<> EIGEN_STRONG_INLINE Packet8f plset(const float& a) { return padd(pset1(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); } +template<> EIGEN_STRONG_INLINE Packet4d plset(const double& a) { return padd(pset1(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); } +template<> EIGEN_STRONG_INLINE Packet8i plset(const int& a) { return padd(pset1(a), _mm256_set_epi32(7,6,5,4,3,2,1,0)); } + template<> EIGEN_STRONG_INLINE Packet8f psub(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d psub(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet8i psub(const Packet8i& a, const Packet8i& b) { @@ -300,7 +300,7 @@ template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a) } template<> EIGEN_STRONG_INLINE Packet8i pnegate(const Packet8i& a) { - return _mm256_sub_epi32(_mm256_set1_epi32(0), a); + return psub(pzero(a), a); } template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; } @@ -419,7 +419,13 @@ template<> EIGEN_STRONG_INLINE Packet4d pmin(const Packet4d& a, const #endif } template<> EIGEN_STRONG_INLINE Packet8i pmin(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 return _mm256_min_epi32(a, b); +#else + __m128i lo = _mm_min_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0)); + __m128i hi = _mm_min_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif } template<> EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) { @@ -445,7 +451,13 @@ template<> EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const #endif } template<> EIGEN_STRONG_INLINE Packet8i pmax(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 return _mm256_max_epi32(a, b); +#else + __m128i lo = _mm_max_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0)); + __m128i hi = _mm_max_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif } // Add specializations for min/max with prescribed NaN progation. @@ -641,17 +653,25 @@ template<> EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) // then we can perform a consistent permutation on the global register to get everything in shape: return _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2)); } -// Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1} +// Loads 2 doubles from memory a returns the packet {a0, a0, a1, a1} template<> EIGEN_STRONG_INLINE Packet4d ploaddup(const double* from) { Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from); return _mm256_permute_pd(tmp, 3<<2); } -// Loads 4 integers from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3} +// Loads 4 integers from memory a returns the packet {a0, a0, a1, a1, a2, a2, a3, a3} template<> EIGEN_STRONG_INLINE Packet8i ploaddup(const int* from) { - Packet8i a = _mm256_castsi128_si256(pload(from)); +#ifdef EIGEN_VECTORIZE_AVX2 + const Packet8i a = _mm256_castsi128_si256(pload(from)); return _mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3)); +#else + __m256 tmp = _mm256_broadcast_ps((const __m128*)(const void*)from); + // mimic an "inplace" permutation of the lower 128bits using a blend + tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15); + // then we can perform a consistent permutation on the global register to get everything in shape: + return _mm256_castps_si256(_mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2))); +#endif } // Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1} @@ -662,7 +682,7 @@ template<> EIGEN_STRONG_INLINE Packet8f ploadquad(const float* from) } template<> EIGEN_STRONG_INLINE Packet8i ploadquad(const int* from) { - return _mm256_inserti128_si256(_mm256_set1_epi32(*from), _mm_set1_epi32(*(from+1)), 1); + return _mm256_insertf128_si256(_mm256_set1_epi32(*from), _mm_set1_epi32(*(from+1)), 1); } template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); } @@ -723,13 +743,13 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, } template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet8i& from, Index stride) { - __m128i low = _mm256_extracti128_si256(from, 0); + __m128i low = _mm256_extractf128_si256(from, 0); to[stride*0] = _mm_extract_epi32(low, 0); to[stride*1] = _mm_extract_epi32(low, 1); to[stride*2] = _mm_extract_epi32(low, 2); to[stride*3] = _mm_extract_epi32(low, 3); - __m128i high = _mm256_extracti128_si256(from, 1); + __m128i high = _mm256_extractf128_si256(from, 1); to[stride*4] = _mm_extract_epi32(high, 0); to[stride*5] = _mm_extract_epi32(high, 1); to[stride*6] = _mm_extract_epi32(high, 2); @@ -803,7 +823,13 @@ template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a) } template<> EIGEN_STRONG_INLINE Packet8i pabs(const Packet8i& a) { +#ifdef EIGEN_VECTORIZE_AVX2 return _mm256_abs_epi32(a); +#else + __m128i lo = _mm_abs_epi32(_mm256_extractf128_si256(a, 0)); + __m128i hi = _mm_abs_epi32(_mm256_extractf128_si256(a, 1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif } template<> EIGEN_STRONG_INLINE Packet8f pfrexp(const Packet8f& a, Packet8f& exponent) { @@ -989,16 +1015,27 @@ ptranspose(PacketBlock& kernel) { #define MM256_SHUFFLE_EPI32(A, B, M) \ _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(A), _mm256_castsi256_ps(B), M)) +#ifdef EIGEN_VECTORIZE_AVX2 +#define MM256_UNPACKLO_EPI32(A, B) \ + _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(A), _mm256_castsi256_ps(B))) +#define MM256_UNPACKHI_EPI32(A, B) \ + _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(A), _mm256_castsi256_ps(B))) +#else +#define MM256_UNPACKLO_EPI32(A, B) _mm256_unpacklo_ps(A, B) +#define MM256_UNPACKHI_EPI32(A, B) _mm256_unpackhi_ps(A, B) +#endif + + EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - __m256i T0 = _mm256_unpacklo_epi32(kernel.packet[0], kernel.packet[1]); - __m256i T1 = _mm256_unpackhi_epi32(kernel.packet[0], kernel.packet[1]); - __m256i T2 = _mm256_unpacklo_epi32(kernel.packet[2], kernel.packet[3]); - __m256i T3 = _mm256_unpackhi_epi32(kernel.packet[2], kernel.packet[3]); - __m256i T4 = _mm256_unpacklo_epi32(kernel.packet[4], kernel.packet[5]); - __m256i T5 = _mm256_unpackhi_epi32(kernel.packet[4], kernel.packet[5]); - __m256i T6 = _mm256_unpacklo_epi32(kernel.packet[6], kernel.packet[7]); - __m256i T7 = _mm256_unpackhi_epi32(kernel.packet[6], kernel.packet[7]); + __m256i T0 = MM256_UNPACKLO_EPI32(kernel.packet[0], kernel.packet[1]); + __m256i T1 = MM256_UNPACKHI_EPI32(kernel.packet[0], kernel.packet[1]); + __m256i T2 = MM256_UNPACKLO_EPI32(kernel.packet[2], kernel.packet[3]); + __m256i T3 = MM256_UNPACKHI_EPI32(kernel.packet[2], kernel.packet[3]); + __m256i T4 = MM256_UNPACKLO_EPI32(kernel.packet[4], kernel.packet[5]); + __m256i T5 = MM256_UNPACKHI_EPI32(kernel.packet[4], kernel.packet[5]); + __m256i T6 = MM256_UNPACKLO_EPI32(kernel.packet[6], kernel.packet[7]); + __m256i T7 = MM256_UNPACKHI_EPI32(kernel.packet[6], kernel.packet[7]); __m256i S0 = MM256_SHUFFLE_EPI32(T0,T2,_MM_SHUFFLE(1,0,1,0)); __m256i S1 = MM256_SHUFFLE_EPI32(T0,T2,_MM_SHUFFLE(3,2,3,2)); __m256i S2 = MM256_SHUFFLE_EPI32(T1,T3,_MM_SHUFFLE(1,0,1,0)); @@ -1019,10 +1056,10 @@ ptranspose(PacketBlock& kernel) { EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - __m256i T0 = _mm256_unpacklo_epi32(kernel.packet[0], kernel.packet[1]); - __m256i T1 = _mm256_unpackhi_epi32(kernel.packet[0], kernel.packet[1]); - __m256i T2 = _mm256_unpacklo_epi32(kernel.packet[2], kernel.packet[3]); - __m256i T3 = _mm256_unpackhi_epi32(kernel.packet[2], kernel.packet[3]); + __m256i T0 = MM256_UNPACKLO_EPI32(kernel.packet[0], kernel.packet[1]); + __m256i T1 = MM256_UNPACKHI_EPI32(kernel.packet[0], kernel.packet[1]); + __m256i T2 = MM256_UNPACKLO_EPI32(kernel.packet[2], kernel.packet[3]); + __m256i T3 = MM256_UNPACKHI_EPI32(kernel.packet[2], kernel.packet[3]); __m256i S0 = MM256_SHUFFLE_EPI32(T0,T2,_MM_SHUFFLE(1,0,1,0)); __m256i S1 = MM256_SHUFFLE_EPI32(T0,T2,_MM_SHUFFLE(3,2,3,2)); diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 6ce15c677..0810f66ee 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -1028,7 +1028,7 @@ template<> EIGEN_STRONG_INLINE Packet8d pldexp(const Packet8d& a, cons // AVX512F does not define _mm512_extracti32x8_epi32 to extract _m256i from _m512i #define EIGEN_EXTRACT_8i_FROM_16i(INPUT, OUTPUT) \ - __m256i OUTPUT##_0 = _mm512_extracti32x8_epi32(INPUT, 0) \ + __m256i OUTPUT##_0 = _mm512_extracti32x8_epi32(INPUT, 0); \ __m256i OUTPUT##_1 = _mm512_extracti32x8_epi32(INPUT, 1) #else #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \ @@ -1037,7 +1037,7 @@ template<> EIGEN_STRONG_INLINE Packet8d pldexp(const Packet8d& a, cons _mm512_extractf32x4_ps(INPUT, 1), 1); \ __m256 OUTPUT##_1 = _mm256_insertf128_ps( \ _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 2)), \ - _mm512_extractf32x4_ps(INPUT, 3), 1); + _mm512_extractf32x4_ps(INPUT, 3), 1) #define EIGEN_EXTRACT_8i_FROM_16i(INPUT, OUTPUT) \ __m256i OUTPUT##_0 = _mm256_insertf128_si256( \ @@ -1045,7 +1045,7 @@ template<> EIGEN_STRONG_INLINE Packet8d pldexp(const Packet8d& a, cons _mm512_extracti32x4_epi32(INPUT, 1), 1); \ __m256i OUTPUT##_1 = _mm256_insertf128_si256( \ _mm256_castsi128_si256(_mm512_extracti32x4_epi32(INPUT, 2)), \ - _mm512_extracti32x4_epi32(INPUT, 3), 1); + _mm512_extracti32x4_epi32(INPUT, 3), 1) #endif #ifdef EIGEN_VECTORIZE_AVX512DQ