From fa5a8f055aebbf4f39fca26e857351103fab4d11 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 29 Apr 2016 13:30:13 -0700 Subject: [PATCH] Implemented palign_impl for AVX512 --- Eigen/src/Core/arch/AVX512/PacketMath.h | 38 ++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 8392c4673..68e86ae7c 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -54,7 +54,6 @@ template<> struct packet_traits : default_packet_traits AlignedOnScalar = 1, size = 16, HasHalfPacket = 1, - HasDiv = 1, #if EIGEN_GNUC_AT_LEAST(5, 3) #ifdef EIGEN_VECTORIZE_AVX512DQ HasLog = 1, @@ -63,6 +62,7 @@ template<> struct packet_traits : default_packet_traits HasSqrt = 1, HasRsqrt = 1, #endif + HasDiv = 1 }; }; template<> struct packet_traits : default_packet_traits @@ -997,9 +997,26 @@ EIGEN_STRONG_INLINE double predux_max(const Packet8d& a) { template struct palign_impl { - static EIGEN_STRONG_INLINE void run(Packet16f& first, const Packet16f& second) { + static EIGEN_STRONG_INLINE void run(Packet16f& first, + const Packet16f& second) { if (Offset != 0) { - assert(false && "To be implemented"); + __m512i first_idx = _mm512_set_epi32( + Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11, + Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6, + Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset); + + __m512i second_idx = + _mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4, + Offset - 5, Offset - 6, Offset - 7, Offset - 8, + Offset - 9, Offset - 10, Offset - 11, Offset - 12, + Offset - 13, Offset - 14, Offset - 15, Offset - 16); + + unsigned short mask = 0xFFFF; + mask <<= (16 - Offset); + + first = _mm512_permutexvar_ps(first_idx, first); + Packet16f tmp = _mm512_permutexvar_ps(second_idx, second); + first = _mm512_mask_blend_ps(mask, first, tmp); } } }; @@ -1007,7 +1024,20 @@ template struct palign_impl { static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) { if (Offset != 0) { - assert(false && "To be implemented"); + __m512i first_idx = _mm512_set_epi32( + 0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0, + Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset); + + __m512i second_idx = _mm512_set_epi32( + 0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0, + Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8); + + unsigned char mask = 0xFF; + mask <<= (8 - Offset); + + first = _mm512_permutexvar_pd(first_idx, first); + Packet8d tmp = _mm512_permutexvar_pd(second_idx, second); + first = _mm512_mask_blend_pd(mask, first, tmp); } } };