From 29aebf96e62f4fb5e4b1f3fb475e299df2e7a02e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 6 Jun 2014 20:18:44 -0700 Subject: [PATCH] Created the pblend packet primitive and implemented it using SSE and AVX instructions. --- Eigen/src/Core/GenericPacketMath.h | 14 ++++++++++ Eigen/src/Core/arch/AVX/PacketMath.h | 15 ++++++++++ Eigen/src/Core/arch/SSE/Complex.h | 8 +++++- Eigen/src/Core/arch/SSE/PacketMath.h | 41 ++++++++++++++++++++++++++-- test/packetmath.cpp | 16 +++++++++++ 5 files changed, 90 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 98313c68f..0869dd49f 100755 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -54,6 +54,7 @@ struct default_packet_traits HasMax = 1, HasConj = 1, HasSetLinear = 1, + HasBlend = 0, HasDiv = 0, HasSqrt = 0, @@ -429,6 +430,19 @@ ptranspose(PacketBlock& /*kernel*/) { // Nothing to do in the scalar case, i.e. a 1x1 matrix. } +/*************************************************************************** + * Selector, i.e. vector of N boolean values used to select (i.e. blend) + * words from 2 packets. +***************************************************************************/ +template struct Selector { + bool select[N]; +}; + +template EIGEN_DEVICE_FUNC inline Packet +pblend(const Selector::size>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) { + return ifPacket.select[0] ? thenPacket : elsePacket; +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 8b8307d75..688ff91e4 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -59,6 +59,7 @@ template<> struct packet_traits : default_packet_traits HasLog = 0, HasExp = 0, HasSqrt = 0 + HasBlend = 1, }; }; template<> struct packet_traits : default_packet_traits @@ -73,6 +74,7 @@ template<> struct packet_traits : default_packet_traits HasDiv = 1, HasExp = 0 + HasBlend = 1, }; }; @@ -557,6 +559,19 @@ ptranspose(PacketBlock& kernel) { kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49); } +template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) { + const __m256 zero = _mm256_setzero_ps(); + const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ); + return _mm256_blendv_ps(thenPacket, elsePacket, false_mask); +} +template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) { + const __m256d zero = _mm256_setzero_pd(); + const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ); + return _mm256_blendv_pd(thenPacket, elsePacket, false_mask); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 758183c18..0bc03cf9e 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -44,7 +44,8 @@ template<> struct packet_traits > : default_packet_traits HasAbs2 = 0, HasMin = 0, HasMax = 0, - HasSetLinear = 0 + HasSetLinear = 0, + HasBlend = 1 }; }; #endif @@ -472,6 +473,11 @@ ptranspose(PacketBlock& kernel) { kernel.packet[1].v = tmp; } +template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { + __m128d result = pblend(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v)); + return Packet2cf(_mm_castpd_ps(result)); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 6912f3bc3..1124b24df 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -108,7 +108,8 @@ template<> struct packet_traits : default_packet_traits HasCos = EIGEN_FAST_MATH, HasLog = 1, HasExp = 1, - HasSqrt = 1 + HasSqrt = 1, + HasBlend = 1 }; }; template<> struct packet_traits : default_packet_traits @@ -123,7 +124,8 @@ template<> struct packet_traits : default_packet_traits HasDiv = 1, HasExp = 1, - HasSqrt = 1 + HasSqrt = 1, + HasBlend = 1 }; }; #endif @@ -135,7 +137,9 @@ template<> struct packet_traits : default_packet_traits // FIXME check the Has* Vectorizable = 1, AlignedOnScalar = 1, - size=4 + size=4, + + HasBlend = 1 }; }; @@ -809,6 +813,37 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3] = _mm_unpackhi_epi64(T2, T3); } +template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { + const __m128i zero = _mm_setzero_si128(); + const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m128i false_mask = _mm_cmpeq_epi32(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_epi8(thenPacket, elsePacket, false_mask); +#else + return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { + const __m128 zero = _mm_setzero_ps(); + const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m128 false_mask = _mm_cmpeq_ps(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_ps(thenPacket, elsePacket, false_mask); +#else + return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { + const __m128d zero = _mm_setzero_pd(); + const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]); + __m128d false_mask = _mm_cmpeq_pd(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_pd(thenPacket, elsePacket, false_mask); +#else + return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket)); +#endif +} + } // end namespace internal } // end namespace Eigen diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 9dab07522..663ab886d 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -261,6 +261,22 @@ template void packetmath() VERIFY(isApproxAbs(data2[j], data1[i+j*PacketSize], refvalue) && "ptranspose"); } } + + if (internal::packet_traits::HasBlend) { + Packet thenPacket = internal::pload(data1); + Packet elsePacket = internal::pload(data2); + EIGEN_ALIGN_DEFAULT internal::Selector selector; + for (int i = 0; i < PacketSize; ++i) { + selector.select[i] = i; + } + + Packet blend = internal::pblend(selector, thenPacket, elsePacket); + EIGEN_ALIGN_DEFAULT Scalar result[size]; + internal::pstore(result, blend); + for (int i = 0; i < PacketSize; ++i) { + VERIFY(isApproxAbs(result[i], (selector.select[i] ? data1[i] : data2[i]), refvalue)); + } + } } template void packetmath_real()