diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index dba1d5a42..11e4165a2 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -558,7 +558,21 @@ pblend(const Selector::size>& ifPacket, const Packet& th return ifPacket.select[0] ? thenPacket : elsePacket; } -/** \internal \returns \a a with last coefficients replaced by the scalar b */ +/** \internal \returns \a a with the first coefficient replaced by the scalar b */ +template EIGEN_DEVICE_FUNC inline Packet +pinsertfirst(const Packet& a, typename unpacket_traits::type b) +{ + // Default implementation based on pblend. + // It must be specialized for higher performance. + Selector::size> mask; + mask.select[0] = true; + // This for loop should be optimized away by the compiler. + for(Index i=1; i::size; ++i) + mask.select[i] = false; + return pblend(mask, pset1(b), a); +} + +/** \internal \returns \a a with the last coefficient replaced by the scalar b */ template EIGEN_DEVICE_FUNC inline Packet pinsertlast(const Packet& a, typename unpacket_traits::type b) { diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index b16e0ddd4..99439c8aa 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -456,6 +456,26 @@ ptranspose(PacketBlock& kernel) { kernel.packet[0].v = tmp; } +template<> EIGEN_STRONG_INLINE Packet4cf pinsertfirst(const Packet4cf& a, std::complex b) +{ + return Packet4cf(_mm256_blend_ps(a.v,pset1(b).v,1|2)); +} + +template<> EIGEN_STRONG_INLINE Packet2cd pinsertfirst(const Packet2cd& a, std::complex b) +{ + return Packet2cd(_mm256_blend_pd(a.v,pset1(b).v,1|2)); +} + +template<> EIGEN_STRONG_INLINE Packet4cf pinsertlast(const Packet4cf& a, std::complex b) +{ + return Packet4cf(_mm256_blend_ps(a.v,pset1(b).v,(1<<7)|(1<<6))); +} + +template<> EIGEN_STRONG_INLINE Packet2cd pinsertlast(const Packet2cd& a, std::complex b) +{ + return Packet2cd(_mm256_blend_pd(a.v,pset1(b).v,(1<<3)|(1<<2))); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 0efdde722..bfe025a81 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -604,6 +604,16 @@ template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, cons return _mm256_blendv_pd(thenPacket, elsePacket, false_mask); } +template<> EIGEN_STRONG_INLINE Packet8f pinsertfirst(const Packet8f& a, float b) +{ + return _mm256_blend_ps(a,pset1(b),1); +} + +template<> EIGEN_STRONG_INLINE Packet4d pinsertfirst(const Packet4d& a, double b) +{ + return _mm256_blend_pd(a,pset1(b),1); +} + template<> EIGEN_STRONG_INLINE Packet8f pinsertlast(const Packet8f& a, float b) { return _mm256_blend_ps(a,pset1(b),(1<<7)); diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index fd7f4d740..5607fe0ab 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -476,6 +476,26 @@ template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, co return Packet2cf(_mm_castpd_ps(result)); } +template<> EIGEN_STRONG_INLINE Packet2cf pinsertfirst(const Packet2cf& a, std::complex b) +{ + return Packet2cf(_mm_loadl_pi(a.v, reinterpret_cast(&b))); +} + +template<> EIGEN_STRONG_INLINE Packet1cd pinsertfirst(const Packet1cd&, std::complex b) +{ + return pset1(b); +} + +template<> EIGEN_STRONG_INLINE Packet2cf pinsertlast(const Packet2cf& a, std::complex b) +{ + return Packet2cf(_mm_loadh_pi(a.v, reinterpret_cast(&b))); +} + +template<> EIGEN_STRONG_INLINE Packet1cd pinsertlast(const Packet1cd&, std::complex b) +{ + return pset1(b); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index b519e3748..6f31cf12b 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -818,6 +818,24 @@ template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, cons #endif } +template<> EIGEN_STRONG_INLINE Packet4f pinsertfirst(const Packet4f& a, float b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blend_ps(a,pset1(b),1); +#else + return _mm_move_ss(a, _mm_load_ss(&b)); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet2d pinsertfirst(const Packet2d& a, double b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blend_pd(a,pset1(b),1); +#else + return _mm_move_sd(a, _mm_load_sd(&b)); +#endif +} + template<> EIGEN_STRONG_INLINE Packet4f pinsertlast(const Packet4f& a, float b) { #ifdef EIGEN_VECTORIZE_SSE4_1 diff --git a/test/packetmath.cpp b/test/packetmath.cpp index a53786ae5..43b62a03f 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -16,6 +16,12 @@ #endif // using namespace Eigen; +#ifdef EIGEN_VECTORIZE_SSE +const bool g_vectorize_sse = true; +#else +const bool g_vectorize_sse = false; +#endif + namespace Eigen { namespace internal { template T negate(const T& x) { return -x; } @@ -290,7 +296,17 @@ template void packetmath() } } - if (PacketTraits::HasBlend) { + if (PacketTraits::HasBlend || g_vectorize_sse) { + // pinsertfirst + for (int i=0; i(); + ref[0] = s; + internal::pstore(data2, internal::pinsertfirst(internal::pload(data1),s)); + VERIFY(areApprox(ref, data2, PacketSize) && "internal::pinsertfirst"); + } + + if (PacketTraits::HasBlend || g_vectorize_sse) { // pinsertlast for (int i=0; i