diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 3bba02278..f1a8d5707 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -371,6 +371,15 @@ template EIGEN_DEVICE_FUNC inline void pstore( template EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from) { (*to) = from; } +/** \internal copy the packet \a from to \a *to, (un-aligned store with a mask) + * There is no generic implementation. We only have implementations for specialized + * cases. Generic case should not be called. + */ +template +EIGEN_DEVICE_FUNC inline +typename enable_if::masked_load_available, void>::type +pstoreu(Scalar* to, const Packet& from, typename unpacket_traits::mask_t umask); + template EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/) { return ploadu(from); } diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 9d13895e3..5011b98ea 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -431,6 +431,14 @@ template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet8f& template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet8f& from, uint8_t umask) { + Packet8i mask = _mm256_set1_epi8(static_cast(umask)); + const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe); + mask = por(mask, bit_mask); + mask = pcmp_eq(mask, _mm256_set1_epi32(0xffffffff)); + EIGEN_DEBUG_UNALIGNED_STORE return _mm256_maskstore_ps(to, mask, from); +} + // NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available // NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4); template<> EIGEN_DEVICE_FUNC inline Packet8f pgather(const float* from, Index stride) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 094309eda..c822f9352 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -576,6 +576,11 @@ EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet16i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512( reinterpret_cast<__m512i*>(to), from); } +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet16f& from, uint16_t umask) { + __mmask16 mask = static_cast<__mmask16>(umask); + EIGEN_DEBUG_UNALIGNED_STORE return _mm512_mask_storeu_ps(to, mask, from); +} template <> EIGEN_DEVICE_FUNC inline Packet16f pgather(const float* from, diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 200670b8c..e704a53ea 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -118,11 +118,17 @@ struct packet_helper template inline Packet load(const T* from) const { return internal::pload(from); } + template + inline Packet loadu(const T* from) const { return internal::ploadu(from); } + template inline Packet load(const T* from, unsigned long long umask) const { return internal::ploadu(from, umask); } template inline void store(T* to, const Packet& x) const { internal::pstore(to,x); } + + template + inline void store(T* to, const Packet& x, unsigned long long umask) const { internal::pstoreu(to, x, umask); } }; template @@ -131,11 +137,17 @@ struct packet_helper template inline T load(const T* from) const { return *from; } + template + inline T loadu(const T* from) const { return *from; } + template inline T load(const T* from, unsigned long long) const { return *from; } template inline void store(T* to, const T& x) const { *to = x; } + + template + inline void store(T* to, const T& x, unsigned long long) const { *to = x; } }; #define CHECK_CWISE1_IF(COND, REFOP, POP) if(COND) { \ @@ -203,18 +215,31 @@ template void packetmath() if (internal::unpacket_traits::masked_load_available) { + packet_helper::masked_load_available, Packet> h; unsigned long long max_umask = (0x1ull << PacketSize); + for (int offset=0; offset::masked_load_available, Packet> h; h.store(data2, h.load(data1+offset, umask)); for (int k=0; k> k) ? data1[k+offset] : Scalar(0); VERIFY(areApprox(data3, data2, PacketSize) && "internal::ploadu masked"); } } + + for (int offset=0; offset(Scalar(0))); + h.store(data2, h.loadu(data1+offset), umask); + for (int k=0; k> k) ? data1[k+offset] : Scalar(0); + VERIFY(areApprox(data3, data2, PacketSize) && "internal::pstoreu masked"); + } + } } for (int offset=0; offset