From 35d149e34caabc8ca77ef908d6024f32d84b7ff4 Mon Sep 17 00:00:00 2001 From: Pedro Caldeira Date: Fri, 21 Aug 2020 17:52:34 -0500 Subject: [PATCH] Add missing functions for Packet8bf in Altivec architecture. Including new tests for bfloat16 Packets. Fix prsqrt on GenericPacketMath. --- Eigen/src/Core/GenericPacketMath.h | 3 +- Eigen/src/Core/arch/AltiVec/PacketMath.h | 41 ++++++++- test/packetmath.cpp | 101 ++++++++++++++++++++++- test/packetmath_test_shared.h | 1 - 4 files changed, 141 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 5c23b4b71..d63c8aaf4 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -612,7 +612,8 @@ Packet psqrt(const Packet& a) { EIGEN_USING_STD_MATH(sqrt); return sqrt(a); } /** \internal \returns the reciprocal square-root of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet prsqrt(const Packet& a) { - return pdiv(pset1(1), psqrt(a)); + typedef typename internal::unpacket_traits::type Scalar; + return pdiv(pset1(Scalar(1)), psqrt(a)); } /** \internal \returns the rounded value of \a a (coeff-wise) */ diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index cfff6691e..c98954393 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -646,6 +646,11 @@ template<> EIGEN_DEVICE_FUNC inline Packet8us pgather(from, stride); } +template<> EIGEN_DEVICE_FUNC inline Packet8bf pgather(const bfloat16* from, Index stride) +{ + return pgather_size8(from, stride); +} + template EIGEN_DEVICE_FUNC inline Packet pgather_size16(const __UNPACK_TYPE__(Packet)* from, Index stride) { EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16]; @@ -724,6 +729,11 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter pscatter_size8(to, from, stride); } +template<> EIGEN_DEVICE_FUNC inline void pscatter(bfloat16* to, const Packet8bf& from, Index stride) +{ + pscatter_size8(to, from, stride); +} + template EIGEN_DEVICE_FUNC inline void pscatter_size16(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride) { EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16]; @@ -1285,7 +1295,30 @@ template<> EIGEN_STRONG_INLINE Packet8bf psub(const Packet8bf& a, con template<> EIGEN_STRONG_INLINE Packet8bf psqrt (const Packet8bf& a){ BF16_TO_F32_UNARY_OP_WRAPPER(vec_sqrt, a); } - +template<> EIGEN_STRONG_INLINE Packet8bf prsqrt (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt, a); +} +template<> EIGEN_STRONG_INLINE Packet8bf pexp (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a); +} +template<> EIGEN_STRONG_INLINE Packet8bf psin (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(psin_float, a); +} +template<> EIGEN_STRONG_INLINE Packet8bf pcos (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(pcos_float, a); +} +template<> EIGEN_STRONG_INLINE Packet8bf plog (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(plog_float, a); +} +template<> EIGEN_STRONG_INLINE Packet8bf pfloor (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(pfloor, a); +} +template<> EIGEN_STRONG_INLINE Packet8bf pceil (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(pceil, a); +} +template<> EIGEN_STRONG_INLINE Packet8bf pround (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(pround, a); +} template<> EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) { Packet4f a_even = Bf16ToF32Even(a); Packet4f a_odd = Bf16ToF32Odd(a); @@ -1325,6 +1358,12 @@ template<> EIGEN_STRONG_INLINE Packet8bf ploaddup(const bfloat16* return ploaddup(reinterpret_cast(from)); } +template<> EIGEN_STRONG_INLINE Packet8bf plset(const bfloat16& a) { + bfloat16 countdown[8] = { bfloat16(0), bfloat16(1), bfloat16(2), bfloat16(3), + bfloat16(4), bfloat16(5), bfloat16(6), bfloat16(7) }; + return padd(pset1(a), pload(countdown)); +} + template<> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Packet4f& exponent) { return pfrexp_float(a,exponent); } diff --git a/test/packetmath.cpp b/test/packetmath.cpp index c8ea3139e..3d8fbafc7 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -707,6 +707,62 @@ void packetmath_real() { } } +#define CAST_CHECK_CWISE1_IF(COND, REFOP, POP, SCALAR, REFTYPE) if(COND) { \ + test::packet_helper h; \ + for (int i=0; i(data1[i]))); \ + h.store(data2, POP(h.load(data1))); \ + VERIFY(test::areApprox(ref, data2, PacketSize) && #POP); \ +} + +template <> +void packetmath_real::type>(){ + typedef internal::packet_traits PacketTraits; + typedef internal::packet_traits::type Packet; + + const int PacketSize = internal::unpacket_traits::size; + const int size = PacketSize * 4; + EIGEN_ALIGN_MAX bfloat16 data1[PacketSize * 4]; + EIGEN_ALIGN_MAX bfloat16 data2[PacketSize * 4]; + EIGEN_ALIGN_MAX bfloat16 ref[PacketSize * 4]; + + for (int i = 0; i < size; ++i) { + data1[i] = bfloat16(internal::random(0, 1) * std::pow(float(10), internal::random(-6, 6))); + data2[i] = bfloat16(internal::random(0, 1) * std::pow(float(10), internal::random(-6, 6))); + data1[i] = bfloat16(0); + } + + if (internal::random(0, 1) < 0.1f) data1[internal::random(0, PacketSize)] = bfloat16(0); + + CAST_CHECK_CWISE1_IF(PacketTraits::HasLog, std::log, internal::plog, bfloat16, float); + CAST_CHECK_CWISE1_IF(PacketTraits::HasRsqrt, float(1) / std::sqrt, internal::prsqrt, bfloat16, float); + + for (int i = 0; i < size; ++i) { + data1[i] = bfloat16(internal::random(-1, 1) * std::pow(float(10), internal::random(-3, 3))); + data2[i] = bfloat16(internal::random(-1, 1) * std::pow(float(10), internal::random(-3, 3))); + } + CAST_CHECK_CWISE1_IF(PacketTraits::HasSin, std::sin, internal::psin, bfloat16, float); + CAST_CHECK_CWISE1_IF(PacketTraits::HasCos, std::cos, internal::pcos, bfloat16, float); + CAST_CHECK_CWISE1_IF(PacketTraits::HasTan, std::tan, internal::ptan, bfloat16, float); + + CAST_CHECK_CWISE1_IF(PacketTraits::HasRound, numext::round, internal::pround, bfloat16, float); + CAST_CHECK_CWISE1_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil, bfloat16, float); + CAST_CHECK_CWISE1_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor, bfloat16, float); + + for (int i = 0; i < size; ++i) { + data1[i] = bfloat16(-1.5 + i); + data2[i] = bfloat16(-1.5 + i); + } + CAST_CHECK_CWISE1_IF(PacketTraits::HasRound, numext::round, internal::pround, bfloat16, float); + + for (int i = 0; i < size; ++i) { + data1[i] = bfloat16(internal::random(-87, 88)); + data2[i] = bfloat16(internal::random(-87, 88)); + } + CAST_CHECK_CWISE1_IF(PacketTraits::HasExp, std::exp, internal::pexp, bfloat16, float); + +} + template void packetmath_notcomplex() { typedef internal::packet_traits PacketTraits; @@ -761,6 +817,47 @@ void packetmath_notcomplex() { } } +template <> +void packetmath_notcomplex::type>(){ + typedef bfloat16 Scalar; + typedef internal::packet_traits::type Packet; + typedef internal::packet_traits PacketTraits; + const int PacketSize = internal::unpacket_traits::size; + + EIGEN_ALIGN_MAX Scalar data1[PacketSize * 4]; + EIGEN_ALIGN_MAX Scalar data2[PacketSize * 4]; + EIGEN_ALIGN_MAX Scalar ref[PacketSize * 4]; + Array::Map(data1, PacketSize * 4).setRandom(); + + ref[0] = data1[0]; + for (int i = 0; i < PacketSize; ++i) ref[0] = (std::min)(ref[0], data1[i]); + VERIFY(internal::isApprox(ref[0], internal::predux_min(internal::pload(data1))) && "internal::predux_min"); + + VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasMin); + VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasMax); + + CHECK_CWISE2_IF(PacketTraits::HasMin, (std::min), internal::pmin); + CHECK_CWISE2_IF(PacketTraits::HasMax, (std::max), internal::pmax); + CHECK_CWISE1(numext::abs, internal::pabs); + CHECK_CWISE2_IF(PacketTraits::HasAbsDiff, REF_ABS_DIFF, internal::pabsdiff); + + ref[0] = data1[0]; + for (int i = 0; i < PacketSize; ++i) ref[0] = (std::max)(ref[0], data1[i]); + VERIFY(internal::isApprox(ref[0], internal::predux_max(internal::pload(data1))) && "internal::predux_max"); + + { + unsigned char* data1_bits = reinterpret_cast(data1); + // predux_any + for (unsigned int i = 0; i < PacketSize * sizeof(Scalar); ++i) data1_bits[i] = 0x0; + VERIFY((!internal::predux_any(internal::pload(data1))) && "internal::predux_any(0000)"); + for (int k = 0; k < PacketSize; ++k) { + for (unsigned int i = 0; i < sizeof(Scalar); ++i) data1_bits[k * sizeof(Scalar) + i] = 0xff; + VERIFY(internal::predux_any(internal::pload(data1)) && "internal::predux_any(0101)"); + for (unsigned int i = 0; i < sizeof(Scalar); ++i) data1_bits[k * sizeof(Scalar) + i] = 0x00; + } + } +} + template void test_conj_helper(Scalar* data1, Scalar* data2, Scalar* ref, Scalar* pval) { const int PacketSize = internal::unpacket_traits::size; @@ -819,7 +916,7 @@ void packetmath_scatter_gather() { typedef typename NumTraits::Real RealScalar; const int PacketSize = internal::unpacket_traits::size; EIGEN_ALIGN_MAX Scalar data1[PacketSize]; - RealScalar refvalue = 0; + RealScalar refvalue = RealScalar(0); for (int i = 0; i < PacketSize; ++i) { data1[i] = internal::random() / RealScalar(PacketSize); } @@ -900,7 +997,7 @@ EIGEN_DECLARE_TEST(packetmath) { CALL_SUBTEST_12(test::runner >::run()); CALL_SUBTEST_13((packetmath::type>())); CALL_SUBTEST_14((packetmath::type>())); - CALL_SUBTEST_15((packetmath::type>())); + CALL_SUBTEST_15(test::runner::run()); g_first_pass = false; } } diff --git a/test/packetmath_test_shared.h b/test/packetmath_test_shared.h index 5be10997a..7b8caedcb 100644 --- a/test/packetmath_test_shared.h +++ b/test/packetmath_test_shared.h @@ -208,7 +208,6 @@ struct runner { static void run() { runall::run(); - runall::run(); } };