From e497a27ddc0d724bc0eadde1e211617b766dd100 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sun, 30 Mar 2014 21:57:05 +0200 Subject: [PATCH] Optimize gebp kernel: 1 - increase peeling level along the depth dimention (+5% for large matrices, i.e., >1000) 2 - improve pipelining when dealing with latest rows of the lhs --- .../Core/products/GeneralBlockPanelKernel.h | 454 +++++++++++------- 1 file changed, 271 insertions(+), 183 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 3ed1fc5a3..a9e42c8aa 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -95,6 +95,9 @@ void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) k = std::min(k, l1/kdiv); SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0; if(_m @@ -328,6 +331,22 @@ protected: conj_helper cj; }; +template +struct DoublePacket +{ + Packet first; + Packet second; +}; + +template +DoublePacket padd(const DoublePacket &a, const DoublePacket &b) +{ + DoublePacket res; + res.first = padd(a.first, b.first); + res.second = padd(a.second,b.second); + return res; +} + template class gebp_traits, std::complex, _ConjLhs, _ConjRhs > { @@ -357,20 +376,16 @@ public: typedef typename packet_traits::type RealPacket; typedef typename packet_traits::type ScalarPacket; - struct DoublePacket - { - RealPacket first; - RealPacket second; - }; + typedef DoublePacket DoublePacketType; typedef typename conditional::type LhsPacket; - typedef typename conditional::type RhsPacket; + typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; - typedef typename conditional::type AccPacket; + typedef typename conditional::type AccPacket; EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } - EIGEN_STRONG_INLINE void initAcc(DoublePacket& p) + EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p) { p.first = pset1(RealScalar(0)); p.second = pset1(RealScalar(0)); @@ -383,7 +398,7 @@ public: } // Vectorized path - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const { dest.first = pset1(real(*b)); dest.second = pset1(imag(*b)); @@ -393,7 +408,7 @@ public: void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3); // Vectorized path - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacket& b0, DoublePacket& b1) + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1) { // FIXME not sure that's the best way to implement it! loadRhs(b+0, b0); @@ -419,7 +434,7 @@ public: dest = ploadu((const typename unpacket_traits::type*)(a)); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& /*tmp*/) const + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const { c.first = padd(pmul(a,b.first), c.first); c.second = padd(pmul(a,b.second),c.second); @@ -432,7 +447,7 @@ public: EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } - EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacket& alpha, ResPacket& r) const + EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const { // assemble c ResPacket tmp; @@ -571,6 +586,14 @@ struct gebp_kernel typedef typename Traits::RhsPacket RhsPacket; typedef typename Traits::ResPacket ResPacket; typedef typename Traits::AccPacket AccPacket; + + typedef gebp_traits SwappedTraits; + typedef typename SwappedTraits::ResScalar SResScalar; + typedef typename SwappedTraits::LhsPacket SLhsPacket; + typedef typename SwappedTraits::RhsPacket SRhsPacket; + typedef typename SwappedTraits::ResPacket SResPacket; + typedef typename SwappedTraits::AccPacket SAccPacket; + enum { Vectorizable = Traits::Vectorizable, @@ -591,6 +614,7 @@ void gebp_kernel Index strideA, Index strideB, Index offsetA, Index offsetB) { Traits traits; + SwappedTraits straits; if(strideA==-1) strideA = depth; if(strideB==-1) strideB = depth; @@ -599,7 +623,9 @@ void gebp_kernel Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; // Here we assume that mr==LhsProgress const Index peeled_mc = (rows/mr)*mr; - const Index peeled_kc = (depth/4)*4; + enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell) + const Index peeled_kc = depth & ~(pk-1); + const Index depth2 = depth & ~1; // loops on each micro vertical panel of rhs (depth x nr) // First pass using depth x 8 panels @@ -634,14 +660,14 @@ void gebp_kernel // uncomment for register prefetching // LhsPacket A1; // traits.loadLhs(blA, A0); - for(Index k=0; k EIGEN_GEBGP_ONESTEP8(1,A0,A1); EIGEN_GEBGP_ONESTEP8(2,A1,A0); EIGEN_GEBGP_ONESTEP8(3,A0,A1); + EIGEN_GEBGP_ONESTEP8(4,A1,A0); + EIGEN_GEBGP_ONESTEP8(5,A0,A1); + EIGEN_GEBGP_ONESTEP8(6,A1,A0); + EIGEN_GEBGP_ONESTEP8(7,A0,A1); - blB += 4*8*RhsProgress; - blA += 4*mr; + blB += pk*8*RhsProgress; + blA += pk*mr; } // process remaining peeled loop for(Index k=peeled_kc; k pstoreu(r0+5*resStride, R5); pstoreu(r0+6*resStride, R6); pstoreu(r0+7*resStride, R0); - } - for(Index i=peeled_mc; i SwappedTraits; - typedef typename SwappedTraits::ResScalar SResScalar; - typedef typename SwappedTraits::LhsPacket SLhsPacket; - typedef typename SwappedTraits::RhsPacket SRhsPacket; - typedef typename SwappedTraits::ResPacket SResPacket; - typedef typename SwappedTraits::AccPacket SAccPacket; - SwappedTraits straits; - - SAccPacket C0; - straits.initAcc(C0); - for(Index k=0; k(&res[j2*resStride + i], resStride); - SResPacket alphav = pset1(alpha); - straits.acc(C0, alphav, R); - pscatter(&res[j2*resStride + i], R, resStride); - - EIGEN_ASM_COMMENT("end_vectorized_multiplication_of_last_rows"); - } - else - { - // gets a 1 x 8 res block as registers - ResScalar C0(0), C1(0), C2(0), C3(0), C4(0), C5(0), C6(0), C7(0); - - for(Index k=0; k SwappedTraits; + typedef typename SwappedTraits::ResScalar SResScalar; + typedef typename SwappedTraits::LhsPacket SLhsPacket; + typedef typename SwappedTraits::RhsPacket SRhsPacket; + typedef typename SwappedTraits::ResPacket SResPacket; + typedef typename SwappedTraits::AccPacket SAccPacket; + SwappedTraits straits; + + Index rows2 = (rows & ~1); + for(Index i=peeled_mc; i(&res[j2*resStride + i], resStride); + SResPacket alphav = pset1(alpha); + straits.acc(padd(C0,C1), alphav, R); + pscatter(&res[j2*resStride + i], R, resStride); + + R = pgather(&res[j2*resStride + i + 1], resStride); + straits.acc(padd(C2,C3), alphav, R); + pscatter(&res[j2*resStride + i + 1], R, resStride); + + EIGEN_ASM_COMMENT("end_vectorized_multiplication_of_last_rows 8"); + } + if(rows2!=rows) + { + Index i = rows-1; + const LhsScalar* blA = &blockA[i*strideA+offsetA]; + const RhsScalar* blB = &blockB[j2*strideB+offsetB*8]; + + EIGEN_ASM_COMMENT("begin_vectorized_multiplication_of_last_rows 8"); + + SAccPacket C0,C1; + straits.initAcc(C0); // even + straits.initAcc(C1); // odd + + for(Index k=0; k(&res[j2*resStride + i], resStride); + SResPacket alphav = pset1(alpha); + straits.acc(padd(C0,C1), alphav, R); + pscatter(&res[j2*resStride + i], R, resStride); + } + } + else + { + // Pure scalar path + for(Index i=peeled_mc; i=4) { for(Index j2=packet_cols8; j2 for(Index i=0; i // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*4]; LhsPacket A0; - // uncomment for register prefetching - // LhsPacket A1; - // traits.loadLhs(blA, A0); - for(Index k=0; k traits.broadcastRhs(&blB[2+4*K*RhsProgress], B_0, B1); \ traits.madd(A0, B_0,C2, B_0); \ traits.madd(A0, B1, C3, B1) - + EIGEN_GEBGP_ONESTEP4(0); EIGEN_GEBGP_ONESTEP4(1); EIGEN_GEBGP_ONESTEP4(2); EIGEN_GEBGP_ONESTEP4(3); + EIGEN_GEBGP_ONESTEP4(4); + EIGEN_GEBGP_ONESTEP4(5); + EIGEN_GEBGP_ONESTEP4(6); + EIGEN_GEBGP_ONESTEP4(7); - blB += 4*4*RhsProgress; - blA += 4*mr; + blB += pk*4*RhsProgress; + blA += pk*mr; } - // process remaining peeled loop + // process remaining of peeled loop for(Index k=peeled_kc; k for(Index i=peeled_mc; i SwappedTraits; - typedef typename SwappedTraits::ResScalar SResScalar; - typedef typename SwappedTraits::LhsPacket SLhsPacket; - typedef typename SwappedTraits::RhsPacket SRhsPacket; - typedef typename SwappedTraits::ResPacket SResPacket; - typedef typename SwappedTraits::AccPacket SAccPacket; - SwappedTraits straits; - - SAccPacket C0; - straits.initAcc(C0); - for(Index k=0; k(&res[j2*resStride + i], resStride); - SResPacket alphav = pset1(alpha); - straits.acc(C0, alphav, R); - pscatter(&res[j2*resStride + i], R, resStride); - - EIGEN_ASM_COMMENT("end_vectorized_multiplication_of_last_rows"); - } else { - // gets a 1 x 4 res block as registers - ResScalar C0(0), C1(0), C2(0), C3(0); + SLhsPacket A0; + straits.loadLhsUnaligned(blB, A0); + SRhsPacket B_0; + straits.loadRhs(&blA[k], B_0); + SRhsPacket T0; + straits.madd(A0,B_0,C0,T0); + blB += 4; + } + SResPacket R = pgather(&res[j2*resStride + i], resStride); + SResPacket alphav = pset1(alpha); + straits.acc(C0, alphav, R); + pscatter(&res[j2*resStride + i], R, resStride); + + EIGEN_ASM_COMMENT("end_vectorized_multiplication_of_last_rows 1x4"); + } + else + { + // Pure scalar path + // gets a 1 x 4 res block as registers + ResScalar C0(0), C1(0), C2(0), C3(0); - for(Index k=0; k do the same but with nr==1 for(Index j2=packet_cols4; j2 traits.acc(C0, alphav, R0); pstoreu(r0, R0); } + // pure scalar path for(Index i=peeled_mc; i