From 11fbdcbc385917f44b7b01671e158d07a695eb00 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 17 Apr 2014 16:04:30 +0200 Subject: [PATCH] Fix and optimize mixed products --- .../Core/products/GeneralBlockPanelKernel.h | 166 ++++++++++-------- 1 file changed, 92 insertions(+), 74 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index df30fdd3e..dcc0b4a0d 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -180,14 +180,15 @@ public: NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, - // register block size along the N direction (must be either 2 or 4) - nr = 4,//NumberOfRegisters/4, + // register block size along the N direction must be 1 or 4 + nr = 4, // register block size along the M direction (currently, this one cannot be modified) #ifdef __FMA__ + // we assume 16 registers mr = 3*LhsPacketSize, #else - mr = 2*LhsPacketSize, + mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, #endif LhsProgress = LhsPacketSize, @@ -209,15 +210,15 @@ public: p = pset1(ResScalar(0)); } - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) - { - pbroadcast4(b, b0, b1, b2, b3); - } - - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) - { - pbroadcast2(b, b0, b1); - } +// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) +// { +// pbroadcast4(b, b0, b1, b2, b3); +// } +// +// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) +// { +// pbroadcast2(b, b0, b1); +// } template EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const @@ -290,8 +291,13 @@ public: ResPacketSize = Vectorizable ? packet_traits::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, - nr = NumberOfRegisters/2, - mr = LhsPacketSize, + nr = 4, +#ifdef __FMA__ + // we assume 16 registers + mr = 3*LhsPacketSize, +#else + mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, +#endif LhsProgress = LhsPacketSize, RhsProgress = 1 @@ -332,15 +338,15 @@ public: dest = ploadu(a); } - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) - { - pbroadcast4(b, b0, b1, b2, b3); - } - - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) - { - pbroadcast2(b, b0, b1); - } +// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) +// { +// pbroadcast4(b, b0, b1, b2, b3); +// } +// +// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) +// { +// pbroadcast2(b, b0, b1); +// } EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const { @@ -566,7 +572,7 @@ public: NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, // FIXME: should depend on NumberOfRegisters nr = 4, - mr = ResPacketSize, + mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize, LhsProgress = ResPacketSize, RhsProgress = 1 @@ -593,19 +599,25 @@ public: } // linking error if instantiated without being optimized out: - void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3); - - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) - { - // FIXME not sure that's the best way to implement it! - b0 = pload1(b+0); - b1 = pload1(b+1); - } +// void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3); +// +// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) +// { +// // FIXME not sure that's the best way to implement it! +// b0 = pload1(b+0); +// b1 = pload1(b+1); +// } EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = ploaddup(a); } + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + eigen_internal_assert(unpacket_traits::size<=4); + loadRhs(b,dest); + } EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const { @@ -619,7 +631,13 @@ public: EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const { +#ifdef EIGEN_VECTORIZE_FMA + EIGEN_UNUSED_VARIABLE(tmp); + c.v = pmadd(a,b.v,c.v); +#else tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp); +#endif + } EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const @@ -956,7 +974,7 @@ void gebp_kernel for(Index k=0; k for(Index k=0; k EIGEN_GEBGP_ONESTEP(7); blB += pk*4*RhsProgress; - blA += pk*(1*Traits::LhsProgress); + blA += pk*1*LhsProgress; IACA_END } // process remaining peeled loop @@ -1169,7 +1187,7 @@ void gebp_kernel RhsPacket B_0, B1; EIGEN_GEBGP_ONESTEP(0); blB += 4*RhsProgress; - blA += 1*Traits::LhsProgress; + blA += 1*LhsProgress; } #undef EIGEN_GEBGP_ONESTEP @@ -1439,6 +1457,8 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1 + : Pack2>1 ? (rows/Pack2)*Pack2 : 0; // Pack 3 packets if(Pack1>=3*PacketSize) @@ -1496,16 +1516,20 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=Pack2) -// { -// if(PanelMode) count += Pack2*offset; -// for(Index k=0; k1) + { + for(Index i=peeled_mc1; i=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; // const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; - int pack_packets = Pack1/PacketSize; + int pack = Pack1; Index i = 0; - while(pack_packets>0) + while(pack>0) { - Index remaining_rows = rows-i; - Index peeled_mc = i+(remaining_rows/(pack_packets*PacketSize))*(pack_packets*PacketSize); -// std::cout << "pack_packets = " << pack_packets << " from " << i << " to " << peeled_mc << "\n"; - for(; i=PacketSize) { - for (Index m = 0; m < (pack_packets*PacketSize); m += PacketSize) + for(; k kernel; - for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = ploadu(&lhs(i+p+m, k)); - ptranspose(kernel); - for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack_packets*PacketSize)*p, cj.pconj(kernel.packet[p])); + for (Index m = 0; m < pack; m += PacketSize) + { + Kernel kernel; + for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = ploadu(&lhs(i+p+m, k)); + ptranspose(kernel); + for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); + } + count += PacketSize*pack; } - count += PacketSize*(pack_packets*PacketSize); } for(; k=Pack2) -// { -// if(PanelMode) count += Pack2*offset; -// for(Index k=0; k