From d1dc088ef045dcee5747b5c722f5f4f6bb58e2d1 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 7 Aug 2009 11:09:34 +0200 Subject: [PATCH] * implement a second level of micro blocking (faster for small sizes) * workaround GCC bad implementation of _mm_set1_p* --- Eigen/src/Core/arch/SSE/PacketMath.h | 15 ++ .../Core/products/GeneralBlockPanelKernel.h | 135 +++++++++++++++++- .../Core/products/SelfadjointMatrixMatrix.h | 50 ++++--- 3 files changed, 174 insertions(+), 26 deletions(-) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 3f1fd8ef5..3fd33afbf 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -74,8 +74,23 @@ template<> struct ei_unpacket_traits { typedef float type; enum {size template<> struct ei_unpacket_traits { typedef double type; enum {size=2}; }; template<> struct ei_unpacket_traits { typedef int type; enum {size=4}; }; +#ifdef __GNUC__ +// Sometimes GCC implements _mm_set1_p* using multiple moves, +// that is inefficient :( +template<> EIGEN_STRONG_INLINE Packet4f ei_pset1(const float& from) { + Packet4f res = _mm_set_ss(from); + asm("shufps $0, %[x], %[x]" : [x] "+x" (res) : ); + return res; +} +template<> EIGEN_STRONG_INLINE Packet2d ei_pset1(const double& from) { + Packet2d res = _mm_set_sd(from); + asm("unpcklpd %[x], %[x]" : [x] "+x" (res) : ); + return res; +} +#else template<> EIGEN_STRONG_INLINE Packet4f ei_pset1(const float& from) { return _mm_set1_ps(from); } template<> EIGEN_STRONG_INLINE Packet2d ei_pset1(const double& from) { return _mm_set1_pd(from); } +#endif template<> EIGEN_STRONG_INLINE Packet4i ei_pset1(const int& from) { return _mm_set1_epi32(from); } template<> EIGEN_STRONG_INLINE Packet4f ei_padd(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); } diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 129fd86e7..fe1987bdd 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -39,11 +39,15 @@ struct ei_gebp_kernel if(strideB==-1) strideB = depth; Conj cj; int packet_cols = (cols/nr) * nr; - const int peeled_mc = (rows/mr)*mr; - // loops on each cache friendly block of the result/rhs + const int peeled_mc = (rows/mr)*mr; + const int peeled_mc2 = peeled_mc + (rows-peeled_mc >= PacketSize ? PacketSize : 0); + const int peeled_kc = (depth/4)*4; + // loops on each micro vertical panel of rhs (depth x nr) for(int j2=0; j2 we select a mr x nr micro block of res which is entirely + // stored into mr/packet_size x nr registers. for(int i=0; i=PacketSize) + { + int i = peeled_mc; + const Scalar* blA = &blockA[i*strideA+offsetA*PacketSize]; + #ifdef EIGEN_VECTORIZE_SSE + _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); + #endif + + // gets res block as register + PacketType C0, C1, C2, C3; + C0 = ei_ploadu(&res[(j2+0)*resStride + i]); + C1 = ei_ploadu(&res[(j2+1)*resStride + i]); + if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i]); + if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]); + + // performs "inner" product + const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr]; + for(int k=0; k=PacketSize) + { + int i = peeled_mc; + const Scalar* blA = &blockA[i*strideA+offsetA*PacketSize]; + #ifdef EIGEN_VECTORIZE_SSE + _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); + #endif + + PacketType C0 = ei_ploadu(&res[(j2+0)*resStride + i]); + + const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB]; + for(int k=0; k::size }; ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); ei_conj_if::IsComplex && Conjugate> cj; ei_const_blas_data_mapper lhs(_lhs,lhsStride); int count = 0; - const int peeled_mc = (rows/mr)*mr; + int peeled_mc = (rows/mr)*mr; for(int i=0; i=PacketSize) + { + if(PanelMode) count += PacketSize*offset; + for(int k=0; k struct ei_gemm_pack_rhs { + typedef typename ei_packet_traits::type Packet; enum { PacketSize = ei_packet_traits::size }; void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols, int stride=0, int offset=0) @@ -354,6 +474,7 @@ struct ei_gemm_pack_rhs // skip what we have after if(PanelMode) count += PacketSize * nr * (stride-offset-depth); } + // copy the remaining columns one at a time (nr==1) for(int j2=packet_cols; j2 struct ei_symm_pack_lhs { + enum { PacketSize = ei_packet_traits::size }; + template inline + void pack(Scalar* blockA, const ei_const_blas_data_mapper& lhs, int cols, int i, int& count) + { + // normal copy + for(int k=0; k lhs(_lhs,lhsStride); int count = 0; - const int peeled_mc = (rows/mr)*mr; + int peeled_mc = (rows/mr)*mr; for(int i=0; i(blockA, lhs, cols, i, count); + } + + if(rows-peeled_mc>=PacketSize) + { + pack(blockA, lhs, cols, peeled_mc, count); + peeled_mc += PacketSize; } // do the same with mr==1