diff --git a/Eigen/Core b/Eigen/Core index 999bb96f2..cc05c308e 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -1,13 +1,18 @@ #ifndef EIGEN_CORE_H #define EIGEN_CORE_H +#define EIGEN_WIP_PRODUCT + #ifndef EIGEN_DONT_VECTORIZE #if ((defined __SSE2__) && ( (!defined __GNUC__) || (__GNUC__>=4 && __GNUC_MINOR__>=2))) #define EIGEN_VECTORIZE #define EIGEN_VECTORIZE_SSE #include #include +// SSE3: #include +// SSSE3: +//#include #endif #ifdef __ALTIVEC__ // There are zero chances of both __SSE2__ AND __ALTIVEC__ been defined #define EIGEN_VECTORIZE diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index 247a46b40..00bb375a9 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -71,7 +71,7 @@ struct ei_traits > || (ColsAtCompileTime != Dynamic && MatrixType::ColsAtCompileTime == Dynamic)) ? ~LargeBit : ~(unsigned int)0, - Flags = MatrixType::Flags & DefaultLostFlagMask & FlagsMaskLargeBit, + Flags = MatrixType::Flags & (DefaultLostFlagMask | VectorizableBit | ReferencableBit) & FlagsMaskLargeBit, CoeffReadCost = MatrixType::CoeffReadCost }; }; @@ -146,13 +146,13 @@ template class Block template PacketScalar _packetCoeff(int row, int col) const { - return m_matrix.packetCoeff(row + m_startRow.value(), col + m_startCol.value()); + return m_matrix.template packetCoeff(row + m_startRow.value(), col + m_startCol.value()); } template void _writePacketCoeff(int row, int col, const PacketScalar& x) { - m_matrix.const_cast_derived().writePacketCoeff(row + m_startRow.value(), col + m_startCol.value(), x); + m_matrix.const_cast_derived().template writePacketCoeff(row + m_startRow.value(), col + m_startCol.value(), x); } protected: diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h index 266da0cfe..e9f65cd7a 100644 --- a/Eigen/src/Core/Map.h +++ b/Eigen/src/Core/Map.h @@ -47,7 +47,7 @@ struct ei_traits > ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - Flags = MatrixType::Flags & DefaultLostFlagMask, + Flags = MatrixType::Flags & (DefaultLostFlagMask | ReferencableBit), CoeffReadCost = NumTraits::ReadCost }; }; diff --git a/Eigen/src/Core/PacketMath.h b/Eigen/src/Core/PacketMath.h index 59c143ede..267192482 100644 --- a/Eigen/src/Core/PacketMath.h +++ b/Eigen/src/Core/PacketMath.h @@ -71,10 +71,10 @@ template inline void ei_pstoreu(Scalar* to, const Scalar& from template inline Scalar ei_pfirst(const Scalar& a) { return a; } /** \internal \returns a packet where the element i contains the sum of the packet of \a vec[i] */ -// template inline Scalar ei_predux(const Scalar* vecs) { return vecs[0]; } +template inline Scalar ei_preduxp(const Scalar* vecs) { return vecs[0]; } /** \internal \returns the sum of the elements of \a a*/ -// template inline Scalar ei_predux(const Scalar& a) { return a; } +template inline Scalar ei_predux(const Scalar& a) { return a; } #endif // EIGEN_PACKET_MATH_H diff --git a/Eigen/src/Core/PacketMath_SSE.h b/Eigen/src/Core/PacketMath_SSE.h index 1872affea..dd467f52a 100644 --- a/Eigen/src/Core/PacketMath_SSE.h +++ b/Eigen/src/Core/PacketMath_SSE.h @@ -102,14 +102,19 @@ inline int ei_pfirst(const __m128i& a) { return _mm_cvtsi128_si32(a); } #ifdef __SSE3__ // TODO implement SSE2 versions as well as integer versions -inline __m128 ei_predux(const __m128* vecs) +inline __m128 ei_preduxp(const __m128* vecs) { return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3])); } -inline __m128d ei_predux(const __m128d* vecs) +inline __m128d ei_preduxp(const __m128d* vecs) { return _mm_hadd_pd(vecs[0], vecs[1]); } +// SSSE3 version: +// inline __m128i ei_preduxp(const __m128i* vecs) +// { +// return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3])); +// } inline float ei_predux(const __m128& a) { @@ -118,6 +123,13 @@ inline float ei_predux(const __m128& a) } inline double ei_predux(const __m128d& a) { return ei_pfirst(_mm_hadd_pd(a, a)); } + +// SSSE3 version: +// inline float ei_predux(const __m128i& a) +// { +// __m128i tmp0 = _mm_hadd_epi32(a,a); +// return ei_pfirst(_mm_hadd_epi32(tmp0, tmp0)); +// } #else // SSE2 versions inline float ei_predux(const __m128& a) @@ -130,7 +142,7 @@ inline double ei_predux(const __m128d& a) return ei_pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a))); } -inline __m128 ei_predux(const __m128* vecs) +inline __m128 ei_preduxp(const __m128* vecs) { __m128 tmp0, tmp1, tmp2; tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]); @@ -144,11 +156,31 @@ inline __m128 ei_predux(const __m128* vecs) return _mm_add_ps(tmp0, tmp2); } -inline __m128d ei_predux(const __m128d* vecs) +inline __m128d ei_preduxp(const __m128d* vecs) { return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1])); } #endif // SSE3 +inline int ei_predux(const __m128i& a) +{ + __m128i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a)); + return ei_pfirst(tmp) + ei_pfirst(_mm_shuffle_epi32(tmp, 1)); +} + +inline __m128i ei_preduxp(const __m128i* vecs) +{ + __m128i tmp0, tmp1, tmp2; + tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + tmp0 = _mm_add_epi32(tmp0, tmp1); + tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + tmp1 = _mm_add_epi32(tmp1, tmp2); + tmp2 = _mm_unpacklo_epi64(tmp0, tmp1); + tmp0 = _mm_unpackhi_epi64(tmp0, tmp1); + return _mm_add_epi32(tmp0, tmp2); +} + #endif // EIGEN_PACKET_MATH_SSE_H diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index c1b2f5457..66c6d4d5b 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -107,6 +107,12 @@ struct ei_packet_product_unroller +struct ei_packet_product_unroller +{ + static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {} +}; + template struct ProductPacketCoeffImpl { inline static typename Product::PacketScalar execute(const Product& product, int row, int col) { return product._packetCoeffRowMajor(row,col); } diff --git a/Eigen/src/Core/ProductWIP.h b/Eigen/src/Core/ProductWIP.h index 57bb899d6..a5fc9d298 100644 --- a/Eigen/src/Core/ProductWIP.h +++ b/Eigen/src/Core/ProductWIP.h @@ -73,7 +73,7 @@ struct ei_packet_product_unroller static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) { ei_packet_product_unroller::run(row, col, lhs, rhs, res); - res = ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.packetCoeff(Index, col), res); + res = ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.template packetCoeff(Index, col), res); } }; @@ -83,7 +83,7 @@ struct ei_packet_product_unroller static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) { ei_packet_product_unroller::run(row, col, lhs, rhs, res); - res = ei_pmadd(lhs.packetCoeff(row, Index), ei_pset1(rhs.coeff(Index, col)), res); + res = ei_pmadd(lhs.template packetCoeff(row, Index), ei_pset1(rhs.coeff(Index, col)), res); } }; @@ -92,7 +92,7 @@ struct ei_packet_product_unroller { static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) { - res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.packetCoeff(0, col)); + res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packetCoeff(0, col)); } }; @@ -101,7 +101,7 @@ struct ei_packet_product_unroller { static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) { - res = ei_pmul(lhs.packetCoeff(row, 0), ei_pset1(rhs.coeff(0, col))); + res = ei_pmul(lhs.template packetCoeff(row, 0), ei_pset1(rhs.coeff(0, col))); } }; @@ -111,6 +111,12 @@ struct ei_packet_product_unroller +struct ei_packet_product_unroller +{ + static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {} +}; + template struct ProductPacketCoeffImpl { inline static typename Product::PacketScalar execute(const Product& product, int row, int col) { return product._packetCoeffRowMajor(row,col); } @@ -139,16 +145,53 @@ template struct ei_product_eval_mode { enum{ value = Lhs::MaxRowsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD && Rhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD - && (!( (Lhs::Flags&RowMajorBit) && ((Rhs::Flags&RowMajorBit) ^ RowMajorBit))) ? CacheOptimalProduct : NormalProduct }; }; +template class ei_product_eval_to_column_major +{ + typedef typename ei_traits::Scalar _Scalar; + enum {_MaxRows = ei_traits::MaxRowsAtCompileTime, + _MaxCols = ei_traits::MaxColsAtCompileTime, + _Flags = ei_traits::Flags + }; + + public: + typedef Matrix<_Scalar, + ei_traits::RowsAtCompileTime, + ei_traits::ColsAtCompileTime, + ei_corrected_matrix_flags<_Scalar, ei_size_at_compile_time<_MaxRows,_MaxCols>::ret, _Flags>::ret & ~RowMajorBit, + ei_traits::MaxRowsAtCompileTime, + ei_traits::MaxColsAtCompileTime> type; +}; + +template struct ei_product_nested_rhs +{ + typedef typename ei_meta_if< + ei_is_temporary::ret && !(ei_traits::Flags & RowMajorBit), + T, + typename ei_meta_if< + (ei_traits::Flags & EvalBeforeNestingBit) + || (ei_traits::Flags & RowMajorBit) + || (!(ei_traits::Flags & ReferencableBit)) + || (n+1) * NumTraits::Scalar>::ReadCost < (n-1) * T::CoeffReadCost, + typename ei_product_eval_to_column_major::type, + const T& + >::ret + >::ret type; +}; + template struct ei_traits > { typedef typename Lhs::Scalar Scalar; - typedef typename ei_nested::type LhsNested; - typedef typename ei_nested::type RhsNested; + // the cache friendly product evals lhs once only + // FIXME what to do if we chose to dynamically call the normal product from the cache friendly one for small matrices ? + typedef typename ei_nested::type LhsNested; + // NOTE that rhs must be ColumnMajor, so we might need a special nested type calculation + typedef typename ei_meta_if::type, + typename ei_nested::type>::ret RhsNested; typedef typename ei_unref::type _LhsNested; typedef typename ei_unref::type _RhsNested; enum { @@ -160,16 +203,20 @@ struct ei_traits > ColsAtCompileTime = Rhs::ColsAtCompileTime, MaxRowsAtCompileTime = Lhs::MaxRowsAtCompileTime, MaxColsAtCompileTime = Rhs::MaxColsAtCompileTime, + // the vectorization flags are only used by the normal product, + // the other one is always vectorized ! _RhsVectorizable = (RhsFlags & RowMajorBit) && (RhsFlags & VectorizableBit) && (ColsAtCompileTime % ei_packet_traits::size == 0), _LhsVectorizable = (!(LhsFlags & RowMajorBit)) && (LhsFlags & VectorizableBit) && (RowsAtCompileTime % ei_packet_traits::size == 0), - _Vectorizable = (_LhsVectorizable || _RhsVectorizable) ? 1 : 0, + _Vectorizable = (_LhsVectorizable || _RhsVectorizable) ? 0 : 0, _RowMajor = (RhsFlags & RowMajorBit) && (EvalMode==(int)CacheOptimalProduct ? (int)LhsFlags & RowMajorBit : (!_LhsVectorizable)), _LostBits = DefaultLostFlagMask & ~( (_RowMajor ? 0 : RowMajorBit) | ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)), Flags = ((unsigned int)(LhsFlags | RhsFlags) & _LostBits) -// | EvalBeforeAssigningBit //FIXME + #ifndef EIGEN_WIP_PRODUCT_DIRTY + | EvalBeforeAssigningBit //FIXME + #endif | EvalBeforeNestingBit | (_Vectorizable ? VectorizableBit : 0), CoeffReadCost @@ -197,11 +244,16 @@ template class Product : ei_no_assignm PacketSize = ei_packet_traits::size, #if (defined __i386__) // i386 architectures provides only 8 xmmm register, - // so let's reduce the max number of rows processed at once + // so let's reduce the max number of rows processed at once. + // NOTE that so far the maximal supported value is 8. MaxBlockRows = 4, + MaxBlockRows_ClampingMask = 0xFFFFFC, #else MaxBlockRows = 8, + MaxBlockRows_ClampingMask = 0xFFFFF8, #endif + // maximal size of the blocks fitted in L2 cache + MaxL2BlockSize = EIGEN_TUNE_FOR_L2_CACHE_SIZE / sizeof(Scalar) }; Product(const Lhs& lhs, const Rhs& rhs) @@ -214,16 +266,6 @@ template class Product : ei_no_assignm template void _cacheFriendlyEval(DestDerived& res) const; - /** \internal */ - template - void _cacheFriendlyEvalImpl(DestDerived& res) const __attribute__ ((noinline)); - - /** \internal */ - template - void _cacheFriendlyEvalKernel(DestDerived& res, - int l2i, int l2j, int l2k, int l1i, - int l2blockRowEnd, int l2blockColEnd, int l2blockSizeEnd, const Scalar* block) const EIGEN_DONT_INLINE; - private: int _rows() const { return m_lhs.rows(); } @@ -255,7 +297,7 @@ template class Product : ei_no_assignm if(Lhs::ColsAtCompileTime <= EIGEN_UNROLLING_LIMIT) { PacketScalar res; - ei_packet_product_unroller @@ -269,21 +311,30 @@ template class Product : ei_no_assignm PacketScalar _packetCoeffRowMajor(int row, int col) const { PacketScalar res; - res = ei_pmul(ei_pset1(m_lhs.coeff(row, 0)),m_rhs.packetCoeff(0, col)); + res = ei_pmul(ei_pset1(m_lhs.coeff(row, 0)),m_rhs.template packetCoeff(0, col)); for(int i = 1; i < m_lhs.cols(); i++) - res = ei_pmadd(ei_pset1(m_lhs.coeff(row, i)), m_rhs.packetCoeff(i, col), res); + res = ei_pmadd(ei_pset1(m_lhs.coeff(row, i)), m_rhs.template packetCoeff(i, col), res); return res; } PacketScalar _packetCoeffColumnMajor(int row, int col) const { PacketScalar res; - res = ei_pmul(m_lhs.packetCoeff(row, 0), ei_pset1(m_rhs.coeff(0, col))); + res = ei_pmul(m_lhs.template packetCoeff(row, 0), ei_pset1(m_rhs.coeff(0, col))); for(int i = 1; i < m_lhs.cols(); i++) - res = ei_pmadd(m_lhs.packetCoeff(row, i), ei_pset1(m_rhs.coeff(i, col)), res); + res = ei_pmadd(m_lhs.template packetCoeff(row, i), ei_pset1(m_rhs.coeff(i, col)), res); return res; } + /** \internal */ + template + void _cacheFriendlyEvalImpl(DestDerived& res) const __attribute__ ((noinline)); + + /** \internal */ + template + void _cacheFriendlyEvalKernel(DestDerived& res, + int l2i, int l2j, int l2k, int l1i, + int l2blockRowEnd, int l2blockColEnd, int l2blockSizeEnd, const Scalar* block) const EIGEN_DONT_INLINE; protected: const LhsNested m_lhs; @@ -361,7 +412,8 @@ void Product::_cacheFriendlyEvalKernel(DestDerived& res, int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*BlockRows; const Scalar* localB = &block[offsetblock]; - int l1jsize = l1j * m_lhs.cols(); //TODO find a better way to optimize address computation ? +// int l1jsize = l1j * m_lhs.cols(); //TODO find a better way to optimize address computation ? + Scalar* rhsColumn = &(m_rhs.const_cast_derived().coeffRef(0, l1j)); // don't worry, dst is a set of registers PacketScalar dst[BlockRows]; @@ -383,7 +435,8 @@ void Product::_cacheFriendlyEvalKernel(DestDerived& res, // unaligned loads are expensive, therefore let's preload the next element in advance if (RhsAlignment==UnAligned) - tmp1 = ei_ploadu(&m_rhs.derived().data()[l1jsize+l2k]); + //tmp1 = ei_ploadu(&m_rhs.data()[l1jsize+l2k]); + tmp1 = ei_ploadu(&rhsColumn[l2k]); for(int k=l2k; k::_cacheFriendlyEvalKernel(DestDerived& res, //PacketScalar tmp = m_rhs.template packetCoeff(k, l1j); if (RhsAlignment==Aligned) { - tmp = ei_pload(&m_rhs.data()[l1jsize + k]); + //tmp = ei_pload(&m_rhs.data()[l1jsize + k]); + tmp = ei_pload(&rhsColumn[k]); } else { tmp = tmp1; if (k+PacketSize::_cacheFriendlyEvalKernel(DestDerived& res, // we have up to 4 packets (for doubles: 8 rows / 2) if (PacketRows>=1) res.template writePacketCoeff(l1i, l1j, - ei_padd(res.template packetCoeff(l1i, l1j), ei_predux(&(dst[0])))); + ei_padd(res.template packetCoeff(l1i, l1j), ei_preduxp(&(dst[0])))); if (PacketRows>=2) res.template writePacketCoeff(l1i+PacketSize, l1j, - ei_padd(res.template packetCoeff(l1i+PacketSize, l1j), ei_predux(&(dst[PacketSize])))); + ei_padd(res.template packetCoeff(l1i+PacketSize, l1j), ei_preduxp(&(dst[PacketSize])))); if (PacketRows>=3) res.template writePacketCoeff(l1i+2*PacketSize, l1j, - ei_padd(res.template packetCoeff(l1i+2*PacketSize, l1j), ei_predux(&(dst[2*PacketSize])))); + ei_padd(res.template packetCoeff(l1i+2*PacketSize, l1j), ei_preduxp(&(dst[2*PacketSize])))); if (PacketRows>=4) res.template writePacketCoeff(l1i+3*PacketSize, l1j, - ei_padd(res.template packetCoeff(l1i+3*PacketSize, l1j), ei_predux(&(dst[3*PacketSize])))); + ei_padd(res.template packetCoeff(l1i+3*PacketSize, l1j), ei_preduxp(&(dst[3*PacketSize])))); // process the remaining rows one at a time if (RemainingStart<=0 && BlockRows>=1) res.coeffRef(l1i+0, l1j) += ei_predux(dst[0]); @@ -450,38 +505,37 @@ template template void Product::_cacheFriendlyEvalImpl(DestDerived& res) const { - // allow direct access to data for benchmark purpose - const Scalar* __restrict__ a = m_lhs.derived().data(); - const Scalar* __restrict__ b = m_rhs.derived().data(); - Scalar* __restrict__ c = res.derived().data(); - // FIXME find a way to optimize: (an_xpr) + (a * b) // then we don't need to clear res and avoid and additional mat-mat sum -// res.setZero(); + #ifndef EIGEN_WIP_PRODUCT_DIRTY +// std::cout << "wip product\n"; + res.setZero(); + #endif - const int bs = PacketSize * MaxBlockRows; // total number of elements treated at once const int rows = _rows(); const int cols = _cols(); const int remainingSize = m_lhs.cols()%PacketSize; const int size = m_lhs.cols() - remainingSize; // third dimension of the product clamped to packet boundaries - const int l2blocksize = 256 > _cols() ? _cols() : 256; - // FIXME use calloca ?? (allocation on the stack) - Scalar* __restrict__ block = new Scalar[l2blocksize*size]; + const int l2BlockRows = MaxL2BlockSize > _rows() ? _rows() : MaxL2BlockSize; + const int l2BlockCols = MaxL2BlockSize > _cols() ? _cols() : MaxL2BlockSize; + const int l2BlockSize = MaxL2BlockSize > size ? size : MaxL2BlockSize; + //Scalar* __restrict__ block = new Scalar[l2blocksize*size];; + Scalar* __restrict__ block = (Scalar*)alloca(sizeof(Scalar)*l2BlockRows*size); // loops on each L2 cache friendly blocks of the result - for(int l2i=0; l2i<_rows(); l2i+=l2blocksize) + for(int l2i=0; l2i<_rows(); l2i+=l2BlockRows) { - const int l2blockRowEnd = std::min(l2i+l2blocksize, rows); - const int l2blockRowEndBW = l2blockRowEnd & 0xFFFFF8; // end of the rows aligned to bw - const int l2blockRowRemaining = l2blockRowEnd - l2blockRowEndBW; // number of remaining rows + const int l2blockRowEnd = std::min(l2i+l2BlockRows, rows); + const int l2blockRowEndBW = l2blockRowEnd & MaxBlockRows_ClampingMask; // end of the rows aligned to bw + const int l2blockRemainingRows = l2blockRowEnd - l2blockRowEndBW; // number of remaining rows // build a cache friendly block int count = 0; // copy l2blocksize rows of m_lhs to blocks of ps x bw - for(int l2k=0; l2k::_cacheFriendlyEvalImpl(DestDerived& res) const block[count++] = m_lhs.coeff(i+w,k+s); } } - if (l2blockRowRemaining>0) + if (l2blockRemainingRows>0) { for (int k=l2k; k::_cacheFriendlyEvalImpl(DestDerived& res) const #if 0 for(int l1j=l2j; l1j::_cacheFriendlyEvalImpl(DestDerived& res) const dst[1] = dst[0]; dst[2] = dst[0]; dst[3] = dst[0]; - if (bw==8) + if (MaxBlockRows==8) { dst[4] = dst[0]; dst[5] = dst[0]; @@ -543,7 +597,7 @@ void Product::_cacheFriendlyEvalImpl(DestDerived& res) const // TODO in unaligned mode, preload the next element // PacketScalar tmp1 = _mm_load_ps(&m_rhs.derived().data()[l1jsize+l2k]); asm("#eigen begincore"); - for(int k=l2k; k(k, l1j); // TODO make this branching compile time (costly for doubles) @@ -559,10 +613,10 @@ void Product::_cacheFriendlyEvalImpl(DestDerived& res) const dst[1] = ei_pmadd(tmp, b1, dst[1]); b1 = ei_pload(&(localB[k*bw+3*ps])); dst[2] = ei_pmadd(tmp, b0, dst[2]); - if (bw==8) + if (MaxBlockRows==8) b0 = ei_pload(&(localB[k*bw+4*ps])); dst[3] = ei_pmadd(tmp, b1, dst[3]); - if (bw==8) + if (MaxBlockRows==8) { b1 = ei_pload(&(localB[k*bw+5*ps])); dst[4] = ei_pmadd(tmp, b0, dst[4]); @@ -576,19 +630,20 @@ void Product::_cacheFriendlyEvalImpl(DestDerived& res) const // if (resIsAligned) { - res.template writePacketCoeff(l1i, l1j, ei_padd(res.template packetCoeff(l1i, l1j), ei_predux(dst))); - if (ps==2) - res.template writePacketCoeff(l1i+2,l1j, ei_padd(res.template packetCoeff(l1i+2,l1j), ei_predux(&(dst[2])))); - if (bw==8) + res.template writePacketCoeff(l1i, l1j, ei_padd(res.template packetCoeff(l1i, l1j), ei_preduxp(dst))); + if (PacketSize==2) + res.template writePacketCoeff(l1i+2,l1j, ei_padd(res.template packetCoeff(l1i+2,l1j), ei_preduxp(&(dst[2])))); + if (MaxBlockRows==8) { - res.template writePacketCoeff(l1i+4,l1j, ei_padd(res.template packetCoeff(l1i+4,l1j), ei_predux(&(dst[4])))); - if (ps==2) - res.template writePacketCoeff(l1i+6,l1j, ei_padd(res.template packetCoeff(l1i+6,l1j), ei_predux(&(dst[6])))); + res.template writePacketCoeff(l1i+4,l1j, ei_padd(res.template packetCoeff(l1i+4,l1j), ei_preduxp(&(dst[4])))); + if (PacketSize==2) + res.template writePacketCoeff(l1i+6,l1j, ei_padd(res.template packetCoeff(l1i+6,l1j), ei_preduxp(&(dst[6])))); } } // else // { // // TODO uncommenting this code kill the perf, even though it is never called !! +// // this is because dst cannot be a set of registers only // // TODO optimize this loop // // TODO is it better to do one redux at once or packet reduxes + unaligned store ? // for (int w = 0; w::_cacheFriendlyEvalImpl(DestDerived& res) const } #endif } - if (l2blockRowRemaining>0) + if (l2blockRemainingRows>0) { // this is an attempt to build an array of kernels, but I did not manage to get it compiles // typedef void (*Kernel)(DestDerived& , int, int, int, int, int, int, int, const Scalar*); // Kernel kernels[8]; // kernels[0] = (Kernel)(&Product::template _cacheFriendlyEvalKernel); -// kernels[l2blockRowRemaining](res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); +// kernels[l2blockRemainingRows](res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); - switch(l2blockRowRemaining) + switch(l2blockRemainingRows) { case 1:_cacheFriendlyEvalKernel( res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break; @@ -627,43 +682,6 @@ void Product::_cacheFriendlyEvalImpl(DestDerived& res) const default: ei_internal_assert(false && "internal error"); break; } - -#if 0 - // TODO optimize this part using a generic templated function that processes N rows - // here we process the remaining l2blockRowRemaining rows - for(int l1j=l2j; l1j::_cacheFriendlyEvalImpl(DestDerived& res) const NormalProduct>( m_lhs.block(0,size, _rows(), remainingSize), m_rhs.block(size,0, remainingSize, _cols())).lazy(); +// res += m_lhs.block(0,size, _rows(), remainingSize)._lazyProduct(m_rhs.block(size,0, remainingSize, _cols())); } - delete[] block; +// delete[] block; } #endif // EIGEN_PRODUCT_H diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index c74d8f87b..48498bfae 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -43,16 +43,18 @@ const unsigned int NullDiagBit = 0x40; ///< means all diagonal coefficients const unsigned int UnitDiagBit = 0x80; ///< means all diagonal coefficients are equal to 1 const unsigned int NullLowerBit = 0x200; ///< means the strictly triangular lower part is 0 const unsigned int NullUpperBit = 0x400; ///< means the strictly triangular upper part is 0 +const unsigned int ReferencableBit = 0x800; ///< means the expression is writable through MatrixBase::coeffRef(int,int) enum { Upper=NullLowerBit, Lower=NullUpperBit }; enum { Aligned=0, UnAligned=1 }; // list of flags that are lost by default -const unsigned int DefaultLostFlagMask = ~(VectorizableBit | Like1DArrayBit | NullDiagBit | UnitDiagBit | NullLowerBit | NullUpperBit); +const unsigned int DefaultLostFlagMask = ~(VectorizableBit | Like1DArrayBit | ReferencableBit + | NullDiagBit | UnitDiagBit | NullLowerBit | NullUpperBit); enum { ConditionalJumpCost = 5 }; enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight }; enum DirectionType { Vertical, Horizontal }; -enum ProductEvaluationMode { NormalProduct, CacheOptimalProduct }; +enum ProductEvaluationMode { NormalProduct, CacheOptimalProduct, LazyProduct}; #endif // EIGEN_CONSTANTS_H diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 613ab9617..f703d159a 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -45,6 +45,13 @@ #define EIGEN_DEFAULT_MATRIX_FLAGS EIGEN_DEFAULT_MATRIX_STORAGE_ORDER +/** Define a hint size when dealling with large matrices and L2 cache friendlyness + * More precisely, its square value represents the amount of bytes which can be assumed to stay in L2 cache. + */ +#ifndef EIGEN_TUNE_FOR_L2_CACHE_SIZE +#define EIGEN_TUNE_FOR_L2_CACHE_SIZE 1024 +#endif + #define USING_PART_OF_NAMESPACE_EIGEN \ EIGEN_USING_MATRIX_TYPEDEFS \ using Eigen::Matrix; \