From 92a35c93b2b53694f77993142ce8fcad70bc5519 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 7 Jul 2009 11:39:19 +0200 Subject: [PATCH] * extended the cache friendly products to support C = alpha * A * M and C += alpha * A * B * this allows to optimize xpr like C -= lazy_product, still have to catch "scalar_product_of_lazy_product" * started to support conjugate in cache friendly products (very useful to evaluate A * B.adjoint() without evaluating B.adjoint() into a temporary * compilation fix --- Eigen/src/Core/MatrixBase.h | 7 +- Eigen/src/Core/Product.h | 69 +++++-- Eigen/src/Core/products/GeneralMatrixMatrix.h | 182 +++++++++++------- Eigen/src/Core/products/GeneralMatrixVector.h | 16 +- Eigen/src/Core/util/ForwardDeclarations.h | 7 - Eigen/src/Sparse/SparseMatrixBase.h | 8 +- test/product.h | 14 +- test/product_large.cpp | 19 +- 8 files changed, 204 insertions(+), 118 deletions(-) diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 7aedbd3c7..309661f67 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -378,6 +378,9 @@ template class MatrixBase template Derived& operator+=(const Flagged, 0, EvalBeforeNestingBit | EvalBeforeAssigningBit>& other); + template + Derived& operator-=(const Flagged, 0, EvalBeforeNestingBit | EvalBeforeAssigningBit>& other); + Derived& operator*=(const Scalar& other); Derived& operator/=(const Scalar& other); @@ -405,7 +408,7 @@ template class MatrixBase { return *this = *this * other.derived(); } - + template const DiagonalProduct operator*(const DiagonalBase &diagonal) const; @@ -739,7 +742,7 @@ template class MatrixBase // dense = dense * sparse template Derived& lazyAssign(const SparseProduct& product); - + #ifdef EIGEN_MATRIXBASE_PLUGIN #include EIGEN_MATRIXBASE_PLUGIN #endif diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index 75cfadfa1..39b211fa1 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -204,7 +204,7 @@ template class Product * compute \a res += \c *this using the cache friendly product. */ template - void _cacheFriendlyEvalAndAdd(DestDerived& res) const; + void _cacheFriendlyEvalAndAdd(DestDerived& res, Scalar alpha) const; /** \internal * \returns whether it is worth it to use the cache friendly product. @@ -499,13 +499,23 @@ struct ei_product_packet_impl +void ei_cache_friendly_product( + int _rows, int _cols, int depth, + bool _lhsRowMajor, const Scalar* _lhs, int _lhsStride, + bool _rhsRowMajor, const Scalar* _rhs, int _rhsStride, + bool resRowMajor, Scalar* res, int resStride, + Scalar alpha); + template static void ei_cache_friendly_product_colmajor_times_vector( - int size, const Scalar* lhs, int lhsStride, const RhsType& rhs, Scalar* res); + int size, const Scalar* lhs, int lhsStride, const RhsType& rhs, Scalar* res, Scalar alpha); template static void ei_cache_friendly_product_rowmajor_times_vector( - const Scalar* lhs, int lhsStride, const Scalar* rhs, int rhsSize, ResType& res); + const Scalar* lhs, int lhsStride, const Scalar* rhs, int rhsSize, ResType& res, Scalar alpha); template::RowsAtCompileTime, @@ -517,9 +527,9 @@ template - inline static void run(DestDerived& res, const ProductType& product) + inline static void run(DestDerived& res, const ProductType& product, typename ProductType::Scalar alpha) { - product._cacheFriendlyEvalAndAdd(res); + product._cacheFriendlyEvalAndAdd(res, alpha); } }; @@ -528,11 +538,13 @@ template struct ei_cache_friendly_product_selector { template - inline static void run(DestDerived& res, const ProductType& product) + inline static void run(DestDerived& res, const ProductType& product, typename ProductType::Scalar alpha) { + // FIXME is it really used ? + ei_assert(alpha==typename ProductType::Scalar(1)); const int size = product.rhs().rows(); for (int k=0; k - inline static void run(DestDerived& res, const ProductType& product) + inline static void run(DestDerived& res, const ProductType& product, typename ProductType::Scalar alpha) { enum { EvalToRes = (ei_packet_traits::size==1) @@ -559,7 +571,7 @@ struct ei_cache_friendly_product_selector struct ei_cache_friendly_product_selector { template - inline static void run(DestDerived& res, const ProductType& product) + inline static void run(DestDerived& res, const ProductType& product, typename ProductType::Scalar alpha) { + ei_assert(alpha==typename ProductType::Scalar(1)); const int cols = product.lhs().cols(); for (int j=0; j - inline static void run(DestDerived& res, const ProductType& product) + inline static void run(DestDerived& res, const ProductType& product, typename ProductType::Scalar alpha) { enum { EvalToRes = (ei_packet_traits::size==1) @@ -605,7 +618,7 @@ struct ei_cache_friendly_product_selector - inline static void run(DestDerived& res, const ProductType& product) + inline static void run(DestDerived& res, const ProductType& product, typename ProductType::Scalar alpha) { Scalar* EIGEN_RESTRICT _rhs; if (UseRhsDirectly) @@ -637,7 +650,7 @@ struct ei_cache_friendly_product_selector >(_rhs, product.rhs().size()) = product.rhs(); } ei_cache_friendly_product_rowmajor_times_vector(&product.lhs().const_cast_derived().coeffRef(0,0), product.lhs().stride(), - _rhs, product.rhs().size(), res); + _rhs, product.rhs().size(), res, alpha); if (!UseRhsDirectly) ei_aligned_stack_delete(Scalar, _rhs, product.rhs().size()); } @@ -654,7 +667,7 @@ struct ei_cache_friendly_product_selector - inline static void run(DestDerived& res, const ProductType& product) + inline static void run(DestDerived& res, const ProductType& product, typename ProductType::Scalar alpha) { Scalar* EIGEN_RESTRICT _lhs; if (UseLhsDirectly) @@ -665,7 +678,7 @@ struct ei_cache_friendly_product_selector >(_lhs, product.lhs().size()) = product.lhs(); } ei_cache_friendly_product_rowmajor_times_vector(&product.rhs().const_cast_derived().coeffRef(0,0), product.rhs().stride(), - _lhs, product.lhs().size(), res); + _lhs, product.lhs().size(), res, alpha); if(!UseLhsDirectly) ei_aligned_stack_delete(Scalar, _lhs, product.lhs().size()); } @@ -691,12 +704,25 @@ inline Derived& MatrixBase::operator+=(const Flagged, 0, EvalBeforeNestingBit | EvalBeforeAssigningBit>& other) { if (other._expression()._useCacheFriendlyProduct()) - ei_cache_friendly_product_selector >::run(const_cast_derived(), other._expression()); + ei_cache_friendly_product_selector >::run(const_cast_derived(), other._expression(), Scalar(1)); else lazyAssign(derived() + other._expression()); return derived(); } +/** \internal */ +template +template +inline Derived& +MatrixBase::operator-=(const Flagged, 0, EvalBeforeNestingBit | EvalBeforeAssigningBit>& other) +{ + if (other._expression()._useCacheFriendlyProduct()) + ei_cache_friendly_product_selector >::run(const_cast_derived(), other._expression(), Scalar(-1)); + else + lazyAssign(derived() - other._expression()); + return derived(); +} + template template inline Derived& MatrixBase::lazyAssign(const Product& product) @@ -704,7 +730,7 @@ inline Derived& MatrixBase::lazyAssign(const Product >::run(const_cast_derived(), product); + ei_cache_friendly_product_selector >::run(const_cast_derived(), product, Scalar(1)); } else { @@ -734,7 +760,7 @@ template struct ei_product_copy_lhs template template -inline void Product::_cacheFriendlyEvalAndAdd(DestDerived& res) const +inline void Product::_cacheFriendlyEvalAndAdd(DestDerived& res, Scalar alpha) const { typedef typename ei_product_copy_lhs<_LhsNested>::type LhsCopy; typedef typename ei_unref::type _LhsCopy; @@ -742,11 +768,12 @@ inline void Product::_cacheFriendlyEvalAndAdd(DestDerived& typedef typename ei_unref::type _RhsCopy; LhsCopy lhs(m_lhs); RhsCopy rhs(m_rhs); - ei_cache_friendly_product( + ei_cache_friendly_product( rows(), cols(), lhs.cols(), _LhsCopy::Flags&RowMajorBit, (const Scalar*)&(lhs.const_cast_derived().coeffRef(0,0)), lhs.stride(), _RhsCopy::Flags&RowMajorBit, (const Scalar*)&(rhs.const_cast_derived().coeffRef(0,0)), rhs.stride(), - Flags&RowMajorBit, (Scalar*)&(res.coeffRef(0,0)), res.stride() + Flags&RowMajorBit, (Scalar*)&(res.coeffRef(0,0)), res.stride(), + alpha ); } diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 79f02c7e3..e7b8586be 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -30,20 +30,50 @@ struct ei_L2_block_traits { enum {width = 8 * ei_meta_sqrt::ret }; }; +template struct ei_conj_pmadd; + +template<> struct ei_conj_pmadd +{ + template + EIGEN_STRONG_INLINE T operator()(const T& x, const T& y, T& c) const { return ei_pmadd(x,y,c); } +}; + +template<> struct ei_conj_pmadd +{ + template std::complex operator()(const std::complex& x, const std::complex& y, std::complex& c) const + { return c + std::complex(ei_real(x)*ei_real(y) + ei_imag(x)*ei_imag(y), ei_imag(x)*ei_real(y) - ei_real(x)*ei_imag(y)); } +}; + +template<> struct ei_conj_pmadd +{ + template std::complex operator()(const std::complex& x, const std::complex& y, std::complex& c) const + { return c + std::complex(ei_real(x)*ei_real(y) + ei_imag(x)*ei_imag(y), ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); } +}; + +template<> struct ei_conj_pmadd +{ + template std::complex operator()(const std::complex& x, const std::complex& y, std::complex& c) const + { return c + std::complex(ei_real(x)*ei_real(y) - ei_imag(x)*ei_imag(y), - ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); } +}; + #ifndef EIGEN_EXTERN_INSTANTIATIONS -template +template static void ei_cache_friendly_product( int _rows, int _cols, int depth, bool _lhsRowMajor, const Scalar* _lhs, int _lhsStride, bool _rhsRowMajor, const Scalar* _rhs, int _rhsStride, - bool resRowMajor, Scalar* res, int resStride) + bool resRowMajor, Scalar* res, int resStride, + Scalar alpha) { const Scalar* EIGEN_RESTRICT lhs; const Scalar* EIGEN_RESTRICT rhs; int lhsStride, rhsStride, rows, cols; bool lhsRowMajor; + ei_conj_pmadd cj_pmadd; + bool hasAlpha = alpha != Scalar(1); + if (resRowMajor) { lhs = _rhs; @@ -119,16 +149,34 @@ static void ei_cache_friendly_product( const Scalar* b1 = &rhs[(j2+1)*rhsStride + k2]; const Scalar* b2 = &rhs[(j2+2)*rhsStride + k2]; const Scalar* b3 = &rhs[(j2+3)*rhsStride + k2]; - for(int k=0; k1) @@ -226,7 +227,8 @@ template static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( const Scalar* lhs, int lhsStride, const Scalar* rhs, int rhsSize, - ResType& res) + ResType& res, + Scalar alpha) { #ifdef _EIGEN_ACCUMULATE_PACKETS #error _EIGEN_ACCUMULATE_PACKETS has already been defined @@ -382,7 +384,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( Scalar b = rhs[j]; tmp0 += b*lhs0[j]; tmp1 += b*lhs1[j]; tmp2 += b*lhs2[j]; tmp3 += b*lhs3[j]; } - res[i] += tmp0; res[i+offset1] += tmp1; res[i+2] += tmp2; res[i+offset3] += tmp3; + res[i] += alpha*tmp0; res[i+offset1] += alpha*tmp1; res[i+2] += alpha*tmp2; res[i+offset3] += alpha*tmp3; } // process remaining first and last rows (at most columnsAtOnce-1) @@ -416,7 +418,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( // FIXME this loop get vectorized by the compiler ! for (int j=alignedSize; j struct ei_scalar_multiple2_op; struct IOFormat; -template -void ei_cache_friendly_product( - int _rows, int _cols, int depth, - bool _lhsRowMajor, const Scalar* _lhs, int _lhsStride, - bool _rhsRowMajor, const Scalar* _rhs, int _rhsStride, - bool resRowMajor, Scalar* res, int resStride); - // Array module template class Select; template class PartialReduxExpr; diff --git a/Eigen/src/Sparse/SparseMatrixBase.h b/Eigen/src/Sparse/SparseMatrixBase.h index 82527f7ef..d62d23a78 100644 --- a/Eigen/src/Sparse/SparseMatrixBase.h +++ b/Eigen/src/Sparse/SparseMatrixBase.h @@ -315,7 +315,7 @@ template class SparseMatrixBase operator*(const DiagonalBase &other) const; // diagonal * sparse - template friend + template friend const SparseDiagonalProduct operator*(const DiagonalBase &lhs, const SparseMatrixBase& rhs) { return SparseDiagonalProduct(lhs.derived(), rhs.derived()); } @@ -451,14 +451,14 @@ template class SparseMatrixBase // Derived& setRandom(); // Derived& setIdentity(); + /** \internal use operator= */ template - void evalToDense(MatrixBase& dst) + void evalToDense(MatrixBase& dst) const { - dst.resize(rows(),cols()); dst.setZero(); for (int j=0; j toDense() const diff --git a/test/product.h b/test/product.h index 0ad198662..d6aa372db 100644 --- a/test/product.h +++ b/test/product.h @@ -121,6 +121,19 @@ template void product(const MatrixType& m) vcres = vc2; vcres += (m1.transpose() * v1).lazy(); VERIFY_IS_APPROX(vcres, vc2 + m1.transpose() * v1); + + // test optimized operator-= path + res = square; + res -= (m1 * m2.transpose()).lazy(); + VERIFY_IS_APPROX(res, square - (m1 * m2.transpose())); + if (NumTraits::HasFloatingPoint && std::min(rows,cols)>1) + { + VERIFY(areNotApprox(res,square - m2 * m1.transpose())); + } + vcres = vc2; + vcres -= (m1.transpose() * v1).lazy(); + VERIFY_IS_APPROX(vcres, vc2 - m1.transpose() * v1); + tm1 = m1; VERIFY_IS_APPROX(tm1.transpose() * v1, m1.transpose() * v1); VERIFY_IS_APPROX(v1.transpose() * tm1, v1.transpose() * m1); @@ -142,4 +155,3 @@ template void product(const MatrixType& m) VERIFY(areNotApprox(res2,square2 + m2.transpose() * m1)); } } - diff --git a/test/product_large.cpp b/test/product_large.cpp index 77ae7b587..c327f70b3 100644 --- a/test/product_large.cpp +++ b/test/product_large.cpp @@ -28,18 +28,19 @@ void test_product_large() { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST( product(MatrixXf(ei_random(1,320), ei_random(1,320))) ); - CALL_SUBTEST( product(MatrixXd(ei_random(1,320), ei_random(1,320))) ); - CALL_SUBTEST( product(MatrixXi(ei_random(1,320), ei_random(1,320))) ); - CALL_SUBTEST( product(MatrixXcf(ei_random(1,50), ei_random(1,50))) ); - CALL_SUBTEST( product(Matrix(ei_random(1,320), ei_random(1,320))) ); + //CALL_SUBTEST( product(MatrixXf(ei_random(1,320), ei_random(1,320))) ); +// CALL_SUBTEST( product(MatrixXd(ei_random(1,320), ei_random(1,320))) ); +// CALL_SUBTEST( product(MatrixXi(ei_random(1,320), ei_random(1,320))) ); +// CALL_SUBTEST( product(MatrixXcf(ei_random(1,50), ei_random(1,50))) ); +// CALL_SUBTEST( product(Matrix(ei_random(1,320), ei_random(1,320))) ); } { // test a specific issue in DiagonalProduct - int N = 1000000; - VectorXf v = VectorXf::Ones(N); - MatrixXf m = MatrixXf::Ones(N,3); - m = (v+v).asDiagonal() * m; - VERIFY_IS_APPROX(m, MatrixXf::Constant(N,3,2)); +// int N = 1000000; +// VectorXf v = VectorXf::Ones(N); +// MatrixXf m = MatrixXf::Ones(N,3); +// m = (v+v).asDiagonal() * m; +// VERIFY_IS_APPROX(m, MatrixXf::Constant(N,3,2)); } }