From d428a199ab70bc08db7551457a1e9d8f65d9ebb9 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 2 Jul 2018 11:41:09 +0200 Subject: [PATCH] bug #1562: optimize evaluation of small products of the form s*A*B by rewriting them as: s*(A.lazyProduct(B)) to save a costly temporary. Measured speedup from 2x to 5x... --- Eigen/src/Core/ProductEvaluators.h | 28 ++++++++++++++++++- Eigen/src/Core/products/GeneralMatrixMatrix.h | 8 +++--- test/product.h | 11 ++++++++ test/product_large.cpp | 2 ++ 4 files changed, 44 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 60de771b2..8072a1959 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -396,7 +396,7 @@ struct generic_product_impl // but easier on the compiler side call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op()); } - + template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { @@ -410,6 +410,32 @@ struct generic_product_impl // dst.noalias() -= lhs.lazyProduct(rhs); call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op()); } + + // Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor: + // dst {,+,-}= s * (A.lazyProduct(B)) + // This is a huge benefit for heap-allocated matrix types as it save one costly allocation. + // For them, this strategy is also faster than simply by-passing the heap allocation through + // stack allocation. + // For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower, + // and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only, + // that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void eval_dynamic(Dst& dst, const CwiseBinaryOp, + const CwiseNullaryOp, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func) + { + call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func); + } + + // Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above + // overload more specialized. + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func) + { + call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func); + } + // template // static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index ed4d3182b..dc47cf7cf 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -431,10 +431,10 @@ struct generic_product_impl // to determine the following heuristic. // EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h, // unless it has been specialized by the user or for a given architecture. - // Note that the condition rhs.rows()>0 was required because lazy produc is (was?) not happy with empty inputs. + // Note that the condition rhs.rows()>0 was required because lazy product is (was?) not happy with empty inputs. // I'm not sure it is still required. if((rhs.rows()+dst.rows()+dst.cols())0) - lazyproduct::evalTo(dst, lhs, rhs); + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op()); else { dst.setZero(); @@ -446,7 +446,7 @@ struct generic_product_impl static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { if((rhs.rows()+dst.rows()+dst.cols())0) - lazyproduct::addTo(dst, lhs, rhs); + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op()); else scaleAndAddTo(dst,lhs, rhs, Scalar(1)); } @@ -455,7 +455,7 @@ struct generic_product_impl static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { if((rhs.rows()+dst.rows()+dst.cols())0) - lazyproduct::subTo(dst, lhs, rhs); + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op()); else scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); } diff --git a/test/product.h b/test/product.h index 0425a929e..d26e8063d 100644 --- a/test/product.h +++ b/test/product.h @@ -111,6 +111,17 @@ template void product(const MatrixType& m) vcres.noalias() -= m1.transpose() * v1; VERIFY_IS_APPROX(vcres, vc2 - m1.transpose() * v1); + // test scaled products + res = square; + res.noalias() = s1 * m1 * m2.transpose(); + VERIFY_IS_APPROX(res, ((s1*m1).eval() * m2.transpose())); + res = square; + res.noalias() += s1 * m1 * m2.transpose(); + VERIFY_IS_APPROX(res, square + ((s1*m1).eval() * m2.transpose())); + res = square; + res.noalias() -= s1 * m1 * m2.transpose(); + VERIFY_IS_APPROX(res, square - ((s1*m1).eval() * m2.transpose())); + // test d ?= a+b*c rules res.noalias() = square + m1 * m2.transpose(); VERIFY_IS_APPROX(res, square + m1 * m2.transpose()); diff --git a/test/product_large.cpp b/test/product_large.cpp index 845cd40ca..14a4f739d 100644 --- a/test/product_large.cpp +++ b/test/product_large.cpp @@ -35,6 +35,8 @@ void test_product_large() for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( product(MatrixXf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_2( product(MatrixXd(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); + CALL_SUBTEST_2( product(MatrixXd(internal::random(1,10), internal::random(1,10))) ); + CALL_SUBTEST_3( product(MatrixXi(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_4( product(MatrixXcf(internal::random(1,EIGEN_TEST_MAX_SIZE/2), internal::random(1,EIGEN_TEST_MAX_SIZE/2))) ); CALL_SUBTEST_5( product(Matrix(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) );