From 8328caa618731fb2a5802daaf8088db4175567a2 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 25 Jan 2016 22:06:42 +0100 Subject: [PATCH] bug #51: add block preallocation mechanism to selfadjoit*matrix product. --- .../Core/products/SelfadjointMatrixMatrix.h | 46 ++++++++++--------- blas/level3_impl.h | 23 ++++++---- test/nomalloc.cpp | 2 +- 3 files changed, 39 insertions(+), 32 deletions(-) diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index f84f54982..ba8ee1d53 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -291,7 +291,7 @@ struct product_selfadjoint_matrix& blocking) { product_selfadjoint_matrix::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs), ColMajor> - ::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha); + ::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking); } }; @@ -314,7 +314,7 @@ struct product_selfadjoint_matrix& blocking); }; template & blocking) { Index size = rows; @@ -340,17 +340,16 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix(kc, mc, nc, 1); - // kc must smaller than mc + Index kc = blocking.kc(); // cache block size along the K direction + Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction + // kc must be smaller than mc kc = (std::min)(kc,mc); + std::size_t sizeA = kc*mc; std::size_t sizeB = kc*cols; - ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); - ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); - Scalar* blockB = allocatedBlockB; + + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); + ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); gebp_kernel gebp_kernel; symm_pack_lhs pack_lhs; @@ -410,7 +409,7 @@ struct product_selfadjoint_matrix& blocking); }; template & blocking) { Index size = cols; @@ -432,14 +431,12 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix(kc, mc, nc, 1); + Index kc = blocking.kc(); // cache block size along the K direction + Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction + std::size_t sizeA = kc*mc; std::size_t sizeB = kc*cols; - ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); - ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); - Scalar* blockB = allocatedBlockB; + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); + ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); gebp_kernel gebp_kernel; gemm_pack_lhs pack_lhs; @@ -498,6 +495,11 @@ struct selfadjoint_product_impl Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) * RhsBlasTraits::extractScalarFactor(a_rhs); + typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar, + Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,1> BlockingType; + + BlockingType blocking(lhs.rows(), rhs.cols(), lhs.cols(), 1, false); + internal::product_selfadjoint_matrix::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint, NumTraits::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)), @@ -509,7 +511,7 @@ struct selfadjoint_product_impl &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info &dst.coeffRef(0,0), dst.outerStride(), // result info - actualAlpha // alpha + actualAlpha, blocking // alpha ); } }; diff --git a/blas/level3_impl.h b/blas/level3_impl.h index 835e53680..b2772b190 100644 --- a/blas/level3_impl.h +++ b/blas/level3_impl.h @@ -275,9 +275,9 @@ int EIGEN_BLAS_FUNC(symm)(char *side, char *uplo, int *m, int *n, RealScalar *pa return 1; } + int size = (SIDE(*side)==LEFT) ? (*m) : (*n); #if ISCOMPLEX // FIXME add support for symmetric complex matrix - int size = (SIDE(*side)==LEFT) ? (*m) : (*n); Matrix matA(size,size); if(UPLO(*uplo)==UP) { @@ -294,13 +294,15 @@ int EIGEN_BLAS_FUNC(symm)(char *side, char *uplo, int *m, int *n, RealScalar *pa else if(SIDE(*side)==RIGHT) matrix(c, *m, *n, *ldc) += alpha * matrix(b, *m, *n, *ldb) * matA; #else + internal::gemm_blocking_space blocking(*m,*n,size,1,false); + if(SIDE(*side)==LEFT) - if(UPLO(*uplo)==UP) internal::product_selfadjoint_matrix::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha); - else if(UPLO(*uplo)==LO) internal::product_selfadjoint_matrix::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha); + if(UPLO(*uplo)==UP) internal::product_selfadjoint_matrix::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha, blocking); + else if(UPLO(*uplo)==LO) internal::product_selfadjoint_matrix::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha, blocking); else return 0; else if(SIDE(*side)==RIGHT) - if(UPLO(*uplo)==UP) internal::product_selfadjoint_matrix::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha); - else if(UPLO(*uplo)==LO) internal::product_selfadjoint_matrix::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha); + if(UPLO(*uplo)==UP) internal::product_selfadjoint_matrix::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha, blocking); + else if(UPLO(*uplo)==LO) internal::product_selfadjoint_matrix::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha, blocking); else return 0; else return 0; @@ -488,20 +490,23 @@ int EIGEN_BLAS_FUNC(hemm)(char *side, char *uplo, int *m, int *n, RealScalar *pa return 1; } + int size = (SIDE(*side)==LEFT) ? (*m) : (*n); + internal::gemm_blocking_space blocking(*m,*n,size,1,false); + if(SIDE(*side)==LEFT) { if(UPLO(*uplo)==UP) internal::product_selfadjoint_matrix - ::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha); + ::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha, blocking); else if(UPLO(*uplo)==LO) internal::product_selfadjoint_matrix - ::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha); + ::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha, blocking); else return 0; } else if(SIDE(*side)==RIGHT) { if(UPLO(*uplo)==UP) matrix(c,*m,*n,*ldc) += alpha * matrix(b,*m,*n,*ldb) * matrix(a,*n,*n,*lda).selfadjointView();/*internal::product_selfadjoint_matrix - ::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);*/ + ::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha, blocking);*/ else if(UPLO(*uplo)==LO) internal::product_selfadjoint_matrix - ::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha); + ::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha, blocking); else return 0; } else diff --git a/test/nomalloc.cpp b/test/nomalloc.cpp index d85e9e5bc..077ecae55 100644 --- a/test/nomalloc.cpp +++ b/test/nomalloc.cpp @@ -85,7 +85,7 @@ template void nomalloc(const MatrixType& m) m2.template selfadjointView().rankUpdate(m1); m2 += m2.template triangularView() * m1; m2.template triangularView() = m2 * m2; -// m1 += m1.template selfadjointView() * m2; + m1 += m1.template selfadjointView() * m2; VERIFY_IS_APPROX(m2,m2); }