diff --git a/Eigen/Core b/Eigen/Core index ee9bfe325..ea871dca3 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -180,6 +180,7 @@ namespace Eigen { #include "src/Core/products/GeneralMatrixMatrix.h" #include "src/Core/products/GeneralMatrixVector.h" #include "src/Core/products/SelfadjointMatrixVector.h" +#include "src/Core/products/SelfadjointMatrixMatrix.h" #include "src/Core/TriangularMatrix.h" #include "src/Core/SelfAdjointView.h" #include "src/Core/SolveTriangular.h" diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index fe3e877e1..30fa4aa66 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -25,6 +25,15 @@ #ifndef EIGEN_GENERAL_MATRIX_MATRIX_H #define EIGEN_GENERAL_MATRIX_MATRIX_H +template +struct ei_gebp_kernel; + +template +struct ei_gemm_pack_rhs; + +template +struct ei_gemm_pack_lhs; + template struct ei_L2_block_traits { enum {width = 8 * ei_meta_sqrt::ret }; @@ -124,10 +133,6 @@ static void ei_cache_friendly_product( typedef typename ei_packet_traits::type PacketType; - - -#ifndef EIGEN_USE_ALT_PRODUCT - enum { PacketSize = sizeof(PacketType)/sizeof(Scalar), #if (defined __i386__) @@ -166,398 +171,346 @@ static void ei_cache_friendly_product( // we have selected one row panel of rhs and one column panel of lhs // pack rhs's panel into a sequential chunk of memory // and expand each coeff to a constant packet for further reuse - { - int count = 0; - for(int j2=0; j2()(blockB, rhs, rhsStride, hasAlpha, alpha, actual_kc, packet_cols, k2, cols); // => GEPP_VAR1 for(int i2=0; i2()(blockA, lhs, lhsStride, lhsRowMajor, actual_kc, actual_mc, k2, i2); - // GEBP - // loops on each cache friendly block of the result/rhs - for(int j2=0; j2 >() + (res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols); } } ei_aligned_stack_delete(Scalar, blockA, kc*mc); ei_aligned_stack_delete(Scalar, blockB, kc*cols*PacketSize); +} - - -#else // alternate product from cylmor - - enum { - PacketSize = sizeof(PacketType)/sizeof(Scalar), - #if (defined __i386__) - // i386 architecture provides only 8 xmm registers, - // so let's reduce the max number of rows processed at once. - MaxBlockRows = 4, - MaxBlockRows_ClampingMask = 0xFFFFFC, - #else - MaxBlockRows = 8, - MaxBlockRows_ClampingMask = 0xFFFFF8, - #endif - // maximal size of the blocks fitted in L2 cache - MaxL2BlockSize = ei_L2_block_traits::width - }; - - const bool resIsAligned = (PacketSize==1) || (((resStride%PacketSize) == 0) && (size_t(res)%16==0)); - - const int remainingSize = depth % PacketSize; - const int size = depth - remainingSize; // third dimension of the product clamped to packet boundaries - - const int l2BlockRows = MaxL2BlockSize > rows ? rows : 512; - const int l2BlockCols = MaxL2BlockSize > cols ? cols : 128; - const int l2BlockSize = MaxL2BlockSize > size ? size : 256; - const int l2BlockSizeAligned = (1 + std::max(l2BlockSize,l2BlockCols)/PacketSize)*PacketSize; - const bool needRhsCopy = (PacketSize>1) && ((rhsStride%PacketSize!=0) || (size_t(rhs)%16!=0)); - - Scalar* EIGEN_RESTRICT block = new Scalar[l2BlockRows*size]; -// for(int i=0; i +struct ei_gebp_kernel +{ + void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int actual_mc, int actual_kc, int packet_cols, int i2, int cols) { - for(int l2i=0; l2i do the same but with nr==1 + for(int j2=packet_cols; j2 +struct ei_gemm_pack_lhs +{ + void operator()(Scalar* blockA, const Scalar* lhs, int lhsStride, bool lhsRowMajor, int actual_kc, int actual_mc, int k2, int i2) + { + int count = 0; + const int peeled_mc = (actual_mc/mr)*mr; + if (lhsRowMajor) + { + for(int i=0; i +struct ei_gemm_pack_rhs +{ + void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, bool hasAlpha, Scalar alpha, int actual_kc, int packet_cols, int k2, int cols) + { + int count = 0; + for(int j2=0; j2 +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see . + +#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_H +#define EIGEN_SELFADJOINT_MATRIX_MATRIX_H + +template +struct ei_symm_pack_lhs +{ + void operator()(Scalar* blockA, const Scalar* lhs, int lhsStride, bool lhsRowMajor, int actual_kc, int actual_mc, int k2, int i2) + { + int count = 0; + const int peeled_mc = (actual_mc/mr)*mr; + if (lhsRowMajor) + { + for(int i=0; i +static EIGEN_DONT_INLINE void ei_product_selfadjoint_matrix( + int size, + const Scalar* lhs, int lhsStride, + const Scalar* rhs, int rhsStride, bool rhsRowMajor, int cols, + Scalar* res, int resStride, + Scalar alpha) +{ + typedef typename ei_packet_traits::type Packet; + + ei_conj_helper cj; + if (ConjugateRhs) + alpha = ei_conj(alpha); + bool hasAlpha = alpha != Scalar(1); + + typedef typename ei_packet_traits::type PacketType; + + const bool lhsRowMajor = (StorageOrder==RowMajor); + + enum { + PacketSize = sizeof(PacketType)/sizeof(Scalar), + #if (defined __i386__) + HalfRegisterCount = 4, + #else + HalfRegisterCount = 8, + #endif + + // register block size along the N direction + nr = HalfRegisterCount/2, + + // register block size along the M direction + mr = 2 * PacketSize, + + // max cache block size along the K direction + Max_kc = ei_L2_block_traits::width, + + // max cache block size along the M direction + Max_mc = 2*Max_kc + }; + + int kc = std::min(Max_kc,size); // cache block size along the K direction + int mc = std::min(Max_mc,size); // cache block size along the M direction + + Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); + Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*PacketSize); + + // number of columns which can be processed by packet of nr columns + int packet_cols = (cols/nr)*nr; + + for(int k2=0; k2()(blockB, rhs, rhsStride, hasAlpha, alpha, actual_kc, packet_cols, k2, cols); + + // the select lhs's panel has to be split in three different parts: + // 1 - the transposed panel above the diagonal block => transposed packed copy + // 2 - the diagonal block => special packed copy + // 3 - the panel below the diagonal block => generic packed copy + for(int i2=0; i2()(blockA, lhs, lhsStride, !lhsRowMajor, actual_kc, actual_mc, k2, i2); + + ei_gebp_kernel >() + (res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols); + } + // the block diagonal + { + const int actual_mc = std::min(k2+kc,size)-k2; + // symmetric packed copy + ei_symm_pack_lhs()(blockA, lhs, lhsStride, lhsRowMajor, actual_kc, actual_mc, k2, k2); + ei_gebp_kernel >() + (res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, k2, cols); + } + + for(int i2=k2+kc; i2()(blockA, lhs, lhsStride, lhsRowMajor, actual_kc, actual_mc, k2, i2); + ei_gebp_kernel >() + (res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols); + } + } + + ei_aligned_stack_delete(Scalar, blockA, kc*mc); + ei_aligned_stack_delete(Scalar, blockB, kc*cols*PacketSize); +} + + +#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H diff --git a/test/product_selfadjoint.cpp b/test/product_selfadjoint.cpp index 29fbf11bf..814672696 100644 --- a/test/product_selfadjoint.cpp +++ b/test/product_selfadjoint.cpp @@ -46,9 +46,9 @@ template void product_selfadjoint(const MatrixType& m) Scalar s1 = ei_random(), s2 = ei_random(), s3 = ei_random(); - + m1 = m1.adjoint()*m1; - + // lower m2.setZero(); m2.template triangularView() = m1; @@ -68,7 +68,7 @@ template void product_selfadjoint(const MatrixType& m) m2 = m1.template triangularView(); m2.template selfadjointView().rank2update(v1,v2); VERIFY_IS_APPROX(m2, (m1 + v1 * v2.adjoint()+ v2 * v1.adjoint()).template triangularView().toDense()); - + m2 = m1.template triangularView(); m2.template selfadjointView().rank2update(-v1,s2*v2,s3); VERIFY_IS_APPROX(m2, (m1 + (-s2*s3) * (v1 * v2.adjoint()+ v2 * v1.adjoint())).template triangularView().toDense()); @@ -99,4 +99,24 @@ void test_product_selfadjoint() CALL_SUBTEST( product_selfadjoint(Matrix(17,17)) ); CALL_SUBTEST( product_selfadjoint(Matrix,Dynamic,Dynamic,RowMajor>(19, 19)) ); } + + for(int i = 0; i < g_repeat ; i++) + { + int size = ei_random(10,1024); + int cols = ei_random(10,320); + MatrixXf A = MatrixXf::Random(size,size); + MatrixXf B = MatrixXf::Random(size,cols); + MatrixXf C = MatrixXf::Random(size,cols); + MatrixXf R = MatrixXf::Random(size,cols); + A = (A+A.transpose()).eval(); + + R = C + (A * B).eval(); + + A.corner(TopRight,size-1,size-1).triangularView().setZero(); + + ei_product_selfadjoint_matrix + (size, A.data(), A.stride(), B.data(), B.stride(), false, B.cols(), C.data(), C.stride(), 1); +// std::cerr << A << "\n\n" << C << "\n\n" << R << "\n\n"; + VERIFY_IS_APPROX(C,R); + } }