Speedup PartialPivLU for small matrices by passing compile-time sizes when available.

This change set also makes a better use of Map<>+OuterStride and Ref<> yielding surprising speed up for small dynamic sizes as well.
The table below reports times in micro seconds for 10 random matrices:
           | ------ float --------- | ------- double ------- |
     size  | before   after  ratio  |  before   after  ratio |
fixed	  1	 | 0.34     0.11   2.93   |  0.35     0.11   3.06  |
fixed	  2	 | 0.81     0.24   3.38   |  0.91     0.25   3.60  |
fixed	  3	 | 1.49     0.49   3.04   |  1.68     0.55   3.01  |
fixed	  4	 | 2.31     0.70   3.28   |  2.45     1.08   2.27  |
fixed	  5	 | 3.49     1.11   3.13   |  3.84     2.24   1.71  |
fixed	  6	 | 4.76     1.64   2.88   |  4.87     2.84   1.71  |
dyn     1	 | 0.50     0.40   1.23   |  0.51     0.40   1.26  |
dyn     2	 | 1.08     0.85   1.27   |  1.04     0.69   1.49  |
dyn     3	 | 1.76     1.26   1.40   |  1.84     1.14   1.60  |
dyn     4	 | 2.57     1.75   1.46   |  2.67     1.66   1.60  |
dyn     5	 | 3.80     2.64   1.43   |  4.00     2.48   1.61  |
dyn     6	 | 5.06     3.43   1.47   |  5.15     3.21   1.60  |
This commit is contained in:
Gael Guennebaud 2019-02-11 13:58:24 +01:00
parent 21eb97d3e0
commit ab6e6edc32

View File

@ -331,17 +331,15 @@ PartialPivLU<MatrixType>::PartialPivLU(EigenBase<InputType>& matrix)
namespace internal { namespace internal {
/** \internal This is the blocked version of fullpivlu_unblocked() */ /** \internal This is the blocked version of fullpivlu_unblocked() */
template<typename Scalar, int StorageOrder, typename PivIndex> template<typename Scalar, int StorageOrder, typename PivIndex, int SizeAtCompileTime=Dynamic>
struct partial_lu_impl struct partial_lu_impl
{ {
// FIXME add a stride to Map, so that the following mapping becomes easier, static const int UnBlockedBound = 16;
// another option would be to create an expression being able to automatically static const bool UnBlockedAtCompileTime = SizeAtCompileTime!=Dynamic && SizeAtCompileTime<=UnBlockedBound;
// warp any Map, Matrix, and Block expressions as a unique type, but since that's exactly static const int ActualSizeAtCompileTime = UnBlockedAtCompileTime ? SizeAtCompileTime : Dynamic;
// a Map + stride, why not adding a stride to Map, and convenient ctors from a Matrix, typedef Matrix<Scalar, ActualSizeAtCompileTime, ActualSizeAtCompileTime, StorageOrder> MatrixType;
// and Block. typedef Ref<MatrixType> MatrixTypeRef;
typedef Map<Matrix<Scalar, Dynamic, Dynamic, StorageOrder> > MapLU; typedef Ref<Matrix<Scalar, Dynamic, Dynamic, StorageOrder> > BlockType;
typedef Block<MapLU, Dynamic, Dynamic> MatrixType;
typedef Block<MatrixType,Dynamic,Dynamic> BlockType;
typedef typename MatrixType::RealScalar RealScalar; typedef typename MatrixType::RealScalar RealScalar;
/** \internal performs the LU decomposition in-place of the matrix \a lu /** \internal performs the LU decomposition in-place of the matrix \a lu
@ -354,7 +352,7 @@ struct partial_lu_impl
* *
* \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise. * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise.
*/ */
static Index unblocked_lu(MatrixType& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions) static Index unblocked_lu(MatrixTypeRef& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions)
{ {
typedef scalar_score_coeff_op<Scalar> Scoring; typedef scalar_score_coeff_op<Scalar> Scoring;
typedef typename Scoring::result_type Score; typedef typename Scoring::result_type Score;
@ -417,13 +415,12 @@ struct partial_lu_impl
*/ */
static Index blocked_lu(Index rows, Index cols, Scalar* lu_data, Index luStride, PivIndex* row_transpositions, PivIndex& nb_transpositions, Index maxBlockSize=256) static Index blocked_lu(Index rows, Index cols, Scalar* lu_data, Index luStride, PivIndex* row_transpositions, PivIndex& nb_transpositions, Index maxBlockSize=256)
{ {
MapLU lu1(lu_data,StorageOrder==RowMajor?rows:luStride,StorageOrder==RowMajor?luStride:cols); MatrixTypeRef lu = MatrixType::Map(lu_data,rows, cols, OuterStride<>(luStride));
MatrixType lu(lu1,0,0,rows,cols);
const Index size = (std::min)(rows,cols); const Index size = (std::min)(rows,cols);
// if the matrix is too small, no blocking: // if the matrix is too small, no blocking:
if(size<=16) if(UnBlockedAtCompileTime || size<=UnBlockedBound)
{ {
return unblocked_lu(lu, row_transpositions, nb_transpositions); return unblocked_lu(lu, row_transpositions, nb_transpositions);
} }
@ -449,12 +446,12 @@ struct partial_lu_impl
// A00 | A01 | A02 // A00 | A01 | A02
// lu = A_0 | A_1 | A_2 = A10 | A11 | A12 // lu = A_0 | A_1 | A_2 = A10 | A11 | A12
// A20 | A21 | A22 // A20 | A21 | A22
BlockType A_0(lu,0,0,rows,k); BlockType A_0 = lu.block(0,0,rows,k);
BlockType A_2(lu,0,k+bs,rows,tsize); BlockType A_2 = lu.block(0,k+bs,rows,tsize);
BlockType A11(lu,k,k,bs,bs); BlockType A11 = lu.block(k,k,bs,bs);
BlockType A12(lu,k,k+bs,bs,tsize); BlockType A12 = lu.block(k,k+bs,bs,tsize);
BlockType A21(lu,k+bs,k,trows,bs); BlockType A21 = lu.block(k+bs,k,trows,bs);
BlockType A22(lu,k+bs,k+bs,trows,tsize); BlockType A22 = lu.block(k+bs,k+bs,trows,tsize);
PivIndex nb_transpositions_in_panel; PivIndex nb_transpositions_in_panel;
// recursively call the blocked LU algorithm on [A11^T A21^T]^T // recursively call the blocked LU algorithm on [A11^T A21^T]^T
@ -497,7 +494,9 @@ void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, t
eigen_assert((&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1); eigen_assert((&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1);
partial_lu_impl partial_lu_impl
<typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor, typename TranspositionType::StorageIndex> < typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor,
typename TranspositionType::StorageIndex,
EIGEN_SIZE_MIN_PREFER_FIXED(MatrixType::RowsAtCompileTime,MatrixType::ColsAtCompileTime)>
::blocked_lu(lu.rows(), lu.cols(), &lu.coeffRef(0,0), lu.outerStride(), &row_transpositions.coeffRef(0), nb_transpositions); ::blocked_lu(lu.rows(), lu.cols(), &lu.coeffRef(0,0), lu.outerStride(), &row_transpositions.coeffRef(0), nb_transpositions);
} }