Speed up Eigen matrix*vector and vector*matrix multiplication.

This change speeds up Eigen matrix * vector and vector * matrix multiplication for dynamic matrices when it is known at runtime that one of the factors is a vector.

The benchmarks below test

c.noalias()= n_by_n_matrix * n_by_1_matrix;
c.noalias()= 1_by_n_matrix * n_by_n_matrix;
respectively.

Benchmark measurements:

SSE:
Run on *** (72 X 2992 MHz CPUs); 2019-01-28T17:51:44.452697457-08:00
CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB
Benchmark                          Base (ns)  New (ns) Improvement
------------------------------------------------------------------
BM_MatVec/64                            1096       312    +71.5%
BM_MatVec/128                           4581      1464    +68.0%
BM_MatVec/256                          18534      5710    +69.2%
BM_MatVec/512                         118083     24162    +79.5%
BM_MatVec/1k                          704106    173346    +75.4%
BM_MatVec/2k                         3080828    742728    +75.9%
BM_MatVec/4k                        25421512   4530117    +82.2%
BM_VecMat/32                             352       130    +63.1%
BM_VecMat/64                            1213       425    +65.0%
BM_VecMat/128                           4640      1564    +66.3%
BM_VecMat/256                          17902      5884    +67.1%
BM_VecMat/512                          70466     24000    +65.9%
BM_VecMat/1k                          340150    161263    +52.6%
BM_VecMat/2k                         1420590    645576    +54.6%
BM_VecMat/4k                         8083859   4364327    +46.0%

AVX2:
Run on *** (72 X 2993 MHz CPUs); 2019-01-28T17:45:11.508545307-08:00
CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB
Benchmark                          Base (ns)  New (ns) Improvement
------------------------------------------------------------------
BM_MatVec/64                             619       120    +80.6%
BM_MatVec/128                           9693       752    +92.2%
BM_MatVec/256                          38356      2773    +92.8%
BM_MatVec/512                          69006     12803    +81.4%
BM_MatVec/1k                          443810    160378    +63.9%
BM_MatVec/2k                         2633553    646594    +75.4%
BM_MatVec/4k                        16211095   4327148    +73.3%
BM_VecMat/64                             925       227    +75.5%
BM_VecMat/128                           3438       830    +75.9%
BM_VecMat/256                          13427      2936    +78.1%
BM_VecMat/512                          53944     12473    +76.9%
BM_VecMat/1k                          302264    157076    +48.0%
BM_VecMat/2k                         1396811    675778    +51.6%
BM_VecMat/4k                         8962246   4459010    +50.2%

AVX512:
Run on *** (72 X 2993 MHz CPUs); 2019-01-28T17:35:17.239329863-08:00
CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB
Benchmark                          Base (ns)  New (ns) Improvement
------------------------------------------------------------------
BM_MatVec/64                             401       111    +72.3%
BM_MatVec/128                           1846       513    +72.2%
BM_MatVec/256                          36739      1927    +94.8%
BM_MatVec/512                          54490      9227    +83.1%
BM_MatVec/1k                          487374    161457    +66.9%
BM_MatVec/2k                         2016270    643824    +68.1%
BM_MatVec/4k                        13204300   4077412    +69.1%
BM_VecMat/32                             324       106    +67.3%
BM_VecMat/64                            1034       246    +76.2%
BM_VecMat/128                           3576       802    +77.6%
BM_VecMat/256                          13411      2561    +80.9%
BM_VecMat/512                          58686     10037    +82.9%
BM_VecMat/1k                          320862    163750    +49.0%
BM_VecMat/2k                         1406719    651397    +53.7%
BM_VecMat/4k                         7785179   4124677    +47.0%
Currently watchingStop watching
This commit is contained in:
Rasmus Munk Larsen 2019-01-31 14:24:08 -08:00
parent 7ef879f6bf
commit 4c0fa6ce0f

View File

@ -404,13 +404,13 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
namespace internal {
template<typename Lhs, typename Rhs>
struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
: generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> >
{
template <typename Lhs, typename Rhs, typename Dest,
bool MultipleRowsAtCompileTime =
(Lhs::RowsAtCompileTime > 1 || Dest::RowsAtCompileTime > 1),
bool MultipleColsAtCompileTime =
(Rhs::ColsAtCompileTime > 1 || Dest::ColsAtCompileTime > 1)>
struct gemm_selector {
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
typedef typename Lhs::Scalar LhsScalar;
typedef typename Rhs::Scalar RhsScalar;
typedef internal::blas_traits<Lhs> LhsBlasTraits;
typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
@ -420,10 +420,130 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
static void run(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha)
{
if (a_rhs.cols() != 1 && a_lhs.rows() != 1) {
gemm_selector<Lhs, Rhs, Dest, true, true>::run(dst, a_lhs, a_rhs, alpha);
} else if (a_rhs.cols() == 1) {
// matrix * vector.
internal::gemv_dense_selector<OnTheRight,
(int(ActualLhsTypeCleaned::Flags)&RowMajorBit) ? RowMajor : ColMajor,
bool(internal::blas_traits<ActualLhsTypeCleaned>::HasUsableDirectAccess)
>::run(a_lhs, a_rhs.col(0), dst, alpha);
} else {
// vector * matrix.
internal::gemv_dense_selector<OnTheLeft,
(int(ActualRhsTypeCleaned::Flags)&RowMajorBit) ? RowMajor : ColMajor,
bool(internal::blas_traits<ActualRhsTypeCleaned>::HasUsableDirectAccess)
>::run(a_lhs.row(0), a_rhs, dst, alpha);
}
}
};
template <typename Lhs, typename Rhs, typename Dest>
struct gemm_selector<Lhs, Rhs, Dest, true, false> {
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
typedef internal::blas_traits<Lhs> LhsBlasTraits;
typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned;
static void run(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha)
{
if (a_rhs.cols() != 1 && a_lhs.rows() != 1) {
gemm_selector<Lhs, Rhs, Dest, true, true>::run(dst, a_lhs, a_rhs, alpha);
} else {
// matrix * vector.
internal::gemv_dense_selector<OnTheRight,
(int(ActualLhsTypeCleaned::Flags)&RowMajorBit) ? RowMajor : ColMajor,
bool(internal::blas_traits<ActualLhsTypeCleaned>::HasUsableDirectAccess)
>::run(a_lhs, a_rhs.col(0), dst, alpha);
}
}
};
template <typename Lhs, typename Rhs, typename Dest>
struct gemm_selector<Lhs, Rhs, Dest, false, true> {
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
typedef internal::blas_traits<Rhs> RhsBlasTraits;
typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
static void run(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha)
{
if (a_rhs.cols() != 1 && a_lhs.rows() != 1) {
gemm_selector<Lhs, Rhs, Dest, true, true>::run(dst, a_lhs, a_rhs, alpha);
} else {
// vector * matrix.
internal::gemv_dense_selector<OnTheLeft,
(int(ActualRhsTypeCleaned::Flags)&RowMajorBit) ? RowMajor : ColMajor,
bool(internal::blas_traits<ActualRhsTypeCleaned>::HasUsableDirectAccess)
>::run(a_lhs.row(0), a_rhs, dst, alpha);
}
}
};
template <typename Lhs, typename Rhs, typename Dest>
struct gemm_selector<Lhs, Rhs, Dest, true, true> {
typedef typename Product<Lhs, Rhs>::Scalar Scalar;
typedef typename Lhs::Scalar LhsScalar;
typedef typename Rhs::Scalar RhsScalar;
typedef internal::blas_traits<Lhs> LhsBlasTraits;
typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
typedef
typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned;
typedef internal::blas_traits<Rhs> RhsBlasTraits;
typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
typedef
typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
enum {
MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime)
MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(
Lhs::MaxColsAtCompileTime, Rhs::MaxRowsAtCompileTime)
};
static void run(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs,
const Scalar& alpha) {
Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) *
RhsBlasTraits::extractScalarFactor(a_rhs);
typename internal::add_const_on_value_type<ActualLhsType>::type lhs =
LhsBlasTraits::extract(a_lhs);
typename internal::add_const_on_value_type<ActualRhsType>::type rhs =
RhsBlasTraits::extract(a_rhs);
typedef internal::gemm_blocking_space<
(Dest::Flags & RowMajorBit) ? RowMajor : ColMajor, LhsScalar, RhsScalar,
Dest::MaxRowsAtCompileTime, Dest::MaxColsAtCompileTime,
MaxDepthAtCompileTime>
BlockingType;
typedef internal::gemm_functor<
Scalar, Index,
internal::general_matrix_matrix_product<
Index, LhsScalar,
(ActualLhsTypeCleaned::Flags & RowMajorBit) ? RowMajor : ColMajor,
bool(LhsBlasTraits::NeedToConjugate), RhsScalar,
(ActualRhsTypeCleaned::Flags & RowMajorBit) ? RowMajor : ColMajor,
bool(RhsBlasTraits::NeedToConjugate),
(Dest::Flags & RowMajorBit) ? RowMajor : ColMajor>,
ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType>
GemmFunctor;
BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime > 32 ||
Dest::MaxRowsAtCompileTime == Dynamic)>(
GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(),
a_rhs.cols(), a_lhs.cols(), Dest::Flags & RowMajorBit);
}
};
template<typename Lhs, typename Rhs>
struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
: generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> >
{
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
typedef generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> lazyproduct;
template<typename Dst>
@ -450,7 +570,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op<typename Dst::Scalar,Scalar>());
else
scaleAndAddTo(dst,lhs, rhs, Scalar(1));
scaleAndAddTo(dst, lhs, rhs, Scalar(1));
}
template<typename Dst>
@ -469,27 +589,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0)
return;
typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs)
* RhsBlasTraits::extractScalarFactor(a_rhs);
typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar,
Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType;
typedef internal::gemm_functor<
Scalar, Index,
internal::general_matrix_matrix_product<
Index,
LhsScalar, (ActualLhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate),
RhsScalar, (ActualRhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor;
BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>
(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), a_lhs.cols(), Dest::Flags&RowMajorBit);
gemm_selector<Lhs, Rhs, Dest>::run(dst, a_lhs, a_rhs, alpha);
}
};