mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-01-18 14:34:17 +08:00
more product refactoring
This commit is contained in:
parent
6b2ab13ac5
commit
56d00779db
14
Eigen/Core
14
Eigen/Core
@ -138,7 +138,7 @@ namespace Eigen {
|
||||
#endif
|
||||
|
||||
#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
|
||||
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16
|
||||
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
|
||||
#endif
|
||||
|
||||
#include "src/Core/Functors.h"
|
||||
@ -180,18 +180,20 @@ namespace Eigen {
|
||||
#include "src/Core/util/BlasUtil.h"
|
||||
#include "src/Core/ProductBase.h"
|
||||
#include "src/Core/Product.h"
|
||||
#include "src/Core/products/GeneralMatrixMatrix.h"
|
||||
#include "src/Core/products/GeneralMatrixVector.h"
|
||||
#include "src/Core/products/SelfadjointMatrixVector.h"
|
||||
#include "src/Core/products/SelfadjointMatrixMatrix.h"
|
||||
#include "src/Core/TriangularMatrix.h"
|
||||
#include "src/Core/SelfAdjointView.h"
|
||||
#include "src/Core/SolveTriangular.h"
|
||||
#include "src/Core/products/GeneralUnrolled.h"
|
||||
#include "src/Core/products/GeneralBlockPanelKernel.h"
|
||||
#include "src/Core/products/GeneralMatrixVector.h"
|
||||
#include "src/Core/products/GeneralMatrixMatrix.h"
|
||||
#include "src/Core/products/SelfadjointMatrixVector.h"
|
||||
#include "src/Core/products/SelfadjointMatrixMatrix.h"
|
||||
#include "src/Core/products/SelfadjointProduct.h"
|
||||
#include "src/Core/products/SelfadjointRank2Update.h"
|
||||
#include "src/Core/products/TriangularMatrixVector.h"
|
||||
#include "src/Core/products/TriangularSolverMatrix.h"
|
||||
#include "src/Core/products/TriangularMatrixMatrix.h"
|
||||
#include "src/Core/products/TriangularSolverMatrix.h"
|
||||
#include "src/Core/BandMatrix.h"
|
||||
|
||||
} // namespace Eigen
|
||||
|
@ -63,23 +63,22 @@ template<typename Lhs, typename Rhs> struct ei_product_type
|
||||
Cols = Rhs::ColsAtCompileTime,
|
||||
Depth = EIGEN_ENUM_MIN(Lhs::ColsAtCompileTime,Rhs::RowsAtCompileTime),
|
||||
|
||||
value = ei_product_type_selector<(Rows>8 ? Large : (Rows==1 ? 1 : Small)),
|
||||
(Cols>8 ? Large : (Cols==1 ? 1 : Small)),
|
||||
(Depth>8 ? Large : (Depth==1 ? 1 : Small))>::ret
|
||||
value = ei_product_type_selector<(Rows >=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ? Large : (Rows==1 ? 1 : Small)),
|
||||
(Cols >=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ? Large : (Cols==1 ? 1 : Small)),
|
||||
(Depth>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ? Large : (Depth==1 ? 1 : Small))>::ret
|
||||
};
|
||||
};
|
||||
|
||||
/* The following allows to select the kind of product at compile time
|
||||
* based on the three dimensions of the product.
|
||||
* This is a compile time mapping from {1,Small,Large}^3 -> {product types} */
|
||||
// FIXME I'm not sure the current mapping is the ideal one.
|
||||
template<int Rows, int Cols> struct ei_product_type_selector<Rows,Cols,1> { enum { ret = OuterProduct }; };
|
||||
template<int Depth> struct ei_product_type_selector<1,1,Depth> { enum { ret = InnerProduct }; };
|
||||
template<> struct ei_product_type_selector<1,1,1> { enum { ret = InnerProduct }; };
|
||||
template<> struct ei_product_type_selector<Small,1,Small> { enum { ret = UnrolledProduct }; };
|
||||
template<> struct ei_product_type_selector<1,Small,Small> { enum { ret = UnrolledProduct }; };
|
||||
template<> struct ei_product_type_selector<Small,Small,Small> { enum { ret = UnrolledProduct }; };
|
||||
|
||||
// template<> struct ei_product_type_selector<Small,1,Small> { enum { ret = GemvProduct }; };
|
||||
// template<> struct ei_product_type_selector<1,Small,Small> { enum { ret = GemvProduct }; };
|
||||
// template<> struct ei_product_type_selector<Small,Small,Small> { enum { ret = GemmProduct }; };
|
||||
|
||||
template<> struct ei_product_type_selector<1,Large,Small> { enum { ret = GemvProduct }; };
|
||||
template<> struct ei_product_type_selector<1,Large,Large> { enum { ret = GemvProduct }; };
|
||||
template<> struct ei_product_type_selector<1,Small,Large> { enum { ret = GemvProduct }; };
|
||||
@ -129,48 +128,6 @@ struct ProductReturnType<Lhs,Rhs,UnrolledProduct>
|
||||
};
|
||||
|
||||
|
||||
/***********************************************************************
|
||||
* Implementation of General Matrix Matrix Product
|
||||
***********************************************************************/
|
||||
|
||||
template<typename Lhs, typename Rhs>
|
||||
struct ei_traits<GeneralProduct<Lhs,Rhs,GemmProduct> >
|
||||
: ei_traits<ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> >
|
||||
{};
|
||||
|
||||
template<typename Lhs, typename Rhs>
|
||||
class GeneralProduct<Lhs, Rhs, GemmProduct>
|
||||
: public ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs>
|
||||
{
|
||||
public:
|
||||
EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
|
||||
|
||||
GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
|
||||
|
||||
template<typename Dest> void addTo(Dest& dst, Scalar alpha) const
|
||||
{
|
||||
ei_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
|
||||
|
||||
const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs);
|
||||
const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs);
|
||||
|
||||
Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
|
||||
* RhsBlasTraits::extractScalarFactor(m_rhs);
|
||||
|
||||
ei_general_matrix_matrix_product<
|
||||
Scalar,
|
||||
(_ActualLhsType::Flags&RowMajorBit)?RowMajor:ColMajor, bool(LhsBlasTraits::NeedToConjugate),
|
||||
(_ActualRhsType::Flags&RowMajorBit)?RowMajor:ColMajor, bool(RhsBlasTraits::NeedToConjugate),
|
||||
(Dest::Flags&RowMajorBit)?RowMajor:ColMajor>
|
||||
::run(
|
||||
this->rows(), this->cols(), lhs.cols(),
|
||||
(const Scalar*)&(lhs.const_cast_derived().coeffRef(0,0)), lhs.stride(),
|
||||
(const Scalar*)&(rhs.const_cast_derived().coeffRef(0,0)), rhs.stride(),
|
||||
(Scalar*)&(dst.coeffRef(0,0)), dst.stride(),
|
||||
actualAlpha);
|
||||
}
|
||||
};
|
||||
|
||||
/***********************************************************************
|
||||
* Implementation of Inner Vector Vector Product
|
||||
***********************************************************************/
|
||||
@ -403,362 +360,6 @@ template<> struct ei_gemv_selector<OnTheRight,RowMajor,false>
|
||||
}
|
||||
};
|
||||
|
||||
/***********************************************************************
|
||||
* Implementation of products with small fixed sizes
|
||||
***********************************************************************/
|
||||
|
||||
/* Since the all the dimensions of the product are small, here we can rely
|
||||
* on the generic Assign mechanism to evaluate the product per coeff (or packet).
|
||||
*
|
||||
* Note that the here inner-loops should always be unrolled.
|
||||
*/
|
||||
|
||||
template<int VectorizationMode, int Index, typename Lhs, typename Rhs, typename RetScalar>
|
||||
struct ei_product_coeff_impl;
|
||||
|
||||
template<int StorageOrder, int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
|
||||
struct ei_product_packet_impl;
|
||||
|
||||
template<typename LhsNested, typename RhsNested>
|
||||
struct ei_traits<GeneralProduct<LhsNested,RhsNested,UnrolledProduct> >
|
||||
{
|
||||
typedef typename ei_cleantype<LhsNested>::type _LhsNested;
|
||||
typedef typename ei_cleantype<RhsNested>::type _RhsNested;
|
||||
typedef typename ei_scalar_product_traits<typename _LhsNested::Scalar, typename _RhsNested::Scalar>::ReturnType Scalar;
|
||||
|
||||
enum {
|
||||
LhsCoeffReadCost = _LhsNested::CoeffReadCost,
|
||||
RhsCoeffReadCost = _RhsNested::CoeffReadCost,
|
||||
LhsFlags = _LhsNested::Flags,
|
||||
RhsFlags = _RhsNested::Flags,
|
||||
|
||||
RowsAtCompileTime = _LhsNested::RowsAtCompileTime,
|
||||
ColsAtCompileTime = _RhsNested::ColsAtCompileTime,
|
||||
InnerSize = EIGEN_ENUM_MIN(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime),
|
||||
|
||||
MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime,
|
||||
MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime,
|
||||
|
||||
LhsRowMajor = LhsFlags & RowMajorBit,
|
||||
RhsRowMajor = RhsFlags & RowMajorBit,
|
||||
|
||||
CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit)
|
||||
&& (ColsAtCompileTime == Dynamic || (ColsAtCompileTime % ei_packet_traits<Scalar>::size) == 0),
|
||||
|
||||
CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit)
|
||||
&& (RowsAtCompileTime == Dynamic || (RowsAtCompileTime % ei_packet_traits<Scalar>::size) == 0),
|
||||
|
||||
EvalToRowMajor = RhsRowMajor && (!CanVectorizeLhs),
|
||||
|
||||
RemovedBits = ~(EvalToRowMajor ? 0 : RowMajorBit),
|
||||
|
||||
Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
|
||||
| EvalBeforeAssigningBit
|
||||
| EvalBeforeNestingBit
|
||||
| (CanVectorizeLhs || CanVectorizeRhs ? PacketAccessBit : 0)
|
||||
| (LhsFlags & RhsFlags & AlignedBit),
|
||||
|
||||
CoeffReadCost = InnerSize == Dynamic ? Dynamic
|
||||
: InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
|
||||
+ (InnerSize - 1) * NumTraits<Scalar>::AddCost,
|
||||
|
||||
/* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
|
||||
* of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
|
||||
* loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
|
||||
* the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
|
||||
*/
|
||||
CanVectorizeInner = LhsRowMajor && (!RhsRowMajor) && (LhsFlags & RhsFlags & ActualPacketAccessBit)
|
||||
&& (InnerSize % ei_packet_traits<Scalar>::size == 0)
|
||||
};
|
||||
};
|
||||
|
||||
template<typename LhsNested, typename RhsNested> class GeneralProduct<LhsNested,RhsNested,UnrolledProduct>
|
||||
: ei_no_assignment_operator,
|
||||
public MatrixBase<GeneralProduct<LhsNested, RhsNested, UnrolledProduct> >
|
||||
{
|
||||
public:
|
||||
|
||||
EIGEN_GENERIC_PUBLIC_INTERFACE(GeneralProduct)
|
||||
|
||||
private:
|
||||
|
||||
typedef typename ei_traits<GeneralProduct>::_LhsNested _LhsNested;
|
||||
typedef typename ei_traits<GeneralProduct>::_RhsNested _RhsNested;
|
||||
|
||||
enum {
|
||||
PacketSize = ei_packet_traits<Scalar>::size,
|
||||
InnerSize = ei_traits<GeneralProduct>::InnerSize,
|
||||
Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
|
||||
CanVectorizeInner = ei_traits<GeneralProduct>::CanVectorizeInner
|
||||
};
|
||||
|
||||
typedef ei_product_coeff_impl<CanVectorizeInner ? InnerVectorization : NoVectorization,
|
||||
Unroll ? InnerSize-1 : Dynamic,
|
||||
_LhsNested, _RhsNested, Scalar> ScalarCoeffImpl;
|
||||
|
||||
public:
|
||||
|
||||
template<typename Lhs, typename Rhs>
|
||||
inline GeneralProduct(const Lhs& lhs, const Rhs& rhs)
|
||||
: m_lhs(lhs), m_rhs(rhs)
|
||||
{
|
||||
// we don't allow taking products of matrices of different real types, as that wouldn't be vectorizable.
|
||||
// We still allow to mix T and complex<T>.
|
||||
EIGEN_STATIC_ASSERT((ei_is_same_type<typename Lhs::RealScalar, typename Rhs::RealScalar>::ret),
|
||||
YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
|
||||
ei_assert(lhs.cols() == rhs.rows()
|
||||
&& "invalid matrix product"
|
||||
&& "if you wanted a coeff-wise or a dot product use the respective explicit functions");
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE int rows() const { return m_lhs.rows(); }
|
||||
EIGEN_STRONG_INLINE int cols() const { return m_rhs.cols(); }
|
||||
|
||||
EIGEN_STRONG_INLINE const Scalar coeff(int row, int col) const
|
||||
{
|
||||
Scalar res;
|
||||
ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
|
||||
return res;
|
||||
}
|
||||
|
||||
/* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
|
||||
* which is why we don't set the LinearAccessBit.
|
||||
*/
|
||||
EIGEN_STRONG_INLINE const Scalar coeff(int index) const
|
||||
{
|
||||
Scalar res;
|
||||
const int row = RowsAtCompileTime == 1 ? 0 : index;
|
||||
const int col = RowsAtCompileTime == 1 ? index : 0;
|
||||
ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
|
||||
return res;
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_STRONG_INLINE const PacketScalar packet(int row, int col) const
|
||||
{
|
||||
PacketScalar res;
|
||||
ei_product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor,
|
||||
Unroll ? InnerSize-1 : Dynamic,
|
||||
_LhsNested, _RhsNested, PacketScalar, LoadMode>
|
||||
::run(row, col, m_lhs, m_rhs, res);
|
||||
return res;
|
||||
}
|
||||
|
||||
protected:
|
||||
const LhsNested m_lhs;
|
||||
const RhsNested m_rhs;
|
||||
};
|
||||
|
||||
|
||||
/***************************************************************************
|
||||
* Normal product .coeff() implementation (with meta-unrolling)
|
||||
***************************************************************************/
|
||||
|
||||
/**************************************
|
||||
*** Scalar path - no vectorization ***
|
||||
**************************************/
|
||||
|
||||
template<int Index, typename Lhs, typename Rhs, typename RetScalar>
|
||||
struct ei_product_coeff_impl<NoVectorization, Index, Lhs, Rhs, RetScalar>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
|
||||
{
|
||||
ei_product_coeff_impl<NoVectorization, Index-1, Lhs, Rhs, RetScalar>::run(row, col, lhs, rhs, res);
|
||||
res += lhs.coeff(row, Index) * rhs.coeff(Index, col);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Lhs, typename Rhs, typename RetScalar>
|
||||
struct ei_product_coeff_impl<NoVectorization, 0, Lhs, Rhs, RetScalar>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
|
||||
{
|
||||
res = lhs.coeff(row, 0) * rhs.coeff(0, col);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Lhs, typename Rhs, typename RetScalar>
|
||||
struct ei_product_coeff_impl<NoVectorization, Dynamic, Lhs, Rhs, RetScalar>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar& res)
|
||||
{
|
||||
ei_assert(lhs.cols()>0 && "you are using a non initialized matrix");
|
||||
res = lhs.coeff(row, 0) * rhs.coeff(0, col);
|
||||
for(int i = 1; i < lhs.cols(); ++i)
|
||||
res += lhs.coeff(row, i) * rhs.coeff(i, col);
|
||||
}
|
||||
};
|
||||
|
||||
// prevent buggy user code from causing an infinite recursion
|
||||
template<typename Lhs, typename Rhs, typename RetScalar>
|
||||
struct ei_product_coeff_impl<NoVectorization, -1, Lhs, Rhs, RetScalar>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int, int, const Lhs&, const Rhs&, RetScalar&) {}
|
||||
};
|
||||
|
||||
/*******************************************
|
||||
*** Scalar path with inner vectorization ***
|
||||
*******************************************/
|
||||
|
||||
template<int Index, typename Lhs, typename Rhs, typename PacketScalar>
|
||||
struct ei_product_coeff_vectorized_unroller
|
||||
{
|
||||
enum { PacketSize = ei_packet_traits<typename Lhs::Scalar>::size };
|
||||
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
|
||||
{
|
||||
ei_product_coeff_vectorized_unroller<Index-PacketSize, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, pres);
|
||||
pres = ei_padd(pres, ei_pmul( lhs.template packet<Aligned>(row, Index) , rhs.template packet<Aligned>(Index, col) ));
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Lhs, typename Rhs, typename PacketScalar>
|
||||
struct ei_product_coeff_vectorized_unroller<0, Lhs, Rhs, PacketScalar>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
|
||||
{
|
||||
pres = ei_pmul(lhs.template packet<Aligned>(row, 0) , rhs.template packet<Aligned>(0, col));
|
||||
}
|
||||
};
|
||||
|
||||
template<int Index, typename Lhs, typename Rhs, typename RetScalar>
|
||||
struct ei_product_coeff_impl<InnerVectorization, Index, Lhs, Rhs, RetScalar>
|
||||
{
|
||||
typedef typename Lhs::PacketScalar PacketScalar;
|
||||
enum { PacketSize = ei_packet_traits<typename Lhs::Scalar>::size };
|
||||
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
|
||||
{
|
||||
PacketScalar pres;
|
||||
ei_product_coeff_vectorized_unroller<Index+1-PacketSize, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, pres);
|
||||
ei_product_coeff_impl<NoVectorization,Index,Lhs,Rhs,RetScalar>::run(row, col, lhs, rhs, res);
|
||||
res = ei_predux(pres);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Lhs, typename Rhs, int LhsRows = Lhs::RowsAtCompileTime, int RhsCols = Rhs::ColsAtCompileTime>
|
||||
struct ei_product_coeff_vectorized_dyn_selector
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
|
||||
{
|
||||
res = ei_dot_impl<
|
||||
Block<Lhs, 1, ei_traits<Lhs>::ColsAtCompileTime>,
|
||||
Block<Rhs, ei_traits<Rhs>::RowsAtCompileTime, 1>,
|
||||
LinearVectorization, NoUnrolling>::run(lhs.row(row), rhs.col(col));
|
||||
}
|
||||
};
|
||||
|
||||
// NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower
|
||||
// NOTE maybe they are now useless since we have a specialization for Block<Matrix>
|
||||
template<typename Lhs, typename Rhs, int RhsCols>
|
||||
struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int /*row*/, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
|
||||
{
|
||||
res = ei_dot_impl<
|
||||
Lhs,
|
||||
Block<Rhs, ei_traits<Rhs>::RowsAtCompileTime, 1>,
|
||||
LinearVectorization, NoUnrolling>::run(lhs, rhs.col(col));
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Lhs, typename Rhs, int LhsRows>
|
||||
struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int row, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
|
||||
{
|
||||
res = ei_dot_impl<
|
||||
Block<Lhs, 1, ei_traits<Lhs>::ColsAtCompileTime>,
|
||||
Rhs,
|
||||
LinearVectorization, NoUnrolling>::run(lhs.row(row), rhs);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Lhs, typename Rhs>
|
||||
struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,1>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int /*row*/, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
|
||||
{
|
||||
res = ei_dot_impl<
|
||||
Lhs,
|
||||
Rhs,
|
||||
LinearVectorization, NoUnrolling>::run(lhs, rhs);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Lhs, typename Rhs, typename RetScalar>
|
||||
struct ei_product_coeff_impl<InnerVectorization, Dynamic, Lhs, Rhs, RetScalar>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
|
||||
{
|
||||
ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs>::run(row, col, lhs, rhs, res);
|
||||
}
|
||||
};
|
||||
|
||||
/*******************
|
||||
*** Packet path ***
|
||||
*******************/
|
||||
|
||||
template<int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
|
||||
struct ei_product_packet_impl<RowMajor, Index, Lhs, Rhs, PacketScalar, LoadMode>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
|
||||
{
|
||||
ei_product_packet_impl<RowMajor, Index-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res);
|
||||
res = ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.template packet<LoadMode>(Index, col), res);
|
||||
}
|
||||
};
|
||||
|
||||
template<int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
|
||||
struct ei_product_packet_impl<ColMajor, Index, Lhs, Rhs, PacketScalar, LoadMode>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
|
||||
{
|
||||
ei_product_packet_impl<ColMajor, Index-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res);
|
||||
res = ei_pmadd(lhs.template packet<LoadMode>(row, Index), ei_pset1(rhs.coeff(Index, col)), res);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
|
||||
struct ei_product_packet_impl<RowMajor, 0, Lhs, Rhs, PacketScalar, LoadMode>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
|
||||
{
|
||||
res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
|
||||
struct ei_product_packet_impl<ColMajor, 0, Lhs, Rhs, PacketScalar, LoadMode>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
|
||||
{
|
||||
res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col)));
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
|
||||
struct ei_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMode>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
|
||||
{
|
||||
ei_assert(lhs.cols()>0 && "you are using a non initialized matrix");
|
||||
res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
|
||||
for(int i = 1; i < lhs.cols(); ++i)
|
||||
res = ei_pmadd(ei_pset1(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
|
||||
struct ei_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMode>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
|
||||
{
|
||||
ei_assert(lhs.cols()>0 && "you are using a non initialized matrix");
|
||||
res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col)));
|
||||
for(int i = 1; i < lhs.cols(); ++i)
|
||||
res = ei_pmadd(lhs.template packet<LoadMode>(row, i), ei_pset1(rhs.coeff(i, col)), res);
|
||||
}
|
||||
};
|
||||
|
||||
/***************************************************************************
|
||||
* Implementation of matrix base methods
|
||||
***************************************************************************/
|
||||
|
@ -196,101 +196,6 @@ struct ei_triangular_assignment_selector<Derived1, Derived2, SelfAdjoint, Dynami
|
||||
}
|
||||
};
|
||||
|
||||
/***************************************************************************
|
||||
* Wrapper to ei_product_selfadjoint_vector
|
||||
***************************************************************************/
|
||||
|
||||
template<typename Lhs, int LhsMode, typename Rhs>
|
||||
struct ei_traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true> >
|
||||
: ei_traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs> >
|
||||
{};
|
||||
|
||||
template<typename Lhs, int LhsMode, typename Rhs>
|
||||
struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
|
||||
: public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs >
|
||||
{
|
||||
EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
|
||||
|
||||
enum {
|
||||
LhsUpLo = LhsMode&(UpperTriangularBit|LowerTriangularBit)
|
||||
};
|
||||
|
||||
SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
|
||||
|
||||
template<typename Dest> void addTo(Dest& dst, Scalar alpha) const
|
||||
{
|
||||
ei_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
|
||||
|
||||
const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs);
|
||||
const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs);
|
||||
|
||||
Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
|
||||
* RhsBlasTraits::extractScalarFactor(m_rhs);
|
||||
|
||||
ei_assert((&dst.coeff(1))-(&dst.coeff(0))==1 && "not implemented yet");
|
||||
ei_product_selfadjoint_vector<Scalar, ei_traits<_ActualLhsType>::Flags&RowMajorBit, int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)>
|
||||
(
|
||||
lhs.rows(), // size
|
||||
&lhs.coeff(0,0), lhs.stride(), // lhs info
|
||||
&rhs.coeff(0), (&rhs.coeff(1))-(&rhs.coeff(0)), // rhs info
|
||||
&dst.coeffRef(0), // result info
|
||||
actualAlpha // scale factor
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
/***************************************************************************
|
||||
* Wrapper to ei_product_selfadjoint_matrix
|
||||
***************************************************************************/
|
||||
|
||||
template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
|
||||
struct ei_traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false> >
|
||||
: ei_traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs> >
|
||||
{};
|
||||
|
||||
template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
|
||||
struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>
|
||||
: public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs >
|
||||
{
|
||||
EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
|
||||
|
||||
SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
|
||||
|
||||
enum {
|
||||
LhsUpLo = LhsMode&(UpperTriangularBit|LowerTriangularBit),
|
||||
LhsIsSelfAdjoint = (LhsMode&SelfAdjointBit)==SelfAdjointBit,
|
||||
RhsUpLo = RhsMode&(UpperTriangularBit|LowerTriangularBit),
|
||||
RhsIsSelfAdjoint = (RhsMode&SelfAdjointBit)==SelfAdjointBit
|
||||
};
|
||||
|
||||
template<typename Dest> void addTo(Dest& dst, Scalar alpha) const
|
||||
{
|
||||
ei_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
|
||||
|
||||
const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs);
|
||||
const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs);
|
||||
|
||||
Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
|
||||
* RhsBlasTraits::extractScalarFactor(m_rhs);
|
||||
|
||||
ei_product_selfadjoint_matrix<Scalar,
|
||||
EIGEN_LOGICAL_XOR(LhsUpLo==UpperTriangular,
|
||||
ei_traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint,
|
||||
NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsUpLo==UpperTriangular,bool(LhsBlasTraits::NeedToConjugate)),
|
||||
EIGEN_LOGICAL_XOR(RhsUpLo==UpperTriangular,
|
||||
ei_traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint,
|
||||
NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsUpLo==UpperTriangular,bool(RhsBlasTraits::NeedToConjugate)),
|
||||
ei_traits<Dest>::Flags&RowMajorBit ? RowMajor : ColMajor>
|
||||
::run(
|
||||
lhs.rows(), rhs.cols(), // sizes
|
||||
&lhs.coeff(0,0), lhs.stride(), // lhs info
|
||||
&rhs.coeff(0,0), rhs.stride(), // rhs info
|
||||
&dst.coeffRef(0,0), dst.stride(), // result info
|
||||
actualAlpha // alpha
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
/***************************************************************************
|
||||
* Implementation of MatrixBase methods
|
||||
***************************************************************************/
|
||||
|
443
Eigen/src/Core/products/GeneralBlockPanelKernel.h
Normal file
443
Eigen/src/Core/products/GeneralBlockPanelKernel.h
Normal file
@ -0,0 +1,443 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr>
|
||||
//
|
||||
// Eigen is free software; you can redistribute it and/or
|
||||
// modify it under the terms of the GNU Lesser General Public
|
||||
// License as published by the Free Software Foundation; either
|
||||
// version 3 of the License, or (at your option) any later version.
|
||||
//
|
||||
// Alternatively, you can redistribute it and/or
|
||||
// modify it under the terms of the GNU General Public License as
|
||||
// published by the Free Software Foundation; either version 2 of
|
||||
// the License, or (at your option) any later version.
|
||||
//
|
||||
// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Lesser General Public
|
||||
// License and a copy of the GNU General Public License along with
|
||||
// Eigen. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
#ifndef EIGEN_GENERAL_BLOCK_PANEL_H
|
||||
#define EIGEN_GENERAL_BLOCK_PANEL_H
|
||||
|
||||
#ifndef EIGEN_EXTERN_INSTANTIATIONS
|
||||
|
||||
// optimized GEneral packed Block * packed Panel product kernel
|
||||
template<typename Scalar, int mr, int nr, typename Conj>
|
||||
struct ei_gebp_kernel
|
||||
{
|
||||
void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int rows, int depth, int cols, int strideA=-1, int strideB=-1, int offsetA=0, int offsetB=0)
|
||||
{
|
||||
typedef typename ei_packet_traits<Scalar>::type PacketType;
|
||||
enum { PacketSize = ei_packet_traits<Scalar>::size };
|
||||
if(strideA==-1) strideA = depth;
|
||||
if(strideB==-1) strideB = depth;
|
||||
Conj cj;
|
||||
int packet_cols = (cols/nr) * nr;
|
||||
const int peeled_mc = (rows/mr)*mr;
|
||||
// loops on each cache friendly block of the result/rhs
|
||||
for(int j2=0; j2<packet_cols; j2+=nr)
|
||||
{
|
||||
// loops on each register blocking of lhs/res
|
||||
for(int i=0; i<peeled_mc; i+=mr)
|
||||
{
|
||||
const Scalar* blA = &blockA[i*strideA+offsetA*mr];
|
||||
#ifdef EIGEN_VECTORIZE_SSE
|
||||
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
||||
#endif
|
||||
|
||||
// TODO move the res loads to the stores
|
||||
|
||||
// gets res block as register
|
||||
PacketType C0, C1, C2, C3, C4, C5, C6, C7;
|
||||
C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
|
||||
C1 = ei_ploadu(&res[(j2+1)*resStride + i]);
|
||||
if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i]);
|
||||
if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]);
|
||||
C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]);
|
||||
C5 = ei_ploadu(&res[(j2+1)*resStride + i + PacketSize]);
|
||||
if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i + PacketSize]);
|
||||
if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i + PacketSize]);
|
||||
|
||||
// performs "inner" product
|
||||
// TODO let's check wether the flowing peeled loop could not be
|
||||
// optimized via optimal prefetching from one loop to the other
|
||||
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
|
||||
const int peeled_kc = (depth/4)*4;
|
||||
for(int k=0; k<peeled_kc; k+=4)
|
||||
{
|
||||
PacketType B0, B1, B2, B3, A0, A1;
|
||||
|
||||
A0 = ei_pload(&blA[0*PacketSize]);
|
||||
A1 = ei_pload(&blA[1*PacketSize]);
|
||||
B0 = ei_pload(&blB[0*PacketSize]);
|
||||
B1 = ei_pload(&blB[1*PacketSize]);
|
||||
C0 = cj.pmadd(A0, B0, C0);
|
||||
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
|
||||
C4 = cj.pmadd(A1, B0, C4);
|
||||
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
|
||||
B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
|
||||
C1 = cj.pmadd(A0, B1, C1);
|
||||
C5 = cj.pmadd(A1, B1, C5);
|
||||
B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
|
||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
||||
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
|
||||
if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
|
||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
||||
A0 = ei_pload(&blA[2*PacketSize]);
|
||||
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
|
||||
A1 = ei_pload(&blA[3*PacketSize]);
|
||||
if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
|
||||
C0 = cj.pmadd(A0, B0, C0);
|
||||
C4 = cj.pmadd(A1, B0, C4);
|
||||
B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
|
||||
C1 = cj.pmadd(A0, B1, C1);
|
||||
C5 = cj.pmadd(A1, B1, C5);
|
||||
B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
|
||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
||||
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
|
||||
if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
|
||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
||||
A0 = ei_pload(&blA[4*PacketSize]);
|
||||
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
|
||||
A1 = ei_pload(&blA[5*PacketSize]);
|
||||
if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
|
||||
|
||||
C0 = cj.pmadd(A0, B0, C0);
|
||||
C4 = cj.pmadd(A1, B0, C4);
|
||||
B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
|
||||
C1 = cj.pmadd(A0, B1, C1);
|
||||
C5 = cj.pmadd(A1, B1, C5);
|
||||
B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
|
||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
||||
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
|
||||
if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
|
||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
||||
A0 = ei_pload(&blA[6*PacketSize]);
|
||||
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
|
||||
A1 = ei_pload(&blA[7*PacketSize]);
|
||||
if(nr==4) B3 = ei_pload(&blB[15*PacketSize]);
|
||||
C0 = cj.pmadd(A0, B0, C0);
|
||||
C4 = cj.pmadd(A1, B0, C4);
|
||||
C1 = cj.pmadd(A0, B1, C1);
|
||||
C5 = cj.pmadd(A1, B1, C5);
|
||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
||||
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
|
||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
||||
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
|
||||
|
||||
blB += 4*nr*PacketSize;
|
||||
blA += 4*mr;
|
||||
}
|
||||
// process remaining peeled loop
|
||||
for(int k=peeled_kc; k<depth; k++)
|
||||
{
|
||||
PacketType B0, B1, B2, B3, A0, A1;
|
||||
|
||||
A0 = ei_pload(&blA[0*PacketSize]);
|
||||
A1 = ei_pload(&blA[1*PacketSize]);
|
||||
B0 = ei_pload(&blB[0*PacketSize]);
|
||||
B1 = ei_pload(&blB[1*PacketSize]);
|
||||
C0 = cj.pmadd(A0, B0, C0);
|
||||
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
|
||||
C4 = cj.pmadd(A1, B0, C4);
|
||||
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
|
||||
C1 = cj.pmadd(A0, B1, C1);
|
||||
C5 = cj.pmadd(A1, B1, C5);
|
||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
||||
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
|
||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
||||
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
|
||||
|
||||
blB += nr*PacketSize;
|
||||
blA += mr;
|
||||
}
|
||||
|
||||
ei_pstoreu(&res[(j2+0)*resStride + i], C0);
|
||||
ei_pstoreu(&res[(j2+1)*resStride + i], C1);
|
||||
if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i], C2);
|
||||
if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i], C3);
|
||||
ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4);
|
||||
ei_pstoreu(&res[(j2+1)*resStride + i + PacketSize], C5);
|
||||
if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i + PacketSize], C6);
|
||||
if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i + PacketSize], C7);
|
||||
}
|
||||
for(int i=peeled_mc; i<rows; i++)
|
||||
{
|
||||
const Scalar* blA = &blockA[i*strideA+offsetA];
|
||||
#ifdef EIGEN_VECTORIZE_SSE
|
||||
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
||||
#endif
|
||||
|
||||
// gets a 1 x nr res block as registers
|
||||
Scalar C0(0), C1(0), C2(0), C3(0);
|
||||
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
Scalar B0, B1, B2, B3, A0;
|
||||
|
||||
A0 = blA[k];
|
||||
B0 = blB[0*PacketSize];
|
||||
B1 = blB[1*PacketSize];
|
||||
C0 = cj.pmadd(A0, B0, C0);
|
||||
if(nr==4) B2 = blB[2*PacketSize];
|
||||
if(nr==4) B3 = blB[3*PacketSize];
|
||||
C1 = cj.pmadd(A0, B1, C1);
|
||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
||||
|
||||
blB += nr*PacketSize;
|
||||
}
|
||||
res[(j2+0)*resStride + i] += C0;
|
||||
res[(j2+1)*resStride + i] += C1;
|
||||
if(nr==4) res[(j2+2)*resStride + i] += C2;
|
||||
if(nr==4) res[(j2+3)*resStride + i] += C3;
|
||||
}
|
||||
}
|
||||
|
||||
// process remaining rhs/res columns one at a time
|
||||
// => do the same but with nr==1
|
||||
for(int j2=packet_cols; j2<cols; j2++)
|
||||
{
|
||||
for(int i=0; i<peeled_mc; i+=mr)
|
||||
{
|
||||
const Scalar* blA = &blockA[i*strideA+offsetA*mr];
|
||||
#ifdef EIGEN_VECTORIZE_SSE
|
||||
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
||||
#endif
|
||||
|
||||
// TODO move the res loads to the stores
|
||||
|
||||
// gets res block as register
|
||||
PacketType C0, C4;
|
||||
C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
|
||||
C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]);
|
||||
|
||||
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB];
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
PacketType B0, A0, A1;
|
||||
|
||||
A0 = ei_pload(&blA[0*PacketSize]);
|
||||
A1 = ei_pload(&blA[1*PacketSize]);
|
||||
B0 = ei_pload(&blB[0*PacketSize]);
|
||||
C0 = cj.pmadd(A0, B0, C0);
|
||||
C4 = cj.pmadd(A1, B0, C4);
|
||||
|
||||
blB += PacketSize;
|
||||
blA += mr;
|
||||
}
|
||||
|
||||
ei_pstoreu(&res[(j2+0)*resStride + i], C0);
|
||||
ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4);
|
||||
}
|
||||
for(int i=peeled_mc; i<rows; i++)
|
||||
{
|
||||
const Scalar* blA = &blockA[i*strideA+offsetA];
|
||||
#ifdef EIGEN_VECTORIZE_SSE
|
||||
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
||||
#endif
|
||||
|
||||
// gets a 1 x 1 res block as registers
|
||||
Scalar C0(0);
|
||||
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB];
|
||||
for(int k=0; k<depth; k++)
|
||||
C0 = cj.pmadd(blA[k], blB[k*PacketSize], C0);
|
||||
res[(j2+0)*resStride + i] += C0;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// pack a block of the lhs
|
||||
// The travesal is as follow (mr==4):
|
||||
// 0 4 8 12 ...
|
||||
// 1 5 9 13 ...
|
||||
// 2 6 10 14 ...
|
||||
// 3 7 11 15 ...
|
||||
//
|
||||
// 16 20 24 28 ...
|
||||
// 17 21 25 29 ...
|
||||
// 18 22 26 30 ...
|
||||
// 19 23 27 31 ...
|
||||
//
|
||||
// 32 33 34 35 ...
|
||||
// 36 36 38 39 ...
|
||||
template<typename Scalar, int mr, int StorageOrder, bool Conjugate, bool PanelMode>
|
||||
struct ei_gemm_pack_lhs
|
||||
{
|
||||
void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, int lhsStride, int depth, int rows,
|
||||
int stride=0, int offset=0)
|
||||
{
|
||||
ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
|
||||
ei_conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
|
||||
ei_const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride);
|
||||
int count = 0;
|
||||
const int peeled_mc = (rows/mr)*mr;
|
||||
for(int i=0; i<peeled_mc; i+=mr)
|
||||
{
|
||||
if(PanelMode) count += mr * offset;
|
||||
for(int k=0; k<depth; k++)
|
||||
for(int w=0; w<mr; w++)
|
||||
blockA[count++] = cj(lhs(i+w, k));
|
||||
if(PanelMode) count += mr * (stride-offset-depth);
|
||||
}
|
||||
for(int i=peeled_mc; i<rows; i++)
|
||||
{
|
||||
if(PanelMode) count += offset;
|
||||
for(int k=0; k<depth; k++)
|
||||
blockA[count++] = cj(lhs(i, k));
|
||||
if(PanelMode) count += (stride-offset-depth);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// copy a complete panel of the rhs while expending each coefficient into a packet form
|
||||
// this version is optimized for column major matrices
|
||||
// The traversal order is as follow (nr==4):
|
||||
// 0 1 2 3 12 13 14 15 24 27
|
||||
// 4 5 6 7 16 17 18 19 25 28
|
||||
// 8 9 10 11 20 21 22 23 26 29
|
||||
// . . . . . . . . . .
|
||||
template<typename Scalar, int nr, bool PanelMode>
|
||||
struct ei_gemm_pack_rhs<Scalar, nr, ColMajor, PanelMode>
|
||||
{
|
||||
enum { PacketSize = ei_packet_traits<Scalar>::size };
|
||||
void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols,
|
||||
int stride=0, int offset=0)
|
||||
{
|
||||
ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
|
||||
bool hasAlpha = alpha != Scalar(1);
|
||||
int packet_cols = (cols/nr) * nr;
|
||||
int count = 0;
|
||||
for(int j2=0; j2<packet_cols; j2+=nr)
|
||||
{
|
||||
// skip what we have before
|
||||
if(PanelMode) count += PacketSize * nr * offset;
|
||||
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
|
||||
const Scalar* b1 = &rhs[(j2+1)*rhsStride];
|
||||
const Scalar* b2 = &rhs[(j2+2)*rhsStride];
|
||||
const Scalar* b3 = &rhs[(j2+3)*rhsStride];
|
||||
if (hasAlpha)
|
||||
{
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[k]));
|
||||
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b1[k]));
|
||||
if (nr==4)
|
||||
{
|
||||
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b2[k]));
|
||||
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b3[k]));
|
||||
}
|
||||
count += nr*PacketSize;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[k]));
|
||||
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b1[k]));
|
||||
if (nr==4)
|
||||
{
|
||||
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b2[k]));
|
||||
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b3[k]));
|
||||
}
|
||||
count += nr*PacketSize;
|
||||
}
|
||||
}
|
||||
// skip what we have after
|
||||
if(PanelMode) count += PacketSize * nr * (stride-offset-depth);
|
||||
}
|
||||
// copy the remaining columns one at a time (nr==1)
|
||||
for(int j2=packet_cols; j2<cols; ++j2)
|
||||
{
|
||||
if(PanelMode) count += PacketSize * offset;
|
||||
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
|
||||
if (hasAlpha)
|
||||
{
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count], ei_pset1(alpha*b0[k]));
|
||||
count += PacketSize;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count], ei_pset1(b0[k]));
|
||||
count += PacketSize;
|
||||
}
|
||||
}
|
||||
if(PanelMode) count += PacketSize * (stride-offset-depth);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// this version is optimized for row major matrices
|
||||
template<typename Scalar, int nr, bool PanelMode>
|
||||
struct ei_gemm_pack_rhs<Scalar, nr, RowMajor, PanelMode>
|
||||
{
|
||||
enum { PacketSize = ei_packet_traits<Scalar>::size };
|
||||
void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols,
|
||||
int stride=0, int offset=0)
|
||||
{
|
||||
ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
|
||||
bool hasAlpha = alpha != Scalar(1);
|
||||
int packet_cols = (cols/nr) * nr;
|
||||
int count = 0;
|
||||
for(int j2=0; j2<packet_cols; j2+=nr)
|
||||
{
|
||||
// skip what we have before
|
||||
if(PanelMode) count += PacketSize * nr * offset;
|
||||
if (hasAlpha)
|
||||
{
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
const Scalar* b0 = &rhs[k*rhsStride + j2];
|
||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[0]));
|
||||
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b0[1]));
|
||||
if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b0[2]));
|
||||
if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b0[3]));
|
||||
count += nr*PacketSize;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
const Scalar* b0 = &rhs[k*rhsStride + j2];
|
||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[0]));
|
||||
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b0[1]));
|
||||
if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b0[2]));
|
||||
if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b0[3]));
|
||||
count += nr*PacketSize;
|
||||
}
|
||||
}
|
||||
// skip what we have after
|
||||
if(PanelMode) count += PacketSize * nr * (stride-offset-depth);
|
||||
}
|
||||
// copy the remaining columns one at a time (nr==1)
|
||||
for(int j2=packet_cols; j2<cols; ++j2)
|
||||
{
|
||||
if(PanelMode) count += PacketSize * offset;
|
||||
const Scalar* b0 = &rhs[j2];
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count], ei_pset1(alpha*b0[k*rhsStride]));
|
||||
count += PacketSize;
|
||||
}
|
||||
if(PanelMode) count += PacketSize * (stride-offset-depth);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#endif // EIGEN_EXTERN_INSTANTIATIONS
|
||||
|
||||
#endif // EIGEN_GENERAL_BLOCK_PANEL_H
|
@ -27,6 +27,7 @@
|
||||
|
||||
#ifndef EIGEN_EXTERN_INSTANTIATIONS
|
||||
|
||||
/* Specialization for a row-major destination matrix => simple transposition of the product */
|
||||
template<
|
||||
typename Scalar,
|
||||
int LhsStorageOrder, bool ConjugateLhs,
|
||||
@ -51,6 +52,8 @@ struct ei_general_matrix_matrix_product<Scalar,LhsStorageOrder,ConjugateLhs,RhsS
|
||||
}
|
||||
};
|
||||
|
||||
/* Specialization for a col-major destination matrix
|
||||
* => Blocking algorithm following Goto's paper */
|
||||
template<
|
||||
typename Scalar,
|
||||
int LhsStorageOrder, bool ConjugateLhs,
|
||||
@ -78,23 +81,30 @@ static void run(int rows, int cols, int depth,
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
|
||||
|
||||
// => GEMM_VAR1
|
||||
// For each horizontal panel of the rhs, and corresponding panel of the lhs...
|
||||
// (==GEMM_VAR1)
|
||||
for(int k2=0; k2<depth; k2+=kc)
|
||||
{
|
||||
const int actual_kc = std::min(k2+kc,depth)-k2;
|
||||
|
||||
// we have selected one row panel of rhs and one column panel of lhs
|
||||
// pack rhs's panel into a sequential chunk of memory
|
||||
// and expand each coeff to a constant packet for further reuse
|
||||
// OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
|
||||
// => Pack rhs's panel into a sequential chunk of memory (L2 caching)
|
||||
// Note that this panel will be read as many times as the number of blocks in the lhs's
|
||||
// vertical panel which is, in practice, a very low number.
|
||||
ei_gemm_pack_rhs<Scalar, Blocking::nr, RhsStorageOrder>()(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols);
|
||||
|
||||
// => GEPP_VAR1
|
||||
// For each mc x kc block of the lhs's vertical panel...
|
||||
// (==GEPP_VAR1)
|
||||
for(int i2=0; i2<rows; i2+=mc)
|
||||
{
|
||||
const int actual_mc = std::min(i2+mc,rows)-i2;
|
||||
|
||||
// We pack the lhs's block into a sequential chunk of memory (L1 caching)
|
||||
// Note that this block will be read a very high number of times, which is equal to the number of
|
||||
// micro vertical panel of the large rhs's panel (e.g., cols/4 times).
|
||||
ei_gemm_pack_lhs<Scalar, Blocking::mr, LhsStorageOrder>()(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
|
||||
|
||||
// Everything is packed, we can now call the block * panel kernel:
|
||||
ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >()
|
||||
(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
|
||||
}
|
||||
@ -106,417 +116,49 @@ static void run(int rows, int cols, int depth,
|
||||
|
||||
};
|
||||
|
||||
// optimized GEneral packed Block * packed Panel product kernel
|
||||
template<typename Scalar, int mr, int nr, typename Conj>
|
||||
struct ei_gebp_kernel
|
||||
{
|
||||
void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int rows, int depth, int cols, int strideA=-1, int strideB=-1, int offsetA=0, int offsetB=0)
|
||||
{
|
||||
typedef typename ei_packet_traits<Scalar>::type PacketType;
|
||||
enum { PacketSize = ei_packet_traits<Scalar>::size };
|
||||
if(strideA==-1) strideA = depth;
|
||||
if(strideB==-1) strideB = depth;
|
||||
Conj cj;
|
||||
int packet_cols = (cols/nr) * nr;
|
||||
const int peeled_mc = (rows/mr)*mr;
|
||||
// loops on each cache friendly block of the result/rhs
|
||||
for(int j2=0; j2<packet_cols; j2+=nr)
|
||||
{
|
||||
// loops on each register blocking of lhs/res
|
||||
for(int i=0; i<peeled_mc; i+=mr)
|
||||
{
|
||||
const Scalar* blA = &blockA[i*strideA+offsetA*mr];
|
||||
#ifdef EIGEN_VECTORIZE_SSE
|
||||
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
||||
#endif
|
||||
|
||||
// TODO move the res loads to the stores
|
||||
|
||||
// gets res block as register
|
||||
PacketType C0, C1, C2, C3, C4, C5, C6, C7;
|
||||
C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
|
||||
C1 = ei_ploadu(&res[(j2+1)*resStride + i]);
|
||||
if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i]);
|
||||
if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]);
|
||||
C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]);
|
||||
C5 = ei_ploadu(&res[(j2+1)*resStride + i + PacketSize]);
|
||||
if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i + PacketSize]);
|
||||
if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i + PacketSize]);
|
||||
|
||||
// performs "inner" product
|
||||
// TODO let's check wether the flowing peeled loop could not be
|
||||
// optimized via optimal prefetching from one loop to the other
|
||||
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
|
||||
const int peeled_kc = (depth/4)*4;
|
||||
for(int k=0; k<peeled_kc; k+=4)
|
||||
{
|
||||
PacketType B0, B1, B2, B3, A0, A1;
|
||||
|
||||
A0 = ei_pload(&blA[0*PacketSize]);
|
||||
A1 = ei_pload(&blA[1*PacketSize]);
|
||||
B0 = ei_pload(&blB[0*PacketSize]);
|
||||
B1 = ei_pload(&blB[1*PacketSize]);
|
||||
C0 = cj.pmadd(A0, B0, C0);
|
||||
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
|
||||
C4 = cj.pmadd(A1, B0, C4);
|
||||
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
|
||||
B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
|
||||
C1 = cj.pmadd(A0, B1, C1);
|
||||
C5 = cj.pmadd(A1, B1, C5);
|
||||
B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
|
||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
||||
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
|
||||
if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
|
||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
||||
A0 = ei_pload(&blA[2*PacketSize]);
|
||||
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
|
||||
A1 = ei_pload(&blA[3*PacketSize]);
|
||||
if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
|
||||
C0 = cj.pmadd(A0, B0, C0);
|
||||
C4 = cj.pmadd(A1, B0, C4);
|
||||
B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
|
||||
C1 = cj.pmadd(A0, B1, C1);
|
||||
C5 = cj.pmadd(A1, B1, C5);
|
||||
B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
|
||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
||||
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
|
||||
if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
|
||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
||||
A0 = ei_pload(&blA[4*PacketSize]);
|
||||
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
|
||||
A1 = ei_pload(&blA[5*PacketSize]);
|
||||
if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
|
||||
|
||||
C0 = cj.pmadd(A0, B0, C0);
|
||||
C4 = cj.pmadd(A1, B0, C4);
|
||||
B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
|
||||
C1 = cj.pmadd(A0, B1, C1);
|
||||
C5 = cj.pmadd(A1, B1, C5);
|
||||
B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
|
||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
||||
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
|
||||
if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
|
||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
||||
A0 = ei_pload(&blA[6*PacketSize]);
|
||||
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
|
||||
A1 = ei_pload(&blA[7*PacketSize]);
|
||||
if(nr==4) B3 = ei_pload(&blB[15*PacketSize]);
|
||||
C0 = cj.pmadd(A0, B0, C0);
|
||||
C4 = cj.pmadd(A1, B0, C4);
|
||||
C1 = cj.pmadd(A0, B1, C1);
|
||||
C5 = cj.pmadd(A1, B1, C5);
|
||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
||||
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
|
||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
||||
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
|
||||
|
||||
blB += 4*nr*PacketSize;
|
||||
blA += 4*mr;
|
||||
}
|
||||
// process remaining peeled loop
|
||||
for(int k=peeled_kc; k<depth; k++)
|
||||
{
|
||||
PacketType B0, B1, B2, B3, A0, A1;
|
||||
|
||||
A0 = ei_pload(&blA[0*PacketSize]);
|
||||
A1 = ei_pload(&blA[1*PacketSize]);
|
||||
B0 = ei_pload(&blB[0*PacketSize]);
|
||||
B1 = ei_pload(&blB[1*PacketSize]);
|
||||
C0 = cj.pmadd(A0, B0, C0);
|
||||
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
|
||||
C4 = cj.pmadd(A1, B0, C4);
|
||||
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
|
||||
C1 = cj.pmadd(A0, B1, C1);
|
||||
C5 = cj.pmadd(A1, B1, C5);
|
||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
||||
if(nr==4) C6 = cj.pmadd(A1, B2, C6);
|
||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
||||
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
|
||||
|
||||
blB += nr*PacketSize;
|
||||
blA += mr;
|
||||
}
|
||||
|
||||
ei_pstoreu(&res[(j2+0)*resStride + i], C0);
|
||||
ei_pstoreu(&res[(j2+1)*resStride + i], C1);
|
||||
if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i], C2);
|
||||
if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i], C3);
|
||||
ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4);
|
||||
ei_pstoreu(&res[(j2+1)*resStride + i + PacketSize], C5);
|
||||
if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i + PacketSize], C6);
|
||||
if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i + PacketSize], C7);
|
||||
}
|
||||
for(int i=peeled_mc; i<rows; i++)
|
||||
{
|
||||
const Scalar* blA = &blockA[i*strideA+offsetA];
|
||||
#ifdef EIGEN_VECTORIZE_SSE
|
||||
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
||||
#endif
|
||||
|
||||
// gets a 1 x nr res block as registers
|
||||
Scalar C0(0), C1(0), C2(0), C3(0);
|
||||
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
Scalar B0, B1, B2, B3, A0;
|
||||
|
||||
A0 = blA[k];
|
||||
B0 = blB[0*PacketSize];
|
||||
B1 = blB[1*PacketSize];
|
||||
C0 = cj.pmadd(A0, B0, C0);
|
||||
if(nr==4) B2 = blB[2*PacketSize];
|
||||
if(nr==4) B3 = blB[3*PacketSize];
|
||||
C1 = cj.pmadd(A0, B1, C1);
|
||||
if(nr==4) C2 = cj.pmadd(A0, B2, C2);
|
||||
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
|
||||
|
||||
blB += nr*PacketSize;
|
||||
}
|
||||
res[(j2+0)*resStride + i] += C0;
|
||||
res[(j2+1)*resStride + i] += C1;
|
||||
if(nr==4) res[(j2+2)*resStride + i] += C2;
|
||||
if(nr==4) res[(j2+3)*resStride + i] += C3;
|
||||
}
|
||||
}
|
||||
|
||||
// process remaining rhs/res columns one at a time
|
||||
// => do the same but with nr==1
|
||||
for(int j2=packet_cols; j2<cols; j2++)
|
||||
{
|
||||
for(int i=0; i<peeled_mc; i+=mr)
|
||||
{
|
||||
const Scalar* blA = &blockA[i*strideA+offsetA*mr];
|
||||
#ifdef EIGEN_VECTORIZE_SSE
|
||||
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
||||
#endif
|
||||
|
||||
// TODO move the res loads to the stores
|
||||
|
||||
// gets res block as register
|
||||
PacketType C0, C4;
|
||||
C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
|
||||
C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]);
|
||||
|
||||
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB];
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
PacketType B0, A0, A1;
|
||||
|
||||
A0 = ei_pload(&blA[0*PacketSize]);
|
||||
A1 = ei_pload(&blA[1*PacketSize]);
|
||||
B0 = ei_pload(&blB[0*PacketSize]);
|
||||
C0 = cj.pmadd(A0, B0, C0);
|
||||
C4 = cj.pmadd(A1, B0, C4);
|
||||
|
||||
blB += PacketSize;
|
||||
blA += mr;
|
||||
}
|
||||
|
||||
ei_pstoreu(&res[(j2+0)*resStride + i], C0);
|
||||
ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4);
|
||||
}
|
||||
for(int i=peeled_mc; i<rows; i++)
|
||||
{
|
||||
const Scalar* blA = &blockA[i*strideA+offsetA];
|
||||
#ifdef EIGEN_VECTORIZE_SSE
|
||||
_mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
|
||||
#endif
|
||||
|
||||
// gets a 1 x 1 res block as registers
|
||||
Scalar C0(0);
|
||||
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB];
|
||||
for(int k=0; k<depth; k++)
|
||||
C0 = cj.pmadd(blA[k], blB[k*PacketSize], C0);
|
||||
res[(j2+0)*resStride + i] += C0;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// pack a block of the lhs
|
||||
// The travesal is as follow (mr==4):
|
||||
// 0 4 8 12 ...
|
||||
// 1 5 9 13 ...
|
||||
// 2 6 10 14 ...
|
||||
// 3 7 11 15 ...
|
||||
//
|
||||
// 16 20 24 28 ...
|
||||
// 17 21 25 29 ...
|
||||
// 18 22 26 30 ...
|
||||
// 19 23 27 31 ...
|
||||
//
|
||||
// 32 33 34 35 ...
|
||||
// 36 36 38 39 ...
|
||||
template<typename Scalar, int mr, int StorageOrder, bool Conjugate, bool PanelMode>
|
||||
struct ei_gemm_pack_lhs
|
||||
{
|
||||
void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, int lhsStride, int depth, int rows,
|
||||
int stride=0, int offset=0)
|
||||
{
|
||||
ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
|
||||
ei_conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
|
||||
ei_const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride);
|
||||
int count = 0;
|
||||
const int peeled_mc = (rows/mr)*mr;
|
||||
for(int i=0; i<peeled_mc; i+=mr)
|
||||
{
|
||||
if(PanelMode) count += mr * offset;
|
||||
for(int k=0; k<depth; k++)
|
||||
for(int w=0; w<mr; w++)
|
||||
blockA[count++] = cj(lhs(i+w, k));
|
||||
if(PanelMode) count += mr * (stride-offset-depth);
|
||||
}
|
||||
for(int i=peeled_mc; i<rows; i++)
|
||||
{
|
||||
if(PanelMode) count += offset;
|
||||
for(int k=0; k<depth; k++)
|
||||
blockA[count++] = cj(lhs(i, k));
|
||||
if(PanelMode) count += (stride-offset-depth);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// copy a complete panel of the rhs while expending each coefficient into a packet form
|
||||
// this version is optimized for column major matrices
|
||||
// The traversal order is as follow (nr==4):
|
||||
// 0 1 2 3 12 13 14 15 24 27
|
||||
// 4 5 6 7 16 17 18 19 25 28
|
||||
// 8 9 10 11 20 21 22 23 26 29
|
||||
// . . . . . . . . . .
|
||||
template<typename Scalar, int nr, bool PanelMode>
|
||||
struct ei_gemm_pack_rhs<Scalar, nr, ColMajor, PanelMode>
|
||||
{
|
||||
enum { PacketSize = ei_packet_traits<Scalar>::size };
|
||||
void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols,
|
||||
int stride=0, int offset=0)
|
||||
{
|
||||
ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
|
||||
bool hasAlpha = alpha != Scalar(1);
|
||||
int packet_cols = (cols/nr) * nr;
|
||||
int count = 0;
|
||||
for(int j2=0; j2<packet_cols; j2+=nr)
|
||||
{
|
||||
// skip what we have before
|
||||
if(PanelMode) count += PacketSize * nr * offset;
|
||||
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
|
||||
const Scalar* b1 = &rhs[(j2+1)*rhsStride];
|
||||
const Scalar* b2 = &rhs[(j2+2)*rhsStride];
|
||||
const Scalar* b3 = &rhs[(j2+3)*rhsStride];
|
||||
if (hasAlpha)
|
||||
{
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[k]));
|
||||
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b1[k]));
|
||||
if (nr==4)
|
||||
{
|
||||
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b2[k]));
|
||||
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b3[k]));
|
||||
}
|
||||
count += nr*PacketSize;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[k]));
|
||||
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b1[k]));
|
||||
if (nr==4)
|
||||
{
|
||||
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b2[k]));
|
||||
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b3[k]));
|
||||
}
|
||||
count += nr*PacketSize;
|
||||
}
|
||||
}
|
||||
// skip what we have after
|
||||
if(PanelMode) count += PacketSize * nr * (stride-offset-depth);
|
||||
}
|
||||
// copy the remaining columns one at a time (nr==1)
|
||||
for(int j2=packet_cols; j2<cols; ++j2)
|
||||
{
|
||||
if(PanelMode) count += PacketSize * offset;
|
||||
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
|
||||
if (hasAlpha)
|
||||
{
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count], ei_pset1(alpha*b0[k]));
|
||||
count += PacketSize;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count], ei_pset1(b0[k]));
|
||||
count += PacketSize;
|
||||
}
|
||||
}
|
||||
if(PanelMode) count += PacketSize * (stride-offset-depth);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// this version is optimized for row major matrices
|
||||
template<typename Scalar, int nr, bool PanelMode>
|
||||
struct ei_gemm_pack_rhs<Scalar, nr, RowMajor, PanelMode>
|
||||
{
|
||||
enum { PacketSize = ei_packet_traits<Scalar>::size };
|
||||
void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols,
|
||||
int stride=0, int offset=0)
|
||||
{
|
||||
ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
|
||||
bool hasAlpha = alpha != Scalar(1);
|
||||
int packet_cols = (cols/nr) * nr;
|
||||
int count = 0;
|
||||
for(int j2=0; j2<packet_cols; j2+=nr)
|
||||
{
|
||||
// skip what we have before
|
||||
if(PanelMode) count += PacketSize * nr * offset;
|
||||
if (hasAlpha)
|
||||
{
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
const Scalar* b0 = &rhs[k*rhsStride + j2];
|
||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[0]));
|
||||
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b0[1]));
|
||||
if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b0[2]));
|
||||
if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b0[3]));
|
||||
count += nr*PacketSize;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
const Scalar* b0 = &rhs[k*rhsStride + j2];
|
||||
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[0]));
|
||||
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b0[1]));
|
||||
if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b0[2]));
|
||||
if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b0[3]));
|
||||
count += nr*PacketSize;
|
||||
}
|
||||
}
|
||||
// skip what we have after
|
||||
if(PanelMode) count += PacketSize * nr * (stride-offset-depth);
|
||||
}
|
||||
// copy the remaining columns one at a time (nr==1)
|
||||
for(int j2=packet_cols; j2<cols; ++j2)
|
||||
{
|
||||
if(PanelMode) count += PacketSize * offset;
|
||||
const Scalar* b0 = &rhs[j2];
|
||||
for(int k=0; k<depth; k++)
|
||||
{
|
||||
ei_pstore(&blockB[count], ei_pset1(alpha*b0[k*rhsStride]));
|
||||
count += PacketSize;
|
||||
}
|
||||
if(PanelMode) count += PacketSize * (stride-offset-depth);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#endif // EIGEN_EXTERN_INSTANTIATIONS
|
||||
|
||||
/*********************************************************************************
|
||||
* Specialization of GeneralProduct<> for "large" GEMM, i.e.,
|
||||
* implementation of the high level wrapper to ei_general_matrix_matrix_product
|
||||
**********************************************************************************/
|
||||
|
||||
template<typename Lhs, typename Rhs>
|
||||
struct ei_traits<GeneralProduct<Lhs,Rhs,GemmProduct> >
|
||||
: ei_traits<ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> >
|
||||
{};
|
||||
|
||||
template<typename Lhs, typename Rhs>
|
||||
class GeneralProduct<Lhs, Rhs, GemmProduct>
|
||||
: public ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs>
|
||||
{
|
||||
public:
|
||||
EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
|
||||
|
||||
GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
|
||||
|
||||
template<typename Dest> void addTo(Dest& dst, Scalar alpha) const
|
||||
{
|
||||
ei_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
|
||||
|
||||
const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs);
|
||||
const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs);
|
||||
|
||||
Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
|
||||
* RhsBlasTraits::extractScalarFactor(m_rhs);
|
||||
|
||||
ei_general_matrix_matrix_product<
|
||||
Scalar,
|
||||
(_ActualLhsType::Flags&RowMajorBit)?RowMajor:ColMajor, bool(LhsBlasTraits::NeedToConjugate),
|
||||
(_ActualRhsType::Flags&RowMajorBit)?RowMajor:ColMajor, bool(RhsBlasTraits::NeedToConjugate),
|
||||
(Dest::Flags&RowMajorBit)?RowMajor:ColMajor>
|
||||
::run(
|
||||
this->rows(), this->cols(), lhs.cols(),
|
||||
(const Scalar*)&(lhs.const_cast_derived().coeffRef(0,0)), lhs.stride(),
|
||||
(const Scalar*)&(rhs.const_cast_derived().coeffRef(0,0)), rhs.stride(),
|
||||
(Scalar*)&(dst.coeffRef(0,0)), dst.stride(),
|
||||
actualAlpha);
|
||||
}
|
||||
};
|
||||
|
||||
#endif // EIGEN_GENERAL_MATRIX_MATRIX_H
|
||||
|
385
Eigen/src/Core/products/GeneralUnrolled.h
Normal file
385
Eigen/src/Core/products/GeneralUnrolled.h
Normal file
@ -0,0 +1,385 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
|
||||
// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
|
||||
//
|
||||
// Eigen is free software; you can redistribute it and/or
|
||||
// modify it under the terms of the GNU Lesser General Public
|
||||
// License as published by the Free Software Foundation; either
|
||||
// version 3 of the License, or (at your option) any later version.
|
||||
//
|
||||
// Alternatively, you can redistribute it and/or
|
||||
// modify it under the terms of the GNU General Public License as
|
||||
// published by the Free Software Foundation; either version 2 of
|
||||
// the License, or (at your option) any later version.
|
||||
//
|
||||
// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Lesser General Public
|
||||
// License and a copy of the GNU General Public License along with
|
||||
// Eigen. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
#ifndef EIGEN_GENERAL_UNROLLED_PRODUCT_H
|
||||
#define EIGEN_GENERAL_UNROLLED_PRODUCT_H
|
||||
|
||||
/*********************************************************************************
|
||||
* Specialization of GeneralProduct<> for products with small fixed sizes
|
||||
*********************************************************************************/
|
||||
|
||||
/* Since the all the dimensions of the product are small, here we can rely
|
||||
* on the generic Assign mechanism to evaluate the product per coeff (or packet).
|
||||
*
|
||||
* Note that here the inner-loops should always be unrolled.
|
||||
*/
|
||||
|
||||
template<int VectorizationMode, int Index, typename Lhs, typename Rhs, typename RetScalar>
|
||||
struct ei_product_coeff_impl;
|
||||
|
||||
template<int StorageOrder, int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
|
||||
struct ei_product_packet_impl;
|
||||
|
||||
template<typename LhsNested, typename RhsNested>
|
||||
struct ei_traits<GeneralProduct<LhsNested,RhsNested,UnrolledProduct> >
|
||||
{
|
||||
typedef typename ei_cleantype<LhsNested>::type _LhsNested;
|
||||
typedef typename ei_cleantype<RhsNested>::type _RhsNested;
|
||||
typedef typename ei_scalar_product_traits<typename _LhsNested::Scalar, typename _RhsNested::Scalar>::ReturnType Scalar;
|
||||
|
||||
enum {
|
||||
LhsCoeffReadCost = _LhsNested::CoeffReadCost,
|
||||
RhsCoeffReadCost = _RhsNested::CoeffReadCost,
|
||||
LhsFlags = _LhsNested::Flags,
|
||||
RhsFlags = _RhsNested::Flags,
|
||||
|
||||
RowsAtCompileTime = _LhsNested::RowsAtCompileTime,
|
||||
ColsAtCompileTime = _RhsNested::ColsAtCompileTime,
|
||||
InnerSize = EIGEN_ENUM_MIN(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime),
|
||||
|
||||
MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime,
|
||||
MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime,
|
||||
|
||||
LhsRowMajor = LhsFlags & RowMajorBit,
|
||||
RhsRowMajor = RhsFlags & RowMajorBit,
|
||||
|
||||
CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit)
|
||||
&& (ColsAtCompileTime == Dynamic || (ColsAtCompileTime % ei_packet_traits<Scalar>::size) == 0),
|
||||
|
||||
CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit)
|
||||
&& (RowsAtCompileTime == Dynamic || (RowsAtCompileTime % ei_packet_traits<Scalar>::size) == 0),
|
||||
|
||||
EvalToRowMajor = RhsRowMajor && (!CanVectorizeLhs),
|
||||
|
||||
RemovedBits = ~(EvalToRowMajor ? 0 : RowMajorBit),
|
||||
|
||||
Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
|
||||
| EvalBeforeAssigningBit
|
||||
| EvalBeforeNestingBit
|
||||
| (CanVectorizeLhs || CanVectorizeRhs ? PacketAccessBit : 0)
|
||||
| (LhsFlags & RhsFlags & AlignedBit),
|
||||
|
||||
CoeffReadCost = InnerSize == Dynamic ? Dynamic
|
||||
: InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
|
||||
+ (InnerSize - 1) * NumTraits<Scalar>::AddCost,
|
||||
|
||||
/* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
|
||||
* of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
|
||||
* loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
|
||||
* the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
|
||||
*/
|
||||
CanVectorizeInner = LhsRowMajor && (!RhsRowMajor) && (LhsFlags & RhsFlags & ActualPacketAccessBit)
|
||||
&& (InnerSize % ei_packet_traits<Scalar>::size == 0)
|
||||
};
|
||||
};
|
||||
|
||||
template<typename LhsNested, typename RhsNested> class GeneralProduct<LhsNested,RhsNested,UnrolledProduct>
|
||||
: ei_no_assignment_operator,
|
||||
public MatrixBase<GeneralProduct<LhsNested, RhsNested, UnrolledProduct> >
|
||||
{
|
||||
public:
|
||||
|
||||
EIGEN_GENERIC_PUBLIC_INTERFACE(GeneralProduct)
|
||||
|
||||
private:
|
||||
|
||||
typedef typename ei_traits<GeneralProduct>::_LhsNested _LhsNested;
|
||||
typedef typename ei_traits<GeneralProduct>::_RhsNested _RhsNested;
|
||||
|
||||
enum {
|
||||
PacketSize = ei_packet_traits<Scalar>::size,
|
||||
InnerSize = ei_traits<GeneralProduct>::InnerSize,
|
||||
Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
|
||||
CanVectorizeInner = ei_traits<GeneralProduct>::CanVectorizeInner
|
||||
};
|
||||
|
||||
typedef ei_product_coeff_impl<CanVectorizeInner ? InnerVectorization : NoVectorization,
|
||||
Unroll ? InnerSize-1 : Dynamic,
|
||||
_LhsNested, _RhsNested, Scalar> ScalarCoeffImpl;
|
||||
|
||||
public:
|
||||
|
||||
template<typename Lhs, typename Rhs>
|
||||
inline GeneralProduct(const Lhs& lhs, const Rhs& rhs)
|
||||
: m_lhs(lhs), m_rhs(rhs)
|
||||
{
|
||||
// we don't allow taking products of matrices of different real types, as that wouldn't be vectorizable.
|
||||
// We still allow to mix T and complex<T>.
|
||||
EIGEN_STATIC_ASSERT((ei_is_same_type<typename Lhs::RealScalar, typename Rhs::RealScalar>::ret),
|
||||
YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
|
||||
ei_assert(lhs.cols() == rhs.rows()
|
||||
&& "invalid matrix product"
|
||||
&& "if you wanted a coeff-wise or a dot product use the respective explicit functions");
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE int rows() const { return m_lhs.rows(); }
|
||||
EIGEN_STRONG_INLINE int cols() const { return m_rhs.cols(); }
|
||||
|
||||
EIGEN_STRONG_INLINE const Scalar coeff(int row, int col) const
|
||||
{
|
||||
Scalar res;
|
||||
ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
|
||||
return res;
|
||||
}
|
||||
|
||||
/* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
|
||||
* which is why we don't set the LinearAccessBit.
|
||||
*/
|
||||
EIGEN_STRONG_INLINE const Scalar coeff(int index) const
|
||||
{
|
||||
Scalar res;
|
||||
const int row = RowsAtCompileTime == 1 ? 0 : index;
|
||||
const int col = RowsAtCompileTime == 1 ? index : 0;
|
||||
ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
|
||||
return res;
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_STRONG_INLINE const PacketScalar packet(int row, int col) const
|
||||
{
|
||||
PacketScalar res;
|
||||
ei_product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor,
|
||||
Unroll ? InnerSize-1 : Dynamic,
|
||||
_LhsNested, _RhsNested, PacketScalar, LoadMode>
|
||||
::run(row, col, m_lhs, m_rhs, res);
|
||||
return res;
|
||||
}
|
||||
|
||||
protected:
|
||||
const LhsNested m_lhs;
|
||||
const RhsNested m_rhs;
|
||||
};
|
||||
|
||||
|
||||
/***************************************************************************
|
||||
* Normal product .coeff() implementation (with meta-unrolling)
|
||||
***************************************************************************/
|
||||
|
||||
/**************************************
|
||||
*** Scalar path - no vectorization ***
|
||||
**************************************/
|
||||
|
||||
template<int Index, typename Lhs, typename Rhs, typename RetScalar>
|
||||
struct ei_product_coeff_impl<NoVectorization, Index, Lhs, Rhs, RetScalar>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
|
||||
{
|
||||
ei_product_coeff_impl<NoVectorization, Index-1, Lhs, Rhs, RetScalar>::run(row, col, lhs, rhs, res);
|
||||
res += lhs.coeff(row, Index) * rhs.coeff(Index, col);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Lhs, typename Rhs, typename RetScalar>
|
||||
struct ei_product_coeff_impl<NoVectorization, 0, Lhs, Rhs, RetScalar>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
|
||||
{
|
||||
res = lhs.coeff(row, 0) * rhs.coeff(0, col);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Lhs, typename Rhs, typename RetScalar>
|
||||
struct ei_product_coeff_impl<NoVectorization, Dynamic, Lhs, Rhs, RetScalar>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar& res)
|
||||
{
|
||||
ei_assert(lhs.cols()>0 && "you are using a non initialized matrix");
|
||||
res = lhs.coeff(row, 0) * rhs.coeff(0, col);
|
||||
for(int i = 1; i < lhs.cols(); ++i)
|
||||
res += lhs.coeff(row, i) * rhs.coeff(i, col);
|
||||
}
|
||||
};
|
||||
|
||||
// prevent buggy user code from causing an infinite recursion
|
||||
template<typename Lhs, typename Rhs, typename RetScalar>
|
||||
struct ei_product_coeff_impl<NoVectorization, -1, Lhs, Rhs, RetScalar>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int, int, const Lhs&, const Rhs&, RetScalar&) {}
|
||||
};
|
||||
|
||||
/*******************************************
|
||||
*** Scalar path with inner vectorization ***
|
||||
*******************************************/
|
||||
|
||||
template<int Index, typename Lhs, typename Rhs, typename PacketScalar>
|
||||
struct ei_product_coeff_vectorized_unroller
|
||||
{
|
||||
enum { PacketSize = ei_packet_traits<typename Lhs::Scalar>::size };
|
||||
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
|
||||
{
|
||||
ei_product_coeff_vectorized_unroller<Index-PacketSize, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, pres);
|
||||
pres = ei_padd(pres, ei_pmul( lhs.template packet<Aligned>(row, Index) , rhs.template packet<Aligned>(Index, col) ));
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Lhs, typename Rhs, typename PacketScalar>
|
||||
struct ei_product_coeff_vectorized_unroller<0, Lhs, Rhs, PacketScalar>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
|
||||
{
|
||||
pres = ei_pmul(lhs.template packet<Aligned>(row, 0) , rhs.template packet<Aligned>(0, col));
|
||||
}
|
||||
};
|
||||
|
||||
template<int Index, typename Lhs, typename Rhs, typename RetScalar>
|
||||
struct ei_product_coeff_impl<InnerVectorization, Index, Lhs, Rhs, RetScalar>
|
||||
{
|
||||
typedef typename Lhs::PacketScalar PacketScalar;
|
||||
enum { PacketSize = ei_packet_traits<typename Lhs::Scalar>::size };
|
||||
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
|
||||
{
|
||||
PacketScalar pres;
|
||||
ei_product_coeff_vectorized_unroller<Index+1-PacketSize, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, pres);
|
||||
ei_product_coeff_impl<NoVectorization,Index,Lhs,Rhs,RetScalar>::run(row, col, lhs, rhs, res);
|
||||
res = ei_predux(pres);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Lhs, typename Rhs, int LhsRows = Lhs::RowsAtCompileTime, int RhsCols = Rhs::ColsAtCompileTime>
|
||||
struct ei_product_coeff_vectorized_dyn_selector
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
|
||||
{
|
||||
res = ei_dot_impl<
|
||||
Block<Lhs, 1, ei_traits<Lhs>::ColsAtCompileTime>,
|
||||
Block<Rhs, ei_traits<Rhs>::RowsAtCompileTime, 1>,
|
||||
LinearVectorization, NoUnrolling>::run(lhs.row(row), rhs.col(col));
|
||||
}
|
||||
};
|
||||
|
||||
// NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower
|
||||
// NOTE maybe they are now useless since we have a specialization for Block<Matrix>
|
||||
template<typename Lhs, typename Rhs, int RhsCols>
|
||||
struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int /*row*/, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
|
||||
{
|
||||
res = ei_dot_impl<
|
||||
Lhs,
|
||||
Block<Rhs, ei_traits<Rhs>::RowsAtCompileTime, 1>,
|
||||
LinearVectorization, NoUnrolling>::run(lhs, rhs.col(col));
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Lhs, typename Rhs, int LhsRows>
|
||||
struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int row, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
|
||||
{
|
||||
res = ei_dot_impl<
|
||||
Block<Lhs, 1, ei_traits<Lhs>::ColsAtCompileTime>,
|
||||
Rhs,
|
||||
LinearVectorization, NoUnrolling>::run(lhs.row(row), rhs);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Lhs, typename Rhs>
|
||||
struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,1>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int /*row*/, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
|
||||
{
|
||||
res = ei_dot_impl<
|
||||
Lhs,
|
||||
Rhs,
|
||||
LinearVectorization, NoUnrolling>::run(lhs, rhs);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Lhs, typename Rhs, typename RetScalar>
|
||||
struct ei_product_coeff_impl<InnerVectorization, Dynamic, Lhs, Rhs, RetScalar>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
|
||||
{
|
||||
ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs>::run(row, col, lhs, rhs, res);
|
||||
}
|
||||
};
|
||||
|
||||
/*******************
|
||||
*** Packet path ***
|
||||
*******************/
|
||||
|
||||
template<int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
|
||||
struct ei_product_packet_impl<RowMajor, Index, Lhs, Rhs, PacketScalar, LoadMode>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
|
||||
{
|
||||
ei_product_packet_impl<RowMajor, Index-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res);
|
||||
res = ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.template packet<LoadMode>(Index, col), res);
|
||||
}
|
||||
};
|
||||
|
||||
template<int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
|
||||
struct ei_product_packet_impl<ColMajor, Index, Lhs, Rhs, PacketScalar, LoadMode>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
|
||||
{
|
||||
ei_product_packet_impl<ColMajor, Index-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res);
|
||||
res = ei_pmadd(lhs.template packet<LoadMode>(row, Index), ei_pset1(rhs.coeff(Index, col)), res);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
|
||||
struct ei_product_packet_impl<RowMajor, 0, Lhs, Rhs, PacketScalar, LoadMode>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
|
||||
{
|
||||
res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
|
||||
struct ei_product_packet_impl<ColMajor, 0, Lhs, Rhs, PacketScalar, LoadMode>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
|
||||
{
|
||||
res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col)));
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
|
||||
struct ei_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMode>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
|
||||
{
|
||||
ei_assert(lhs.cols()>0 && "you are using a non initialized matrix");
|
||||
res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
|
||||
for(int i = 1; i < lhs.cols(); ++i)
|
||||
res = ei_pmadd(ei_pset1(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
|
||||
struct ei_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMode>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
|
||||
{
|
||||
ei_assert(lhs.cols()>0 && "you are using a non initialized matrix");
|
||||
res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col)));
|
||||
for(int i = 1; i < lhs.cols(); ++i)
|
||||
res = ei_pmadd(lhs.template packet<LoadMode>(row, i), ei_pset1(rhs.coeff(i, col)), res);
|
||||
}
|
||||
};
|
||||
|
||||
#endif // EIGEN_GENERAL_UNROLLED_PRODUCT_H
|
@ -339,4 +339,56 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,false,ConjugateLhs,
|
||||
}
|
||||
};
|
||||
|
||||
/***************************************************************************
|
||||
* Wrapper to ei_product_selfadjoint_matrix
|
||||
***************************************************************************/
|
||||
|
||||
template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
|
||||
struct ei_traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false> >
|
||||
: ei_traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs> >
|
||||
{};
|
||||
|
||||
template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
|
||||
struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>
|
||||
: public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs >
|
||||
{
|
||||
EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
|
||||
|
||||
SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
|
||||
|
||||
enum {
|
||||
LhsUpLo = LhsMode&(UpperTriangularBit|LowerTriangularBit),
|
||||
LhsIsSelfAdjoint = (LhsMode&SelfAdjointBit)==SelfAdjointBit,
|
||||
RhsUpLo = RhsMode&(UpperTriangularBit|LowerTriangularBit),
|
||||
RhsIsSelfAdjoint = (RhsMode&SelfAdjointBit)==SelfAdjointBit
|
||||
};
|
||||
|
||||
template<typename Dest> void addTo(Dest& dst, Scalar alpha) const
|
||||
{
|
||||
ei_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
|
||||
|
||||
const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs);
|
||||
const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs);
|
||||
|
||||
Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
|
||||
* RhsBlasTraits::extractScalarFactor(m_rhs);
|
||||
|
||||
ei_product_selfadjoint_matrix<Scalar,
|
||||
EIGEN_LOGICAL_XOR(LhsUpLo==UpperTriangular,
|
||||
ei_traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint,
|
||||
NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsUpLo==UpperTriangular,bool(LhsBlasTraits::NeedToConjugate)),
|
||||
EIGEN_LOGICAL_XOR(RhsUpLo==UpperTriangular,
|
||||
ei_traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint,
|
||||
NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsUpLo==UpperTriangular,bool(RhsBlasTraits::NeedToConjugate)),
|
||||
ei_traits<Dest>::Flags&RowMajorBit ? RowMajor : ColMajor>
|
||||
::run(
|
||||
lhs.rows(), rhs.cols(), // sizes
|
||||
&lhs.coeff(0,0), lhs.stride(), // lhs info
|
||||
&rhs.coeff(0,0), rhs.stride(), // rhs info
|
||||
&dst.coeffRef(0,0), dst.stride(), // result info
|
||||
actualAlpha // alpha
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H
|
||||
|
@ -154,5 +154,48 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector(
|
||||
ei_aligned_stack_delete(Scalar, const_cast<Scalar*>(rhs), size);
|
||||
}
|
||||
|
||||
/***************************************************************************
|
||||
* Wrapper to ei_product_selfadjoint_vector
|
||||
***************************************************************************/
|
||||
|
||||
template<typename Lhs, int LhsMode, typename Rhs>
|
||||
struct ei_traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true> >
|
||||
: ei_traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs> >
|
||||
{};
|
||||
|
||||
template<typename Lhs, int LhsMode, typename Rhs>
|
||||
struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
|
||||
: public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs >
|
||||
{
|
||||
EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
|
||||
|
||||
enum {
|
||||
LhsUpLo = LhsMode&(UpperTriangularBit|LowerTriangularBit)
|
||||
};
|
||||
|
||||
SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
|
||||
|
||||
template<typename Dest> void addTo(Dest& dst, Scalar alpha) const
|
||||
{
|
||||
ei_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
|
||||
|
||||
const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs);
|
||||
const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs);
|
||||
|
||||
Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
|
||||
* RhsBlasTraits::extractScalarFactor(m_rhs);
|
||||
|
||||
ei_assert((&dst.coeff(1))-(&dst.coeff(0))==1 && "not implemented yet");
|
||||
ei_product_selfadjoint_vector<Scalar, ei_traits<_ActualLhsType>::Flags&RowMajorBit, int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)>
|
||||
(
|
||||
lhs.rows(), // size
|
||||
&lhs.coeff(0,0), lhs.stride(), // lhs info
|
||||
&rhs.coeff(0), (&rhs.coeff(1))-(&rhs.coeff(0)), // rhs info
|
||||
&dst.coeffRef(0), // result info
|
||||
actualAlpha // scale factor
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_H
|
||||
|
@ -139,7 +139,7 @@ struct ei_product_blocking_traits
|
||||
// register block size along the N direction (must be either 2 or 4)
|
||||
nr = HalfRegisterCount/2,
|
||||
|
||||
// register block size along the M direction (this cannot be modified)
|
||||
// register block size along the M direction (currently, this one cannot be modified)
|
||||
mr = 2 * PacketSize,
|
||||
|
||||
// max cache block size along the K direction
|
||||
|
@ -144,7 +144,7 @@ umeyama(const MatrixBase<Derived>& src, const MatrixBase<OtherDerived>& dst, boo
|
||||
// const Scalar dst_var = dst_demean.rowwise().squaredNorm().sum() * one_over_n;
|
||||
|
||||
// Eq. (38)
|
||||
const MatrixType sigma = (dst_demean * src_demean.transpose() * one_over_n).lazy();
|
||||
const MatrixType sigma = (one_over_n * dst_demean * src_demean.transpose()).lazy();
|
||||
|
||||
SVD<MatrixType> svd(sigma);
|
||||
|
||||
|
@ -148,7 +148,6 @@ template<typename Scalar, int Mode> void transformations(void)
|
||||
Transform3 tmat3(mat3), tmat4(mat4);
|
||||
if(Mode!=int(AffineCompact))
|
||||
tmat4.matrix()(3,3) = Scalar(1);
|
||||
std::cerr << tmat3.matrix() << "\n\n" << tmat4.matrix() << "\n\n";
|
||||
VERIFY_IS_APPROX(tmat3.matrix(), tmat4.matrix());
|
||||
|
||||
Scalar a3 = ei_random<Scalar>(-Scalar(M_PI), Scalar(M_PI));
|
||||
|
@ -65,7 +65,12 @@ template<typename MatrixType> void nomalloc(const MatrixType& m)
|
||||
VERIFY_IS_APPROX((m1+m2)*s1, s1*m1+s1*m2);
|
||||
VERIFY_IS_APPROX((m1+m2)(r,c), (m1(r,c))+(m2(r,c)));
|
||||
VERIFY_IS_APPROX(m1.cwise() * m1.block(0,0,rows,cols), m1.cwise() * m1);
|
||||
VERIFY_IS_APPROX((m1*m1.transpose())*m2, m1*(m1.transpose()*m2));
|
||||
if (MatrixType::RowsAtCompileTime<EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD) {
|
||||
// If the matrices are too large, we have better to use the optimized GEMM
|
||||
// routines which allocates temporaries. However, on some platforms
|
||||
// these temporaries are allocated on the stack using alloca.
|
||||
VERIFY_IS_APPROX((m1*m1.transpose())*m2, m1*(m1.transpose()*m2));
|
||||
}
|
||||
}
|
||||
|
||||
void test_nomalloc()
|
||||
|
Loading…
Reference in New Issue
Block a user