mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-03-31 19:00:35 +08:00
Optimization: added super efficient rowmajor * vector product (and vector * colmajor).
It basically performs 4 dot products at once reducing loads of the vector and improving instructions scheduling. With 3 cache friendly algorithms, we now handle all product configurations with outstanding perf for large matrices.
This commit is contained in:
parent
51e6ee39f0
commit
99a625243f
@ -460,7 +460,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
|
||||
_EIGEN_ACCUMULATE_PACKETS(,u,u,);
|
||||
break;
|
||||
default:
|
||||
for (int j = peeledSize; j<alignedSize; j+=PacketSize)
|
||||
for (int j = alignedStart; j<alignedSize; j+=PacketSize)
|
||||
_EIGEN_ACCUMULATE_PACKETS(u,u,u,);
|
||||
break;
|
||||
}
|
||||
@ -487,10 +487,10 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
|
||||
|
||||
// process aligned result's coeffs
|
||||
if ((iN0 % PacketSize) == 0)
|
||||
for (int j = 0;j<alignedSize;j+=PacketSize)
|
||||
for (int j = alignedStart;j<alignedSize;j+=PacketSize)
|
||||
ei_pstore(&res[j], ei_padd(ei_pmul(ptmp0,ei_pload(&lhs[j+iN0])),ei_pload(&res[j])));
|
||||
else
|
||||
for (int j = 0;j<alignedSize;j+=PacketSize)
|
||||
for (int j = alignedStart;j<alignedSize;j+=PacketSize)
|
||||
ei_pstore(&res[j], ei_padd(ei_pmul(ptmp0,ei_ploadu(&lhs[j+iN0])),ei_pload(&res[j])));
|
||||
|
||||
// process remaining scalars
|
||||
@ -511,4 +511,157 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
|
||||
#undef _EIGEN_ACCUMULATE_PACKETS
|
||||
}
|
||||
|
||||
template<typename Scalar, typename ResType>
|
||||
EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
|
||||
const Scalar* lhs, int lhsStride,
|
||||
const Scalar* rhs, int rhsSize,
|
||||
ResType& res)
|
||||
{
|
||||
#ifdef _EIGEN_ACCUMULATE_PACKETS
|
||||
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
|
||||
#endif
|
||||
|
||||
#define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2,OFFSET) {\
|
||||
Packet b = ei_pload(&rhs[j]); \
|
||||
ptmp0 = ei_padd(ptmp0, ei_pmul(b, ei_pload##A0 (&lhs[j+iN0]))); \
|
||||
ptmp1 = ei_padd(ptmp1, ei_pmul(b, ei_pload##A13(&lhs[j+iN1]))); \
|
||||
ptmp2 = ei_padd(ptmp2, ei_pmul(b, ei_pload##A2 (&lhs[j+iN2]))); \
|
||||
ptmp3 = ei_padd(ptmp3, ei_pmul(b, ei_pload##A13(&lhs[j+iN3]))); }
|
||||
|
||||
asm("#begin matrix_vector_product");
|
||||
typedef typename ei_packet_traits<Scalar>::type Packet;
|
||||
const int PacketSize = sizeof(Packet)/sizeof(Scalar);
|
||||
|
||||
enum { AllAligned, EvenAligned, FirstAligned, NoneAligned };
|
||||
const int rowsAtOnce = 4;
|
||||
const int peels = 2;
|
||||
const int PacketAlignedMask = PacketSize-1;
|
||||
const int PeelAlignedMask = PacketSize*peels-1;
|
||||
const bool Vectorized = sizeof(Packet) != sizeof(Scalar);
|
||||
const int size = rhsSize;
|
||||
|
||||
// How many coeffs of the result do we have to skip to be aligned.
|
||||
// Here we assume data are at least aligned on the base scalar type that is mandatory anyway.
|
||||
const int alignedStart = Vectorized
|
||||
? std::min<int>( (PacketSize - ((size_t(rhs)/sizeof(Scalar)) & PacketAlignedMask)) & PacketAlignedMask, size)
|
||||
: 0;
|
||||
const int alignedSize = alignedStart + ((size-alignedStart) & ~PacketAlignedMask);
|
||||
const int peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : 0;
|
||||
|
||||
const int alignmentStep = lhsStride % PacketSize;
|
||||
int alignmentPattern = alignmentStep==0 ? AllAligned
|
||||
: alignmentStep==2 ? EvenAligned
|
||||
: FirstAligned;
|
||||
|
||||
// find how many rows do we have to skip to be aligned with rhs (if possible)
|
||||
int skipRows=0;
|
||||
for (; skipRows<PacketSize && alignedStart != alignmentStep*skipRows; ++skipRows)
|
||||
{}
|
||||
if (skipRows==PacketSize)
|
||||
{
|
||||
// nothing can be aligned, no need to skip any column
|
||||
alignmentPattern = NoneAligned;
|
||||
skipRows = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
skipRows = std::min(skipRows,res.size());
|
||||
// note that the skiped columns are processed later.
|
||||
}
|
||||
|
||||
int rowBound = (res.size()/rowsAtOnce)*rowsAtOnce;
|
||||
for (int i=skipRows; i<rowBound; i+=rowsAtOnce)
|
||||
{
|
||||
Scalar tmp0 = Scalar(0), tmp1 = Scalar(0), tmp2 = Scalar(0), tmp3 = Scalar(0);
|
||||
Packet ptmp0 = ei_pset1(Scalar(0)), ptmp1 = ei_pset1(Scalar(0)), ptmp2 = ei_pset1(Scalar(0)), ptmp3 = ei_pset1(Scalar(0));
|
||||
int iN0 = i*lhsStride, iN1 = (i+1)*lhsStride, iN2 = (i+2)*lhsStride, iN3 = (i+3)*lhsStride;
|
||||
|
||||
// process initial unaligned coeffs
|
||||
for (int j=0; j<alignedStart; j++)
|
||||
{
|
||||
Scalar b = rhs[j];
|
||||
tmp0 += b * lhs[j+iN0]; tmp1 += b * lhs[j+iN1]; tmp2 += b * lhs[j+iN2]; tmp3 += b * lhs[j+iN3];
|
||||
}
|
||||
|
||||
if (alignedSize>alignedStart)
|
||||
{
|
||||
switch(alignmentPattern)
|
||||
{
|
||||
case AllAligned:
|
||||
for (int j = alignedStart; j<alignedSize; j+=PacketSize)
|
||||
_EIGEN_ACCUMULATE_PACKETS(,,,);
|
||||
break;
|
||||
case EvenAligned:
|
||||
for (int j = alignedStart; j<alignedSize; j+=PacketSize)
|
||||
_EIGEN_ACCUMULATE_PACKETS(,u,,);
|
||||
break;
|
||||
case FirstAligned:
|
||||
for (int j = alignedStart; j<alignedSize; j+=PacketSize)
|
||||
_EIGEN_ACCUMULATE_PACKETS(,u,u,);
|
||||
break;
|
||||
default:
|
||||
for (int j = alignedStart; j<alignedSize; j+=PacketSize)
|
||||
_EIGEN_ACCUMULATE_PACKETS(u,u,u,);
|
||||
break;
|
||||
}
|
||||
tmp0 += ei_predux(ptmp0);
|
||||
tmp1 += ei_predux(ptmp1);
|
||||
tmp2 += ei_predux(ptmp2);
|
||||
tmp3 += ei_predux(ptmp3);
|
||||
}
|
||||
|
||||
// process remaining coeffs
|
||||
for (int j=alignedSize; j<size; j++)
|
||||
{
|
||||
Scalar b = rhs[j];
|
||||
tmp0 += b * lhs[j+iN0]; tmp1 += b * lhs[j+iN1]; tmp2 += b * lhs[j+iN2]; tmp3 += b * lhs[j+iN3];
|
||||
}
|
||||
res[i] = tmp0; res[i+1] = tmp1; res[i+2] = tmp2; res[i+3] = tmp3;
|
||||
}
|
||||
|
||||
// process remaining first and last rows (at most columnsAtOnce-1)
|
||||
int end = res.size();
|
||||
int start = rowBound;
|
||||
do
|
||||
{
|
||||
for (int i=rowBound; i<end; i++)
|
||||
{
|
||||
Scalar tmp0 = Scalar(0);
|
||||
Packet ptmp0 = ei_pset1(tmp0);
|
||||
int iN0 = i*lhsStride;
|
||||
// process first unaligned result's coeffs
|
||||
for (int j=0; j<alignedStart; j++)
|
||||
tmp0 += rhs[j] * lhs[j+iN0];
|
||||
|
||||
if (alignedSize>alignedStart)
|
||||
{
|
||||
// process aligned rhs coeffs
|
||||
if (iN0 % PacketSize==0)
|
||||
for (int j = alignedStart;j<alignedSize;j+=PacketSize)
|
||||
ptmp0 = ei_padd(ptmp0, ei_pmul(ei_pload(&rhs[j]), ei_pload(&lhs[j+iN0])));
|
||||
else
|
||||
for (int j = alignedStart;j<alignedSize;j+=PacketSize)
|
||||
ptmp0 = ei_padd(ptmp0, ei_pmul(ei_pload(&rhs[j]), ei_ploadu(&lhs[j+iN0])));
|
||||
tmp0 += ei_predux(ptmp0);
|
||||
}
|
||||
|
||||
// process remaining scalars
|
||||
for (int j=alignedSize; j<size; j++)
|
||||
tmp0 += rhs[j] * lhs[j+iN0];
|
||||
res[i] += tmp0;
|
||||
}
|
||||
if (skipRows)
|
||||
{
|
||||
start = 0;
|
||||
end = skipRows;
|
||||
skipRows = 0;
|
||||
}
|
||||
else
|
||||
break;
|
||||
} while(true);
|
||||
asm("#end matrix_vector_product");
|
||||
|
||||
#undef _EIGEN_ACCUMULATE_PACKETS
|
||||
}
|
||||
|
||||
#endif // EIGEN_CACHE_FRIENDLY_PRODUCT_H
|
||||
|
@ -125,7 +125,7 @@ template<typename T, int Size, int _Cols> class ei_matrix_storage<T, Size, Dynam
|
||||
inline void swap(ei_matrix_storage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
|
||||
inline int rows(void) const {return m_rows;}
|
||||
inline int cols(void) const {return _Cols;}
|
||||
inline void resize(int size, int rows, int)
|
||||
inline void resize(int /*size*/, int rows, int)
|
||||
{
|
||||
m_rows = rows;
|
||||
}
|
||||
|
@ -85,16 +85,19 @@ struct ProductReturnType<Lhs,Rhs,CacheFriendlyProduct>
|
||||
*/
|
||||
template<typename Lhs, typename Rhs> struct ei_product_mode
|
||||
{
|
||||
enum{ value = ((Rhs::Flags&Diagonal)==Diagonal) || ((Lhs::Flags&Diagonal)==Diagonal)
|
||||
? DiagonalProduct
|
||||
: (Rhs::Flags & Lhs::Flags & SparseBit)
|
||||
? SparseProduct
|
||||
: Lhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
|
||||
&& ( Lhs::MaxRowsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
|
||||
|| ((Rhs::Flags&RowMajorBit) && Lhs::IsVectorAtCompileTime))
|
||||
&& ( Rhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
|
||||
|| ((!(Lhs::Flags&RowMajorBit)) && Rhs::IsVectorAtCompileTime))
|
||||
? CacheFriendlyProduct : NormalProduct };
|
||||
enum{
|
||||
|
||||
value = ((Rhs::Flags&Diagonal)==Diagonal) || ((Lhs::Flags&Diagonal)==Diagonal)
|
||||
? DiagonalProduct
|
||||
: (Rhs::Flags & Lhs::Flags & SparseBit)
|
||||
? SparseProduct
|
||||
: Lhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
|
||||
&& ( Lhs::MaxRowsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
|
||||
|| Rhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD )
|
||||
&& (!(Rhs::IsVectorAtCompileTime && (Lhs::Flags&RowMajorBit) && (!Lhs::Flags&DirectAccessBit)))
|
||||
&& (!(Lhs::IsVectorAtCompileTime && (!Rhs::Flags&RowMajorBit) && (!Rhs::Flags&DirectAccessBit)))
|
||||
? CacheFriendlyProduct
|
||||
: NormalProduct };
|
||||
};
|
||||
|
||||
/** \class Product
|
||||
@ -210,11 +213,9 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
|
||||
*/
|
||||
inline bool _useCacheFriendlyProduct() const
|
||||
{
|
||||
return m_lhs.cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
|
||||
&& (rows()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
|
||||
|| ((_RhsNested::Flags&RowMajorBit) && _LhsNested::IsVectorAtCompileTime))
|
||||
&& (cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
|
||||
|| ((!(_LhsNested::Flags&RowMajorBit)) && _RhsNested::IsVectorAtCompileTime));
|
||||
return m_lhs.cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
|
||||
&& ( rows()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
|
||||
|| cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD);
|
||||
}
|
||||
|
||||
inline int rows() const { return m_lhs.rows(); }
|
||||
@ -365,7 +366,6 @@ struct ei_product_coeff_impl<InnerVectorization, Index, Lhs, Rhs>
|
||||
}
|
||||
};
|
||||
|
||||
// NOTE the following specializations are because taking .col(0) on a vector is a bit slower
|
||||
template<typename Lhs, typename Rhs, int LhsRows = Lhs::RowsAtCompileTime, int RhsCols = Rhs::ColsAtCompileTime>
|
||||
struct ei_product_coeff_vectorized_dyn_selector
|
||||
{
|
||||
@ -378,6 +378,7 @@ struct ei_product_coeff_vectorized_dyn_selector
|
||||
}
|
||||
};
|
||||
|
||||
// NOTE the 2 following specializations are because taking .col(0) on a vector is a bit slower
|
||||
template<typename Lhs, typename Rhs, int RhsCols>
|
||||
struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols>
|
||||
{
|
||||
@ -483,6 +484,10 @@ template<typename Scalar, typename RhsType>
|
||||
static void ei_cache_friendly_product_colmajor_times_vector(
|
||||
int size, const Scalar* lhs, int lhsStride, const RhsType& rhs, Scalar* res);
|
||||
|
||||
template<typename Scalar, typename ResType>
|
||||
static void ei_cache_friendly_product_rowmajor_times_vector(
|
||||
const Scalar* lhs, int lhsStride, const Scalar* rhs, int rhsSize, ResType& res);
|
||||
|
||||
template<typename ProductType,
|
||||
int LhsRows = ei_traits<ProductType>::RowsAtCompileTime,
|
||||
int LhsOrder = int(ei_traits<ProductType>::LhsFlags)&RowMajorBit ? RowMajor : ColMajor,
|
||||
@ -538,7 +543,7 @@ struct ei_cache_friendly_product_selector<ProductType,LhsRows,ColMajor,HasDirect
|
||||
product.rhs(), _res);
|
||||
|
||||
if (!EvalToRes)
|
||||
res = Map<Matrix<Scalar,DestDerived::RowsAtCompileTime,1> >(_res, res.size());
|
||||
res = Map<Matrix<Scalar,DestDerived::SizeAtCompileTime,1> >(_res, res.size());
|
||||
}
|
||||
};
|
||||
|
||||
@ -574,17 +579,82 @@ struct ei_cache_friendly_product_selector<ProductType,1,LhsOrder,LhsAccess,RhsCo
|
||||
else
|
||||
{
|
||||
_res = (Scalar*)alloca(sizeof(Scalar)*res.size());
|
||||
Map<Matrix<Scalar,DestDerived::RowsAtCompileTime,1> >(_res, res.size()) = res;
|
||||
Map<Matrix<Scalar,DestDerived::SizeAtCompileTime,1> >(_res, res.size()) = res;
|
||||
}
|
||||
ei_cache_friendly_product_colmajor_times_vector(res.size(),
|
||||
&product.rhs().const_cast_derived().coeffRef(0,0), product.rhs().stride(),
|
||||
product.lhs().transpose(), _res);
|
||||
|
||||
if (!EvalToRes)
|
||||
res = Map<Matrix<Scalar,1,DestDerived::ColsAtCompileTime> >(_res, res.size());
|
||||
res = Map<Matrix<Scalar,DestDerived::SizeAtCompileTime,1> >(_res, res.size());
|
||||
}
|
||||
};
|
||||
|
||||
// optimized rowmajor - vector product
|
||||
template<typename ProductType, int LhsRows, int RhsOrder, int RhsAccess>
|
||||
struct ei_cache_friendly_product_selector<ProductType,LhsRows,RowMajor,HasDirectAccess,1,RhsOrder,RhsAccess>
|
||||
{
|
||||
typedef typename ProductType::Scalar Scalar;
|
||||
typedef typename ei_traits<ProductType>::_RhsNested Rhs;
|
||||
enum {
|
||||
UseRhsDirectly = ((ei_packet_traits<Scalar>::size==1) || (Rhs::Flags&ActualPacketAccessBit))
|
||||
&& (!(Rhs::Flags & RowMajorBit)) };
|
||||
|
||||
template<typename DestDerived>
|
||||
inline static void run(DestDerived& res, const ProductType& product)
|
||||
{
|
||||
Scalar* __restrict__ _rhs;
|
||||
if (UseRhsDirectly)
|
||||
_rhs = &product.rhs().const_cast_derived().coeffRef(0);
|
||||
else
|
||||
{
|
||||
_rhs = (Scalar*)alloca(sizeof(Scalar)*product.rhs().size());
|
||||
Map<Matrix<Scalar,Rhs::SizeAtCompileTime,1> >(_rhs, product.rhs().size()) = product.rhs();
|
||||
}
|
||||
ei_cache_friendly_product_rowmajor_times_vector(&product.lhs().const_cast_derived().coeffRef(0,0), product.lhs().stride(),
|
||||
_rhs, product.rhs().size(), res);
|
||||
}
|
||||
};
|
||||
|
||||
// optimized vector - colmajor product
|
||||
template<typename ProductType, int LhsOrder, int LhsAccess, int RhsCols>
|
||||
struct ei_cache_friendly_product_selector<ProductType,1,LhsOrder,LhsAccess,RhsCols,ColMajor,HasDirectAccess>
|
||||
{
|
||||
typedef typename ProductType::Scalar Scalar;
|
||||
typedef typename ei_traits<ProductType>::_LhsNested Lhs;
|
||||
enum {
|
||||
UseLhsDirectly = ((ei_packet_traits<Scalar>::size==1) || (Lhs::Flags&ActualPacketAccessBit))
|
||||
&& (!(Lhs::Flags & RowMajorBit)) };
|
||||
|
||||
template<typename DestDerived>
|
||||
inline static void run(DestDerived& res, const ProductType& product)
|
||||
{
|
||||
Scalar* __restrict__ _lhs;
|
||||
if (UseLhsDirectly)
|
||||
_lhs = &product.lhs().const_cast_derived().coeffRef(0);
|
||||
else
|
||||
{
|
||||
_lhs = (Scalar*)alloca(sizeof(Scalar)*product.lhs().size());
|
||||
Map<Matrix<Scalar,Lhs::SizeAtCompileTime,1> >(_lhs, product.lhs().size()) = product.lhs();
|
||||
}
|
||||
ei_cache_friendly_product_rowmajor_times_vector(&product.rhs().const_cast_derived().coeffRef(0,0), product.rhs().stride(),
|
||||
_lhs, product.lhs().size(), res);
|
||||
}
|
||||
};
|
||||
|
||||
// discard this case which has to be handled by the default path
|
||||
// (we keep it to be sure to hit a compilation error if this is not the case)
|
||||
template<typename ProductType, int LhsRows, int RhsOrder, int RhsAccess>
|
||||
struct ei_cache_friendly_product_selector<ProductType,LhsRows,RowMajor,NoDirectAccess,1,RhsOrder,RhsAccess>
|
||||
{};
|
||||
|
||||
// discard this case which has to be handled by the default path
|
||||
// (we keep it to be sure to hit a compilation error if this is not the case)
|
||||
template<typename ProductType, int LhsOrder, int LhsAccess, int RhsCols>
|
||||
struct ei_cache_friendly_product_selector<ProductType,1,LhsOrder,LhsAccess,RhsCols,ColMajor,NoDirectAccess>
|
||||
{};
|
||||
|
||||
|
||||
/** \internal */
|
||||
template<typename Derived>
|
||||
template<typename Lhs,typename Rhs>
|
||||
|
@ -5,7 +5,6 @@ mtl4 ; with lines lc rgbcolor "#74B973" lt 1
|
||||
blitz ; with lines lc rgbcolor "#38F5F5" lt 1
|
||||
ATLAS ; with lines lc rgbcolor "green" lt 1
|
||||
INTEL_MKL ; with lines lc rgbcolor "yellow" lt 2
|
||||
MKL_INTEL ; with lines lc rgbcolor "yellow" lt 2
|
||||
ublas ; with lines lc rgbcolor "red" lt 1
|
||||
F77 ; with lines lc rgbcolor "#9A6B36" lt 1
|
||||
C ; with lines lc rgbcolor "#7DF4FF" lt 1
|
||||
|
Loading…
x
Reference in New Issue
Block a user