mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-02-17 18:09:55 +08:00
more refactoring in the level3 products
This commit is contained in:
parent
d6627d540e
commit
d6475ea390
@ -176,6 +176,7 @@ namespace Eigen {
|
||||
#include "src/Core/IO.h"
|
||||
#include "src/Core/Swap.h"
|
||||
#include "src/Core/CommaInitializer.h"
|
||||
#include "src/Core/util/BlasUtil.h"
|
||||
#include "src/Core/Product.h"
|
||||
#include "src/Core/products/GeneralMatrixMatrix.h"
|
||||
#include "src/Core/products/GeneralMatrixVector.h"
|
||||
|
@ -577,22 +577,6 @@ struct ei_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMod
|
||||
|
||||
// Forward declarations
|
||||
|
||||
template<typename Scalar, bool ConjugateLhs, bool ConjugateRhs>
|
||||
static void ei_cache_friendly_product(
|
||||
int _rows, int _cols, int depth,
|
||||
bool _lhsRowMajor, const Scalar* _lhs, int _lhsStride,
|
||||
bool _rhsRowMajor, const Scalar* _rhs, int _rhsStride,
|
||||
bool resRowMajor, Scalar* res, int resStride,
|
||||
Scalar alpha);
|
||||
|
||||
template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename RhsType>
|
||||
static void ei_cache_friendly_product_colmajor_times_vector(
|
||||
int size, const Scalar* lhs, int lhsStride, const RhsType& rhs, Scalar* res, Scalar alpha);
|
||||
|
||||
template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename ResType>
|
||||
static void ei_cache_friendly_product_rowmajor_times_vector(
|
||||
const Scalar* lhs, int lhsStride, const Scalar* rhs, int rhsSize, ResType& res, Scalar alpha);
|
||||
|
||||
// This helper class aims to determine which optimized product to call,
|
||||
// and how to call it. We have to distinghish three major cases:
|
||||
// 1 - matrix-matrix
|
||||
@ -926,16 +910,18 @@ inline void Product<Lhs,Rhs,ProductMode>::_cacheFriendlyEvalAndAdd(DestDerived&
|
||||
typedef typename ei_unref<RhsCopy>::type _RhsCopy;
|
||||
LhsCopy lhs(actualLhs);
|
||||
RhsCopy rhs(actualRhs);
|
||||
ei_cache_friendly_product<Scalar,
|
||||
((int(Flags)&RowMajorBit) ? bool(RhsProductTraits::NeedToConjugate) : bool(LhsProductTraits::NeedToConjugate)),
|
||||
((int(Flags)&RowMajorBit) ? bool(LhsProductTraits::NeedToConjugate) : bool(RhsProductTraits::NeedToConjugate))>
|
||||
(
|
||||
rows(), cols(), lhs.cols(),
|
||||
_LhsCopy::Flags&RowMajorBit, (const Scalar*)&(lhs.const_cast_derived().coeffRef(0,0)), lhs.stride(),
|
||||
_RhsCopy::Flags&RowMajorBit, (const Scalar*)&(rhs.const_cast_derived().coeffRef(0,0)), rhs.stride(),
|
||||
Flags&RowMajorBit, (Scalar*)&(res.coeffRef(0,0)), res.stride(),
|
||||
actualAlpha
|
||||
);
|
||||
|
||||
ei_general_matrix_matrix_product<
|
||||
Scalar,
|
||||
(_LhsCopy::Flags&RowMajorBit)?RowMajor:ColMajor, bool(LhsProductTraits::NeedToConjugate),
|
||||
(_RhsCopy::Flags&RowMajorBit)?RowMajor:ColMajor, bool(RhsProductTraits::NeedToConjugate),
|
||||
(DestDerived::Flags&RowMajorBit)?RowMajor:ColMajor>
|
||||
::run(
|
||||
rows(), cols(), lhs.cols(),
|
||||
(const Scalar*)&(lhs.const_cast_derived().coeffRef(0,0)), lhs.stride(),
|
||||
(const Scalar*)&(rhs.const_cast_derived().coeffRef(0,0)), rhs.stride(),
|
||||
(Scalar*)&(res.coeffRef(0,0)), res.stride(),
|
||||
actualAlpha);
|
||||
}
|
||||
|
||||
#endif // EIGEN_PRODUCT_H
|
||||
|
@ -25,145 +25,63 @@
|
||||
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_H
|
||||
#define EIGEN_GENERAL_MATRIX_MATRIX_H
|
||||
|
||||
template<typename Scalar, typename Packet, int PacketSize, int mr, int nr, typename Conj>
|
||||
struct ei_gebp_kernel;
|
||||
|
||||
template<typename Scalar, int PacketSize, int nr>
|
||||
struct ei_gemm_pack_rhs;
|
||||
|
||||
template<typename Scalar, int mr>
|
||||
struct ei_gemm_pack_lhs;
|
||||
|
||||
template <int L2MemorySize,typename Scalar>
|
||||
struct ei_L2_block_traits {
|
||||
enum {width = 8 * ei_meta_sqrt<L2MemorySize/(64*sizeof(Scalar))>::ret };
|
||||
};
|
||||
|
||||
template<bool ConjLhs, bool ConjRhs> struct ei_conj_helper;
|
||||
|
||||
template<> struct ei_conj_helper<false,false>
|
||||
{
|
||||
template<typename T>
|
||||
EIGEN_STRONG_INLINE T pmadd(const T& x, const T& y, const T& c) const { return ei_pmadd(x,y,c); }
|
||||
template<typename T>
|
||||
EIGEN_STRONG_INLINE T pmul(const T& x, const T& y) const { return ei_pmul(x,y); }
|
||||
};
|
||||
|
||||
template<> struct ei_conj_helper<false,true>
|
||||
{
|
||||
template<typename T> std::complex<T>
|
||||
pmadd(const std::complex<T>& x, const std::complex<T>& y, const std::complex<T>& c) const
|
||||
{ return c + pmul(x,y); }
|
||||
|
||||
template<typename T> std::complex<T> pmul(const std::complex<T>& x, const std::complex<T>& y) const
|
||||
{ return std::complex<T>(ei_real(x)*ei_real(y) + ei_imag(x)*ei_imag(y), ei_imag(x)*ei_real(y) - ei_real(x)*ei_imag(y)); }
|
||||
};
|
||||
|
||||
template<> struct ei_conj_helper<true,false>
|
||||
{
|
||||
template<typename T> std::complex<T>
|
||||
pmadd(const std::complex<T>& x, const std::complex<T>& y, const std::complex<T>& c) const
|
||||
{ return c + pmul(x,y); }
|
||||
|
||||
template<typename T> std::complex<T> pmul(const std::complex<T>& x, const std::complex<T>& y) const
|
||||
{ return std::complex<T>(ei_real(x)*ei_real(y) + ei_imag(x)*ei_imag(y), ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); }
|
||||
};
|
||||
|
||||
template<> struct ei_conj_helper<true,true>
|
||||
{
|
||||
template<typename T> std::complex<T>
|
||||
pmadd(const std::complex<T>& x, const std::complex<T>& y, const std::complex<T>& c) const
|
||||
{ return c + pmul(x,y); }
|
||||
|
||||
template<typename T> std::complex<T> pmul(const std::complex<T>& x, const std::complex<T>& y) const
|
||||
{ return std::complex<T>(ei_real(x)*ei_real(y) - ei_imag(x)*ei_imag(y), - ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); }
|
||||
};
|
||||
|
||||
#ifndef EIGEN_EXTERN_INSTANTIATIONS
|
||||
|
||||
/** \warning you should never call this function directly,
|
||||
* this is because the ConjugateLhs/ConjugateRhs have to
|
||||
* be flipped is resRowMajor==true */
|
||||
template<typename Scalar, bool ConjugateLhs, bool ConjugateRhs>
|
||||
static void ei_cache_friendly_product(
|
||||
int _rows, int _cols, int depth,
|
||||
bool _lhsRowMajor, const Scalar* _lhs, int _lhsStride,
|
||||
bool _rhsRowMajor, const Scalar* _rhs, int _rhsStride,
|
||||
bool resRowMajor, Scalar* res, int resStride,
|
||||
template<
|
||||
typename Scalar,
|
||||
int LhsStorageOrder, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool ConjugateRhs>
|
||||
struct ei_general_matrix_matrix_product<Scalar,LhsStorageOrder,ConjugateLhs,RhsStorageOrder,ConjugateRhs,RowMajor>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(
|
||||
int rows, int cols, int depth,
|
||||
const Scalar* lhs, int lhsStride,
|
||||
const Scalar* rhs, int rhsStride,
|
||||
Scalar* res, int resStride,
|
||||
Scalar alpha)
|
||||
{
|
||||
// transpose the product such that the result is column major
|
||||
ei_general_matrix_matrix_product<Scalar,
|
||||
RhsStorageOrder==RowMajor ? ColMajor : RowMajor,
|
||||
ConjugateRhs,
|
||||
LhsStorageOrder==RowMajor ? ColMajor : RowMajor,
|
||||
ConjugateLhs,
|
||||
ColMajor>
|
||||
::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha);
|
||||
}
|
||||
};
|
||||
|
||||
template<
|
||||
typename Scalar,
|
||||
int LhsStorageOrder, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool ConjugateRhs>
|
||||
struct ei_general_matrix_matrix_product<Scalar,LhsStorageOrder,ConjugateLhs,RhsStorageOrder,ConjugateRhs,ColMajor>
|
||||
{
|
||||
static void run(int rows, int cols, int depth,
|
||||
const Scalar* _lhs, int lhsStride,
|
||||
const Scalar* _rhs, int rhsStride,
|
||||
Scalar* res, int resStride,
|
||||
Scalar alpha)
|
||||
{
|
||||
const Scalar* EIGEN_RESTRICT lhs;
|
||||
const Scalar* EIGEN_RESTRICT rhs;
|
||||
int lhsStride, rhsStride, rows, cols;
|
||||
bool lhsRowMajor;
|
||||
ei_const_blas_data_mapper<Scalar, LhsStorageOrder> lhs(_lhs,lhsStride);
|
||||
ei_const_blas_data_mapper<Scalar, RhsStorageOrder> rhs(_rhs,rhsStride);
|
||||
|
||||
ei_conj_helper<ConjugateLhs,ConjugateRhs> cj;
|
||||
if (ConjugateRhs)
|
||||
alpha = ei_conj(alpha);
|
||||
bool hasAlpha = alpha != Scalar(1);
|
||||
|
||||
if (resRowMajor)
|
||||
{
|
||||
// return ei_cache_friendly_product<Scalar,ConjugateRhs,ConjugateLhs>(_cols,_rows,depth,
|
||||
// !_rhsRowMajor, _rhs, _rhsStride,
|
||||
// !_lhsRowMajor, _lhs, _lhsStride,
|
||||
// false, res, resStride,
|
||||
// alpha);
|
||||
|
||||
lhs = _rhs;
|
||||
rhs = _lhs;
|
||||
lhsStride = _rhsStride;
|
||||
rhsStride = _lhsStride;
|
||||
cols = _rows;
|
||||
rows = _cols;
|
||||
lhsRowMajor = !_rhsRowMajor;
|
||||
ei_assert(_lhsRowMajor);
|
||||
}
|
||||
else
|
||||
{
|
||||
lhs = _lhs;
|
||||
rhs = _rhs;
|
||||
lhsStride = _lhsStride;
|
||||
rhsStride = _rhsStride;
|
||||
rows = _rows;
|
||||
cols = _cols;
|
||||
lhsRowMajor = _lhsRowMajor;
|
||||
ei_assert(!_rhsRowMajor);
|
||||
}
|
||||
|
||||
typedef typename ei_packet_traits<Scalar>::type PacketType;
|
||||
typedef ei_product_blocking_traits<Scalar> Blocking;
|
||||
|
||||
enum {
|
||||
PacketSize = sizeof(PacketType)/sizeof(Scalar),
|
||||
#if (defined __i386__)
|
||||
HalfRegisterCount = 4,
|
||||
#else
|
||||
HalfRegisterCount = 8,
|
||||
#endif
|
||||
|
||||
// register block size along the N direction
|
||||
nr = HalfRegisterCount/2,
|
||||
|
||||
// register block size along the M direction
|
||||
mr = 2 * PacketSize,
|
||||
|
||||
// max cache block size along the K direction
|
||||
Max_kc = ei_L2_block_traits<EIGEN_TUNE_FOR_CPU_CACHE_SIZE,Scalar>::width,
|
||||
|
||||
// max cache block size along the M direction
|
||||
Max_mc = 2*Max_kc
|
||||
};
|
||||
|
||||
int kc = std::min<int>(Max_kc,depth); // cache block size along the K direction
|
||||
int mc = std::min<int>(Max_mc,rows); // cache block size along the M direction
|
||||
int kc = std::min<int>(Blocking::Max_kc,depth); // cache block size along the K direction
|
||||
int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction
|
||||
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*PacketSize);
|
||||
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
|
||||
|
||||
// number of columns which can be processed by packet of nr columns
|
||||
int packet_cols = (cols/nr)*nr;
|
||||
int packet_cols = (cols/Blocking::nr) * Blocking::nr;
|
||||
|
||||
// GEMM_VAR1
|
||||
// => GEMM_VAR1
|
||||
for(int k2=0; k2<depth; k2+=kc)
|
||||
{
|
||||
const int actual_kc = std::min(k2+kc,depth)-k2;
|
||||
@ -171,24 +89,26 @@ static void ei_cache_friendly_product(
|
||||
// we have selected one row panel of rhs and one column panel of lhs
|
||||
// pack rhs's panel into a sequential chunk of memory
|
||||
// and expand each coeff to a constant packet for further reuse
|
||||
ei_gemm_pack_rhs<Scalar,PacketSize,nr>()(blockB, rhs, rhsStride, hasAlpha, alpha, actual_kc, packet_cols, k2, cols);
|
||||
ei_gemm_pack_rhs<Scalar, Blocking::PacketSize, Blocking::nr>()(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, packet_cols, cols);
|
||||
|
||||
// => GEPP_VAR1
|
||||
for(int i2=0; i2<rows; i2+=mc)
|
||||
{
|
||||
const int actual_mc = std::min(i2+mc,rows)-i2;
|
||||
|
||||
ei_gemm_pack_lhs<Scalar, Blocking::mr, LhsStorageOrder>()(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
|
||||
|
||||
ei_gemm_pack_lhs<Scalar,mr>()(blockA, lhs, lhsStride, lhsRowMajor, actual_kc, actual_mc, k2, i2);
|
||||
|
||||
ei_gebp_kernel<Scalar, PacketType, PacketSize, mr, nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >()
|
||||
ei_gebp_kernel<Scalar, PacketType, Blocking::PacketSize, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >()
|
||||
(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols);
|
||||
}
|
||||
}
|
||||
|
||||
ei_aligned_stack_delete(Scalar, blockA, kc*mc);
|
||||
ei_aligned_stack_delete(Scalar, blockB, kc*cols*PacketSize);
|
||||
ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
// optimized GEneral packed Block * packed Panel product kernel
|
||||
template<typename Scalar, typename PacketType, int PacketSize, int mr, int nr, typename Conj>
|
||||
struct ei_gebp_kernel
|
||||
@ -356,8 +276,7 @@ struct ei_gebp_kernel
|
||||
if(nr==4) res[(j2+3)*resStride + i2 + i] += C3;
|
||||
}
|
||||
}
|
||||
// remaining rhs/res columns (<nr)
|
||||
|
||||
|
||||
// process remaining rhs/res columns one at a time
|
||||
// => do the same but with nr==1
|
||||
for(int j2=packet_cols; j2<cols; j2++)
|
||||
@ -413,35 +332,22 @@ struct ei_gebp_kernel
|
||||
};
|
||||
|
||||
// pack a block of the lhs
|
||||
template<typename Scalar, int mr>
|
||||
template<typename Scalar, int mr, int StorageOrder>
|
||||
struct ei_gemm_pack_lhs
|
||||
{
|
||||
void operator()(Scalar* blockA, const Scalar* lhs, int lhsStride, bool lhsRowMajor, int actual_kc, int actual_mc, int k2, int i2)
|
||||
void operator()(Scalar* blockA, const EIGEN_RESTRICT Scalar* _lhs, int lhsStride, int actual_kc, int actual_mc)
|
||||
{
|
||||
ei_const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride);
|
||||
int count = 0;
|
||||
const int peeled_mc = (actual_mc/mr)*mr;
|
||||
if (lhsRowMajor)
|
||||
for(int i=0; i<peeled_mc; i+=mr)
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
for(int w=0; w<mr; w++)
|
||||
blockA[count++] = lhs(i+w, k);
|
||||
for(int i=peeled_mc; i<actual_mc; i++)
|
||||
{
|
||||
for(int i=0; i<peeled_mc; i+=mr)
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
for(int w=0; w<mr; w++)
|
||||
blockA[count++] = lhs[(k2+k) + (i2+i+w)*lhsStride];
|
||||
for(int i=peeled_mc; i<actual_mc; i++)
|
||||
{
|
||||
const Scalar* llhs = &lhs[(k2) + (i2+i)*lhsStride];
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
blockA[count++] = llhs[k];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(int i=0; i<peeled_mc; i+=mr)
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
for(int w=0; w<mr; w++)
|
||||
blockA[count++] = lhs[(k2+k)*lhsStride + i2+i+w];
|
||||
for(int i=peeled_mc; i<actual_mc; i++)
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
blockA[count++] = lhs[(k2+k)*lhsStride + i2+i];
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
blockA[count++] = lhs(i, k);
|
||||
}
|
||||
}
|
||||
};
|
||||
@ -450,15 +356,16 @@ struct ei_gemm_pack_lhs
|
||||
template<typename Scalar, int PacketSize, int nr>
|
||||
struct ei_gemm_pack_rhs
|
||||
{
|
||||
void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, bool hasAlpha, Scalar alpha, int actual_kc, int packet_cols, int k2, int cols)
|
||||
void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int actual_kc, int packet_cols, int cols)
|
||||
{
|
||||
bool hasAlpha = alpha != Scalar(1);
|
||||
int count = 0;
|
||||
for(int j2=0; j2<packet_cols; j2+=nr)
|
||||
{
|
||||
const Scalar* b0 = &rhs[(j2+0)*rhsStride + k2];
|
||||
const Scalar* b1 = &rhs[(j2+1)*rhsStride + k2];
|
||||
const Scalar* b2 = &rhs[(j2+2)*rhsStride + k2];
|
||||
const Scalar* b3 = &rhs[(j2+3)*rhsStride + k2];
|
||||
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
|
||||
const Scalar* b1 = &rhs[(j2+1)*rhsStride];
|
||||
const Scalar* b2 = &rhs[(j2+2)*rhsStride];
|
||||
const Scalar* b3 = &rhs[(j2+3)*rhsStride];
|
||||
if (hasAlpha)
|
||||
{
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
@ -491,7 +398,7 @@ struct ei_gemm_pack_rhs
|
||||
// copy the remaining columns one at a time (nr==1)
|
||||
for(int j2=packet_cols; j2<cols; ++j2)
|
||||
{
|
||||
const Scalar* b0 = &rhs[(j2+0)*rhsStride + k2];
|
||||
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
|
||||
if (hasAlpha)
|
||||
{
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
|
@ -25,61 +25,44 @@
|
||||
#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_H
|
||||
#define EIGEN_SELFADJOINT_MATRIX_MATRIX_H
|
||||
|
||||
template<typename Scalar, int mr>
|
||||
// pack a selfadjoint block diagonal for use with the gebp_kernel
|
||||
template<typename Scalar, int mr, int StorageOrder>
|
||||
struct ei_symm_pack_lhs
|
||||
{
|
||||
void operator()(Scalar* blockA, const Scalar* lhs, int lhsStride, bool lhsRowMajor, int actual_kc, int actual_mc, int k2, int i2)
|
||||
void operator()(Scalar* blockA, const Scalar* _lhs, int lhsStride, int actual_kc, int actual_mc)
|
||||
{
|
||||
ei_const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride);
|
||||
int count = 0;
|
||||
const int peeled_mc = (actual_mc/mr)*mr;
|
||||
if (lhsRowMajor)
|
||||
for(int i=0; i<peeled_mc; i+=mr)
|
||||
{
|
||||
for(int i=0; i<peeled_mc; i+=mr)
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
for(int w=0; w<mr; w++)
|
||||
blockA[count++] = lhs[(k2+k) + (i2+i+w)*lhsStride];
|
||||
for(int i=peeled_mc; i<actual_mc; i++)
|
||||
for(int k=0; k<i; k++)
|
||||
for(int w=0; w<mr; w++)
|
||||
blockA[count++] = lhs(i+w,k);
|
||||
// symmetric copy
|
||||
int h = 0;
|
||||
for(int k=i; k<i+mr; k++)
|
||||
{
|
||||
const Scalar* llhs = &lhs[(k2) + (i2+i)*lhsStride];
|
||||
for(int k=0; k<actual_kc; k++)
|
||||
blockA[count++] = llhs[k];
|
||||
// transposed copy
|
||||
for(int w=0; w<h; w++)
|
||||
blockA[count++] = lhs(k, i+w);
|
||||
for(int w=h; w<mr; w++)
|
||||
blockA[count++] = lhs(i+w, k);
|
||||
++h;
|
||||
}
|
||||
// transposed copy
|
||||
for(int k=i+mr; k<actual_kc; k++)
|
||||
for(int w=0; w<mr; w++)
|
||||
blockA[count++] = lhs(k, i+w);
|
||||
}
|
||||
else
|
||||
// do the same with mr==1
|
||||
for(int i=peeled_mc; i<actual_mc; i++)
|
||||
{
|
||||
for(int i=0; i<peeled_mc; i+=mr)
|
||||
{
|
||||
for(int k=0; k<i; k++)
|
||||
for(int w=0; w<mr; w++)
|
||||
blockA[count++] = lhs[(k2+k)*lhsStride + i2+i+w];
|
||||
|
||||
// symmetric copy
|
||||
int h = 0;
|
||||
for(int k=i; k<i+mr; k++)
|
||||
{
|
||||
// transposed copy
|
||||
for(int w=0; w<h; w++)
|
||||
blockA[count++] = lhs[(k2+k) + (i2+i+w)*lhsStride];
|
||||
for(int w=h; w<mr; w++)
|
||||
blockA[count++] = lhs[(k2+k)*lhsStride + i2+i+w];
|
||||
++h;
|
||||
}
|
||||
|
||||
// transposed copy
|
||||
for(int k=i+mr; k<actual_kc; k++)
|
||||
for(int w=0; w<mr; w++)
|
||||
blockA[count++] = lhs[(k2+k) + (i2+i+w)*lhsStride];
|
||||
}
|
||||
// do the same with mr==1
|
||||
for(int i=peeled_mc; i<actual_mc; i++)
|
||||
{
|
||||
for(int k=0; k<=i; k++)
|
||||
blockA[count++] = lhs[(k2+k)*lhsStride + i2+i];
|
||||
|
||||
// transposed copy
|
||||
for(int k=i+1; k<actual_kc; k++)
|
||||
blockA[count++] = lhs[(k2+k) + (i2+i)*lhsStride];
|
||||
}
|
||||
for(int k=0; k<=i; k++)
|
||||
blockA[count++] = lhs(i, k);
|
||||
// transposed copy
|
||||
for(int k=i+1; k<actual_kc; k++)
|
||||
blockA[count++] = lhs(k, i);
|
||||
}
|
||||
}
|
||||
};
|
||||
@ -90,51 +73,36 @@ struct ei_symm_pack_lhs
|
||||
template<typename Scalar, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs>
|
||||
static EIGEN_DONT_INLINE void ei_product_selfadjoint_matrix(
|
||||
int size,
|
||||
const Scalar* lhs, int lhsStride,
|
||||
const Scalar* rhs, int rhsStride, bool rhsRowMajor, int cols,
|
||||
const Scalar* _lhs, int lhsStride,
|
||||
const Scalar* _rhs, int rhsStride, bool rhsRowMajor, int cols,
|
||||
Scalar* res, int resStride,
|
||||
Scalar alpha)
|
||||
{
|
||||
typedef typename ei_packet_traits<Scalar>::type Packet;
|
||||
|
||||
ei_conj_helper<ConjugateLhs,ConjugateRhs> cj;
|
||||
ei_const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride);
|
||||
ei_const_blas_data_mapper<Scalar, ColMajor> rhs(_rhs,rhsStride);
|
||||
|
||||
if (ConjugateRhs)
|
||||
alpha = ei_conj(alpha);
|
||||
bool hasAlpha = alpha != Scalar(1);
|
||||
|
||||
typedef typename ei_packet_traits<Scalar>::type PacketType;
|
||||
|
||||
const bool lhsRowMajor = (StorageOrder==RowMajor);
|
||||
|
||||
enum {
|
||||
PacketSize = sizeof(PacketType)/sizeof(Scalar),
|
||||
#if (defined __i386__)
|
||||
HalfRegisterCount = 4,
|
||||
#else
|
||||
HalfRegisterCount = 8,
|
||||
#endif
|
||||
typedef ei_product_blocking_traits<Scalar> Blocking;
|
||||
|
||||
// register block size along the N direction
|
||||
nr = HalfRegisterCount/2,
|
||||
|
||||
// register block size along the M direction
|
||||
mr = 2 * PacketSize,
|
||||
|
||||
// max cache block size along the K direction
|
||||
Max_kc = ei_L2_block_traits<EIGEN_TUNE_FOR_CPU_CACHE_SIZE,Scalar>::width,
|
||||
|
||||
// max cache block size along the M direction
|
||||
Max_mc = 2*Max_kc
|
||||
};
|
||||
|
||||
int kc = std::min<int>(Max_kc,size); // cache block size along the K direction
|
||||
int mc = std::min<int>(Max_mc,size); // cache block size along the M direction
|
||||
int kc = std::min<int>(Blocking::Max_kc,size); // cache block size along the K direction
|
||||
int mc = std::min<int>(Blocking::Max_mc,size); // cache block size along the M direction
|
||||
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*PacketSize);
|
||||
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
|
||||
|
||||
// number of columns which can be processed by packet of nr columns
|
||||
int packet_cols = (cols/nr)*nr;
|
||||
int packet_cols = (cols/Blocking::nr)*Blocking::nr;
|
||||
|
||||
ei_gebp_kernel<Scalar, PacketType, Blocking::PacketSize,
|
||||
Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel;
|
||||
|
||||
for(int k2=0; k2<size; k2+=kc)
|
||||
{
|
||||
@ -143,7 +111,8 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_matrix(
|
||||
// we have selected one row panel of rhs and one column panel of lhs
|
||||
// pack rhs's panel into a sequential chunk of memory
|
||||
// and expand each coeff to a constant packet for further reuse
|
||||
ei_gemm_pack_rhs<Scalar,PacketSize,nr>()(blockB, rhs, rhsStride, hasAlpha, alpha, actual_kc, packet_cols, k2, cols);
|
||||
ei_gemm_pack_rhs<Scalar,Blocking::PacketSize,Blocking::nr>()
|
||||
(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, packet_cols, cols);
|
||||
|
||||
// the select lhs's panel has to be split in three different parts:
|
||||
// 1 - the transposed panel above the diagonal block => transposed packed copy
|
||||
@ -153,32 +122,31 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_matrix(
|
||||
{
|
||||
const int actual_mc = std::min(i2+mc,k2)-i2;
|
||||
// transposed packed copy
|
||||
ei_gemm_pack_lhs<Scalar,mr>()(blockA, lhs, lhsStride, !lhsRowMajor, actual_kc, actual_mc, k2, i2);
|
||||
ei_gemm_pack_lhs<Scalar,Blocking::mr,StorageOrder==RowMajor?ColMajor:RowMajor>()
|
||||
(blockA, &lhs(k2,i2), lhsStride, actual_kc, actual_mc);
|
||||
|
||||
ei_gebp_kernel<Scalar, PacketType, PacketSize, mr, nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >()
|
||||
(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols);
|
||||
gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols);
|
||||
}
|
||||
// the block diagonal
|
||||
{
|
||||
const int actual_mc = std::min(k2+kc,size)-k2;
|
||||
// symmetric packed copy
|
||||
ei_symm_pack_lhs<Scalar,mr>()(blockA, lhs, lhsStride, lhsRowMajor, actual_kc, actual_mc, k2, k2);
|
||||
ei_gebp_kernel<Scalar, PacketType, PacketSize, mr, nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >()
|
||||
(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, k2, cols);
|
||||
ei_symm_pack_lhs<Scalar,Blocking::mr,StorageOrder>()
|
||||
(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc);
|
||||
gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, k2, cols);
|
||||
}
|
||||
|
||||
for(int i2=k2+kc; i2<size; i2+=mc)
|
||||
{
|
||||
const int actual_mc = std::min(i2+mc,size)-i2;
|
||||
ei_gemm_pack_lhs<Scalar,mr>()(blockA, lhs, lhsStride, lhsRowMajor, actual_kc, actual_mc, k2, i2);
|
||||
ei_gebp_kernel<Scalar, PacketType, PacketSize, mr, nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >()
|
||||
(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols);
|
||||
ei_gemm_pack_lhs<Scalar,Blocking::mr,StorageOrder>()
|
||||
(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
|
||||
gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols);
|
||||
}
|
||||
}
|
||||
|
||||
ei_aligned_stack_delete(Scalar, blockA, kc*mc);
|
||||
ei_aligned_stack_delete(Scalar, blockB, kc*cols*PacketSize);
|
||||
ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize);
|
||||
}
|
||||
|
||||
|
||||
#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H
|
||||
|
157
Eigen/src/Core/util/BlasUtil.h
Normal file
157
Eigen/src/Core/util/BlasUtil.h
Normal file
@ -0,0 +1,157 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr>
|
||||
//
|
||||
// Eigen is free software; you can redistribute it and/or
|
||||
// modify it under the terms of the GNU Lesser General Public
|
||||
// License as published by the Free Software Foundation; either
|
||||
// version 3 of the License, or (at your option) any later version.
|
||||
//
|
||||
// Alternatively, you can redistribute it and/or
|
||||
// modify it under the terms of the GNU General Public License as
|
||||
// published by the Free Software Foundation; either version 2 of
|
||||
// the License, or (at your option) any later version.
|
||||
//
|
||||
// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Lesser General Public
|
||||
// License and a copy of the GNU General Public License along with
|
||||
// Eigen. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
#ifndef EIGEN_BLASUTIL_H
|
||||
#define EIGEN_BLASUTIL_H
|
||||
|
||||
// This file contains many lightweight helper classes used to
|
||||
// implement and control fast level 2 and level 3 BLAS-like routines.
|
||||
|
||||
// forward declarations
|
||||
template<typename Scalar, typename Packet, int PacketSize, int mr, int nr, typename Conj>
|
||||
struct ei_gebp_kernel;
|
||||
|
||||
template<typename Scalar, int PacketSize, int nr>
|
||||
struct ei_gemm_pack_rhs;
|
||||
|
||||
template<typename Scalar, int mr, int StorageOrder>
|
||||
struct ei_gemm_pack_lhs;
|
||||
|
||||
template<
|
||||
typename Scalar,
|
||||
int LhsStorageOrder, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResStorageOrder>
|
||||
struct ei_general_matrix_matrix_product;
|
||||
|
||||
template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename RhsType>
|
||||
static void ei_cache_friendly_product_colmajor_times_vector(
|
||||
int size, const Scalar* lhs, int lhsStride, const RhsType& rhs, Scalar* res, Scalar alpha);
|
||||
|
||||
template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename ResType>
|
||||
static void ei_cache_friendly_product_rowmajor_times_vector(
|
||||
const Scalar* lhs, int lhsStride, const Scalar* rhs, int rhsSize, ResType& res, Scalar alpha);
|
||||
|
||||
// Provides scalar/packet-wise product and product with accumulation
|
||||
// with optional conjugation of the arguments.
|
||||
template<bool ConjLhs, bool ConjRhs> struct ei_conj_helper;
|
||||
|
||||
template<> struct ei_conj_helper<false,false>
|
||||
{
|
||||
template<typename T>
|
||||
EIGEN_STRONG_INLINE T pmadd(const T& x, const T& y, const T& c) const { return ei_pmadd(x,y,c); }
|
||||
template<typename T>
|
||||
EIGEN_STRONG_INLINE T pmul(const T& x, const T& y) const { return ei_pmul(x,y); }
|
||||
};
|
||||
|
||||
template<> struct ei_conj_helper<false,true>
|
||||
{
|
||||
template<typename T> std::complex<T>
|
||||
pmadd(const std::complex<T>& x, const std::complex<T>& y, const std::complex<T>& c) const
|
||||
{ return c + pmul(x,y); }
|
||||
|
||||
template<typename T> std::complex<T> pmul(const std::complex<T>& x, const std::complex<T>& y) const
|
||||
{ return std::complex<T>(ei_real(x)*ei_real(y) + ei_imag(x)*ei_imag(y), ei_imag(x)*ei_real(y) - ei_real(x)*ei_imag(y)); }
|
||||
};
|
||||
|
||||
template<> struct ei_conj_helper<true,false>
|
||||
{
|
||||
template<typename T> std::complex<T>
|
||||
pmadd(const std::complex<T>& x, const std::complex<T>& y, const std::complex<T>& c) const
|
||||
{ return c + pmul(x,y); }
|
||||
|
||||
template<typename T> std::complex<T> pmul(const std::complex<T>& x, const std::complex<T>& y) const
|
||||
{ return std::complex<T>(ei_real(x)*ei_real(y) + ei_imag(x)*ei_imag(y), ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); }
|
||||
};
|
||||
|
||||
template<> struct ei_conj_helper<true,true>
|
||||
{
|
||||
template<typename T> std::complex<T>
|
||||
pmadd(const std::complex<T>& x, const std::complex<T>& y, const std::complex<T>& c) const
|
||||
{ return c + pmul(x,y); }
|
||||
|
||||
template<typename T> std::complex<T> pmul(const std::complex<T>& x, const std::complex<T>& y) const
|
||||
{ return std::complex<T>(ei_real(x)*ei_real(y) - ei_imag(x)*ei_imag(y), - ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); }
|
||||
};
|
||||
|
||||
// lightweight helper class to access matrix coefficients
|
||||
template<typename Scalar, int StorageOrder>
|
||||
class ei_blas_data_mapper
|
||||
{
|
||||
public:
|
||||
ei_blas_data_mapper(Scalar* data, int stride) : m_data(data), m_stride(stride) {}
|
||||
EIGEN_STRONG_INLINE Scalar& operator()(int i, int j)
|
||||
{ return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; }
|
||||
protected:
|
||||
Scalar* EIGEN_RESTRICT m_data;
|
||||
int m_stride;
|
||||
};
|
||||
|
||||
// lightweight helper class to access matrix coefficients (const version)
|
||||
template<typename Scalar, int StorageOrder>
|
||||
class ei_const_blas_data_mapper
|
||||
{
|
||||
public:
|
||||
ei_const_blas_data_mapper(const Scalar* data, int stride) : m_data(data), m_stride(stride) {}
|
||||
EIGEN_STRONG_INLINE const Scalar& operator()(int i, int j) const
|
||||
{ return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; }
|
||||
protected:
|
||||
const Scalar* EIGEN_RESTRICT m_data;
|
||||
int m_stride;
|
||||
};
|
||||
|
||||
//
|
||||
// template <int L2MemorySize,typename Scalar>
|
||||
// struct ei_L2_block_traits {
|
||||
// enum {width = 8 * ei_meta_sqrt<L2MemorySize/(64*sizeof(Scalar))>::ret };
|
||||
// };
|
||||
|
||||
// Defines various constant controlling level 3 blocking
|
||||
template<typename Scalar>
|
||||
struct ei_product_blocking_traits
|
||||
{
|
||||
typedef typename ei_packet_traits<Scalar>::type PacketType;
|
||||
enum {
|
||||
PacketSize = sizeof(PacketType)/sizeof(Scalar),
|
||||
#if (defined __i386__)
|
||||
HalfRegisterCount = 4,
|
||||
#else
|
||||
HalfRegisterCount = 8,
|
||||
#endif
|
||||
|
||||
// register block size along the N direction
|
||||
nr = HalfRegisterCount/2,
|
||||
|
||||
// register block size along the M direction
|
||||
mr = 2 * PacketSize,
|
||||
|
||||
// max cache block size along the K direction
|
||||
Max_kc = 8 * ei_meta_sqrt<EIGEN_TUNE_FOR_CPU_CACHE_SIZE/(64*sizeof(Scalar))>::ret,
|
||||
|
||||
// max cache block size along the M direction
|
||||
Max_mc = 2*Max_kc
|
||||
};
|
||||
};
|
||||
|
||||
#endif // EIGEN_BLASUTIL_H
|
Loading…
Reference in New Issue
Block a user