mirror of
https://gitlab.com/libeigen/eigen.git
synced 2024-12-09 07:00:27 +08:00
Add a proof concept API to configure the blocking parameters at runtime.
After validation of the final API I'll update the other products to use it.
This commit is contained in:
parent
7726cc8a29
commit
88cd6885be
@ -31,10 +31,10 @@
|
||||
|
||||
#ifndef EIGEN_HAS_FUSE_CJMADD
|
||||
#define EIGEN_HAS_FUSE_CJMADD 1
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef EIGEN_TUNE_FOR_CPU_CACHE_SIZE
|
||||
#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE 8*128*128
|
||||
#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE 8*256*256
|
||||
#endif
|
||||
|
||||
// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
|
||||
@ -153,7 +153,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float& from) {
|
||||
return vc;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int& from) {
|
||||
template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int& from) {
|
||||
int EIGEN_ALIGN16 ai[4];
|
||||
ai[0] = from;
|
||||
Packet4i vc = vec_ld(0, ai);
|
||||
|
@ -52,7 +52,7 @@
|
||||
* Typically for a single-threaded application you would set that to 25% of the size of your CPU caches in bytes
|
||||
*/
|
||||
#ifndef EIGEN_TUNE_FOR_CPU_CACHE_SIZE
|
||||
#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE (sizeof(float)*256*256)
|
||||
#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE (sizeof(float)*512*512)
|
||||
#endif
|
||||
|
||||
/** Defines the maximal width of the blocks used in the triangular product and solver
|
||||
|
@ -32,7 +32,7 @@
|
||||
#endif
|
||||
|
||||
#ifndef EIGEN_TUNE_FOR_CPU_CACHE_SIZE
|
||||
#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE 4*96*96
|
||||
#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE 4*192*192
|
||||
#endif
|
||||
|
||||
// FIXME NEON has 16 quad registers, but since the current register allocator
|
||||
|
@ -25,11 +25,100 @@
|
||||
#ifndef EIGEN_GENERAL_BLOCK_PANEL_H
|
||||
#define EIGEN_GENERAL_BLOCK_PANEL_H
|
||||
|
||||
/** \internal */
|
||||
inline void ei_manage_caching_sizes(Action action, std::ptrdiff_t* a=0, std::ptrdiff_t* b=0, int scalar_size = 0)
|
||||
{
|
||||
const int nbScalarSizes = 12;
|
||||
static std::ptrdiff_t m_maxK[nbScalarSizes];
|
||||
static std::ptrdiff_t m_maxM[nbScalarSizes];
|
||||
static std::ptrdiff_t m_cpuCacheSize = 0;
|
||||
if(m_cpuCacheSize==0)
|
||||
{
|
||||
// initialization
|
||||
m_cpuCacheSize = EIGEN_TUNE_FOR_CPU_CACHE_SIZE;
|
||||
ei_manage_caching_sizes(SetAction,&m_cpuCacheSize);
|
||||
}
|
||||
|
||||
if(action==SetAction && scalar_size==0)
|
||||
{
|
||||
// set the cpu cache size and cache all block sizes from a global cache size in byte
|
||||
ei_internal_assert(a!=0 && b==0);
|
||||
m_cpuCacheSize = *a;
|
||||
int ss = 4;
|
||||
for(int i=0; i<nbScalarSizes;++i,ss+=4)
|
||||
{
|
||||
m_maxK[i] = 4 * std::ptrdiff_t(std::sqrt(std::ptrdiff_t(m_cpuCacheSize/(64*ss))));
|
||||
m_maxM[i] = 2 * m_maxK[i];
|
||||
}
|
||||
}
|
||||
else if(action==SetAction && scalar_size!=0)
|
||||
{
|
||||
// set the block sizes for the given scalar type (represented as its size)
|
||||
ei_internal_assert(a!=0 && b!=0);
|
||||
int i = std::max((scalar_size>>2)-1,0);
|
||||
if(i<nbScalarSizes)
|
||||
{
|
||||
m_maxK[i] = *a;
|
||||
m_maxM[i] = *b;
|
||||
}
|
||||
}
|
||||
else if(action==GetAction && scalar_size==0)
|
||||
{
|
||||
ei_internal_assert(a!=0 && b==0);
|
||||
*a = m_cpuCacheSize;
|
||||
}
|
||||
else if(action==GetAction && scalar_size!=0)
|
||||
{
|
||||
ei_internal_assert(a!=0 && b!=0);
|
||||
int i = std::min(std::max((scalar_size>>2),1),nbScalarSizes)-1;
|
||||
*a = m_maxK[i];
|
||||
*b = m_maxM[i];
|
||||
}
|
||||
else
|
||||
{
|
||||
ei_internal_assert(false);
|
||||
}
|
||||
}
|
||||
|
||||
/** \returns the currently set cpu cache size (in bytes) used to estimate the ideal blocking size parameters */
|
||||
std::ptrdiff_t ei_cpuCacheSize()
|
||||
{
|
||||
std::ptrdiff_t ret;
|
||||
ei_manage_caching_sizes(GetAction, &ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/** Set the cpu cache size (in bytes) for blocking.
|
||||
* This function also automatically set the blocking size parameters for each scalar type using the following formula:
|
||||
* \code
|
||||
* max_k = 4 * sqrt(cache_size/(64*sizeof(Scalar)));
|
||||
* max_m = 2 * k;
|
||||
* \endcode
|
||||
* overwriting custom values set using the ei_setBlockingSizes function.
|
||||
* \sa ei_setBlockingSizes */
|
||||
void ei_setCpuCacheSize(std::ptrdiff_t cache_size) { ei_manage_caching_sizes(SetAction,&cache_size); }
|
||||
|
||||
/** Set the blocking size parameters \a maxK and \a maxM for the scalar type \a Scalar.
|
||||
* Note that in practice there is no distinction between scalar types of same size.
|
||||
* \sa ei_setCpuCacheSize */
|
||||
template<typename Scalar>
|
||||
void ei_setBlockingSizes(std::ptrdiff_t maxK, std::ptrdiff_t maxM)
|
||||
{
|
||||
ei_manage_caching_sizes(SetAction,&maxK,&maxM,sizeof(Scalar));
|
||||
}
|
||||
|
||||
/** \returns in \a makK, \a maxM the blocking size parameters for the scalar type \a Scalar.
|
||||
* \sa ei_setBlockingSizes */
|
||||
template<typename Scalar>
|
||||
void ei_getBlockingSizes(std::ptrdiff_t& maxK, std::ptrdiff_t& maxM)
|
||||
{
|
||||
ei_manage_caching_sizes(GetAction,&maxK,&maxM,sizeof(Scalar));
|
||||
}
|
||||
|
||||
#ifdef EIGEN_HAS_FUSE_CJMADD
|
||||
#define CJMADD(A,B,C,T) C = cj.pmadd(A,B,C);
|
||||
#define CJMADD(A,B,C,T) C = cj.pmadd(A,B,C);
|
||||
#else
|
||||
#define CJMADD(A,B,C,T) T = B; T = cj.pmul(A,T); C = ei_padd(C,T);
|
||||
// #define CJMADD(A,B,C,T) T = A; T = cj.pmul(T,B); C = ei_padd(C,T);
|
||||
#define CJMADD(A,B,C,T) T = B; T = cj.pmul(A,T); C = ei_padd(C,T);
|
||||
#endif
|
||||
|
||||
// optimized GEneral packed Block * packed Panel product kernel
|
||||
|
@ -75,8 +75,11 @@ static void run(Index rows, Index cols, Index depth,
|
||||
typedef typename ei_packet_traits<Scalar>::type PacketType;
|
||||
typedef ei_product_blocking_traits<Scalar> Blocking;
|
||||
|
||||
Index kc = std::min<Index>(Blocking::Max_kc,depth); // cache block size along the K direction
|
||||
Index mc = std::min<Index>(Blocking::Max_mc,rows); // cache block size along the M direction
|
||||
Index kc; // cache block size along the K direction
|
||||
Index mc; // cache block size along the M direction
|
||||
ei_getBlockingSizes<Scalar>(kc, mc);
|
||||
kc = std::min<Index>(kc,depth);
|
||||
mc = std::min<Index>(mc,rows);
|
||||
|
||||
ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, RhsStorageOrder> pack_rhs;
|
||||
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr, LhsStorageOrder> pack_lhs;
|
||||
@ -235,7 +238,9 @@ struct ei_gemm_functor
|
||||
|
||||
Index sharedBlockBSize() const
|
||||
{
|
||||
return std::min<Index>(ei_product_blocking_traits<Scalar>::Max_kc,m_rhs.rows()) * m_rhs.cols();
|
||||
int maxKc, maxMc;
|
||||
ei_getBlockingSizes<Scalar>(maxKc,maxMc);
|
||||
return std::min<Index>(maxKc,m_rhs.rows()) * m_rhs.cols();
|
||||
}
|
||||
|
||||
protected:
|
||||
|
@ -139,7 +139,7 @@ struct ei_product_blocking_traits
|
||||
mr = 2 * PacketSize,
|
||||
|
||||
// max cache block size along the K direction
|
||||
Max_kc = 8 * ei_meta_sqrt<EIGEN_TUNE_FOR_CPU_CACHE_SIZE/(64*sizeof(Scalar))>::ret,
|
||||
Max_kc = 4 * ei_meta_sqrt<EIGEN_TUNE_FOR_CPU_CACHE_SIZE/(64*sizeof(Scalar))>::ret,
|
||||
|
||||
// max cache block size along the M direction
|
||||
Max_mc = 2*Max_kc
|
||||
|
@ -269,6 +269,8 @@ namespace Architecture
|
||||
|
||||
enum { CoeffBasedProductMode, LazyCoeffBasedProductMode, OuterProduct, InnerProduct, GemvProduct, GemmProduct };
|
||||
|
||||
enum Action {GetAction, SetAction};
|
||||
|
||||
/** The type used to identify a dense storage. */
|
||||
struct Dense {};
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user