diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 0a7b07645..2feca365a 100644 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -31,10 +31,10 @@ #ifndef EIGEN_HAS_FUSE_CJMADD #define EIGEN_HAS_FUSE_CJMADD 1 -#endif +#endif #ifndef EIGEN_TUNE_FOR_CPU_CACHE_SIZE -#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE 8*128*128 +#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE 8*256*256 #endif // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16 @@ -153,7 +153,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pset1(const float& from) { return vc; } -template<> EIGEN_STRONG_INLINE Packet4i ei_pset1(const int& from) { +template<> EIGEN_STRONG_INLINE Packet4i ei_pset1(const int& from) { int EIGEN_ALIGN16 ai[4]; ai[0] = from; Packet4i vc = vec_ld(0, ai); diff --git a/Eigen/src/Core/arch/Default/Settings.h b/Eigen/src/Core/arch/Default/Settings.h index 1ab2877b6..150c4bdc7 100644 --- a/Eigen/src/Core/arch/Default/Settings.h +++ b/Eigen/src/Core/arch/Default/Settings.h @@ -52,7 +52,7 @@ * Typically for a single-threaded application you would set that to 25% of the size of your CPU caches in bytes */ #ifndef EIGEN_TUNE_FOR_CPU_CACHE_SIZE -#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE (sizeof(float)*256*256) +#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE (sizeof(float)*512*512) #endif /** Defines the maximal width of the blocks used in the triangular product and solver diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 96c75101c..d4dd33322 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -32,7 +32,7 @@ #endif #ifndef EIGEN_TUNE_FOR_CPU_CACHE_SIZE -#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE 4*96*96 +#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE 4*192*192 #endif // FIXME NEON has 16 quad registers, but since the current register allocator diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index ca3e4eaf3..dc6c2ebf3 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -25,11 +25,100 @@ #ifndef EIGEN_GENERAL_BLOCK_PANEL_H #define EIGEN_GENERAL_BLOCK_PANEL_H +/** \internal */ +inline void ei_manage_caching_sizes(Action action, std::ptrdiff_t* a=0, std::ptrdiff_t* b=0, int scalar_size = 0) +{ + const int nbScalarSizes = 12; + static std::ptrdiff_t m_maxK[nbScalarSizes]; + static std::ptrdiff_t m_maxM[nbScalarSizes]; + static std::ptrdiff_t m_cpuCacheSize = 0; + if(m_cpuCacheSize==0) + { + // initialization + m_cpuCacheSize = EIGEN_TUNE_FOR_CPU_CACHE_SIZE; + ei_manage_caching_sizes(SetAction,&m_cpuCacheSize); + } + + if(action==SetAction && scalar_size==0) + { + // set the cpu cache size and cache all block sizes from a global cache size in byte + ei_internal_assert(a!=0 && b==0); + m_cpuCacheSize = *a; + int ss = 4; + for(int i=0; i>2)-1,0); + if(i>2),1),nbScalarSizes)-1; + *a = m_maxK[i]; + *b = m_maxM[i]; + } + else + { + ei_internal_assert(false); + } +} + +/** \returns the currently set cpu cache size (in bytes) used to estimate the ideal blocking size parameters */ +std::ptrdiff_t ei_cpuCacheSize() +{ + std::ptrdiff_t ret; + ei_manage_caching_sizes(GetAction, &ret); + return ret; +} + +/** Set the cpu cache size (in bytes) for blocking. + * This function also automatically set the blocking size parameters for each scalar type using the following formula: + * \code + * max_k = 4 * sqrt(cache_size/(64*sizeof(Scalar))); + * max_m = 2 * k; + * \endcode + * overwriting custom values set using the ei_setBlockingSizes function. + * \sa ei_setBlockingSizes */ +void ei_setCpuCacheSize(std::ptrdiff_t cache_size) { ei_manage_caching_sizes(SetAction,&cache_size); } + +/** Set the blocking size parameters \a maxK and \a maxM for the scalar type \a Scalar. + * Note that in practice there is no distinction between scalar types of same size. + * \sa ei_setCpuCacheSize */ +template +void ei_setBlockingSizes(std::ptrdiff_t maxK, std::ptrdiff_t maxM) +{ + ei_manage_caching_sizes(SetAction,&maxK,&maxM,sizeof(Scalar)); +} + +/** \returns in \a makK, \a maxM the blocking size parameters for the scalar type \a Scalar. + * \sa ei_setBlockingSizes */ +template +void ei_getBlockingSizes(std::ptrdiff_t& maxK, std::ptrdiff_t& maxM) +{ + ei_manage_caching_sizes(GetAction,&maxK,&maxM,sizeof(Scalar)); +} + #ifdef EIGEN_HAS_FUSE_CJMADD -#define CJMADD(A,B,C,T) C = cj.pmadd(A,B,C); + #define CJMADD(A,B,C,T) C = cj.pmadd(A,B,C); #else -#define CJMADD(A,B,C,T) T = B; T = cj.pmul(A,T); C = ei_padd(C,T); -// #define CJMADD(A,B,C,T) T = A; T = cj.pmul(T,B); C = ei_padd(C,T); + #define CJMADD(A,B,C,T) T = B; T = cj.pmul(A,T); C = ei_padd(C,T); #endif // optimized GEneral packed Block * packed Panel product kernel diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 457173382..3086616f8 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -75,8 +75,11 @@ static void run(Index rows, Index cols, Index depth, typedef typename ei_packet_traits::type PacketType; typedef ei_product_blocking_traits Blocking; - Index kc = std::min(Blocking::Max_kc,depth); // cache block size along the K direction - Index mc = std::min(Blocking::Max_mc,rows); // cache block size along the M direction + Index kc; // cache block size along the K direction + Index mc; // cache block size along the M direction + ei_getBlockingSizes(kc, mc); + kc = std::min(kc,depth); + mc = std::min(mc,rows); ei_gemm_pack_rhs pack_rhs; ei_gemm_pack_lhs pack_lhs; @@ -235,7 +238,9 @@ struct ei_gemm_functor Index sharedBlockBSize() const { - return std::min(ei_product_blocking_traits::Max_kc,m_rhs.rows()) * m_rhs.cols(); + int maxKc, maxMc; + ei_getBlockingSizes(maxKc,maxMc); + return std::min(maxKc,m_rhs.rows()) * m_rhs.cols(); } protected: diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index 89c094d31..24d27bce2 100644 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -139,7 +139,7 @@ struct ei_product_blocking_traits mr = 2 * PacketSize, // max cache block size along the K direction - Max_kc = 8 * ei_meta_sqrt::ret, + Max_kc = 4 * ei_meta_sqrt::ret, // max cache block size along the M direction Max_mc = 2*Max_kc diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index a586f2a5d..a36e7d05b 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -269,6 +269,8 @@ namespace Architecture enum { CoeffBasedProductMode, LazyCoeffBasedProductMode, OuterProduct, InnerProduct, GemvProduct, GemmProduct }; +enum Action {GetAction, SetAction}; + /** The type used to identify a dense storage. */ struct Dense {};