slightly optimize computeProductBlockingSizes by explicitely precomputing what is known at compile time

This commit is contained in:
Gael Guennebaud 2010-06-22 11:10:38 +02:00
parent 3ae0eee0b8
commit fd9a9fa0ae
2 changed files with 18 additions and 9 deletions

View File

@ -93,7 +93,7 @@ inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2)
* \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension.
*
* Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
* this function computes the blocking size parameters along the respective dimensions
* this function computes the blocking size parameters along the respective dimensions
* for matrix products and related algorithms. The blocking sizes depends on various
* parameters:
* - the L1 and L2 cache sizes,
@ -112,11 +112,18 @@ void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrd
// i.e., each coefficient is replicated to fit a packet. This small vertical panel has to
// stay in L1 cache.
std::ptrdiff_t l1, l2;
enum {
kdiv = 2 * ei_product_blocking_traits<RhsScalar>::nr
* ei_packet_traits<RhsScalar>::size * sizeof(RhsScalar),
mr = ei_product_blocking_traits<LhsScalar>::mr,
mr_mask = (0xffffffff/mr)*mr
};
ei_manage_caching_sizes(GetAction, &l1, &l2);
k = std::min<std::ptrdiff_t>(k, l1/(2 * ei_product_blocking_traits<RhsScalar>::nr
* ei_packet_traits<RhsScalar>::size * sizeof(RhsScalar)));
std::ptrdiff_t _m = l2/(4 * k * sizeof(LhsScalar));
if(_m<m) m = (_m/ei_product_blocking_traits<LhsScalar>::mr) * ei_product_blocking_traits<LhsScalar>::mr;
k = std::min<std::ptrdiff_t>(k, l1/kdiv);
std::ptrdiff_t _m = l2/(4 * sizeof(LhsScalar) * k);
if(_m<m) m = _m & mr_mask;
n = n;
}

View File

@ -2,8 +2,8 @@
// g++-4.4 bench_gemm.cpp -I .. -O2 -DNDEBUG -lrt -fopenmp && OMP_NUM_THREADS=2 ./a.out
// icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp && OMP_NUM_THREADS=2 ./a.out
#include <Eigen/Core>
#include <iostream>
#include <Eigen/Core>
#include <bench/BenchTimer.h>
using namespace std;
@ -70,8 +70,6 @@ int main(int argc, char ** argv)
std::cout << "L1 cache size = " << (l1>0 ? l1/1024 : -1) << " KB\n";
std::cout << "L2/L3 cache size = " << (l2>0 ? l2/1024 : -1) << " KB\n";
setCpuCacheSizes(ei_queryL1CacheSize()/1,ei_queryTopLevelCacheSize()/2);
int rep = 1; // number of repetitions per try
int tries = 2; // number of tries, we keep the best
@ -85,13 +83,17 @@ int main(int argc, char ** argv)
s = atoi(argv[i]+1);
else if(argv[i][0]=='c')
cache_size = atoi(argv[i]+1);
else if(argv[i][0]=='t')
tries = atoi(argv[i]+1);
else if(argv[i][0]=='p')
rep = atoi(argv[i]+1);
else
need_help = true;
}
if(need_help)
{
std::cout << argv[0] << " s<matrix size> c<cache size> \n";
std::cout << argv[0] << " s<matrix size> c<cache size> t<nb tries> p<nb repeats>\n";
return 1;
}