mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-02-23 18:20:47 +08:00
implement a smarter parallelization strategy for gemm avoiding multiple
paking of the same data
This commit is contained in:
parent
a1e1103328
commit
3ac2b96a2f
@ -37,7 +37,8 @@
|
||||
template<typename Scalar, int mr, int nr, typename Conj>
|
||||
struct ei_gebp_kernel
|
||||
{
|
||||
void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int rows, int depth, int cols, int strideA=-1, int strideB=-1, int offsetA=0, int offsetB=0)
|
||||
void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int rows, int depth, int cols,
|
||||
int strideA=-1, int strideB=-1, int offsetA=0, int offsetB=0, Scalar* unpackedB = 0)
|
||||
{
|
||||
typedef typename ei_packet_traits<Scalar>::type PacketType;
|
||||
enum { PacketSize = ei_packet_traits<Scalar>::size };
|
||||
@ -49,7 +50,8 @@ struct ei_gebp_kernel
|
||||
const int peeled_mc2 = peeled_mc + (rows-peeled_mc >= PacketSize ? PacketSize : 0);
|
||||
const int peeled_kc = (depth/4)*4;
|
||||
|
||||
Scalar* unpackedB = const_cast<Scalar*>(blockB - strideB * nr * PacketSize);
|
||||
if(unpackedB==0)
|
||||
unpackedB = const_cast<Scalar*>(blockB - strideB * nr * PacketSize);
|
||||
|
||||
// loops on each micro vertical panel of rhs (depth x nr)
|
||||
for(int j2=0; j2<packet_cols; j2+=nr)
|
||||
|
@ -39,7 +39,8 @@ struct ei_general_matrix_matrix_product<Scalar,LhsStorageOrder,ConjugateLhs,RhsS
|
||||
const Scalar* lhs, int lhsStride,
|
||||
const Scalar* rhs, int rhsStride,
|
||||
Scalar* res, int resStride,
|
||||
Scalar alpha)
|
||||
Scalar alpha,
|
||||
GemmParallelInfo* info = 0)
|
||||
{
|
||||
// transpose the product such that the result is column major
|
||||
ei_general_matrix_matrix_product<Scalar,
|
||||
@ -48,7 +49,7 @@ struct ei_general_matrix_matrix_product<Scalar,LhsStorageOrder,ConjugateLhs,RhsS
|
||||
LhsStorageOrder==RowMajor ? ColMajor : RowMajor,
|
||||
ConjugateLhs,
|
||||
ColMajor>
|
||||
::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha);
|
||||
::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,info);
|
||||
}
|
||||
};
|
||||
|
||||
@ -64,7 +65,8 @@ static void run(int rows, int cols, int depth,
|
||||
const Scalar* _lhs, int lhsStride,
|
||||
const Scalar* _rhs, int rhsStride,
|
||||
Scalar* res, int resStride,
|
||||
Scalar alpha)
|
||||
Scalar alpha,
|
||||
GemmParallelInfo* info = 0)
|
||||
{
|
||||
ei_const_blas_data_mapper<Scalar, LhsStorageOrder> lhs(_lhs,lhsStride);
|
||||
ei_const_blas_data_mapper<Scalar, RhsStorageOrder> rhs(_rhs,rhsStride);
|
||||
@ -75,9 +77,76 @@ static void run(int rows, int cols, int depth,
|
||||
typedef typename ei_packet_traits<Scalar>::type PacketType;
|
||||
typedef ei_product_blocking_traits<Scalar> Blocking;
|
||||
|
||||
int kc = std::min<int>(Blocking::Max_kc,depth); // cache block size along the K direction
|
||||
int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction
|
||||
// int kc = std::min<int>(Blocking::Max_kc,depth); // cache block size along the K direction
|
||||
// int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction
|
||||
|
||||
int kc = std::min<int>(256,depth); // cache block size along the K direction
|
||||
int mc = std::min<int>(512,rows); // cache block size along the M direction
|
||||
|
||||
ei_gemm_pack_rhs<Scalar, Blocking::nr, RhsStorageOrder> pack_rhs;
|
||||
ei_gemm_pack_lhs<Scalar, Blocking::mr, LhsStorageOrder> pack_lhs;
|
||||
ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp;
|
||||
|
||||
if(info)
|
||||
{
|
||||
// this is the parallel version!
|
||||
int tid = omp_get_thread_num();
|
||||
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||
std::size_t sizeW = kc*Blocking::PacketSize*Blocking::nr*8;
|
||||
Scalar* w = ei_aligned_stack_new(Scalar, sizeW);
|
||||
Scalar* blockB = (Scalar*)info[tid].blockB;
|
||||
|
||||
// if you have the GOTO blas library you can try our parallelization strategy
|
||||
// using GOTO's optimized routines.
|
||||
// #define USEGOTOROUTINES
|
||||
#ifdef USEGOTOROUTINES
|
||||
void* u = alloca(4096+sizeW);
|
||||
#endif
|
||||
|
||||
// For each horizontal panel of the rhs, and corresponding panel of the lhs...
|
||||
// (==GEMM_VAR1)
|
||||
for(int k=0; k<depth; k+=kc)
|
||||
{
|
||||
#pragma omp barrier
|
||||
const int actual_kc = std::min(k+kc,depth)-k;
|
||||
|
||||
// pack B_k to B' in parallel fashion,
|
||||
// each thread packs the B_k,j sub block to B'_j where j is the thread id
|
||||
#ifndef USEGOTOROUTINES
|
||||
pack_rhs(blockB+info[tid].rhs_start*kc, &rhs(k,info[tid].rhs_start), rhsStride, alpha, actual_kc, info[tid].rhs_length);
|
||||
#else
|
||||
sgemm_oncopy(actual_kc, info[tid].rhs_length, &rhs(k,info[tid].rhs_start), rhsStride, blockB+info[tid].rhs_start*kc);
|
||||
#endif
|
||||
|
||||
#pragma omp barrier
|
||||
|
||||
for(int i=0; i<rows; i+=mc)
|
||||
{
|
||||
const int actual_mc = std::min(i+mc,rows)-i;
|
||||
|
||||
// pack A_i,k to A'
|
||||
#ifndef USEGOTOROUTINES
|
||||
pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc);
|
||||
#else
|
||||
sgemm_itcopy(actual_kc, actual_mc, &lhs(i,k), lhsStride, blockA);
|
||||
#endif
|
||||
|
||||
// C_i += A' * B'
|
||||
#ifndef USEGOTOROUTINES
|
||||
gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, -1,-1,0,0, w);
|
||||
#else
|
||||
sgemm_kernel(actual_mc, cols, actual_kc, alpha, blockA, blockB, res+i, resStride);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
ei_aligned_stack_delete(Scalar, blockA, kc*mc);
|
||||
ei_aligned_stack_delete(Scalar, w, sizeW);
|
||||
}
|
||||
else
|
||||
{
|
||||
// this is the sequential version!
|
||||
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
|
||||
std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
|
||||
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
|
||||
@ -93,7 +162,8 @@ static void run(int rows, int cols, int depth,
|
||||
// => Pack rhs's panel into a sequential chunk of memory (L2 caching)
|
||||
// Note that this panel will be read as many times as the number of blocks in the lhs's
|
||||
// vertical panel which is, in practice, a very low number.
|
||||
ei_gemm_pack_rhs<Scalar, Blocking::nr, RhsStorageOrder>()(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols);
|
||||
pack_rhs(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols);
|
||||
|
||||
|
||||
// For each mc x kc block of the lhs's vertical panel...
|
||||
// (==GEPP_VAR1)
|
||||
@ -104,18 +174,17 @@ static void run(int rows, int cols, int depth,
|
||||
// We pack the lhs's block into a sequential chunk of memory (L1 caching)
|
||||
// Note that this block will be read a very high number of times, which is equal to the number of
|
||||
// micro vertical panel of the large rhs's panel (e.g., cols/4 times).
|
||||
ei_gemm_pack_lhs<Scalar, Blocking::mr, LhsStorageOrder>()(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
|
||||
pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
|
||||
|
||||
// Everything is packed, we can now call the block * panel kernel:
|
||||
ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >()
|
||||
(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
|
||||
gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
|
||||
|
||||
// sgemm_kernel(actual_mc, cols, actual_kc, alpha, blockA, allocatedBlockB, res+i2, resStride);
|
||||
}
|
||||
}
|
||||
|
||||
ei_aligned_stack_delete(Scalar, blockA, kc*mc);
|
||||
ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
@ -139,15 +208,16 @@ struct ei_gemm_functor
|
||||
: m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha)
|
||||
{}
|
||||
|
||||
void operator() (int col, int cols, int row=0, int rows=-1) const
|
||||
void operator() (int row, int rows, int col=0, int cols=-1, GemmParallelInfo* info=0) const
|
||||
{
|
||||
if(rows==-1)
|
||||
rows = m_lhs.rows();
|
||||
if(cols==-1)
|
||||
cols = m_rhs.cols();
|
||||
Gemm::run(rows, cols, m_lhs.cols(),
|
||||
(const Scalar*)&(m_lhs.const_cast_derived().coeffRef(row,0)), m_lhs.stride(),
|
||||
(const Scalar*)&(m_rhs.const_cast_derived().coeffRef(0,col)), m_rhs.stride(),
|
||||
(Scalar*)&(m_dest.coeffRef(row,col)), m_dest.stride(),
|
||||
m_actualAlpha);
|
||||
m_actualAlpha,
|
||||
info);
|
||||
}
|
||||
|
||||
protected:
|
||||
@ -191,7 +261,9 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
|
||||
_ActualRhsType,
|
||||
Dest> Functor;
|
||||
|
||||
ei_run_parallel_2d<true>(Functor(lhs, rhs, dst, actualAlpha), this->cols(), this->rows());
|
||||
// ei_run_parallel_1d<true>(Functor(lhs, rhs, dst, actualAlpha), this->rows());
|
||||
// ei_run_parallel_2d<true>(Functor(lhs, rhs, dst, actualAlpha), this->rows(), this->cols());
|
||||
ei_run_parallel_gemm<true>(Functor(lhs, rhs, dst, actualAlpha), this->rows(), this->cols());
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -25,6 +25,13 @@
|
||||
#ifndef EIGEN_PARALLELIZER_H
|
||||
#define EIGEN_PARALLELIZER_H
|
||||
|
||||
struct GemmParallelInfo
|
||||
{
|
||||
int rhs_start;
|
||||
int rhs_length;
|
||||
float* blockB;
|
||||
};
|
||||
|
||||
template<bool Parallelize,typename Functor>
|
||||
void ei_run_parallel_1d(const Functor& func, int size)
|
||||
{
|
||||
@ -53,14 +60,17 @@ void ei_run_parallel_2d(const Functor& func, int size1, int size2)
|
||||
#ifndef EIGEN_HAS_OPENMP
|
||||
func(0,size1, 0,size2);
|
||||
#else
|
||||
if(!Parallelize)
|
||||
|
||||
int threads = omp_get_max_threads();
|
||||
if((!Parallelize)||(threads==1))
|
||||
return func(0,size1, 0,size2);
|
||||
|
||||
// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
||||
static const int divide1[17] = { 0, 1, 2, 3, 2, 5, 3, 7, 4, 3, 5, 11, 4, 13, 7, 5, 4};
|
||||
static const int divide2[17] = { 0, 1, 1, 1, 2, 1, 2, 1, 2, 3, 2, 1, 3, 1, 2, 3, 4};
|
||||
static const int divide1[17] = { 0, 1, 2, 3, 2, 5, 3, 7, 4, 3, 5, 1, 4, 1, 7, 5, 4};
|
||||
static const int divide2[17] = { 0, 1, 1, 1, 2, 1, 2, 1, 2, 3, 2, 11, 3, 13, 2, 3, 4};
|
||||
|
||||
|
||||
|
||||
int threads = omp_get_num_procs();
|
||||
ei_assert(threads<=16 && "too many threads !");
|
||||
int blockSize1 = size1 / divide1[threads];
|
||||
int blockSize2 = size2 / divide2[threads];
|
||||
@ -87,4 +97,44 @@ void ei_run_parallel_2d(const Functor& func, int size1, int size2)
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif // EIGEN_GENERAL_MATRIX_MATRIX_H
|
||||
template<bool Parallelize,typename Functor>
|
||||
void ei_run_parallel_gemm(const Functor& func, int rows, int cols)
|
||||
{
|
||||
#ifndef EIGEN_HAS_OPENMP
|
||||
func(0,size1, 0,size2);
|
||||
#else
|
||||
|
||||
int threads = omp_get_max_threads();
|
||||
if((!Parallelize)||(threads==1))
|
||||
return func(0,rows, 0,cols);
|
||||
|
||||
|
||||
int blockCols = (cols / threads) & ~0x3;
|
||||
int blockRows = (rows / threads) & ~0x7;
|
||||
|
||||
float* sharedBlockB = new float[2048*2048*4];
|
||||
|
||||
GemmParallelInfo* info = new GemmParallelInfo[threads];
|
||||
|
||||
#pragma omp parallel for schedule(static,1)
|
||||
for(int i=0; i<threads; ++i)
|
||||
{
|
||||
int r0 = i*blockRows;
|
||||
int actualBlockRows = (i+1==threads) ? rows-r0 : blockRows;
|
||||
|
||||
int c0 = i*blockCols;
|
||||
int actualBlockCols = (i+1==threads) ? cols-c0 : blockCols;
|
||||
|
||||
info[i].rhs_start = c0;
|
||||
info[i].rhs_length = actualBlockCols;
|
||||
info[i].blockB = sharedBlockB;
|
||||
|
||||
func(r0, actualBlockRows, 0,cols, info);
|
||||
}
|
||||
|
||||
delete[] sharedBlockB;
|
||||
delete[] info;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif // EIGEN_PARALLELIZER_H
|
||||
|
@ -15,6 +15,52 @@ using namespace Eigen;
|
||||
typedef SCALAR Scalar;
|
||||
typedef Matrix<Scalar,Dynamic,Dynamic> M;
|
||||
|
||||
#ifdef HAVE_BLAS
|
||||
|
||||
extern "C" {
|
||||
#include <bench/btl/libs/C_BLAS/blas.h>
|
||||
|
||||
void sgemm_kernel(int actual_mc, int cols, int actual_kc, float alpha,
|
||||
float* blockA, float* blockB, float* res, int resStride);
|
||||
void sgemm_oncopy(int actual_kc, int cols, const float* rhs, int rhsStride, float* blockB);
|
||||
void sgemm_itcopy(int actual_kc, int cols, const float* rhs, int rhsStride, float* blockB);
|
||||
}
|
||||
|
||||
static float fone = 1;
|
||||
static float fzero = 0;
|
||||
static double done = 1;
|
||||
static double szero = 0;
|
||||
static char notrans = 'N';
|
||||
static char trans = 'T';
|
||||
static char nonunit = 'N';
|
||||
static char lower = 'L';
|
||||
static char right = 'R';
|
||||
static int intone = 1;
|
||||
|
||||
void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c)
|
||||
{
|
||||
int M = c.rows(); int N = c.cols(); int K = a.cols();
|
||||
int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows();
|
||||
|
||||
sgemm_(¬rans,¬rans,&M,&N,&K,&fone,
|
||||
const_cast<float*>(a.data()),&lda,
|
||||
const_cast<float*>(b.data()),&ldb,&fone,
|
||||
c.data(),&ldc);
|
||||
}
|
||||
|
||||
void blas_gemm(const MatrixXd& a, const MatrixXd& b, MatrixXd& c)
|
||||
{
|
||||
int M = c.rows(); int N = c.cols(); int K = a.cols();
|
||||
int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows();
|
||||
|
||||
dgemm_(¬rans,¬rans,&M,&N,&K,&done,
|
||||
const_cast<double*>(a.data()),&lda,
|
||||
const_cast<double*>(b.data()),&ldb,&done,
|
||||
c.data(),&ldc);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void gemm(const M& a, const M& b, M& c)
|
||||
{
|
||||
c.noalias() += a * b;
|
||||
@ -22,21 +68,42 @@ void gemm(const M& a, const M& b, M& c)
|
||||
|
||||
int main(int argc, char ** argv)
|
||||
{
|
||||
int rep = 1;
|
||||
int rep = 1; // number of repetitions per try
|
||||
int tries = 5; // number of tries, we keep the best
|
||||
|
||||
int s = 2048;
|
||||
int m = s;
|
||||
int n = s;
|
||||
int p = s;
|
||||
M a(m,n); a.setOnes();
|
||||
M b(n,p); b.setOnes();
|
||||
M a(m,n); a.setRandom();
|
||||
M b(n,p); b.setRandom();
|
||||
M c(m,p); c.setOnes();
|
||||
|
||||
BenchTimer t;
|
||||
|
||||
BENCH(t, 5, rep, gemm(a,b,c));
|
||||
M r = c;
|
||||
|
||||
std::cerr << "cpu " << t.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << t.total(CPU_TIMER) << "s)\n";
|
||||
std::cerr << "real " << t.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
|
||||
// check the parallel product is correct
|
||||
#ifdef HAVE_BLAS
|
||||
blas_gemm(a,b,r);
|
||||
#else
|
||||
int procs = omp_get_max_threads();
|
||||
omp_set_num_threads(1);
|
||||
r.noalias() += a * b;
|
||||
omp_set_num_threads(procs);
|
||||
#endif
|
||||
c.noalias() += a * b;
|
||||
if(!r.isApprox(c)) std::cerr << "Warning, your parallel product is crap!\n\n";
|
||||
|
||||
#ifdef HAVE_BLAS
|
||||
BENCH(t, tries, rep, blas_gemm(a,b,c));
|
||||
std::cerr << "blas cpu " << t.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << t.total(CPU_TIMER) << "s)\n";
|
||||
std::cerr << "blas real " << t.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
|
||||
#endif
|
||||
|
||||
BENCH(t, tries, rep, gemm(a,b,c));
|
||||
std::cerr << "eigen cpu " << t.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << t.total(CPU_TIMER) << "s)\n";
|
||||
std::cerr << "eigen real " << t.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -1,83 +0,0 @@
|
||||
|
||||
#include <Eigen/Core>
|
||||
#include <bench/BenchTimer.h>
|
||||
|
||||
extern "C"
|
||||
{
|
||||
#include <bench/btl/libs/C_BLAS/blas.h>
|
||||
#include <cblas.h>
|
||||
}
|
||||
|
||||
using namespace std;
|
||||
using namespace Eigen;
|
||||
|
||||
#ifndef SCALAR
|
||||
#define SCALAR float
|
||||
#endif
|
||||
|
||||
typedef SCALAR Scalar;
|
||||
typedef Matrix<Scalar,Dynamic,Dynamic> M;
|
||||
|
||||
static float fone = 1;
|
||||
static float fzero = 0;
|
||||
static double done = 1;
|
||||
static double szero = 0;
|
||||
static char notrans = 'N';
|
||||
static char trans = 'T';
|
||||
static char nonunit = 'N';
|
||||
static char lower = 'L';
|
||||
static char right = 'R';
|
||||
static int intone = 1;
|
||||
|
||||
void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c)
|
||||
{
|
||||
int M = c.rows();
|
||||
int N = c.cols();
|
||||
int K = a.cols();
|
||||
|
||||
int lda = a.rows();
|
||||
int ldb = b.rows();
|
||||
int ldc = c.rows();
|
||||
|
||||
sgemm_(¬rans,¬rans,&M,&N,&K,&fone,
|
||||
const_cast<float*>(a.data()),&lda,
|
||||
const_cast<float*>(b.data()),&ldb,&fone,
|
||||
c.data(),&ldc);
|
||||
}
|
||||
|
||||
void blas_gemm(const MatrixXd& a, const MatrixXd& b, MatrixXd& c)
|
||||
{
|
||||
int M = c.rows();
|
||||
int N = c.cols();
|
||||
int K = a.cols();
|
||||
|
||||
int lda = a.rows();
|
||||
int ldb = b.rows();
|
||||
int ldc = c.rows();
|
||||
|
||||
dgemm_(¬rans,¬rans,&M,&N,&K,&done,
|
||||
const_cast<double*>(a.data()),&lda,
|
||||
const_cast<double*>(b.data()),&ldb,&done,
|
||||
c.data(),&ldc);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int rep = 1;
|
||||
int s = 2048;
|
||||
int m = s;
|
||||
int n = s;
|
||||
int p = s;
|
||||
M a(m,n); a.setOnes();
|
||||
M b(n,p); b.setOnes();
|
||||
M c(m,p); c.setOnes();
|
||||
|
||||
BenchTimer t;
|
||||
|
||||
BENCH(t, 5, rep, blas_gemm(a,b,c));
|
||||
|
||||
std::cerr << "cpu " << t.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << t.total(CPU_TIMER) << "s)\n";
|
||||
std::cerr << "real " << t.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
|
||||
return 0;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user