* basic support for multicore CPU via a .evalOMP() which

internaly uses OpenMP if enabled at compile time.
 * added a bench/ folder with a couple benchmarks and benchmark tools.
This commit is contained in:
Gael Guennebaud 2008-03-09 16:13:47 +00:00
parent f64311e07d
commit 9d9d81ad71
18 changed files with 514 additions and 2 deletions

View File

@ -40,6 +40,7 @@ namespace Eigen {
#include "src/Core/IO.h"
#include "src/Core/Swap.h"
#include "src/Core/CommaInitializer.h"
#include "src/Core/EvalOMP.h"
} // namespace Eigen

118
Eigen/src/Core/EvalOMP.h Normal file
View File

@ -0,0 +1,118 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra. Eigen itself is part of the KDE project.
//
// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
// Copyright (C) 2006-2008 Benoit Jacob <jacob@math.jussieu.fr>
//
// Eigen is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 3 of the License, or (at your option) any later version.
//
// Alternatively, you can redistribute it and/or
// modify it under the terms of the GNU General Public License as
// published by the Free Software Foundation; either version 2 of
// the License, or (at your option) any later version.
//
// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License and a copy of the GNU General Public License along with
// Eigen. If not, see <http://www.gnu.org/licenses/>.
#ifndef EIGEN_EVAL_OMP_H
#define EIGEN_EVAL_OMP_H
/** \class EvalOMP
*
* \brief Parallel evaluation of an expression using OpenMP
*
* The template parameter Expression is the type of the expression that we are evaluating.
*
* This class is the return type of MatrixBase::evalOMP() and most of the time this is the
* only way it is used.
*
* Note that if OpenMP is not enabled, then this class is equivalent to Eval.
*
* \sa MatrixBase::evalOMP(), class Eval, MatrixBase::eval()
*/
template<typename ExpressionType> class EvalOMP : NoOperatorEquals,
public Matrix< typename ExpressionType::Scalar,
ExpressionType::Traits::RowsAtCompileTime,
ExpressionType::Traits::ColsAtCompileTime,
EIGEN_DEFAULT_MATRIX_STORAGE_ORDER,
ExpressionType::Traits::MaxRowsAtCompileTime,
ExpressionType::Traits::MaxColsAtCompileTime>
{
public:
typedef typename ExpressionType::Scalar Scalar;
/** The actual matrix type to evaluate to. This type can be used independently
* of the rest of this class to get the actual matrix type to evaluate and store
* the value of an expression.
*/
typedef Matrix<Scalar,
ExpressionType::Traits::RowsAtCompileTime,
ExpressionType::Traits::ColsAtCompileTime,
EIGEN_DEFAULT_MATRIX_STORAGE_ORDER,
ExpressionType::Traits::MaxRowsAtCompileTime,
ExpressionType::Traits::MaxColsAtCompileTime> MatrixType;
#ifdef _OPENMP
explicit EvalOMP(const ExpressionType& other)
: MatrixType(other.rows(), other.cols())
{
#ifdef __INTEL_COMPILER
#pragma omp parallel default(none) shared(other)
#else
#pragma omp parallel default(none)
#endif
{
if (this->cols()>this->rows())
{
#pragma omp for
for(int j = 0; j < this->cols(); j++)
for(int i = 0; i < this->rows(); i++)
this->coeffRef(i, j) = other.coeff(i, j);
}
else
{
#pragma omp for
for(int i = 0; i < this->rows(); i++)
for(int j = 0; j < this->cols(); j++)
this->coeffRef(i, j) = other.coeff(i, j);
}
}
}
#else
explicit EvalOMP(const ExpressionType& other) : MatrixType(other) {}
#endif
};
/** Evaluates *this in a parallel fashion using OpenMP and returns the obtained matrix.
*
* Of course, it only makes sense to call this function for complex expressions, and/or
* large matrices (>32x32), \b and if there is no outer loop which can be parallelized.
*
* It is the responsibility of the user manage the OpenMP parameters, for instance:
* \code
* #include <omp.h>
* // ...
* omp_set_num_threads(omp_get_num_procs());
* \endcode
* You also need to enable OpenMP on your compiler (e.g., -fopenmp) during both compilation and linking.
*
* Note that if OpenMP is not enabled, then evalOMP() is equivalent to eval().
*
* \sa class EvalOMP, eval()
*/
template<typename Scalar, typename Derived>
const EvalOMP<Derived> MatrixBase<Scalar, Derived>::evalOMP() const
{
return EvalOMP<Derived>(*static_cast<const Derived*>(this));
}
#endif // EIGEN_EVAL_OMP_H

View File

@ -44,6 +44,7 @@ template<typename MatrixType> class DiagonalCoeffs;
template<typename MatrixType> class Identity;
template<typename MatrixType> class Map;
template<typename Derived> class Eval;
template<typename Derived> class EvalOMP;
struct ScalarProductOp;
struct ScalarQuotientOp;

View File

@ -359,6 +359,7 @@ template<typename Scalar, typename Derived> class MatrixBase
/// \name special functions
//@{
const Eval<Derived> eval() const EIGEN_ALWAYS_INLINE;
const EvalOMP<Derived> evalOMP() const EIGEN_ALWAYS_INLINE;
template<typename CustomUnaryOp>
const CwiseUnaryOp<CustomUnaryOp, Derived> cwise(const CustomUnaryOp& func = CustomUnaryOp()) const;

75
bench/BenchTimer.h Normal file
View File

@ -0,0 +1,75 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra. Eigen itself is part of the KDE project.
//
// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
// Copyright (C) 2006-2008 Benoit Jacob <jacob@math.jussieu.fr>
//
// Eigen is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 3 of the License, or (at your option) any later version.
//
// Alternatively, you can redistribute it and/or
// modify it under the terms of the GNU General Public License as
// published by the Free Software Foundation; either version 2 of
// the License, or (at your option) any later version.
//
// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License and a copy of the GNU General Public License along with
// Eigen. If not, see <http://www.gnu.org/licenses/>.
#ifndef EIGEN_BENCH_TIMER_H
#define EIGEN_BENCH_TIMER_H
#include <sys/time.h>
#include <unistd.h>
#include <cstdlib>
namespace Eigen
{
/** Elapsed time timer keeping the best try.
*/
class BenchTimer
{
public:
BenchTimer() : m_best(1e99) {}
~BenchTimer() {}
inline void start(void) {m_start = getTime();}
inline void stop(void)
{
m_best = std::min(m_best, getTime() - m_start);
}
/** Return the best elapsed time.
*/
inline double value(void)
{
return m_best;
}
static inline double getTime(void)
{
struct timeval tv;
struct timezone tz;
gettimeofday(&tv, &tz);
return (double)tv.tv_sec + 1.e-6 * (double)tv.tv_usec;
}
protected:
double m_best, m_start;
};
}
#endif // EIGEN_BENCH_TIMER_H

28
bench/BenchUtil.h Normal file
View File

@ -0,0 +1,28 @@
#include <Eigen/Core>
#include "BenchTimer.h"
using namespace std;
USING_PART_OF_NAMESPACE_EIGEN
#include <boost/preprocessor/repetition/enum_params.hpp>
#include <boost/preprocessor/repetition.hpp>
#include <boost/preprocessor/seq.hpp>
#include <boost/preprocessor/array.hpp>
#include <boost/preprocessor/arithmetic.hpp>
#include <boost/preprocessor/comparison.hpp>
#include <boost/preprocessor/punctuation.hpp>
#include <boost/preprocessor/punctuation/comma.hpp>
#include <boost/preprocessor/stringize.hpp>
template<typename MatrixType> void initMatrix_random(MatrixType& mat) __attribute__((noinline));
template<typename MatrixType> void initMatrix_random(MatrixType& mat)
{
mat.setRandom();// = MatrixType::random(mat.rows(), mat.cols());
}
template<typename MatrixType> void initMatrix_identity(MatrixType& mat) __attribute__((noinline));
template<typename MatrixType> void initMatrix_identity(MatrixType& mat)
{
mat.setIdentity();
}

55
bench/README.txt Normal file
View File

@ -0,0 +1,55 @@
This folder contains a couple of benchmark utities and Eigen benchmarks.
****************************
* bench_multi_compilers.sh *
****************************
This script allows to run a benchmark on a set of different compilers/compiler options.
It takes two arguments:
- a file defining the list of the compilers with their options
- the .cpp file of the benchmark
Examples:
$ ./bench_multi_compilers.sh basicbench.cxxlist basicbenchmark.cpp
g++-4.1 -O3 -DNDEBUG -finline-limit=10000
3d-3x3 / 4d-4x4 / Xd-4x4 / Xd-20x20 /
0.271102 0.131416 0.422322 0.198633
0.201658 0.102436 0.397566 0.207282
g++-4.2 -O3 -DNDEBUG -finline-limit=10000
3d-3x3 / 4d-4x4 / Xd-4x4 / Xd-20x20 /
0.107805 0.0890579 0.30265 0.161843
0.127157 0.0712581 0.278341 0.191029
g++-4.3 -O3 -DNDEBUG -finline-limit=10000
3d-3x3 / 4d-4x4 / Xd-4x4 / Xd-20x20 /
0.134318 0.105291 0.3704 0.180966
0.137703 0.0732472 0.31225 0.202204
icpc -fast -DNDEBUG -fno-exceptions -no-inline-max-size
3d-3x3 / 4d-4x4 / Xd-4x4 / Xd-20x20 /
0.226145 0.0941319 0.371873 0.159433
0.109302 0.0837538 0.328102 0.173891
$ ./bench_multi_compilers.sh ompbench.cxxlist ompbenchmark.cpp
g++-4.2 -O3 -DNDEBUG -finline-limit=10000 -fopenmp
double, fixed-size 4x4: 0.00165105s 0.0778739s
double, 32x32: 0.0654769s 0.075289s => x0.869674 (2)
double, 128x128: 0.054148s 0.0419669s => x1.29025 (2)
double, 512x512: 0.913799s 0.428533s => x2.13239 (2)
double, 1024x1024: 14.5972s 9.3542s => x1.5605 (2)
icpc -fast -DNDEBUG -fno-exceptions -no-inline-max-size -openmp
double, fixed-size 4x4: 0.000589848s 0.019949s
double, 32x32: 0.0682781s 0.0449722s => x1.51823 (2)
double, 128x128: 0.0547509s 0.0435519s => x1.25714 (2)
double, 512x512: 0.829436s 0.424438s => x1.9542 (2)
double, 1024x1024: 14.5243s 10.7735s => x1.34815 (2)

12
bench/basicbench.cxxlist Normal file
View File

@ -0,0 +1,12 @@
#!/bin/bash
CLIST[((g++))]="g++-4.1 -O3 -DNDEBUG"
CLIST[((g++))]="g++-4.1 -O3 -DNDEBUG -finline-limit=20000"
CLIST[((g++))]="g++-4.2 -O3 -DNDEBUG"
CLIST[((g++))]="g++-4.2 -O3 -DNDEBUG -finline-limit=20000"
CLIST[((g++))]="g++-4.3 -O3 -DNDEBUG"
CLIST[((g++))]="g++-4.3 -O3 -DNDEBUG -finline-limit=20000"
CLIST[((g++))]="icpc -fast -DNDEBUG -fno-exceptions -no-inline-max-size"

46
bench/basicbenchmark.cpp Normal file
View File

@ -0,0 +1,46 @@
#include "BenchUtil.h"
#include "basicbenchmark.h"
int main(int argc, char *argv[])
{
// disbale floating point exceptions
// this leads to more stable bench results
// (this is done by default by ICC)
#ifndef __INTEL_COMPILER
{
int aux;
asm(
"stmxcsr %[aux] \n\t"
"orl $32832, %[aux] \n\t"
"ldmxcsr %[aux] \n\t"
: : [aux] "m" (aux));
}
#endif
// this is the list of matrix type and size we want to bench:
// ((suffix) (matrix size) (number of iterations))
#define MODES ((3d)(3)(4000000)) ((4d)(4)(1000000)) ((Xd)(4)(1000000)) ((Xd)(20)(10000))
// #define MODES ((Xd)(20)(10000))
#define _GENERATE_HEADER(R,ARG,EL) << BOOST_PP_STRINGIZE(BOOST_PP_SEQ_HEAD(EL)) << "-" \
<< BOOST_PP_STRINGIZE(BOOST_PP_SEQ_ELEM(1,EL)) << "x" \
<< BOOST_PP_STRINGIZE(BOOST_PP_SEQ_ELEM(1,EL)) << " / "
std::cout BOOST_PP_SEQ_FOR_EACH(_GENERATE_HEADER, ~, MODES ) << endl;
const int tries = 10;
#define _RUN_BENCH(R,ARG,EL) \
std::cout << ARG( \
BOOST_PP_CAT(Matrix, BOOST_PP_SEQ_HEAD(EL)) (\
BOOST_PP_SEQ_ELEM(1,EL),BOOST_PP_SEQ_ELEM(1,EL)), BOOST_PP_SEQ_ELEM(2,EL), tries) \
<< " ";
BOOST_PP_SEQ_FOR_EACH(_RUN_BENCH, benchBasic<LazyEval>, MODES );
std::cout << endl;
BOOST_PP_SEQ_FOR_EACH(_RUN_BENCH, benchBasic<EarlyEval>, MODES );
std::cout << endl;
return 0;
}

59
bench/basicbenchmark.h Normal file
View File

@ -0,0 +1,59 @@
enum {LazyEval, EarlyEval, OmpEval};
template<int Mode, typename MatrixType>
void benchBasic_loop(const MatrixType& I, MatrixType& m, int iterations) __attribute__((noinline));
template<int Mode, typename MatrixType>
void benchBasic_loop(const MatrixType& I, MatrixType& m, int iterations)
{
for(int a = 0; a < iterations; a++)
{
if (Mode==LazyEval)
{
asm("#begin_bench_loop LazyEval");
if (MatrixType::Traits::SizeAtCompileTime!=Eigen::Dynamic) asm("#fixedsize");
m = (I + 0.00005 * (m + m.lazyProduct(m))).eval();
}
else if (Mode==OmpEval)
{
asm("#begin_bench_loop OmpEval");
if (MatrixType::Traits::SizeAtCompileTime!=Eigen::Dynamic) asm("#fixedsize");
m = (I + 0.00005 * (m + m.lazyProduct(m))).evalOMP();
}
else
{
asm("#begin_bench_loop EarlyEval");
if (MatrixType::Traits::SizeAtCompileTime!=Eigen::Dynamic) asm("#fixedsize");
m = I + 0.00005 * (m + m * m);
}
asm("#end_bench_loop");
}
}
template<int Mode, typename MatrixType>
double benchBasic(const MatrixType& mat, int size, int tries) __attribute__((noinline));
template<int Mode, typename MatrixType>
double benchBasic(const MatrixType& mat, int iterations, int tries)
{
const int rows = mat.rows();
const int cols = mat.cols();
MatrixType I(rows,cols);
MatrixType m(rows,cols);
initMatrix_identity(I);
Eigen::BenchTimer timer;
for(uint t=0; t<tries; ++t)
{
initMatrix_random(m);
timer.start();
benchBasic_loop<Mode>(I, m, iterations);
timer.stop();
cerr << m;
}
return timer.value();
};

28
bench/bench_multi_compilers.sh Executable file
View File

@ -0,0 +1,28 @@
#!/bin/bash
if (($# < 2)); then
echo "Usage: $0 compilerlist.txt benchfile.cpp"
else
compilerlist=$1
benchfile=$2
g=0
source $compilerlist
# for each compiler, compile benchfile and run the benchmark
for (( i=0 ; i<g ; ++i )) ; do
# check the compiler exists
compiler=`echo ${CLIST[$i]} | cut -d " " -f 1`
if [ -e `which $compiler` ]; then
echo "${CLIST[$i]}"
# echo "${CLIST[$i]} $benchfile -I.. -o bench~"
if [ -e ./.bench ] ; then rm .bench; fi
${CLIST[$i]} $benchfile -I.. -o .bench && ./.bench 2> /dev/null
echo ""
else
echo "compiler not found: $compiler"
fi
done
fi

View File

@ -13,7 +13,7 @@ int main(int argc, char *argv[])
{
m(i,j) = 0.1 * (i+20*j);
}
for(int a = 0; a < 1000000; a++)
for(int a = 0; a < 100000; a++)
{
m = I + 0.00005 * (m + m*m);
}

7
bench/ompbench.cxxlist Normal file
View File

@ -0,0 +1,7 @@
#!/bin/bash
CLIST[((g++))]="g++-4.2 -O3 -DNDEBUG -finline-limit=10000 -fopenmp"
# CLIST[((g++))]="g++-4.3 -O3 -DNDEBUG -finline-limit=10000 -fopenmp"
CLIST[((g++))]="icpc -fast -DNDEBUG -fno-exceptions -no-inline-max-size -openmp"

81
bench/ompbenchmark.cpp Normal file
View File

@ -0,0 +1,81 @@
// g++ -O3 -DNDEBUG -I.. -fopenmp benchOpenMP.cpp -o benchOpenMP && ./benchOpenMP 2> /dev/null
// icpc -fast -fno-exceptions -DNDEBUG -I.. -openmp benchOpenMP.cpp -o benchOpenMP && ./benchOpenMP 2> /dev/null
#include <omp.h>
#include "BenchUtil.h"
#include "basicbenchmark.h"
// #include <Eigen/Core>
// #include "BenchTimer.h"
//
// using namespace std;
// USING_PART_OF_NAMESPACE_EIGEN
//
// enum {LazyEval, EarlyEval, OmpEval};
//
// template<int Mode, typename MatrixType>
// double benchSingleProc(const MatrixType& mat, int iterations, int tries) __attribute__((noinline));
//
// template<int Mode, typename MatrixType>
// double benchBasic(const MatrixType& mat, int iterations, int tries)
// {
// const int rows = mat.rows();
// const int cols = mat.cols();
//
// Eigen::BenchTimer timer;
// for(uint t=0; t<tries; ++t)
// {
// MatrixType I = MatrixType::identity(rows, cols);
// MatrixType m = MatrixType::random(rows, cols);
//
// timer.start();
// for(int a = 0; a < iterations; a++)
// {
// if(Mode==LazyEval)
// m = (I + 0.00005 * (m + m.lazyProduct(m))).eval();
// else if(Mode==OmpEval)
// m = (I + 0.00005 * (m + m.lazyProduct(m))).evalOMP();
// else
// m = I + 0.00005 * (m + m * m);
// }
// timer.stop();
// cerr << m;
// }
// return timer.value();
// };
int main(int argc, char *argv[])
{
// disbale floating point exceptions
// this leads to more stable bench results
{
int aux;
asm(
"stmxcsr %[aux] \n\t"
"orl $32832, %[aux] \n\t"
"ldmxcsr %[aux] \n\t"
: : [aux] "m" (aux));
}
// commented since the default setting is use as many threads as processors
//omp_set_num_threads(omp_get_num_procs());
std::cout << "double, fixed-size 4x4: "
<< benchBasic<LazyEval>(Matrix4d(), 10000, 10) << "s "
<< benchBasic<OmpEval>(Matrix4d(), 10000, 10) << "s \n";
#define BENCH_MATRIX(TYPE, SIZE, ITERATIONS, TRIES) {\
double single = benchBasic<LazyEval>(Matrix<TYPE,Eigen::Dynamic,Eigen::Dynamic>(SIZE,SIZE), ITERATIONS, TRIES); \
double omp = benchBasic<OmpEval> (Matrix<TYPE,Eigen::Dynamic,Eigen::Dynamic>(SIZE,SIZE), ITERATIONS, TRIES); \
std::cout << #TYPE << ", " << #SIZE << "x" << #SIZE << ": " << single << "s " << omp << "s " \
<< " => x" << single/omp << " (" << omp_get_num_procs() << ")" << std::endl; \
}
BENCH_MATRIX(double, 32, 1000, 10);
BENCH_MATRIX(double, 128, 10, 10);
BENCH_MATRIX(double, 512, 1, 6);
BENCH_MATRIX(double, 1024, 1, 4);
return 0;
}

View File

@ -75,7 +75,7 @@ Eigen is well tested with recent versions of GCC and ICC. Both GCC 4.2 and ICC g
For best performance, we recommend the following compilation flags:
<ul>
<li>\b GCC: \c -O3 \c -DNDEBUG \c -finline-limit=10000 \c -falign-functions=64 </li>
<li>\b GCC: \c -O3 \c -DNDEBUG \c -finline-limit=10000 </li>
<li>\b ICC: \c -O3 \c -DNDEBUG \c -xT \c -ipo \c -no-prec-div \c -no-inline-max-size </li>
</ul>