* basic support for multicore CPU via a .evalOMP() which

internaly uses OpenMP if enabled at compile time. * added a bench/ folder with a couple benchmarks and benchmark tools.
2025-04-12 19:20:36 +08:00 · 2008-03-09 16:13:47 +00:00 · 2008-03-09 16:13:47 +00:00 · 9d9d81ad71
commit 9d9d81ad71
parent f64311e07d
18 changed files with 514 additions and 2 deletions
--- a/Eigen/Core
+++ b/Eigen/Core
@ -40,6 +40,7 @@ namespace Eigen {
 #include "src/Core/IO.h"
 #include "src/Core/Swap.h"
 #include "src/Core/CommaInitializer.h"
+#include "src/Core/EvalOMP.h"

 } // namespace Eigen

--- a/Eigen/src/Core/EvalOMP.h
+++ b/Eigen/src/Core/EvalOMP.h
@ -0,0 +1,118 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob@math.jussieu.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_EVAL_OMP_H
+#define EIGEN_EVAL_OMP_H
+
+/** \class EvalOMP
+  *
+  * \brief Parallel evaluation of an expression using OpenMP
+  *
+  * The template parameter Expression is the type of the expression that we are evaluating.
+  *
+  * This class is the return type of MatrixBase::evalOMP() and most of the time this is the
+  * only way it is used.
+  *
+  * Note that if OpenMP is not enabled, then this class is equivalent to Eval.
+  *
+  * \sa MatrixBase::evalOMP(), class Eval, MatrixBase::eval()
+  */
+template<typename ExpressionType> class EvalOMP : NoOperatorEquals,
+  public Matrix< typename ExpressionType::Scalar,
+                 ExpressionType::Traits::RowsAtCompileTime,
+                 ExpressionType::Traits::ColsAtCompileTime,
+                 EIGEN_DEFAULT_MATRIX_STORAGE_ORDER,
+                 ExpressionType::Traits::MaxRowsAtCompileTime,
+                 ExpressionType::Traits::MaxColsAtCompileTime>
+{
+  public:
+    typedef typename ExpressionType::Scalar Scalar;
+
+    /** The actual matrix type to evaluate to. This type can be used independently
+      * of the rest of this class to get the actual matrix type to evaluate and store
+      * the value of an expression.
+      */
+    typedef Matrix<Scalar,
+                   ExpressionType::Traits::RowsAtCompileTime,
+                   ExpressionType::Traits::ColsAtCompileTime,
+                   EIGEN_DEFAULT_MATRIX_STORAGE_ORDER,
+                   ExpressionType::Traits::MaxRowsAtCompileTime,
+                   ExpressionType::Traits::MaxColsAtCompileTime> MatrixType;
+
+    #ifdef _OPENMP
+    explicit EvalOMP(const ExpressionType& other)
+      : MatrixType(other.rows(), other.cols())
+    {
+      #ifdef __INTEL_COMPILER
+      #pragma omp parallel default(none) shared(other)
+      #else
+      #pragma omp parallel default(none)
+      #endif
+      {
+        if (this->cols()>this->rows())
+        {
+          #pragma omp for
+          for(int j = 0; j < this->cols(); j++)
+            for(int i = 0; i < this->rows(); i++)
+              this->coeffRef(i, j) = other.coeff(i, j);
+        }
+        else
+        {
+          #pragma omp for
+          for(int i = 0; i < this->rows(); i++)
+            for(int j = 0; j < this->cols(); j++)
+              this->coeffRef(i, j) = other.coeff(i, j);
+        }
+      }
+    }
+    #else
+    explicit EvalOMP(const ExpressionType& other) : MatrixType(other) {}
+    #endif
+};
+
+/** Evaluates *this in a parallel fashion using OpenMP and returns the obtained matrix.
+  *
+  * Of course, it only makes sense to call this function for complex expressions, and/or
+  * large matrices (>32x32), \b and if there is no outer loop which can be parallelized.
+  *
+  * It is the responsibility of the user manage the OpenMP parameters, for instance:
+  * \code
+  * #include <omp.h>
+  * // ...
+  * omp_set_num_threads(omp_get_num_procs());
+  * \endcode
+  * You also need to enable OpenMP on your compiler (e.g., -fopenmp) during both compilation and linking.
+  *
+  * Note that if OpenMP is not enabled, then evalOMP() is equivalent to eval().
+  *
+  * \sa class EvalOMP, eval()
+  */
+template<typename Scalar, typename Derived>
+const EvalOMP<Derived> MatrixBase<Scalar, Derived>::evalOMP() const
+{
+  return EvalOMP<Derived>(*static_cast<const Derived*>(this));
+}
+
+#endif // EIGEN_EVAL_OMP_H
--- a/Eigen/src/Core/ForwardDeclarations.h
+++ b/Eigen/src/Core/ForwardDeclarations.h
@ -44,6 +44,7 @@ template<typename MatrixType> class DiagonalCoeffs;
 template<typename MatrixType> class Identity;
 template<typename MatrixType> class Map;
 template<typename Derived> class Eval;
+template<typename Derived> class EvalOMP;

 struct ScalarProductOp;
 struct ScalarQuotientOp;
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@ -359,6 +359,7 @@ template<typename Scalar, typename Derived> class MatrixBase
    /// \name special functions
    //@{
    const Eval<Derived> eval() const EIGEN_ALWAYS_INLINE;
+    const EvalOMP<Derived> evalOMP() const EIGEN_ALWAYS_INLINE;

    template<typename CustomUnaryOp>
    const CwiseUnaryOp<CustomUnaryOp, Derived> cwise(const CustomUnaryOp& func = CustomUnaryOp()) const;
--- a/bench/BenchTimer.h
+++ b/bench/BenchTimer.h
@ -0,0 +1,75 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob@math.jussieu.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_BENCH_TIMER_H
+#define EIGEN_BENCH_TIMER_H
+
+#include <sys/time.h>
+#include <unistd.h>
+#include <cstdlib>
+
+namespace Eigen
+{
+
+/** Elapsed time timer keeping the best try.
+  */
+class BenchTimer
+{
+public:
+
+  BenchTimer() : m_best(1e99) {}
+
+  ~BenchTimer() {}
+
+  inline void start(void) {m_start = getTime();}
+  inline void stop(void)
+  {
+    m_best = std::min(m_best, getTime() - m_start);
+  }
+
+  /** Return the best elapsed time.
+    */
+  inline double value(void)
+  {
+      return m_best;
+  }
+
+  static inline double getTime(void)
+  {
+      struct timeval tv;
+      struct timezone tz;
+      gettimeofday(&tv, &tz);
+      return (double)tv.tv_sec + 1.e-6 * (double)tv.tv_usec;
+  }
+
+protected:
+
+  double m_best, m_start;
+
+};
+
+}
+
+#endif // EIGEN_BENCH_TIMER_H
--- a/bench/BenchUtil.h
+++ b/bench/BenchUtil.h
@ -0,0 +1,28 @@
+
+#include <Eigen/Core>
+#include "BenchTimer.h"
+
+using namespace std;
+USING_PART_OF_NAMESPACE_EIGEN
+
+#include <boost/preprocessor/repetition/enum_params.hpp>
+#include <boost/preprocessor/repetition.hpp>
+#include <boost/preprocessor/seq.hpp>
+#include <boost/preprocessor/array.hpp>
+#include <boost/preprocessor/arithmetic.hpp>
+#include <boost/preprocessor/comparison.hpp>
+#include <boost/preprocessor/punctuation.hpp>
+#include <boost/preprocessor/punctuation/comma.hpp>
+#include <boost/preprocessor/stringize.hpp>
+
+template<typename MatrixType> void initMatrix_random(MatrixType& mat) __attribute__((noinline));
+template<typename MatrixType> void initMatrix_random(MatrixType& mat)
+{
+  mat.setRandom();// = MatrixType::random(mat.rows(), mat.cols());
+}
+
+template<typename MatrixType> void initMatrix_identity(MatrixType& mat) __attribute__((noinline));
+template<typename MatrixType> void initMatrix_identity(MatrixType& mat)
+{
+  mat.setIdentity();
+}
--- a/bench/README.txt
+++ b/bench/README.txt
@ -0,0 +1,55 @@
+
+This folder contains a couple of benchmark utities and Eigen benchmarks.
+
+****************************
+* bench_multi_compilers.sh *
+****************************
+
+This script allows to run a benchmark on a set of different compilers/compiler options.
+It takes two arguments:
+ - a file defining the list of the compilers with their options
+ - the .cpp file of the benchmark
+
+Examples:
+
+$ ./bench_multi_compilers.sh basicbench.cxxlist basicbenchmark.cpp
+
+    g++-4.1 -O3 -DNDEBUG -finline-limit=10000
+    3d-3x3   /   4d-4x4   /   Xd-4x4   /   Xd-20x20   /
+    0.271102   0.131416   0.422322   0.198633
+    0.201658   0.102436   0.397566   0.207282
+
+    g++-4.2 -O3 -DNDEBUG -finline-limit=10000
+    3d-3x3   /   4d-4x4   /   Xd-4x4   /   Xd-20x20   /
+    0.107805   0.0890579   0.30265   0.161843
+    0.127157   0.0712581   0.278341   0.191029
+
+    g++-4.3 -O3 -DNDEBUG -finline-limit=10000
+    3d-3x3   /   4d-4x4   /   Xd-4x4   /   Xd-20x20   /
+    0.134318   0.105291   0.3704   0.180966
+    0.137703   0.0732472   0.31225   0.202204
+
+    icpc -fast -DNDEBUG -fno-exceptions -no-inline-max-size
+    3d-3x3   /   4d-4x4   /   Xd-4x4   /   Xd-20x20   /
+    0.226145   0.0941319   0.371873   0.159433
+    0.109302   0.0837538   0.328102   0.173891
+
+
+$ ./bench_multi_compilers.sh ompbench.cxxlist ompbenchmark.cpp
+
+    g++-4.2 -O3 -DNDEBUG -finline-limit=10000 -fopenmp
+    double, fixed-size 4x4: 0.00165105s  0.0778739s
+    double, 32x32: 0.0654769s 0.075289s  => x0.869674 (2)
+    double, 128x128: 0.054148s 0.0419669s  => x1.29025 (2)
+    double, 512x512: 0.913799s 0.428533s  => x2.13239 (2)
+    double, 1024x1024: 14.5972s 9.3542s  => x1.5605 (2)
+
+    icpc -fast -DNDEBUG -fno-exceptions -no-inline-max-size -openmp
+    double, fixed-size 4x4: 0.000589848s  0.019949s
+    double, 32x32: 0.0682781s 0.0449722s  => x1.51823 (2)
+    double, 128x128: 0.0547509s 0.0435519s  => x1.25714 (2)
+    double, 512x512: 0.829436s 0.424438s  => x1.9542 (2)
+    double, 1024x1024: 14.5243s 10.7735s  => x1.34815 (2)
+
+
+
--- a/bench/basicbench.cxxlist
+++ b/bench/basicbench.cxxlist
@ -0,0 +1,12 @@
+#!/bin/bash
+
+CLIST[((g++))]="g++-4.1 -O3 -DNDEBUG"
+CLIST[((g++))]="g++-4.1 -O3 -DNDEBUG -finline-limit=20000"
+
+CLIST[((g++))]="g++-4.2 -O3 -DNDEBUG"
+CLIST[((g++))]="g++-4.2 -O3 -DNDEBUG -finline-limit=20000"
+
+CLIST[((g++))]="g++-4.3 -O3 -DNDEBUG"
+CLIST[((g++))]="g++-4.3 -O3 -DNDEBUG -finline-limit=20000"
+
+CLIST[((g++))]="icpc -fast -DNDEBUG -fno-exceptions -no-inline-max-size"
--- a/bench/basicbenchmark.cpp
+++ b/bench/basicbenchmark.cpp
@ -0,0 +1,46 @@
+
+#include "BenchUtil.h"
+#include "basicbenchmark.h"
+
+int main(int argc, char *argv[])
+{
+  // disbale floating point exceptions
+  // this leads to more stable bench results
+  // (this is done by default by ICC)
+  #ifndef __INTEL_COMPILER
+  {
+    int aux;
+    asm(
+    "stmxcsr   %[aux]           \n\t"
+    "orl       $32832, %[aux]   \n\t"
+    "ldmxcsr   %[aux]           \n\t"
+    : : [aux] "m" (aux));
+  }
+  #endif
+
+  // this is the list of matrix type and size we want to bench:
+  // ((suffix) (matrix size) (number of iterations))
+  #define MODES ((3d)(3)(4000000)) ((4d)(4)(1000000)) ((Xd)(4)(1000000)) ((Xd)(20)(10000))
+//   #define MODES ((Xd)(20)(10000))
+
+  #define _GENERATE_HEADER(R,ARG,EL) << BOOST_PP_STRINGIZE(BOOST_PP_SEQ_HEAD(EL)) << "-" \
+    << BOOST_PP_STRINGIZE(BOOST_PP_SEQ_ELEM(1,EL)) << "x" \
+    << BOOST_PP_STRINGIZE(BOOST_PP_SEQ_ELEM(1,EL)) << "   /   "
+
+  std::cout BOOST_PP_SEQ_FOR_EACH(_GENERATE_HEADER, ~, MODES ) << endl;
+
+  const int tries = 10;
+
+  #define _RUN_BENCH(R,ARG,EL) \
+    std::cout << ARG( \
+      BOOST_PP_CAT(Matrix, BOOST_PP_SEQ_HEAD(EL)) (\
+         BOOST_PP_SEQ_ELEM(1,EL),BOOST_PP_SEQ_ELEM(1,EL)), BOOST_PP_SEQ_ELEM(2,EL), tries) \
+    << "   ";
+
+  BOOST_PP_SEQ_FOR_EACH(_RUN_BENCH, benchBasic<LazyEval>, MODES );
+  std::cout << endl;
+  BOOST_PP_SEQ_FOR_EACH(_RUN_BENCH, benchBasic<EarlyEval>, MODES );
+  std::cout << endl;
+
+  return 0;
+}
--- a/bench/basicbenchmark.h
+++ b/bench/basicbenchmark.h
@ -0,0 +1,59 @@
+
+enum {LazyEval, EarlyEval, OmpEval};
+
+template<int Mode, typename MatrixType>
+void benchBasic_loop(const MatrixType& I, MatrixType& m, int iterations) __attribute__((noinline));
+
+template<int Mode, typename MatrixType>
+void benchBasic_loop(const MatrixType& I, MatrixType& m, int iterations)
+{
+  for(int a = 0; a < iterations; a++)
+  {
+    if (Mode==LazyEval)
+    {
+      asm("#begin_bench_loop LazyEval");
+      if (MatrixType::Traits::SizeAtCompileTime!=Eigen::Dynamic) asm("#fixedsize");
+      m = (I + 0.00005 * (m + m.lazyProduct(m))).eval();
+    }
+    else if (Mode==OmpEval)
+    {
+      asm("#begin_bench_loop OmpEval");
+      if (MatrixType::Traits::SizeAtCompileTime!=Eigen::Dynamic) asm("#fixedsize");
+      m = (I + 0.00005 * (m + m.lazyProduct(m))).evalOMP();
+    }
+    else
+    {
+      asm("#begin_bench_loop EarlyEval");
+      if (MatrixType::Traits::SizeAtCompileTime!=Eigen::Dynamic) asm("#fixedsize");
+      m = I + 0.00005 * (m + m * m);
+    }
+    asm("#end_bench_loop");
+  }
+}
+
+template<int Mode, typename MatrixType>
+double benchBasic(const MatrixType& mat, int size, int tries) __attribute__((noinline));
+
+template<int Mode, typename MatrixType>
+double benchBasic(const MatrixType& mat, int iterations, int tries)
+{
+  const int rows = mat.rows();
+  const int cols = mat.cols();
+
+  MatrixType I(rows,cols);
+  MatrixType m(rows,cols);
+
+  initMatrix_identity(I);
+
+  Eigen::BenchTimer timer;
+  for(uint t=0; t<tries; ++t)
+  {
+    initMatrix_random(m);
+    timer.start();
+    benchBasic_loop<Mode>(I, m, iterations);
+    timer.stop();
+    cerr << m;
+  }
+  return timer.value();
+};
+
--- a/bench/bench_multi_compilers.sh
+++ b/bench/bench_multi_compilers.sh
@ -0,0 +1,28 @@
+#!/bin/bash
+
+if (($# < 2)); then
+    echo "Usage: $0 compilerlist.txt benchfile.cpp"
+else
+
+compilerlist=$1
+benchfile=$2
+
+g=0
+source $compilerlist
+
+# for each compiler, compile benchfile and run the benchmark
+for (( i=0 ; i<g ; ++i )) ; do
+  # check the compiler exists
+  compiler=`echo ${CLIST[$i]} | cut -d " " -f 1`
+  if [ -e `which $compiler` ]; then
+    echo "${CLIST[$i]}"
+#     echo "${CLIST[$i]} $benchfile -I.. -o bench~"
+    if [ -e ./.bench ] ; then rm .bench; fi
+    ${CLIST[$i]} $benchfile -I.. -o .bench && ./.bench 2> /dev/null
+    echo ""
+  else
+    echo "compiler not found: $compiler"
+  fi
+done
+
+fi
--- a/bench/bench_unrolling
+++ b/bench/bench_unrolling
--- a/bench/benchmark.cpp
+++ b/bench/benchmark.cpp
--- a/bench/benchmarkX.cpp
+++ b/bench/benchmarkX.cpp
@ -13,7 +13,7 @@ int main(int argc, char *argv[])
 	{
 		m(i,j) = 0.1 * (i+20*j);
 	}
-	for(int a = 0; a < 1000000; a++)
+	for(int a = 0; a < 100000; a++)
 	{
 		m = I + 0.00005 * (m + m*m);
 	}
--- a/bench/benchmark_suite
+++ b/bench/benchmark_suite
--- a/bench/ompbench.cxxlist
+++ b/bench/ompbench.cxxlist
@ -0,0 +1,7 @@
+#!/bin/bash
+
+CLIST[((g++))]="g++-4.2 -O3 -DNDEBUG -finline-limit=10000 -fopenmp"
+
+# CLIST[((g++))]="g++-4.3 -O3 -DNDEBUG -finline-limit=10000 -fopenmp"
+
+CLIST[((g++))]="icpc -fast -DNDEBUG -fno-exceptions -no-inline-max-size -openmp"
--- a/bench/ompbenchmark.cpp
+++ b/bench/ompbenchmark.cpp
@ -0,0 +1,81 @@
+// g++ -O3 -DNDEBUG -I.. -fopenmp benchOpenMP.cpp -o benchOpenMP && ./benchOpenMP 2> /dev/null
+// icpc -fast -fno-exceptions -DNDEBUG -I.. -openmp  benchOpenMP.cpp -o benchOpenMP && ./benchOpenMP 2> /dev/null
+
+#include <omp.h>
+#include "BenchUtil.h"
+#include "basicbenchmark.h"
+
+// #include <Eigen/Core>
+// #include "BenchTimer.h"
+//
+// using namespace std;
+// USING_PART_OF_NAMESPACE_EIGEN
+//
+// enum {LazyEval, EarlyEval, OmpEval};
+//
+// template<int Mode, typename MatrixType>
+// double benchSingleProc(const MatrixType& mat, int iterations, int tries)  __attribute__((noinline));
+//
+// template<int Mode, typename MatrixType>
+// double benchBasic(const MatrixType& mat, int iterations, int tries)
+// {
+//   const int rows = mat.rows();
+//   const int cols = mat.cols();
+//
+//   Eigen::BenchTimer timer;
+//   for(uint t=0; t<tries; ++t)
+//   {
+//     MatrixType I = MatrixType::identity(rows, cols);
+//     MatrixType m = MatrixType::random(rows, cols);
+//
+//     timer.start();
+//     for(int a = 0; a < iterations; a++)
+//     {
+//       if(Mode==LazyEval)
+//         m = (I + 0.00005 * (m + m.lazyProduct(m))).eval();
+//       else if(Mode==OmpEval)
+//         m = (I + 0.00005 * (m + m.lazyProduct(m))).evalOMP();
+//       else
+//         m = I + 0.00005 * (m + m * m);
+//     }
+//     timer.stop();
+//     cerr << m;
+//   }
+//   return timer.value();
+// };
+
+int main(int argc, char *argv[])
+{
+  // disbale floating point exceptions
+  // this leads to more stable bench results
+  {
+    int aux;
+    asm(
+    "stmxcsr   %[aux]           \n\t"
+    "orl       $32832, %[aux]   \n\t"
+    "ldmxcsr   %[aux]           \n\t"
+    : : [aux] "m" (aux));
+  }
+
+  // commented since the default setting is use as many threads as processors
+  //omp_set_num_threads(omp_get_num_procs());
+
+  std::cout << "double, fixed-size 4x4: "
+    << benchBasic<LazyEval>(Matrix4d(), 10000, 10) << "s  "
+    << benchBasic<OmpEval>(Matrix4d(), 10000, 10) << "s  \n";
+
+  #define BENCH_MATRIX(TYPE, SIZE, ITERATIONS, TRIES) {\
+      double single = benchBasic<LazyEval>(Matrix<TYPE,Eigen::Dynamic,Eigen::Dynamic>(SIZE,SIZE), ITERATIONS, TRIES); \
+      double omp    = benchBasic<OmpEval> (Matrix<TYPE,Eigen::Dynamic,Eigen::Dynamic>(SIZE,SIZE), ITERATIONS, TRIES); \
+      std::cout << #TYPE << ", " << #SIZE << "x" << #SIZE << ": " << single << "s " << omp << "s " \
+        << " => x" << single/omp << " (" << omp_get_num_procs() << ")" << std::endl; \
+    }
+
+  BENCH_MATRIX(double,   32, 1000, 10);
+  BENCH_MATRIX(double,  128,   10, 10);
+  BENCH_MATRIX(double,  512,    1,  6);
+  BENCH_MATRIX(double, 1024,    1,  4);
+
+  return 0;
+}
+
--- a/doc/Mainpage.dox
+++ b/doc/Mainpage.dox
@ -75,7 +75,7 @@ Eigen is well tested with recent versions of GCC and ICC. Both GCC 4.2 and ICC g

 For best performance, we recommend the following compilation flags:
 <ul>
-  <li>\b GCC: \c -O3 \c -DNDEBUG \c -finline-limit=10000 \c -falign-functions=64 </li>
+  <li>\b GCC: \c -O3 \c -DNDEBUG \c -finline-limit=10000  </li>
  <li>\b ICC: \c -O3 \c -DNDEBUG \c -xT \c -ipo \c -no-prec-div \c -no-inline-max-size </li>
 </ul>