diff --git a/CMakeLists.txt b/CMakeLists.txt index 9a7ea46c6..ccb234a64 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,6 +8,14 @@ SET(CMAKE_INCLUDE_CURRENT_DIR ON) IF(CMAKE_COMPILER_IS_GNUCXX) IF(CMAKE_SYSTEM_NAME MATCHES Linux) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pedantic -Wnon-virtual-dtor -Wno-long-long -ansi -Wundef -Wcast-align -Wchar-subscripts -Wall -W -Wpointer-arith -Wwrite-strings -Wformat-security -fno-exceptions -fno-check-new -fno-common -fstrict-aliasing") + IF(TEST_OPENMP) + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") + MESSAGE("Enabling OpenMP in tests/examples") + ENDIF(TEST_OPENMP) + IF(TEST_SSE2) + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2") + MESSAGE("Enabling SSE2 in tests/examples") + ENDIF(TEST_SSE2) ENDIF(CMAKE_SYSTEM_NAME MATCHES Linux) ENDIF(CMAKE_COMPILER_IS_GNUCXX) diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h index 53d6250f3..37ad2c0cf 100644 --- a/Eigen/src/Core/Assign.h +++ b/Eigen/src/Core/Assign.h @@ -28,7 +28,7 @@ #define EIGEN_ASSIGN_H template -struct ei_matrix_operator_equals_unroller +struct ei_matrix_assignment_unroller { enum { col = (UnrollCount-1) / Derived1::RowsAtCompileTime, @@ -37,13 +37,13 @@ struct ei_matrix_operator_equals_unroller static void run(Derived1 &dst, const Derived2 &src) { - ei_matrix_operator_equals_unroller::run(dst, src); + ei_matrix_assignment_unroller::run(dst, src); dst.coeffRef(row, col) = src.coeff(row, col); } }; template -struct ei_matrix_operator_equals_unroller +struct ei_matrix_assignment_unroller { static void run(Derived1 &dst, const Derived2 &src) { @@ -53,13 +53,13 @@ struct ei_matrix_operator_equals_unroller // prevent buggy user code from causing an infinite recursion template -struct ei_matrix_operator_equals_unroller +struct ei_matrix_assignment_unroller { static void run(Derived1 &, const Derived2 &) {} }; template -struct ei_matrix_operator_equals_unroller +struct ei_matrix_assignment_unroller { static void run(Derived1 &, const Derived2 &) {} }; @@ -67,7 +67,7 @@ struct ei_matrix_operator_equals_unroller //---- template -struct ei_matrix_operator_equals_packet_unroller +struct ei_matrix_assignment_packet_unroller { enum { row = Derived1::Flags&RowMajorBit ? Index / Derived1::ColsAtCompileTime : Index % Derived1::RowsAtCompileTime, @@ -76,14 +76,14 @@ struct ei_matrix_operator_equals_packet_unroller static void run(Derived1 &dst, const Derived2 &src) { - ei_matrix_operator_equals_packet_unroller::size>::run(dst, src); dst.writePacketCoeff(row, col, src.packetCoeff(row, col)); } }; template -struct ei_matrix_operator_equals_packet_unroller +struct ei_matrix_assignment_packet_unroller { static void run(Derived1 &dst, const Derived2 &src) { @@ -92,58 +92,22 @@ struct ei_matrix_operator_equals_packet_unroller }; template -struct ei_matrix_operator_equals_packet_unroller +struct ei_matrix_assignment_packet_unroller { - static void run(Derived1 &, const Derived2 &) { ei_internal_assert(false && "ei_matrix_operator_equals_packet_unroller"); } -}; - -//---- - -template -struct ei_vector_operator_equals_unroller -{ - enum { index = UnrollCount - 1 }; - - static void run(Derived1 &dst, const Derived2 &src) - { - ei_vector_operator_equals_unroller::run(dst, src); - dst.coeffRef(index) = src.coeff(index); - } -}; - -// prevent buggy user code from causing an infinite recursion -template -struct ei_vector_operator_equals_unroller -{ - static void run(Derived1 &, const Derived2 &) {} -}; - -template -struct ei_vector_operator_equals_unroller -{ - static void run(Derived1 &dst, const Derived2 &src) - { - dst.coeffRef(0) = src.coeff(0); - } -}; - -template -struct ei_vector_operator_equals_unroller -{ - static void run(Derived1 &, const Derived2 &) {} + static void run(Derived1 &, const Derived2 &) { ei_internal_assert(false && "ei_matrix_assignment_packet_unroller"); } }; template -struct ei_operator_equals_impl; +struct ei_assignment_impl; template template Derived& MatrixBase ::lazyAssign(const MatrixBase& other) { - ei_operator_equals_impl::execute(derived(),other.derived()); + ei_assignment_impl::execute(derived(),other.derived()); return derived(); } @@ -152,125 +116,27 @@ template Derived& MatrixBase ::operator=(const MatrixBase& other) { + const bool need_to_transpose = Derived::IsVectorAtCompileTime + && OtherDerived::IsVectorAtCompileTime + && (int)Derived::RowsAtCompileTime != (int)OtherDerived::RowsAtCompileTime; if(OtherDerived::Flags & EvalBeforeAssigningBit) { - return lazyAssign(other.derived().eval()); + if(need_to_transpose) + return lazyAssign(other.transpose().eval()); + else + return lazyAssign(other.eval()); } else - return lazyAssign(other.derived()); + { + if(need_to_transpose) + return lazyAssign(other.transpose()); + else + return lazyAssign(other.derived()); + } } template -struct ei_operator_equals_impl -{ - static void execute(Derived & dst, const OtherDerived & src) - { - const bool unroll = Derived::SizeAtCompileTime * OtherDerived::CoeffReadCost <= EIGEN_UNROLLING_LIMIT; - if(Derived::IsVectorAtCompileTime && OtherDerived::IsVectorAtCompileTime) - // copying a vector expression into a vector - { - ei_assert(dst.size() == src.size()); - if(unroll) - ei_vector_operator_equals_unroller - ::run(dst.derived(), src.derived()); - else - { - #ifdef EIGEN_USE_OPENMPf - if(Derived::Flags & OtherDerived::Flags & LargeBit) - { - #ifdef __INTEL_COMPILER - #pragma omp parallel default(none) shared(other) - #else - #pragma omp parallel default(none) - #endif - { - #pragma omp for - for(int i = 0; i < dst.size(); i++) - dst.coeffRef(i) = src.coeff(i); - } - } - else - #endif // EIGEN_USE_OPENMP - { - for(int i = 0; i < dst.size(); i++) - dst.coeffRef(i) = src.coeff(i); - } - } - } - else // copying a matrix expression into a matrix - { - ei_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); - if(unroll) - { - ei_matrix_operator_equals_unroller - ::run(dst.derived(), src.derived()); - } - else - { - if(Derived::ColsAtCompileTime == Dynamic || Derived::RowsAtCompileTime != Dynamic) - { - #ifdef EIGEN_USE_OPENMP - if(Derived::Flags & OtherDerived::Flags & LargeBit) - { - #ifdef __INTEL_COMPILER - #pragma omp parallel default(none) shared(other) - #else - #pragma omp parallel default(none) - #endif - { - #pragma omp for - for(int j = 0; j < dst.cols(); j++) - for(int i = 0; i < dst.rows(); i++) - dst.coeffRef(i, j) = src.coeff(i, j); - } - } - else - #endif // EIGEN_USE_OPENMP - { - // traverse in column-major order - for(int j = 0; j < dst.cols(); j++) - for(int i = 0; i < dst.rows(); i++) - dst.coeffRef(i, j) = src.coeff(i, j); - } - } - else - { - #ifdef EIGEN_USE_OPENMP - if(Derived::Flags & OtherDerived::Flags & LargeBit) - { - #ifdef __INTEL_COMPILER - #pragma omp parallel default(none) shared(other) - #else - #pragma omp parallel default(none) - #endif - { - #pragma omp for - for(int i = 0; i < dst.rows(); i++) - for(int j = 0; j < dst.cols(); j++) - dst.coeffRef(i, j) = src.coeff(i, j); - } - } - else - #endif // EIGEN_USE_OPENMP - { - // traverse in row-major order - // in order to allow the compiler to unroll the inner loop - for(int i = 0; i < dst.rows(); i++) - for(int j = 0; j < dst.cols(); j++) - dst.coeffRef(i, j) = src.coeff(i, j); - } - } - } - } - } -}; - -template -struct ei_operator_equals_impl +struct ei_assignment_impl { static void execute(Derived & dst, const OtherDerived & src) { @@ -278,7 +144,47 @@ struct ei_operator_equals_impl ei_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); if(unroll) { - ei_matrix_operator_equals_packet_unroller + ei_matrix_assignment_unroller + ::run(dst.derived(), src.derived()); + } + else + { + if(Derived::ColsAtCompileTime == Dynamic || Derived::RowsAtCompileTime != Dynamic) + { + #define EIGEN_THE_PARALLELIZABLE_LOOP \ + for(int j = 0; j < dst.cols(); j++) \ + for(int i = 0; i < dst.rows(); i++) \ + dst.coeffRef(i, j) = src.coeff(i, j); + EIGEN_RUN_PARALLELIZABLE_LOOP(Derived::Flags & OtherDerived::Flags & LargeBit) + #undef EIGEN_THE_PARALLELIZABLE_LOOP + } + else + { + // traverse in row-major order + // in order to allow the compiler to unroll the inner loop + #define EIGEN_THE_PARALLELIZABLE_LOOP \ + for(int i = 0; i < dst.rows(); i++) \ + for(int j = 0; j < dst.cols(); j++) \ + dst.coeffRef(i, j) = src.coeff(i, j); + EIGEN_RUN_PARALLELIZABLE_LOOP(Derived::Flags & OtherDerived::Flags & LargeBit) + #undef EIGEN_THE_PARALLELIZABLE_LOOP + } + } + } +}; + +template +struct ei_assignment_impl +{ + static void execute(Derived & dst, const OtherDerived & src) + { + const bool unroll = Derived::SizeAtCompileTime * OtherDerived::CoeffReadCost <= EIGEN_UNROLLING_LIMIT; + ei_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); + if(unroll) + { + ei_matrix_assignment_packet_unroller =ei_packet_traits::size ? Derived::SizeAtCompileTime-ei_packet_traits::size @@ -288,15 +194,21 @@ struct ei_operator_equals_impl { if(OtherDerived::Flags&RowMajorBit) { - for(int i = 0; i < dst.rows(); i++) - for(int j = 0; j < dst.cols(); j+=ei_packet_traits::size) + #define EIGEN_THE_PARALLELIZABLE_LOOP \ + for(int i = 0; i < dst.rows(); i++) \ + for(int j = 0; j < dst.cols(); j+=ei_packet_traits::size) \ dst.writePacketCoeff(i, j, src.packetCoeff(i, j)); + EIGEN_RUN_PARALLELIZABLE_LOOP(Derived::Flags & OtherDerived::Flags & LargeBit) + #undef EIGEN_THE_PARALLELIZABLE_LOOP } else { - for(int j = 0; j < dst.cols(); j++) - for(int i = 0; i < dst.rows(); i+=ei_packet_traits::size) + #define EIGEN_THE_PARALLELIZABLE_LOOP \ + for(int j = 0; j < dst.cols(); j++) \ + for(int i = 0; i < dst.rows(); i+=ei_packet_traits::size) \ dst.writePacketCoeff(i, j, src.packetCoeff(i, j)); + EIGEN_RUN_PARALLELIZABLE_LOOP(Derived::Flags & OtherDerived::Flags & LargeBit) + #undef EIGEN_THE_PARALLELIZABLE_LOOP } } } diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index bb7c254c2..6771b5d30 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -283,64 +283,70 @@ void Product::_cacheOptimalEval(DestDerived& res) const #ifdef EIGEN_VECTORIZE if( (Flags & VectorizableBit) && (!(Lhs::Flags & RowMajorBit)) ) { - for(int k=0; k::type tmp0 = ei_pset1(m_rhs.coeff(j+0,k)); - const typename ei_packet_traits::type tmp1 = ei_pset1(m_rhs.coeff(j+1,k)); - const typename ei_packet_traits::type tmp2 = ei_pset1(m_rhs.coeff(j+2,k)); - const typename ei_packet_traits::type tmp3 = ei_pset1(m_rhs.coeff(j+3,k)); - for (int i=0; i::size) - { - res.writePacketCoeff(i,k, - ei_padd( - res.packetCoeff(i,k), - ei_padd( - ei_padd( - ei_pmul(tmp0, m_lhs.packetCoeff(i,j)), - ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))), - ei_padd( - ei_pmul(tmp2, m_lhs.packetCoeff(i,j+2)), - ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3)) - ) - ) - ) - ); - } + #define EIGEN_THE_PARALLELIZABLE_LOOP \ + for(int k=0; kcols(); k++) \ + { \ + int j=0; \ + for(; j::type tmp0 = ei_pset1(m_rhs.coeff(j+0,k)); \ + const typename ei_packet_traits::type tmp1 = ei_pset1(m_rhs.coeff(j+1,k)); \ + const typename ei_packet_traits::type tmp2 = ei_pset1(m_rhs.coeff(j+2,k)); \ + const typename ei_packet_traits::type tmp3 = ei_pset1(m_rhs.coeff(j+3,k)); \ + for (int i=0; irows(); i+=ei_packet_traits::size) \ + { \ + res.writePacketCoeff(i,k,\ + ei_padd( \ + res.packetCoeff(i,k), \ + ei_padd( \ + ei_padd( \ + ei_pmul(tmp0, m_lhs.packetCoeff(i,j)), \ + ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))), \ + ei_padd( \ + ei_pmul(tmp2, m_lhs.packetCoeff(i,j+2)), \ + ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3)) \ + ) \ + ) \ + ) \ + ); \ + } \ + } \ + for(; j::type tmp = ei_pset1(m_rhs.coeff(j,k)); \ + for (int i=0; irows(); ++i) \ + res.writePacketCoeff(i,k,ei_pmul(tmp, m_lhs.packetCoeff(i,j))); \ + } \ } - for(; j::type tmp = ei_pset1(m_rhs.coeff(j,k)); - for (int i=0; icols(); ++k) \ + { \ + int j=0; \ + for(; jrows(); ++i) \ + res.coeffRef(i,k) += tmp0 * m_lhs.coeff(i,j) + tmp1 * m_lhs.coeff(i,j+1) \ + + tmp2 * m_lhs.coeff(i,j+2) + tmp3 * m_lhs.coeff(i,j+3); \ + } \ + for(; jrows(); ++i) \ + res.coeffRef(i,k) += tmp * m_lhs.coeff(i,j); \ + } \ } - for(; j Eigen::MatrixBase::eval() const' diff --git a/bench/benchmark.cpp b/bench/benchmark.cpp index 99fd097ba..39ed317aa 100644 --- a/bench/benchmark.cpp +++ b/bench/benchmark.cpp @@ -18,12 +18,11 @@ USING_PART_OF_NAMESPACE_EIGEN int main(int argc, char *argv[]) { - Matrix I; + Matrix I = Matrix::ones(); Matrix m; for(int i = 0; i < MATSIZE; i++) for(int j = 0; j < MATSIZE; j++) { - I(i,j) = (i==j); m(i,j) = (i+MATSIZE*j); } asm("#begin"); diff --git a/bench/benchmarkX.cpp b/bench/benchmarkX.cpp index 590f2636b..a269f695a 100644 --- a/bench/benchmarkX.cpp +++ b/bench/benchmarkX.cpp @@ -1,33 +1,32 @@ -// g++ -O3 -DNDEBUG benchmarkX.cpp -o benchmarkX && time ./benchmarkX - +// g++ -fopenmp -I .. -O3 -DNDEBUG -finline-limit=1000 benchmarkX.cpp -o b && time ./b #include using namespace std; USING_PART_OF_NAMESPACE_EIGEN #ifndef MATTYPE -#define MATTYPE MatrixXd +#define MATTYPE MatrixXLd #endif #ifndef MATSIZE -#define MATSIZE 20 +#define MATSIZE 400 #endif #ifndef REPEAT -#define REPEAT 100000 +#define REPEAT 100 #endif int main(int argc, char *argv[]) { - MATTYPE I = MATTYPE::identity(MATSIZE,MATSIZE); + MATTYPE I = MATTYPE::ones(MATSIZE,MATSIZE); MATTYPE m(MATSIZE,MATSIZE); for(int i = 0; i < MATSIZE; i++) for(int j = 0; j < MATSIZE; j++) { - m(i,j) = 0.1 * (i+MATSIZE*j)/MATSIZE; + m(i,j) = (i+j+1)/(MATSIZE*MATSIZE); } for(int a = 0; a < REPEAT; a++) { - m = I + 0.00005 * (m + m*m); + m = I + 0.0001 * (m + m*m); } cout << m(0,0) << endl; return 0; diff --git a/bench/benchmarkXL.cpp b/bench/benchmarkXcwise.cpp similarity index 85% rename from bench/benchmarkXL.cpp rename to bench/benchmarkXcwise.cpp index ff128da06..dd29743cd 100644 --- a/bench/benchmarkXL.cpp +++ b/bench/benchmarkXcwise.cpp @@ -19,11 +19,11 @@ USING_PART_OF_NAMESPACE_EIGEN int main(int argc, char *argv[]) { - MATTYPE I = MATTYPE::identity(MATSIZE,MATSIZE); + MATTYPE I = MATTYPE::ones(MATSIZE,MATSIZE); MATTYPE m(MATSIZE,MATSIZE); for(int i = 0; i < MATSIZE; i++) for(int j = 0; j < MATSIZE; j++) { - m(i,j) = 0.1 * (i+MATSIZE*j)/MATSIZE; + m(i,j) = 0.1 * (i+j+1)/(MATSIZE*MATSIZE); } for(int a = 0; a < REPEAT; a++) { diff --git a/test/product.cpp b/test/product.cpp index ba0c55da6..a6b6a537d 100644 --- a/test/product.cpp +++ b/test/product.cpp @@ -100,7 +100,7 @@ void EigenTest::testProduct() } // test a large matrix only once - product(Matrix()); + product(MatrixXf(100,100)); } } // namespace Eigen