* Start of the LU module, with matrix inversion already there and

fully optimized. * Even if LargeBit is set, only parallelize for large enough objects (controlled by EIGEN_PARALLELIZATION_TRESHOLD).
2025-04-12 19:20:36 +08:00 · 2008-04-14 08:20:24 +00:00 · 2008-04-14 08:20:24 +00:00 · ea3ccb1e8c
commit ea3ccb1e8c
parent ab4046970b
14 changed files with 360 additions and 23 deletions
--- a/Eigen/Core
+++ b/Eigen/Core
@ -10,7 +10,7 @@
 #endif
 #endif

-#ifndef EIGEN_DONT_USE_OPENMP
+#ifndef EIGEN_DONT_PARALLELIZE
 #ifdef _OPENMP
 #define EIGEN_USE_OPENMP
 #include <omp.h>
--- a/Eigen/LU
+++ b/Eigen/LU
@ -0,0 +1,12 @@
+#ifndef EIGEN_LU_H
+#define EIGEN_LU_H
+
+#include "Core"
+
+namespace Eigen {
+
+#include "Eigen/src/LU/Inverse.h"
+
+} // namespace Eigen
+
+#endif // EIGEN_LU_H
--- a/Eigen/src/CMakeLists.txt
+++ b/Eigen/src/CMakeLists.txt
@ -1 +1,2 @@
 ADD_SUBDIRECTORY(Core)
+ADD_SUBDIRECTORY(LU)
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@ -135,6 +135,11 @@ Derived& MatrixBase<Derived>
  }
 }

+template<typename T1, typename T2> bool ei_should_parallelize_assignment(const T1& t, const T2&)
+{
+  return (T1::Flags & T2::Flags & LargeBit) && t.size() >= EIGEN_PARALLELIZATION_TRESHOLD;
+}
+
 template <typename Derived, typename OtherDerived>
 struct ei_assignment_impl<Derived, OtherDerived, false>
 {
@ -157,7 +162,7 @@ struct ei_assignment_impl<Derived, OtherDerived, false>
            for(int j = 0; j < dst.cols(); j++) \
              for(int i = 0; i < dst.rows(); i++) \
                dst.coeffRef(i, j) = src.coeff(i, j);
-        EIGEN_RUN_PARALLELIZABLE_LOOP(Derived::Flags & OtherDerived::Flags & LargeBit)
+        EIGEN_RUN_PARALLELIZABLE_LOOP(ei_should_parallelize_assignment(dst, src))
        #undef EIGEN_THE_PARALLELIZABLE_LOOP
      }
      else
@ -168,7 +173,7 @@ struct ei_assignment_impl<Derived, OtherDerived, false>
          for(int i = 0; i < dst.rows(); i++) \
            for(int j = 0; j < dst.cols(); j++) \
              dst.coeffRef(i, j) = src.coeff(i, j);
-        EIGEN_RUN_PARALLELIZABLE_LOOP(Derived::Flags & OtherDerived::Flags & LargeBit)
+        EIGEN_RUN_PARALLELIZABLE_LOOP(ei_should_parallelize_assignment(dst, src))
        #undef EIGEN_THE_PARALLELIZABLE_LOOP
      }
    }
@ -198,7 +203,7 @@ struct ei_assignment_impl<Derived, OtherDerived, true>
        for(int i = 0; i < dst.rows(); i++) \
          for(int j = 0; j < dst.cols(); j+=ei_packet_traits<typename Derived::Scalar>::size) \
            dst.writePacketCoeff(i, j, src.packetCoeff(i, j));
-        EIGEN_RUN_PARALLELIZABLE_LOOP(Derived::Flags & OtherDerived::Flags & LargeBit)
+        EIGEN_RUN_PARALLELIZABLE_LOOP(ei_should_parallelize_assignment(dst, src))
        #undef EIGEN_THE_PARALLELIZABLE_LOOP
      }
      else
@ -207,7 +212,7 @@ struct ei_assignment_impl<Derived, OtherDerived, true>
        for(int j = 0; j < dst.cols(); j++) \
          for(int i = 0; i < dst.rows(); i+=ei_packet_traits<typename Derived::Scalar>::size) \
            dst.writePacketCoeff(i, j, src.packetCoeff(i, j));
-        EIGEN_RUN_PARALLELIZABLE_LOOP(Derived::Flags & OtherDerived::Flags & LargeBit)
+        EIGEN_RUN_PARALLELIZABLE_LOOP(ei_should_parallelize_assignment(dst, src))
        #undef EIGEN_THE_PARALLELIZABLE_LOOP
      }
    }
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@ -67,11 +67,11 @@ struct ei_traits<Block<MatrixType, BlockRows, BlockCols> >
      : (BlockRows==Dynamic ? MatrixType::MaxRowsAtCompileTime : BlockRows),
    MaxColsAtCompileTime = ColsAtCompileTime == 1 ? 1
      : (BlockCols==Dynamic ? MatrixType::MaxColsAtCompileTime : BlockCols),
-    FlagsLargeBit = ((RowsAtCompileTime != Dynamic && MatrixType::RowsAtCompileTime == Dynamic)
-                  || (ColsAtCompileTime != Dynamic && MatrixType::ColsAtCompileTime == Dynamic))
-                  ? ~LargeBit
-                  : ~(unsigned int)0,
-    Flags = MatrixType::Flags & FlagsLargeBit & ~VectorizableBit,
+    FlagsMaskLargeBit = ((RowsAtCompileTime != Dynamic && MatrixType::RowsAtCompileTime == Dynamic)
+                      || (ColsAtCompileTime != Dynamic && MatrixType::ColsAtCompileTime == Dynamic))
+                      ? ~LargeBit
+                      : ~(unsigned int)0,
+    Flags = MatrixType::Flags & FlagsMaskLargeBit & ~VectorizableBit,
    CoeffReadCost = MatrixType::CoeffReadCost
  };
 };
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@ -95,11 +95,8 @@ struct ei_traits<Matrix<_Scalar, _Rows, _Cols, _Flags, _MaxRows, _MaxCols> >
  };
 };

-template<typename _Scalar, int _Rows, int _Cols,
-         unsigned int _Flags = EIGEN_DEFAULT_MATRIX_FLAGS,
-         int _MaxRows = _Rows, int _MaxCols = _Cols>
-class Matrix : public MatrixBase<Matrix<_Scalar, _Rows, _Cols,
-                                        _Flags, _MaxRows, _MaxCols> >
+template<typename _Scalar, int _Rows, int _Cols, unsigned int _Flags, int _MaxRows, int _MaxCols>
+class Matrix : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _Flags, _MaxRows, _MaxCols> >
 {
  public:

--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@ -506,6 +506,15 @@ template<typename Derived> class MatrixBase
    { return *static_cast<Derived*>(const_cast<MatrixBase*>(this)); }
    //@}

+    /** \name LU module
+      *
+      * \code #include <Eigen/LU> \endcode
+      */
+    //@{
+    const Inverse<Derived, true> inverse() const;
+    const Inverse<Derived, false> quickInverse() const;
+    //@}
+
  private:

    PacketScalar _packetCoeff(int , int) const { ei_internal_assert(false && "_packetCoeff not defined"); }
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@ -280,6 +280,8 @@ void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res) const
 {
  res.setZero();
  const int cols4 = m_lhs.cols() & 0xfffffffC;
+  const bool should_parallelize = (Flags & DestDerived::Flags & LargeBit)
+                                && res.size() >= EIGEN_PARALLELIZATION_TRESHOLD;
  #ifdef EIGEN_VECTORIZE
  if( (Flags & VectorizableBit) && (!(Lhs::Flags & RowMajorBit)) )
  {
@ -318,7 +320,7 @@ void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res) const
            res.writePacketCoeff(i,k,ei_pmul(tmp, m_lhs.packetCoeff(i,j))); \
        } \
      }
-    EIGEN_RUN_PARALLELIZABLE_LOOP(Flags & DestDerived::Flags & LargeBit)
+    EIGEN_RUN_PARALLELIZABLE_LOOP(should_parallelize)
    #undef EIGEN_THE_PARALLELIZABLE_LOOP
  }
  else
@ -345,7 +347,7 @@ void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res) const
            res.coeffRef(i,k) += tmp * m_lhs.coeff(i,j); \
        } \
      }
-    EIGEN_RUN_PARALLELIZABLE_LOOP(Flags & DestDerived::Flags & LargeBit)
+    EIGEN_RUN_PARALLELIZABLE_LOOP(should_parallelize)
    #undef EIGEN_THE_PARALLELIZABLE_LOOP
  }
 }
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@ -29,7 +29,11 @@ template<typename T> struct ei_traits;
 template<typename Lhs, typename Rhs> struct ei_product_eval_mode;
 template<typename T> struct NumTraits;

-template<typename _Scalar, int _Rows, int _Cols, unsigned int _Flags, int _MaxRows, int _MaxCols> class Matrix;
+template<typename _Scalar, int _Rows, int _Cols,
+         unsigned int _Flags = EIGEN_DEFAULT_MATRIX_FLAGS,
+         int _MaxRows = _Rows, int _MaxCols = _Cols>
+class Matrix;
+
 template<typename ExpressionType> class Lazy;
 template<typename ExpressionType> class Temporary;
 template<typename MatrixType> class Minor;
@ -47,7 +51,6 @@ template<typename MatrixType> class DiagonalCoeffs;
 template<typename MatrixType> class Identity;
 template<typename MatrixType> class Map;
 template<typename Derived> class Eval;
-template<typename Derived> class EvalOMP;
 template<int Direction, typename UnaryOp, typename MatrixType> class PartialRedux;

 template<typename Scalar> struct ei_scalar_sum_op;
@ -70,4 +73,6 @@ template<typename Scalar> struct ei_scalar_quotient1_op;
 template<typename Scalar> struct ei_scalar_min_op;
 template<typename Scalar> struct ei_scalar_max_op;

+template<typename ExpressionType, bool CheckExistence = true> class Inverse;
+
 #endif // EIGEN_FORWARDDECLARATIONS_H
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@ -37,6 +37,10 @@
 #define EIGEN_UNROLLING_LIMIT 400
 #endif

+#ifndef EIGEN_PARALLELIZATION_TRESHOLD
+#define EIGEN_PARALLELIZATION_TRESHOLD 2000
+#endif
+
 #ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
 #define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER RowMajorBit
 #else
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@ -182,5 +182,4 @@ template<typename T> struct ei_packet_traits
  enum {size=1};
 };

-
 #endif // EIGEN_META_H
--- a/Eigen/src/LU/CMakeLists.txt
+++ b/Eigen/src/LU/CMakeLists.txt
@ -0,0 +1,6 @@
+FILE(GLOB Eigen_LU_SRCS "*.h")
+
+INSTALL(FILES 
+  ${Eigen_LU_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/LU
+  )
--- a/Eigen/src/LU/Inverse.h
+++ b/Eigen/src/LU/Inverse.h
@ -0,0 +1,296 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2008 Benoit Jacob <jacob@math.jussieu.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_INVERSE_H
+#define EIGEN_INVERSE_H
+
+/** \class Inverse
+  *
+  * \brief Inverse of a matrix
+  *
+  * \param ExpressionType the type of the matrix/expression of which we are taking the inverse
+  * \param CheckExistence whether or not to check the existence of the inverse while computing it
+  *
+  * This class represents the inverse of a matrix. It is the return
+  * type of MatrixBase::inverse() and most of the time this is the only way it
+  * is used.
+  *
+  * \sa MatrixBase::inverse(), MatrixBase::quickInverse()
+  */
+template<typename ExpressionType, bool CheckExistence>
+struct ei_traits<Inverse<ExpressionType, CheckExistence> >
+{
+  typedef typename ExpressionType::Scalar Scalar;
+  typedef typename ExpressionType::Eval MatrixType;
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
+    Flags = MatrixType::Flags,
+    CoeffReadCost = MatrixType::CoeffReadCost
+  };
+};
+
+template<typename ExpressionType, bool CheckExistence> class Inverse : ei_no_assignment_operator,
+  public MatrixBase<Inverse<ExpressionType, CheckExistence> >
+{
+  public:
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(Inverse)
+    typedef typename ei_traits<Inverse>::MatrixType MatrixType;
+
+    Inverse(const ExpressionType& xpr)
+      : m_exists(true),
+        m_inverse(MatrixType::identity(xpr.rows(), xpr.cols()))
+    {
+      ei_assert(xpr.rows() == xpr.cols());
+      _compute(xpr);
+    }
+
+    /** \returns whether or not the inverse exists.
+      *
+      * \note This method is only available if CheckExistence is set to true, which is the default value.
+      *       For instance, when using quickInverse(), this method is not available.
+      */
+    bool exists() const { assert(CheckExistence); return m_exists; }
+
+  private:
+
+    int _rows() const { return m_inverse.rows(); }
+    int _cols() const { return m_inverse.cols(); }
+
+    const Scalar _coeff(int row, int col) const
+    {
+      return m_inverse.coeff(row, col);
+    }
+
+    void _compute(const ExpressionType& xpr);
+    void _compute_not_unrolled(MatrixType& matrix, const RealScalar& max);
+    template<int Size, int Step, bool Finished = Size==Dynamic> struct _unroll_first_loop;
+    template<int Size, int Step, bool Finished = Size==Dynamic> struct _unroll_second_loop;
+    template<int Size, int Step, bool Finished = Size==Dynamic> struct _unroll_third_loop;
+
+  protected:
+    bool m_exists;
+    MatrixType m_inverse;
+};
+
+template<typename ExpressionType, bool CheckExistence>
+void Inverse<ExpressionType, CheckExistence>
+::_compute_not_unrolled(MatrixType& matrix, const RealScalar& max)
+{
+  const int size = matrix.rows();
+  for(int k = 0; k < size-1; k++)
+  {
+    int rowOfBiggest;
+    const RealScalar max_in_this_col
+      = matrix.col(k).end(size-k).cwiseAbs().maxCoeff(&rowOfBiggest);
+    if(CheckExistence && ei_isMuchSmallerThan(max_in_this_col, max))
+    { m_exists = false; return; }
+
+    m_inverse.row(k).swap(m_inverse.row(k+rowOfBiggest));
+    matrix.row(k).swap(matrix.row(k+rowOfBiggest));
+    
+    const Scalar d = matrix(k,k);
+    m_inverse.block(k+1, 0, size-k-1, size)
+      -= matrix.col(k).end(size-k-1) * (m_inverse.row(k) / d);
+    matrix.corner(BottomRight, size-k-1, size-k)
+      -= matrix.col(k).end(size-k-1) * (matrix.row(k).end(size-k) / d);
+  }
+
+  for(int k = 0; k < size-1; k++)
+  {
+    const Scalar d = static_cast<Scalar>(1)/matrix(k,k);
+    matrix.row(k).end(size-k) *= d;
+    m_inverse.row(k) *= d;
+  }
+  if(CheckExistence && ei_isMuchSmallerThan(matrix(size-1,size-1), max))
+  { m_exists = false; return; }
+  m_inverse.row(size-1) /= matrix(size-1,size-1);
+
+  for(int k = size-1; k >= 1; k--)
+  {
+    m_inverse.block(0,0,k,size) -= matrix.col(k).start(k) * m_inverse.row(k);
+  }
+}
+
+template<typename ExpressionType, bool CheckExistence>
+template<int Size, int Step, bool Finished>
+struct Inverse<ExpressionType, CheckExistence>::_unroll_first_loop
+{
+  typedef Inverse<ExpressionType, CheckExistence> Inv;
+  typedef typename Inv::MatrixType MatrixType;
+  typedef typename Inv::RealScalar RealScalar;
+
+  static void run(Inv& object, MatrixType& matrix, const RealScalar& max)
+  {
+    MatrixType& inverse = object.m_inverse;
+    int rowOfBiggest;
+    const RealScalar max_in_this_col
+      = matrix.col(Step).template end<Size-Step>().cwiseAbs().maxCoeff(&rowOfBiggest);
+    if(CheckExistence && ei_isMuchSmallerThan(max_in_this_col, max)) 
+    { object.m_exists = false; return; }
+
+    inverse.row(Step).swap(inverse.row(Step+rowOfBiggest));
+    matrix.row(Step).swap(matrix.row(Step+rowOfBiggest));
+  
+    const Scalar d = matrix(Step,Step);
+    inverse.template block<Size-Step-1, Size>(Step+1, 0)
+      -= matrix.col(Step).template end<Size-Step-1>() * (inverse.row(Step) / d);
+    matrix.template corner<Size-Step-1, Size-Step>(BottomRight)
+      -= matrix.col(Step).template end<Size-Step-1>()
+        * (matrix.row(Step).template end<Size-Step>() / d);
+
+    _unroll_first_loop<Size, Step+1, Step >= Size-2>::run(object, matrix, max);
+  }
+};
+
+template<typename ExpressionType, bool CheckExistence>
+template<int Size, int Step>
+struct Inverse<ExpressionType, CheckExistence>::_unroll_first_loop<Step, Size, true>
+{
+  typedef Inverse<ExpressionType, CheckExistence> Inv;
+  typedef typename Inv::MatrixType MatrixType;
+  typedef typename Inv::RealScalar RealScalar;
+  static void run(Inv&, MatrixType&, const RealScalar&) {}
+};
+
+template<typename ExpressionType, bool CheckExistence>
+template<int Size, int Step, bool Finished>
+struct Inverse<ExpressionType, CheckExistence>::_unroll_second_loop
+{
+  typedef Inverse<ExpressionType, CheckExistence> Inv;
+  typedef typename Inv::MatrixType MatrixType;
+  typedef typename Inv::RealScalar RealScalar;
+
+  static void run(Inv& object, MatrixType& matrix, const RealScalar& max)
+  {
+    MatrixType& inverse = object.m_inverse;
+   
+    if(Step == Size-1)
+    {
+      if(CheckExistence && ei_isMuchSmallerThan(matrix(Size-1,Size-1), max))
+      { object.m_exists = false; return; }
+      inverse.row(Size-1) /= matrix(Size-1,Size-1);
+    }
+    else
+    {
+      const Scalar d = static_cast<Scalar>(1)/matrix(Step,Step);
+      matrix.row(Step).template end<Size-Step>() *= d;
+      inverse.row(Step) *= d;
+    }
+   
+    _unroll_second_loop<Size, Step+1, Step >= Size-1>::run(object, matrix, max);
+  }
+};
+
+template<typename ExpressionType, bool CheckExistence>
+template<int Size, int Step>
+struct Inverse<ExpressionType, CheckExistence>::_unroll_second_loop <Step, Size, true>
+{
+  typedef Inverse<ExpressionType, CheckExistence> Inv;
+  typedef typename Inv::MatrixType MatrixType;
+  typedef typename Inv::RealScalar RealScalar;
+  static void run(Inv&, MatrixType&, const RealScalar&) {}
+};
+
+template<typename ExpressionType, bool CheckExistence>
+template<int Size, int Step, bool Finished>
+struct Inverse<ExpressionType, CheckExistence>::_unroll_third_loop
+{
+  typedef Inverse<ExpressionType, CheckExistence> Inv;
+  typedef typename Inv::MatrixType MatrixType;
+  typedef typename Inv::RealScalar RealScalar;
+
+  static void run(Inv& object, MatrixType& matrix, const RealScalar& max)
+  {
+    MatrixType& inverse = object.m_inverse;
+    inverse.template block<Size-Step,Size>(0,0)
+        -= matrix.col(Size-Step).template start<Size-Step>() * inverse.row(Size-Step);
+    _unroll_third_loop<Size, Step+1, Step >= Size-1>::run(object, matrix, max);
+  }
+};
+
+template<typename ExpressionType, bool CheckExistence>
+template<int Size, int Step>
+struct Inverse<ExpressionType, CheckExistence>::_unroll_third_loop<Step, Size, true>
+{
+  typedef Inverse<ExpressionType, CheckExistence> Inv;
+  typedef typename Inv::MatrixType MatrixType;
+  typedef typename Inv::RealScalar RealScalar;
+  static void run(Inv&, MatrixType&, const RealScalar&) {}
+};
+
+template<typename ExpressionType, bool CheckExistence>
+void Inverse<ExpressionType, CheckExistence>::_compute(const ExpressionType& xpr)
+{
+  MatrixType matrix(xpr);
+  const RealScalar max = CheckExistence ? matrix.cwiseAbs().maxCoeff()
+                                        : static_cast<RealScalar>(0);
+
+  if(MatrixType::RowsAtCompileTime <= 4)
+  {
+    const int size = MatrixType::RowsAtCompileTime;
+    _unroll_first_loop<size, 0>::run(*this, matrix, max);
+    if(CheckExistence && !m_exists) return;
+    _unroll_second_loop<size, 0>::run(*this, matrix, max);
+    if(CheckExistence && !m_exists) return;
+    _unroll_third_loop<size, 1>::run(*this, matrix, max);
+  }
+  else
+  {
+    _compute_not_unrolled(matrix, max);
+  }
+}
+
+/** \return the matrix inverse of \c *this, if it exists.
+  *
+  * Example: \include MatrixBase_inverse.cpp
+  * Output: \verbinclude MatrixBase_inverse.out
+  *
+  * \sa class Inverse
+  */
+template<typename Derived>
+const Inverse<Derived, true>
+MatrixBase<Derived>::inverse() const
+{
+  return Inverse<Derived, true>(derived());
+}
+
+/** \return the matrix inverse of \c *this, which is assumed to exist.
+  *
+  * Example: \include MatrixBase_quickInverse.cpp
+  * Output: \verbinclude MatrixBase_quickInverse.out
+  *
+  * \sa class Inverse
+  */
+template<typename Derived>
+const Inverse<Derived, false>
+MatrixBase<Derived>::quickInverse() const
+{
+  return Inverse<Derived, false>(derived());
+}
+
+#endif // EIGEN_INVERSE_H
--- a/doc/echelon.cpp
+++ b/doc/echelon.cpp
@ -49,7 +49,7 @@ struct unroll_echelon<Derived, Dynamic>
 {
  static void run(MatrixBase<Derived>& m)
  {
-    for(int k = 0; k < m.diagonal().size(); k++)
+    for(int k = 0; k < m.diagonal().size() - 1; k++)
    {
      int rowOfBiggest, colOfBiggest;
      int cornerRows = m.rows()-k, cornerCols = m.cols()-k;
@ -63,13 +63,14 @@ struct unroll_echelon<Derived, Dynamic>
    }
  }
 };
+
 using namespace std;
 template<typename Derived>
 void echelon(MatrixBase<Derived>& m)
 {
  const int size = DiagonalCoeffs<Derived>::SizeAtCompileTime;
  const bool unroll = size <= 4;
-  unroll_echelon<Derived, unroll ? size : Dynamic>::run(m);
+  unroll_echelon<Derived, unroll ? size-1 : Dynamic>::run(m);
 }

 template<typename Derived>
@ -100,7 +101,7 @@ using namespace std;
 int main(int, char **)
 {
  srand((unsigned int)time(0));
-  const int Rows = 6, Cols = 6;
+  const int Rows = 6, Cols = 4;
  typedef Matrix<double, Rows, Cols> Mat;
  const int N = Rows < Cols ? Rows : Cols;