From fc9480cbb38db2c7585e3ac5d14719237a85d2d7 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sun, 16 Aug 2009 10:55:10 +0200
Subject: [PATCH] bugfix in compute_matrix_flags, optimization in LU, improve
 doc, and workaround aliasing detection in MatrixBase_eval snippet (not very
 nice but I don't know how to do it in a better way)

---
 Eigen/src/Core/NoAlias.h         |  52 +++++++----
 Eigen/src/Core/ProductBase.h     |   6 +-
 Eigen/src/Core/util/XprHelper.h  |   2 +-
 Eigen/src/LU/LU.h                |   2 +-
 Eigen/src/LU/PartialLU.h         |   2 +-
 doc/I01_TopicLazyEvaluation.dox  |   2 +-
 doc/I02_HiPerformance.dox        | 154 +++++++------------------------
 doc/snippets/MatrixBase_eval.cpp |   2 +-
 8 files changed, 77 insertions(+), 145 deletions(-)
diff --git a/Eigen/src/Core/NoAlias.h b/Eigen/src/Core/NoAlias.h
index 5c2b6e3ca..a45493ddc 100644
--- a/Eigen/src/Core/NoAlias.h
+++ b/Eigen/src/Core/NoAlias.h
@@ -31,8 +31,9 @@
   *
   * \param ExpressionType the type of the object on which to do the lazy assignment
   *
-  * This class represents an expression with a special assignment operator (operator=)
+  * This class represents an expression with special assignment operators
   * assuming no aliasing between the target expression and the source expression.
+  * More precisely it alloas to bypass the EvalBeforeAssignBit flag of the source expression.
   * It is the return type of MatrixBase::noalias()
   * and most of the time this is the only way it is used.
   *
@@ -44,44 +45,61 @@ class NoAlias
   public:
     NoAlias(ExpressionType& expression) : m_expression(expression) {}
 
-    /** Behaves like MatrixBase::lazyAssign() */
+    /** Behaves like MatrixBase::lazyAssign(other)
+      * \sa MatrixBase::lazyAssign() */
     template<typename OtherDerived>
     ExpressionType& operator=(const MatrixBase<OtherDerived>& other)
-    {
-      return m_expression.lazyAssign(other.derived());
-    }
+    { return m_expression.lazyAssign(other.derived()); }
 
-    // TODO could be removed if we decide that += is noalias by default
+    /** \sa MatrixBase::operator+= */
     template<typename OtherDerived>
     ExpressionType& operator+=(const MatrixBase<OtherDerived>& other)
-    {
-      return m_expression.lazyAssign(m_expression + other.derived());
-    }
+    { return m_expression.lazyAssign(m_expression + other.derived()); }
 
-    // TODO could be removed if we decide that += is noalias by default
+    /** \sa MatrixBase::operator-= */
     template<typename OtherDerived>
     ExpressionType& operator-=(const MatrixBase<OtherDerived>& other)
-    {
-      return m_expression.lazyAssign(m_expression - other.derived());
-    }
+    { return m_expression.lazyAssign(m_expression - other.derived()); }
 
-    // TODO could be removed if we decide that += is noalias by default
+#ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename ProductDerived, typename Lhs, typename Rhs>
     ExpressionType& operator+=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
     { other.derived().addTo(m_expression); return m_expression; }
 
-    // TODO could be removed if we decide that += is noalias by default
     template<typename ProductDerived, typename Lhs, typename Rhs>
     ExpressionType& operator-=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
     { other.derived().subTo(m_expression); return m_expression; }
+#endif
 
   protected:
     ExpressionType& m_expression;
 };
 
-
 /** \returns a pseudo expression of \c *this with an operator= assuming
-  * no aliasing between \c *this and the source expression
+  * no aliasing between \c *this and the source expression.
+  * 
+  * More precisely, noalias() allows to bypass the EvalBeforeAssignBit flag.
+  * Currently, even though several expressions may alias, only product
+  * expressions have this flag. Therefore, noalias() is only usefull when
+  * the source expression contains a matrix product.
+  * 
+  * Here are some examples where noalias is usefull:
+  * \code
+  * D.noalias()  = A * B;
+  * D.noalias() += A.transpose() * B;
+  * D.noalias() -= 2 * A * B.adjoint();
+  * \endcode
+  *
+  * On the other hand the following example will lead to a \b wrong result:
+  * \code
+  * A.noalias() = A * B;
+  * \endcode
+  * because the result matrix A is also an operand of the matrix product. Therefore,
+  * there is no alternative than evaluating A * B in a temporary, that is the default
+  * behavior when you write:
+  * \code
+  * A = A * B;
+  * \endcode
   *
   * \sa class NoAlias
   */
diff --git a/Eigen/src/Core/ProductBase.h b/Eigen/src/Core/ProductBase.h
index 5e259b7f7..b2c4cd989 100644
--- a/Eigen/src/Core/ProductBase.h
+++ b/Eigen/src/Core/ProductBase.h
@@ -119,10 +119,8 @@ class ProductBase : public MatrixBase<Derived>
       return res;
     }
 
-    const Flagged<ProductBase, 0, EvalBeforeAssigningBit> lazy() const
-    {
-      return *this;
-    }
+    EIGEN_DEPRECATED const Flagged<ProductBase, 0, EvalBeforeAssigningBit> lazy() const
+    { return *this; }
 
     const _LhsNested& lhs() const { return m_lhs; }
     const _RhsNested& rhs() const { return m_rhs; }
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index d392e27ba..871259b08 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -90,7 +90,7 @@ class ei_compute_matrix_flags
                      : MaxRows==1 ? MaxCols
                      : row_major_bit ? MaxCols : MaxRows,
       is_big = inner_max_size == Dynamic,
-      is_packet_size_multiple = Rows==Dynamic || Cols==Dynamic || ((Cols*Rows) % ei_packet_traits<Scalar>::size) == 0,
+      is_packet_size_multiple = MaxRows==Dynamic || MaxCols==Dynamic || ((MaxCols*MaxRows) % ei_packet_traits<Scalar>::size) == 0,
       aligned_bit = (((Options&DontAlign)==0) && (is_big || is_packet_size_multiple)) ? AlignedBit : 0,
       packet_access_bit = ei_packet_traits<Scalar>::size > 1 && aligned_bit ? PacketAccessBit : 0
     };
diff --git a/Eigen/src/LU/LU.h b/Eigen/src/LU/LU.h
index f072ff906..d32fe08a1 100644
--- a/Eigen/src/LU/LU.h
+++ b/Eigen/src/LU/LU.h
@@ -437,7 +437,7 @@ LU<MatrixType>& LU<MatrixType>::compute(const MatrixType& matrix)
     if(k<rows-1)
       m_lu.col(k).end(rows-k-1) /= m_lu.coeff(k,k);
     if(k<size-1)
-      m_lu.block(k+1,k+1,rows-k-1,cols-k-1) -= m_lu.col(k).end(rows-k-1) * m_lu.row(k).end(cols-k-1);
+      m_lu.block(k+1,k+1,rows-k-1,cols-k-1).noalias() -= m_lu.col(k).end(rows-k-1) * m_lu.row(k).end(cols-k-1);
   }
 
   for(int k = 0; k < matrix.rows(); ++k) m_p.coeffRef(k) = k;
diff --git a/Eigen/src/LU/PartialLU.h b/Eigen/src/LU/PartialLU.h
index eb5c3d34b..0ef59bac7 100644
--- a/Eigen/src/LU/PartialLU.h
+++ b/Eigen/src/LU/PartialLU.h
@@ -248,7 +248,7 @@ struct ei_partial_lu_impl
         int rrows = rows-k-1;
         int rsize = size-k-1;
         lu.col(k).end(rrows) /= lu.coeff(k,k);
-        lu.corner(BottomRight,rrows,rsize) -= (lu.col(k).end(rrows) * lu.row(k).end(rsize)).lazy();
+        lu.corner(BottomRight,rrows,rsize).noalias() -= lu.col(k).end(rrows) * lu.row(k).end(rsize);
       }
     }
   }
diff --git a/doc/I01_TopicLazyEvaluation.dox b/doc/I01_TopicLazyEvaluation.dox
index 16fe79e70..3e2b9db5f 100644
--- a/doc/I01_TopicLazyEvaluation.dox
+++ b/doc/I01_TopicLazyEvaluation.dox
@@ -4,7 +4,7 @@ namespace Eigen {
 
 Executive summary: Eigen has intelligent compile-time mechanisms to enable lazy evaluation and removing temporaries where appropriate.
 It will handle aliasing automatically in most cases, for example with matrix products. The automatic behavior can be overridden
-manually by using the MatrixBase::eval() and MatrixBase::lazy() methods.
+manually by using the MatrixBase::eval() and MatrixBase::noalias() methods.
 
 When you write a line of code involving a complex expression such as
 
diff --git a/doc/I02_HiPerformance.dox b/doc/I02_HiPerformance.dox
index 2c77f87b3..c2148ab4d 100644
--- a/doc/I02_HiPerformance.dox
+++ b/doc/I02_HiPerformance.dox
@@ -9,13 +9,13 @@ for small fixed size matrices. For large matrices, however, it might useful to
 take some care when writing your expressions in order to minimize useless evaluations
 and optimize the performance.
 In this page we will give a brief overview of the Eigen's internal mechanism to simplify
-and evaluate complex expressions, and discuss the current limitations.
+and evaluate complex product expressions, and discuss the current limitations.
 In particular we will focus on expressions matching level 2 and 3 BLAS routines, i.e,
 all kind of matrix products and triangular solvers.
 
 Indeed, in Eigen we have implemented a set of highly optimized routines which are very similar
 to BLAS's ones. Unlike BLAS, those routines are made available to user via a high level and
-natural API. Each of these routines can perform in a single evaluation a wide variety of expressions.
+natural API. Each of these routines can compute in a single evaluation a wide variety of expressions.
 Given an expression, the challenge is then to map it to a minimal set of primitives.
 As explained latter, this mechanism has some limitations, and knowing them will allow
 you to write faster code by making your expressions more Eigen friendly.
@@ -25,18 +25,18 @@ you to write faster code by making your expressions more Eigen friendly.
 Let's start with the most common primitive: the matrix product of general dense matrices.
 In the BLAS world this corresponds to the GEMM routine. Our equivalent primitive can
 perform the following operation:
-\f$ C += \alpha op1(A) * op2(B) \f$
+\f$ C.noalias() += \alpha op1(A) op2(B) \f$
 where A, B, and C are column and/or row major matrices (or sub-matrices),
 alpha is a scalar value, and op1, op2 can be transpose, adjoint, conjugate, or the identity.
 When Eigen detects a matrix product, it analyzes both sides of the product to extract a
 unique scalar factor alpha, and for each side its effective storage (order and shape) and conjugate state.
 More precisely each side is simplified by iteratively removing trivial expressions such as scalar multiple,
-negate and conjugate. Transpose and Block expressions are not evaluated and only modify the storage order
+negate and conjugate. Transpose and Block expressions are not evaluated and they only modify the storage order
 and shape. All other expressions are immediately evaluated.
 For instance, the following expression:
-\code m1 -= (s1 * m2.adjoint() * (-(s3*m3).conjugate()*s2)).lazy()  \endcode
+\code m1.noalias() -= s1 * m2.adjoint() * (-(s3*m3).conjugate()*s2)  \endcode
 is automatically simplified to:
-\code m1 += (s1*s2*conj(s3)) * m2.adjoint() * m3.conjugate() \endcode
+\code m1.noalias() += (s1*s2*conj(s3)) * m2.adjoint() * m3.conjugate() \endcode
 which exactly matches our GEMM routine.
 
 \subsection GEMM_Limitations Limitations
@@ -52,23 +52,26 @@ handled by a single GEMM-like call are correctly detected.
 <tr>
 <td>\code m1 += m2 * m3; \endcode</td>
 <td>\code temp = m2 * m3; m1 += temp; \endcode</td>
-<td>\code m1 += (m2 * m3).lazy(); \endcode</td>
-<td>Use .lazy() to tell Eigen the result and right-hand-sides do not alias.</td>
+<td>\code m1.noalias() += m2 * m3; \endcode</td>
+<td>Use .noalias() to tell Eigen the result and right-hand-sides do not alias. 
+    Otherwise the product m2 * m3 is evaluated into a temporary.</td>
 </tr>
 <tr>
-<td>\code m1 += (s1 * (m2 * m3)).lazy(); \endcode</td>
-<td>\code temp = (m2 * m3).lazy(); m1 += s1 * temp; \endcode</td>
-<td>\code m1 += (s1 * m2 * m3).lazy(); \endcode</td>
-<td>This is because m2 * m3 is immediately evaluated by the scalar product. <br>
-    Make sure the matrix product is the top most expression.</td>
+<td></td>
+<td></td>
+<td>\code m1.noalias() += s1 * (m2 * m3); \endcode</td>
+<td>This is a special feature of Eigen. Here the product between a scalar
+    and a matrix product does not evaluate the matrix product but instead it
+    returns a matrix product expression tracking the scalar scaling factor. <br>
+    Without this optimization, the matrix product would be evaluated into a
+    temporary as in the next example.</td>
 </tr>
 <tr>
-<td>\code m1 += s1 * (m2 * m3).lazy(); \endcode</td>
-<td>\code m1 += s1 * m2 * m3; // using a naive product \endcode</td>
-<td>\code m1 += (s1 * m2 * m3).lazy(); \endcode</td>
-<td>Even though this expression is evaluated without temporary, it is actually even
-    worse than the previous case because here the .lazy() enforces Eigen to use a
-    naive (and slow) evaluation of the product.</td>
+<td>\code m1.noalias() += (m2 * m3).transpose(); \endcode</td>
+<td>\code temp = m2 * m3; m1 += temp.transpose(); \endcode</td>
+<td>\code m1.noalias() += m3.adjoint() * m3.adjoint(); \endcode</td>
+<td>This is because the product expression has the EvalBeforeNesting bit which
+    enforces the evaluation of the product by the Tranpose expression.</td>
 </tr>
 <tr>
 <td>\code m1 = m1 + m2 * m3; \endcode</td>
@@ -78,112 +81,25 @@ handled by a single GEMM-like call are correctly detected.
     and so the matrix product will be immediately evaluated.</td>
 </tr>
 <tr>
-<td>\code m1 += ((s1*m2).block(....) * m3).lazy(); \endcode</td>
-<td>\code temp = (s1*m2).block(....); m1 += (temp * m3).lazy(); \endcode</td>
-<td>\code m1 += (s1 * m2.block(....) * m3).lazy(); \endcode</td>
+<td>\code m1.noalias() = m4 + m2 * m3; \endcode</td>
+<td>\code temp = m2 * m3; m1 = m4 + temp; \endcode</td>
+<td>\code m1 = m4; m1.noalias() += m2 * m3; \endcode</td>
+<td>First of all, here the .noalias() in the first expression is useless because
+    m2*m3 will be evaluated anyway. However, note how this expression can be rewritten
+    so that no temporary is evaluated. (tip: for very small fixed size matrix
+    it is slighlty better to rewrite it like this: m1.noalias() = m2 * m3; m1 += m4;</td>
+</tr>
+<tr>
+<td>\code m1.noalias() += ((s1*m2).block(....) * m3); \endcode</td>
+<td>\code temp = (s1*m2).block(....); m1 += temp * m3; \endcode</td>
+<td>\code m1.noalias() += s1 * m2.block(....) * m3; \endcode</td>
 <td>This is because our expression analyzer is currently not able to extract trivial
     expressions nested in a Block expression. Therefore the nested scalar
     multiple cannot be properly extracted.</td>
 </tr>
 </table>
 
-Of course all these remarks hold for all other kind of products that we will describe in the following paragraphs.
-
-
-
-
-<table class="tutorial_code">
-<tr>
-<td>BLAS equivalent routine</td>
-<td>Efficient version <br> (compile to a single optimized evaluation)</td>
-<td>Less efficient equivalent version <br> (requires multiple evaluations)</td>
-<td>comments</td>
-</tr>
-<tr>
-<td>GEMM</td>
-<td>m1 = s1 * m2 * m3</td>
-<td>m1 = s1 * (m2 * m3)</td>
-<td>This is because m2 * m3 is evaluated by the scalar product.</td>
-</tr>
-<tr>
-<td>GEMM</td>
-<td>m1 += s1 * m2.adjoint() * m3</td>
-<td>m1 += (s1 * m2).adjoint() * m3</td>
-<td>This is because our expression analyzer stops at the first transpose expression and cannot extract the nested scalar multiple.</td>
-</tr>
-<tr>
-<td>GEMM</td>
-<td>m1 += m2.adjoint() * m3</td>
-<td>m1 += m2.conjugate().transpose() * m3</td>
-<td>For the same reason. Use .adjoint() or .transpose().conjugate()</td>
-</tr>
-<tr>
-<td>GEMM</td>
-<td>m1 -= (-(s0*m2).conjugate()*s1) * (s2 * m3.adjoint() * s3)</td>
-<td></td>
-<td>Note that s0 is automatically conjugated during the simplification of the expression.</td>
-</tr>
-<tr>
-<td>SYR</td>
-<td>m.sefadjointView<LowerTriangular>().rankUpdate(v,s)</td>
-<td></td>
-<td>Computes m += s * v * v.adjoint()</td>
-</tr>
-<tr>
-<td>SYR2</td>
-<td>m.sefadjointView<LowerTriangular>().rankUpdate(u,v,s)</td>
-<td></td>
-<td>Computes m += s * u * v.adjoint() + s * v * u.adjoint()</td>
-</tr>
-<tr>
-<td>SYRK</td>
-<td>m1.sefadjointView<UpperTriangular>().rankUpdate(m2.adjoint(),s)</td>
-<td></td>
-<td>Computes m1 += s * m2.adjoint() * m2</td>
-</tr>
-<tr>
-<td>SYMM/HEMM</td>
-<td>m3 -= s1 * m1.sefadjointView<UpperTriangular>() * m2.adjoint()</td>
-<td></td>
-<td></td>
-</tr>
-<tr>
-<td>SYMM/HEMM</td>
-<td>m3 += s1 * m2.transpose() * m1.conjugate().sefadjointView<UpperTriangular>()</td>
-<td></td>
-<td></td>
-</tr>
-<tr>
-<td>TRMM</td>
-<td>m3 -= s1 * m1.triangularView<UnitUpperTriangular>() * m2.adjoint()</td>
-<td></td>
-<td></td>
-</tr>
-<tr>
-<td>TRSV / TRSM</td>
-<td>m1.adjoint().triangularView<UnitLowerTriangular>().solveInPlace(m2)</td>
-<td></td>
-<td></td>
-</tr>
-<tr>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-</tr>
-<tr>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-</tr>
-<tr>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-</tr>
-</table>
+Of course all these remarks hold for all other kind of products involving triangular or selfadjoint matrices.
 
 */
 
diff --git a/doc/snippets/MatrixBase_eval.cpp b/doc/snippets/MatrixBase_eval.cpp
index 732293043..d70424562 100644
--- a/doc/snippets/MatrixBase_eval.cpp
+++ b/doc/snippets/MatrixBase_eval.cpp
@@ -4,7 +4,7 @@ m = M;
 cout << "Here is the matrix m:" << endl << m << endl;
 cout << "Now we want to replace m by its own transpose." << endl;
 cout << "If we do m = m.transpose(), then m becomes:" << endl;
-m = m.transpose();
+m = m.transpose() * 1;
 cout << m << endl << "which is wrong!" << endl;
 cout << "Now let us instead do m = m.transpose().eval(). Then m becomes" << endl;
 m = M;

BLAS equivalent routine	Efficient version (compile to a single optimized evaluation)	Less efficient equivalent version (requires multiple evaluations)	comments
GEMM	m1 = s1 * m2 * m3	m1 = s1 * (m2 * m3)	This is because m2 * m3 is evaluated by the scalar product.
GEMM	m1 += s1 * m2.adjoint() * m3	m1 += (s1 * m2).adjoint() * m3	This is because our expression analyzer stops at the first transpose expression and cannot extract the nested scalar multiple.
GEMM	m1 += m2.adjoint() * m3	m1 += m2.conjugate().transpose() * m3	For the same reason. Use .adjoint() or .transpose().conjugate()
GEMM	m1 -= (-(s0m2).conjugate()s1) * (s2 * m3.adjoint() * s3)		Note that s0 is automatically conjugated during the simplification of the expression.
SYR	m.sefadjointView().rankUpdate(v,s)		Computes m += s * v * v.adjoint()
SYR2	m.sefadjointView().rankUpdate(u,v,s)		Computes m += s * u * v.adjoint() + s * v * u.adjoint()
SYRK	m1.sefadjointView().rankUpdate(m2.adjoint(),s)		Computes m1 += s * m2.adjoint() * m2
SYMM/HEMM	m3 -= s1 * m1.sefadjointView() * m2.adjoint()
SYMM/HEMM	m3 += s1 * m2.transpose() * m1.conjugate().sefadjointView()
TRMM	m3 -= s1 * m1.triangularView() * m2.adjoint()
TRSV / TRSM	m1.adjoint().triangularView().solveInPlace(m2)