* refactoring of Product:

* use ProductReturnType<>::Type to get the correct Product xpr type * Product is no longer instanciated for xpr types which are evaluated * vectorization of "a.transpose() * b" for the normal product (small and fixed-size matrix) * some cleanning * removed ArrayBase
2025-02-17 18:09:55 +08:00 · 2008-06-19 17:33:57 +00:00 · 2008-06-19 17:33:57 +00:00 · 82c3cea1d5
commit 82c3cea1d5
parent 5dbfed1902
12 changed files with 359 additions and 247 deletions
--- a/Eigen/Array
+++ b/Eigen/Array
@ -5,7 +5,6 @@

 namespace Eigen {

-#include "src/Array/ArrayBase.h"
 #include "src/Array/CwiseOperators.h"
 #include "src/Array/Functors.h"
 #include "src/Array/AllAndAny.h"
--- a/Eigen/src/Core/DiagonalProduct.h
+++ b/Eigen/src/Core/DiagonalProduct.h
@ -26,48 +26,57 @@
 #ifndef EIGEN_DIAGONALPRODUCT_H
 #define EIGEN_DIAGONALPRODUCT_H

-template<typename Lhs, typename Rhs>
-struct ei_traits<Product<Lhs, Rhs, DiagonalProduct> >
+template<typename LhsNested, typename RhsNested>
+struct ei_traits<Product<LhsNested, RhsNested, DiagonalProduct> >
 {
-  typedef typename Lhs::Scalar Scalar;
-  typedef typename ei_nested<Lhs>::type LhsNested;
-  typedef typename ei_nested<Rhs>::type RhsNested;
-  typedef typename ei_unref<LhsNested>::type _LhsNested;
-  typedef typename ei_unref<RhsNested>::type _RhsNested;
+  // clean the nested types:
+  typedef typename ei_unconst<typename ei_unref<LhsNested>::type>::type _LhsNested;
+  typedef typename ei_unconst<typename ei_unref<RhsNested>::type>::type _RhsNested;
+  typedef typename _LhsNested::Scalar Scalar;
+
  enum {
    LhsFlags = _LhsNested::Flags,
    RhsFlags = _RhsNested::Flags,
-    RowsAtCompileTime = Lhs::RowsAtCompileTime,
-    ColsAtCompileTime = Rhs::ColsAtCompileTime,
-    MaxRowsAtCompileTime = Lhs::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = Rhs::MaxColsAtCompileTime,
-    _RhsPacketAccess =  (RhsFlags & RowMajorBit) && (RhsFlags & PacketAccessBit)
+    RowsAtCompileTime = _LhsNested::RowsAtCompileTime,
+    ColsAtCompileTime = _RhsNested::ColsAtCompileTime,
+    MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime,
+
+    LhsIsDiagonal = (_LhsNested::Flags&Diagonal)==Diagonal,
+    RhsIsDiagonal = (_RhsNested::Flags&Diagonal)==Diagonal,
+
+    CanVectorizeRhs =  (!RhsIsDiagonal) && (RhsFlags & RowMajorBit) && (RhsFlags & PacketAccessBit)
                     && (ColsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
-    _LhsPacketAccess =  (!(LhsFlags & RowMajorBit)) && (LhsFlags & PacketAccessBit)
+
+    CanVectorizeLhs =  (!LhsIsDiagonal) && (!(LhsFlags & RowMajorBit)) && (LhsFlags & PacketAccessBit)
                     && (RowsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
-    _LostBits = ~(((RhsFlags & RowMajorBit) && (!_LhsPacketAccess) ? 0 : RowMajorBit)
-                | ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)),
-    Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & _LostBits)
-          | (_LhsPacketAccess || _RhsPacketAccess ? PacketAccessBit : 0),
+
+    RemovedBits = ~(((RhsFlags & RowMajorBit) && (!CanVectorizeLhs) ? 0 : RowMajorBit)
+                | ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit))
+                | LinearAccessBit,
+
+    Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
+          | (CanVectorizeLhs || CanVectorizeRhs ? PacketAccessBit : 0),
+
    CoeffReadCost = NumTraits<Scalar>::MulCost + _LhsNested::CoeffReadCost + _RhsNested::CoeffReadCost
  };
 };

-template<typename Lhs, typename Rhs> class Product<Lhs, Rhs, DiagonalProduct> : ei_no_assignment_operator,
-  public MatrixBase<Product<Lhs, Rhs, DiagonalProduct> >
+template<typename LhsNested, typename RhsNested> class Product<LhsNested, RhsNested, DiagonalProduct> : ei_no_assignment_operator,
+  public MatrixBase<Product<LhsNested, RhsNested, DiagonalProduct> >
 {
-  public:
-
-    EIGEN_GENERIC_PUBLIC_INTERFACE(Product)
-    typedef typename ei_traits<Product>::LhsNested LhsNested;
-    typedef typename ei_traits<Product>::RhsNested RhsNested;
    typedef typename ei_traits<Product>::_LhsNested _LhsNested;
    typedef typename ei_traits<Product>::_RhsNested _RhsNested;

    enum {
-      PacketSize = ei_packet_traits<Scalar>::size
+      RhsIsDiagonal = (_RhsNested::Flags&Diagonal)==Diagonal
    };

+  public:
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(Product)
+
+    template<typename Lhs, typename Rhs>
    inline Product(const Lhs& lhs, const Rhs& rhs)
      : m_lhs(lhs), m_rhs(rhs)
    {
@ -81,14 +90,14 @@ template<typename Lhs, typename Rhs> class Product<Lhs, Rhs, DiagonalProduct> :

    const Scalar _coeff(int row, int col) const
    {
-      int unique = ((Rhs::Flags&Diagonal)==Diagonal) ? col : row;
+      const int unique = RhsIsDiagonal ? col : row;
      return m_lhs.coeff(row, unique) * m_rhs.coeff(unique, col);
    }

    template<int LoadMode>
    const PacketScalar _packet(int row, int col) const
    {
-      if ((Rhs::Flags&Diagonal)==Diagonal)
+      if (RhsIsDiagonal)
      {
        ei_assert((_LhsNested::Flags&RowMajorBit)==0);
        return ei_pmul(m_lhs.template packet<LoadMode>(row, col), ei_pset1(m_rhs.coeff(col, col)));
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@ -49,7 +49,7 @@
  *
  * \nosubgrouping
  */
-template<typename Derived> class MatrixBase : public ArrayBase<Derived>
+template<typename Derived> class MatrixBase
 {
    struct CommaInitializer;

@ -168,16 +168,6 @@ template<typename Derived> class MatrixBase : public ArrayBase<Derived>
    };
    /** Represents a product scalar-matrix */
    typedef CwiseUnaryOp<ei_scalar_multiple_op<Scalar>, Derived> ScalarMultipleReturnType;
-    /** */
-    template<typename OtherDerived>
-    struct ProductReturnType
-    {
-      typedef typename ei_meta_if<
-            (Derived::Flags & OtherDerived::Flags & ArrayBit),
-            CwiseBinaryOp<ei_scalar_product_op<typename ei_traits<Derived>::Scalar>, Derived, OtherDerived>,
-            Product<Derived,OtherDerived>
-          >::ret Type;
-    };
    /** the return type of MatrixBase::conjugate() */
    typedef typename ei_meta_if<NumTraits<Scalar>::IsComplex,
                        CwiseUnaryOp<ei_scalar_conjugate_op<Scalar>, Derived>,
@ -274,7 +264,7 @@ template<typename Derived> class MatrixBase : public ArrayBase<Derived>


    template<typename OtherDerived>
-    const typename ProductReturnType<OtherDerived>::Type
+    const typename ProductReturnType<Derived,OtherDerived>::Type
    operator*(const MatrixBase<OtherDerived> &other) const;

    template<typename OtherDerived>
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@ -26,123 +26,69 @@
 #ifndef EIGEN_PRODUCT_H
 #define EIGEN_PRODUCT_H

-template<int Index, int Size, typename Lhs, typename Rhs>
-struct ei_product_impl
-{
-  inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs,
-                               typename Lhs::Scalar &res)
-  {
-    ei_product_impl<Index-1, Size, Lhs, Rhs>::run(row, col, lhs, rhs, res);
-    res += lhs.coeff(row, Index) * rhs.coeff(Index, col);
-  }
+/***************************
+*** Forward declarations ***
+***************************/
+
+enum {
+  ColMajorProduct,
+  RowMajorProduct
 };

-template<int Size, typename Lhs, typename Rhs>
-struct ei_product_impl<0, Size, Lhs, Rhs>
-{
-  inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs,
-                  typename Lhs::Scalar &res)
-  {
-    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
-  }
-};
+template<int VectorizationMode, int Index, typename Lhs, typename Rhs>
+struct ei_product_coeff_impl;

-template<typename Lhs, typename Rhs>
-struct ei_product_impl<Dynamic, Dynamic, Lhs, Rhs>
-{
-  inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar& res)
-  {
-    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
-      for(int i = 1; i < lhs.cols(); i++)
-        res += lhs.coeff(row, i) * rhs.coeff(i, col);
-  }
-};
+template<int StorageOrder, int Index, typename Lhs, typename Rhs, typename PacketScalar>
+struct ei_product_packet_impl;

-// prevent buggy user code from causing an infinite recursion
-template<int Index, typename Lhs, typename Rhs>
-struct ei_product_impl<Index, 0, Lhs, Rhs>
-{
-  inline static void run(int, int, const Lhs&, const Rhs&, typename Lhs::Scalar&) {}
-};
+template<typename T> class ei_product_eval_to_column_major;

-//----------
-
-template<bool RowMajor, int Index, int Size, typename Lhs, typename Rhs, typename PacketScalar>
-struct ei_packet_product_impl;
-
-template<int Index, int Size, typename Lhs, typename Rhs, typename PacketScalar>
-struct ei_packet_product_impl<true, Index, Size, Lhs, Rhs, PacketScalar>
-{
-  inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
-  {
-    ei_packet_product_impl<true, Index-1, Size, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
-    res =  ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.template packet<Aligned>(Index, col), res);
-  }
-};
-
-template<int Index, int Size, typename Lhs, typename Rhs, typename PacketScalar>
-struct ei_packet_product_impl<false, Index, Size, Lhs, Rhs, PacketScalar>
-{
-  inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
-  {
-    ei_packet_product_impl<false, Index-1, Size, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
-    res =  ei_pmadd(lhs.template packet<Aligned>(row, Index), ei_pset1(rhs.coeff(Index, col)), res);
-  }
-};
-
-template<int Size, typename Lhs, typename Rhs, typename PacketScalar>
-struct ei_packet_product_impl<true, 0, Size, Lhs, Rhs, PacketScalar>
-{
-  inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
-  {
-    res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<Aligned>(0, col));
-  }
-};
-
-template<int Size, typename Lhs, typename Rhs, typename PacketScalar>
-struct ei_packet_product_impl<false, 0, Size, Lhs, Rhs, PacketScalar>
-{
-  inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
-  {
-    res = ei_pmul(lhs.template packet<Aligned>(row, 0), ei_pset1(rhs.coeff(0, col)));
-  }
-};
-
-template<bool RowMajor, int Index, typename Lhs, typename Rhs, typename PacketScalar>
-struct ei_packet_product_impl<RowMajor, Index, Dynamic, Lhs, Rhs, PacketScalar>
-{
-  inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
-  {
-    res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<Aligned>(0, col));
-      for(int i = 1; i < lhs.cols(); i++)
-        res =  ei_pmadd(ei_pset1(lhs.coeff(row, i)), rhs.template packet<Aligned>(i, col), res);
-  }
-};
-
-template<int Index, typename Lhs, typename Rhs, typename PacketScalar>
-struct ei_packet_product_impl<false, Index, Dynamic, Lhs, Rhs, PacketScalar>
-{
-  inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
-  {
-    res = ei_pmul(lhs.template packet<Aligned>(row, 0), ei_pset1(rhs.coeff(0, col)));
-      for(int i = 1; i < lhs.cols(); i++)
-        res =  ei_pmadd(lhs.template packet<Aligned>(row, i), ei_pset1(rhs.coeff(i, col)), res);
-  }
-};
-
-/** \class Product
+/** \class ProductReturnType
  *
-  * \brief Expression of the product of two matrices
+  * \brief Helper class to get the correct and optimized returned type of operator*
  *
  * \param Lhs the type of the left-hand side
  * \param Rhs the type of the right-hand side
-  * \param EvalMode internal use only
+  * \param ProductMode the type of the product (determined automatically by ei_product_mode)
  *
-  * This class represents an expression of the product of two matrices.
-  * It is the return type of the operator* between matrices, and most of the time
-  * this is the only way it is used.
+  * This class defines the typename Type representing the optimized product expression
+  * between two matrix expressions. In practice, using ProductReturnType<Lhs,Rhs>::Type
+  * is the recommended way to define the result type of a function returning an expression
+  * which involve a matrix product. The class Product or DiagonalProduct should never be
+  * used directly.
+  *
+  * \sa class Product, class DiagonalProduct, MatrixBase::operator*(const MatrixBase<OtherDerived>&)
  */
-template<typename Lhs, typename Rhs> struct ei_product_eval_mode
+template<typename Lhs, typename Rhs, int ProductMode>
+struct ProductReturnType
+{
+  typedef typename ei_nested<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
+  typedef typename ei_nested<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
+
+  typedef Product<typename ei_unconst<LhsNested>::type,
+                  typename ei_unconst<RhsNested>::type, ProductMode> Type;
+};
+
+// cache friendly specialization
+template<typename Lhs, typename Rhs>
+struct ProductReturnType<Lhs,Rhs,CacheFriendlyProduct>
+{
+  typedef typename ei_nested<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
+
+  typedef typename ei_nested<Rhs,Lhs::RowsAtCompileTime,
+              typename ei_product_eval_to_column_major<Rhs>::type
+          >::type RhsNested;
+
+  typedef Product<typename ei_unconst<LhsNested>::type,
+                  typename ei_unconst<RhsNested>::type, CacheFriendlyProduct> Type;
+};
+
+/*  Helper class to determine the type of the product, can be either:
+ *    - NormalProduct
+ *    - CacheFriendlyProduct
+ *    - NormalProduct
+ */
+template<typename Lhs, typename Rhs> struct ei_product_mode
 {
  enum{ value = ((Rhs::Flags&Diagonal)==Diagonal) || ((Lhs::Flags&Diagonal)==Diagonal)
              ? DiagonalProduct
@ -152,100 +98,103 @@ template<typename Lhs, typename Rhs> struct ei_product_eval_mode
                ? CacheFriendlyProduct : NormalProduct };
 };

-template<typename T> class ei_product_eval_to_column_major
+/** \class Product
+  *
+  * \brief Expression of the product of two matrices
+  *
+  * \param LhsNested the type used to store the left-hand side
+  * \param RhsNested the type used to store the right-hand side
+  * \param ProductMode the type of the product
+  *
+  * This class represents an expression of the product of two matrices.
+  * It is the return type of the operator* between matrices. Its template
+  * arguments are determined automatically by ProductReturnType. Therefore,
+  * Product should be used direclty. To determine the result type of a function
+  * which involve a matrix product, use ProductReturnType::Type.
+  *
+  * \sa ProductReturnType, MatrixBase::operator*(const MatrixBase<OtherDerived>&)
+  */
+template<typename LhsNested, typename RhsNested, int ProductMode>
+struct ei_traits<Product<LhsNested, RhsNested, ProductMode> >
 {
-    typedef typename ei_traits<T>::Scalar _Scalar;
-    enum {
-          _Rows = ei_traits<T>::RowsAtCompileTime,
-          _Cols = ei_traits<T>::ColsAtCompileTime,
-          _MaxRows = ei_traits<T>::MaxRowsAtCompileTime,
-          _MaxCols = ei_traits<T>::MaxColsAtCompileTime,
-          _Flags = ei_traits<T>::Flags
-    };
-
-  public:
-    typedef Matrix<_Scalar,
-                  _Rows, _Cols, _MaxRows, _MaxCols,
-                  ei_corrected_matrix_flags<
-                      _Scalar,
-                      _Rows, _Cols, _MaxRows, _MaxCols,
-                      _Flags
-                  >::ret & ~RowMajorBit
-            > type;
-};
-
-// as ei_nested, but evaluate to a column-major matrix if an evaluation is required
-template<typename T, int n=1> struct ei_product_nested_rhs
-{
-  typedef typename ei_meta_if<
-    ei_must_nest_by_value<T>::ret,
-    T,
-    typename ei_meta_if<
-        ((ei_traits<T>::Flags & EvalBeforeNestingBit)
-      || (n+1) * (NumTraits<typename ei_traits<T>::Scalar>::ReadCost) < (n-1) * T::CoeffReadCost),
-      typename ei_product_eval_to_column_major<T>::type,
-      const T&
-    >::ret
-  >::ret type;
-};
-
-template<typename Lhs, typename Rhs, int EvalMode>
-struct ei_traits<Product<Lhs, Rhs, EvalMode> >
-{
-  typedef typename Lhs::Scalar Scalar;
-  typedef typename ei_nested<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
-  typedef typename ei_meta_if<EvalMode==CacheFriendlyProduct,
-      typename ei_product_nested_rhs<Rhs,Lhs::RowsAtCompileTime>::type,
-      typename ei_nested<Rhs,Lhs::RowsAtCompileTime>::type>::ret RhsNested;
+  // clean the nested types:
  typedef typename ei_unconst<typename ei_unref<LhsNested>::type>::type _LhsNested;
  typedef typename ei_unconst<typename ei_unref<RhsNested>::type>::type _RhsNested;
+  typedef typename _LhsNested::Scalar Scalar;
+
  enum {
    LhsCoeffReadCost = _LhsNested::CoeffReadCost,
    RhsCoeffReadCost = _RhsNested::CoeffReadCost,
    LhsFlags = _LhsNested::Flags,
    RhsFlags = _RhsNested::Flags,
-    RowsAtCompileTime = Lhs::RowsAtCompileTime,
-    ColsAtCompileTime = Rhs::ColsAtCompileTime,
-    MaxRowsAtCompileTime = Lhs::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = Rhs::MaxColsAtCompileTime,
-    // the vectorization flags are only used by the normal product,
-    // the other one is always vectorized !
-    _RhsPacketAccess = (RhsFlags & RowMajorBit) && (RhsFlags & PacketAccessBit) && (ColsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
-    _LhsPacketAccess = (!(LhsFlags & RowMajorBit)) && (LhsFlags & PacketAccessBit) && (RowsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
-    _PacketAccess = (_LhsPacketAccess || _RhsPacketAccess) ? 1 : 0,
-    _RowMajor = (RhsFlags & RowMajorBit)
-              && (EvalMode==(int)CacheFriendlyProduct ? (int)LhsFlags & RowMajorBit : (!_LhsPacketAccess)),
-    _LostBits = ~((_RowMajor ? 0 : RowMajorBit)
+
+    RowsAtCompileTime = _LhsNested::RowsAtCompileTime,
+    ColsAtCompileTime = _RhsNested::ColsAtCompileTime,
+    InnerSize = EIGEN_ENUM_MIN(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime),
+
+    MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime,
+
+    LhsRowMajor = LhsFlags & RowMajorBit,
+    RhsRowMajor = RhsFlags & RowMajorBit,
+
+    CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit)
+                    && (ColsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
+
+    CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit)
+                    && (RowsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
+
+    CanVectorizeInner = LhsRowMajor && (!RhsRowMajor) && (LhsFlags & PacketAccessBit) && (RhsFlags & PacketAccessBit)
+                      && (InnerSize!=Dynamic) && (InnerSize % ei_packet_traits<Scalar>::size == 0),
+
+    EvalToRowMajor = (RhsFlags & RowMajorBit)
+                   && (ProductMode==(int)CacheFriendlyProduct ? (int)LhsFlags & RowMajorBit : (!CanVectorizeLhs)),
+
+    RemovedBits = ~((EvalToRowMajor ? 0 : RowMajorBit)
                | ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)
                | LinearAccessBit),
-    Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & _LostBits)
+
+    Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
          | EvalBeforeAssigningBit
          | EvalBeforeNestingBit
-          | (_PacketAccess ? PacketAccessBit : 0),
-    CoeffReadCost
-      = Lhs::ColsAtCompileTime == Dynamic
-      ? Dynamic
-      : Lhs::ColsAtCompileTime
-        * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
-        + (Lhs::ColsAtCompileTime - 1) * NumTraits<Scalar>::AddCost
+          | (CanVectorizeLhs || CanVectorizeRhs ? PacketAccessBit : 0),
+
+    CoeffReadCost = InnerSize == Dynamic ? Dynamic
+                  : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
+                    + (InnerSize - 1) * NumTraits<Scalar>::AddCost
  };
 };

-template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignment_operator,
-  public MatrixBase<Product<Lhs, Rhs, EvalMode> >
+template<typename LhsNested, typename RhsNested, int ProductMode> class Product : ei_no_assignment_operator,
+  public MatrixBase<Product<LhsNested, RhsNested, ProductMode> >
 {
  public:

    EIGEN_GENERIC_PUBLIC_INTERFACE(Product)
-    typedef typename ei_traits<Product>::LhsNested LhsNested;
-    typedef typename ei_traits<Product>::RhsNested RhsNested;
+
+  private:
+
    typedef typename ei_traits<Product>::_LhsNested _LhsNested;
    typedef typename ei_traits<Product>::_RhsNested _RhsNested;

    enum {
-      PacketSize = ei_packet_traits<Scalar>::size
+      PacketSize = ei_packet_traits<Scalar>::size,
+      InnerSize  = ei_traits<Product>::InnerSize,
+      Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
+      CanVectorizeInner = ei_traits<Product>::CanVectorizeInner && Unroll
    };

+    typedef ei_product_coeff_impl<CanVectorizeInner ? InnerVectorization : NoVectorization,
+                                  Unroll ? InnerSize-1 : Dynamic,
+                                  _LhsNested, _RhsNested> ScalarCoeffImpl;
+
+    typedef ei_product_packet_impl<Flags&RowMajorBit ? RowMajorProduct : ColMajorProduct,
+                                   Unroll ? InnerSize-1 : Dynamic,
+                                   _LhsNested, _RhsNested, PacketScalar> PacketCoeffImpl;
+
+  public:
+
+    template<typename Lhs, typename Rhs>
    inline Product(const Lhs& lhs, const Rhs& rhs)
      : m_lhs(lhs), m_rhs(rhs)
    {
@ -268,23 +217,15 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
    const Scalar _coeff(int row, int col) const
    {
      Scalar res;
-      const bool unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT;
-      ei_product_impl<unroll ? Lhs::ColsAtCompileTime-1 : Dynamic,
-                          unroll ? Lhs::ColsAtCompileTime : Dynamic,
-                          _LhsNested, _RhsNested>
-        ::run(row, col, m_lhs, m_rhs, res);
+      ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
      return res;
    }

    template<int LoadMode>
    const PacketScalar _packet(int row, int col) const
    {
-      const bool unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT;
      PacketScalar res;
-      ei_packet_product_impl<Flags&RowMajorBit ? true : false, Lhs::ColsAtCompileTime-1,
-                          unroll ? Lhs::ColsAtCompileTime : Dynamic,
-                          _LhsNested, _RhsNested, PacketScalar>
-        ::run(row, col, m_lhs, m_rhs, res);
+      PacketCoeffImpl::run(row, col, m_lhs, m_rhs, res);
      return res;
    }

@ -302,11 +243,11 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
  */
 template<typename Derived>
 template<typename OtherDerived>
-inline const typename MatrixBase<Derived>::template ProductReturnType<OtherDerived>::Type
+inline const typename ProductReturnType<Derived,OtherDerived>::Type
 MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
 {
  assert( (Derived::Flags&ArrayBit) == (OtherDerived::Flags&ArrayBit) );
-  return typename ProductReturnType<OtherDerived>::Type(derived(), other.derived());
+  return typename ProductReturnType<Derived,OtherDerived>::Type(derived(), other.derived());
 }

 /** replaces \c *this by \c *this * \a other.
@ -321,6 +262,157 @@ MatrixBase<Derived>::operator*=(const MatrixBase<OtherDerived> &other)
  return *this = *this * other;
 }

+/***************************************************************************
+* Normal product .coeff() implementation (with meta-unrolling)
+***************************************************************************/
+
+/**************************************
+*** Scalar path  - no vectorization ***
+**************************************/
+
+template<int Index, typename Lhs, typename Rhs>
+struct ei_product_coeff_impl<NoVectorization, Index, Lhs, Rhs>
+{
+  inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+  {
+    ei_product_coeff_impl<NoVectorization, Index-1, Lhs, Rhs>::run(row, col, lhs, rhs, res);
+    res += lhs.coeff(row, Index) * rhs.coeff(Index, col);
+  }
+};
+
+template<typename Lhs, typename Rhs>
+struct ei_product_coeff_impl<NoVectorization, 0, Lhs, Rhs>
+{
+  inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+  {
+    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
+  }
+};
+
+template<typename Lhs, typename Rhs>
+struct ei_product_coeff_impl<NoVectorization, Dynamic, Lhs, Rhs>
+{
+  inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar& res)
+  {
+    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
+      for(int i = 1; i < lhs.cols(); i++)
+        res += lhs.coeff(row, i) * rhs.coeff(i, col);
+  }
+};
+
+// prevent buggy user code from causing an infinite recursion
+template<typename Lhs, typename Rhs>
+struct ei_product_coeff_impl<NoVectorization, -1, Lhs, Rhs>
+{
+  inline static void run(int, int, const Lhs&, const Rhs&, typename Lhs::Scalar&) {}
+};
+
+/*******************************************
+*** Scalar path with inner vectorization ***
+*******************************************/
+
+template<int Index, typename Lhs, typename Rhs, typename PacketScalar>
+struct ei_product_coeff_vectorized_impl
+{
+  enum { PacketSize = ei_packet_traits<typename Lhs::Scalar>::size };
+  inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
+  {
+    ei_product_coeff_vectorized_impl<Index-PacketSize, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, pres);
+    pres = ei_padd(pres, ei_pmul( lhs.template packet<Aligned>(row, Index) , rhs.template packet<Aligned>(Index, col) ));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename PacketScalar>
+struct ei_product_coeff_vectorized_impl<0, Lhs, Rhs, PacketScalar>
+{
+  inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
+  {
+    pres = ei_pmul(lhs.template packet<Aligned>(row, 0) , rhs.template packet<Aligned>(0, col));
+  }
+};
+
+template<int Index, typename Lhs, typename Rhs>
+struct ei_product_coeff_impl<InnerVectorization, Index, Lhs, Rhs>
+{
+  typedef typename Lhs::PacketScalar PacketScalar;
+  enum { PacketSize = ei_packet_traits<typename Lhs::Scalar>::size };
+  inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+  {
+    PacketScalar pres;
+    ei_product_coeff_vectorized_impl<Index+1-PacketSize, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, pres);
+    ei_product_coeff_impl<NoVectorization,Index,Lhs,Rhs>::run(row, col, lhs, rhs, res);
+    res = ei_predux(pres);
+  }
+};
+
+/*******************
+*** Packet path  ***
+*******************/
+
+template<int Index, typename Lhs, typename Rhs, typename PacketScalar>
+struct ei_product_packet_impl<RowMajorProduct, Index, Lhs, Rhs, PacketScalar>
+{
+  inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
+  {
+    ei_product_packet_impl<RowMajorProduct, Index-1, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
+    res =  ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.template packet<Aligned>(Index, col), res);
+  }
+};
+
+template<int Index, typename Lhs, typename Rhs, typename PacketScalar>
+struct ei_product_packet_impl<ColMajorProduct, Index, Lhs, Rhs, PacketScalar>
+{
+  inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
+  {
+    ei_product_packet_impl<ColMajorProduct, Index-1, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
+    res =  ei_pmadd(lhs.template packet<Aligned>(row, Index), ei_pset1(rhs.coeff(Index, col)), res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename PacketScalar>
+struct ei_product_packet_impl<RowMajorProduct, 0, Lhs, Rhs, PacketScalar>
+{
+  inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
+  {
+    res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<Aligned>(0, col));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename PacketScalar>
+struct ei_product_packet_impl<ColMajorProduct, 0, Lhs, Rhs, PacketScalar>
+{
+  inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
+  {
+    res = ei_pmul(lhs.template packet<Aligned>(row, 0), ei_pset1(rhs.coeff(0, col)));
+  }
+};
+
+template<int StorageOrder, typename Lhs, typename Rhs, typename PacketScalar>
+struct ei_product_packet_impl<StorageOrder, Dynamic, Lhs, Rhs, PacketScalar>
+{
+  inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
+  {
+    res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<Aligned>(0, col));
+      for(int i = 1; i < lhs.cols(); i++)
+        res =  ei_pmadd(ei_pset1(lhs.coeff(row, i)), rhs.template packet<Aligned>(i, col), res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename PacketScalar>
+struct ei_product_packet_impl<ColMajorProduct, Dynamic, Lhs, Rhs, PacketScalar>
+{
+  inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
+  {
+    res = ei_pmul(lhs.template packet<Aligned>(row, 0), ei_pset1(rhs.coeff(0, col)));
+      for(int i = 1; i < lhs.cols(); i++)
+        res =  ei_pmadd(lhs.template packet<Aligned>(row, i), ei_pset1(rhs.coeff(i, col)), res);
+  }
+};
+
+/***************************************************************************
+* Cache friendly product callers and specific nested evaluation strategies
+***************************************************************************/
+
 /** \internal */
 template<typename Derived>
 template<typename Lhs,typename Rhs>
@ -339,6 +431,28 @@ inline Derived& MatrixBase<Derived>::lazyAssign(const Product<Lhs,Rhs,CacheFrien
  return derived();
 }

+template<typename T> class ei_product_eval_to_column_major
+{
+    typedef typename ei_traits<T>::Scalar _Scalar;
+    enum {
+          _Rows = ei_traits<T>::RowsAtCompileTime,
+          _Cols = ei_traits<T>::ColsAtCompileTime,
+          _MaxRows = ei_traits<T>::MaxRowsAtCompileTime,
+          _MaxCols = ei_traits<T>::MaxColsAtCompileTime,
+          _Flags = ei_traits<T>::Flags
+    };
+
+  public:
+    typedef Matrix<_Scalar,
+                  _Rows, _Cols, _MaxRows, _MaxCols,
+                  ei_corrected_matrix_flags<
+                      _Scalar,
+                      _Rows, _Cols, _MaxRows, _MaxCols,
+                      _Flags
+                  >::ret & ~RowMajorBit
+            > type;
+};
+
 template<typename T> struct ei_product_copy_rhs
 {
  typedef typename ei_meta_if<
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@ -140,7 +140,7 @@ enum { Aligned=0, UnAligned=1 };
 enum { ConditionalJumpCost = 5 };
 enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight };
 enum DirectionType { Vertical, Horizontal };
-enum ProductEvaluationMode { NormalProduct, CacheFriendlyProduct, DiagonalProduct, LazyProduct};
+enum ProductEvaluationMode { NormalProduct, CacheFriendlyProduct, DiagonalProduct };


 #endif // EIGEN_CONSTANTS_H
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@ -26,7 +26,6 @@
 #define EIGEN_FORWARDDECLARATIONS_H

 template<typename T> struct ei_traits;
-template<typename Lhs, typename Rhs> struct ei_product_eval_mode;
 template<typename T> struct NumTraits;
 template<typename Scalar, int Rows, int Cols, int MaxRows, int MaxCols, unsigned int SuggestedFlags> class ei_corrected_matrix_flags;

@ -49,7 +48,7 @@ template<typename MatrixType> class Conjugate;
 template<typename NullaryOp, typename MatrixType>         class CwiseNullaryOp;
 template<typename UnaryOp,   typename MatrixType>         class CwiseUnaryOp;
 template<typename BinaryOp,  typename Lhs, typename Rhs>  class CwiseBinaryOp;
-template<typename Lhs, typename Rhs, int EvalMode=ei_product_eval_mode<Lhs,Rhs>::value> class Product;
+template<typename Lhs, typename Rhs, int ProductMode> class Product;
 template<typename CoeffsVectorType> class DiagonalMatrix;
 template<typename MatrixType> class DiagonalCoeffs;
 template<typename MatrixType> class Map;
@ -63,6 +62,8 @@ template<typename Scalar> class Rotation2D;
 template<typename Scalar> class AngleAxis;
 template<typename Scalar,int Dim> class Transform;

+template<typename Lhs, typename Rhs> struct ei_product_mode;
+template<typename Lhs, typename Rhs, int ProductMode = ei_product_mode<Lhs,Rhs>::value> struct ProductReturnType;

 template<typename Scalar> struct ei_scalar_sum_op;
 template<typename Scalar> struct ei_scalar_difference_op;
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@ -160,10 +160,7 @@ class ei_corrected_matrix_flags
           packet_access_bit
            = ei_packet_traits<Scalar>::size > 1
              && (is_big || inner_size%ei_packet_traits<Scalar>::size==0)
-              ? PacketAccessBit : 0,
-
-          _flags1 = (SuggestedFlags & ~(EvalBeforeNestingBit | EvalBeforeAssigningBit | PacketAccessBit | RowMajorBit))
-                                    | LinearAccessBit | DirectAccessBit
+              ? PacketAccessBit : 0
    };

  public:
@ -208,7 +205,7 @@ template<typename T> struct ei_must_nest_by_value { enum { ret = false }; };
 template<typename T> struct ei_must_nest_by_value<NestByValue<T> > { enum { ret = true }; };


-template<typename T, int n=1> struct ei_nested
+template<typename T, int n=1, typename EvalType = typename ei_eval<T>::type> struct ei_nested
 {
  typedef typename ei_meta_if<
    ei_must_nest_by_value<T>::ret,
@ -216,7 +213,7 @@ template<typename T, int n=1> struct ei_nested
    typename ei_meta_if<
      (int(ei_traits<T>::Flags) & EvalBeforeNestingBit)
      || ((n+1) * int(NumTraits<typename ei_traits<T>::Scalar>::ReadCost) <= (n-1) * int(T::CoeffReadCost)),
-      typename ei_eval<T>::type,
+      EvalType,
      const T&
    >::ret
  >::ret type;
--- a/Eigen/src/Geometry/Rotation.h
+++ b/Eigen/src/Geometry/Rotation.h
@ -107,10 +107,10 @@ struct ToRotationMatrix<Scalar, Dim, MatrixBase<OtherDerived> >
  *
  * \param _Scalar the scalar type, i.e., the type of the coefficients
  *
-  * This class is equivalent to a single scalar representating the rotation angle
+  * This class is equivalent to a single scalar representing the rotation angle
  * in radian with some additional features such as the conversion from/to
  * rotation matrix. Moreover this class aims to provide a similar interface
-  * to Quaternion in order to facilitate the writting of generic algorithm
+  * to Quaternion in order to facilitate the writing of generic algorithm
  * dealing with rotations.
  *
  * \sa class Quaternion, class Transform
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h
@ -103,17 +103,17 @@ public:
  inline VectorRef translation() { return m_matrix.template block<Dim,1>(0,Dim); }

  template<typename OtherDerived>
-  struct ProductReturnType
+  struct TransformProductReturnType
  {
    typedef typename ei_transform_product_impl<OtherDerived>::ResultType Type;
  };

  template<typename OtherDerived>
-  const typename ProductReturnType<OtherDerived>::Type
+  const typename TransformProductReturnType<OtherDerived>::Type
  operator * (const MatrixBase<OtherDerived> &other) const;

  /** Contatenates two transformations */
-  const Product<MatrixType,MatrixType>
+  const typename ProductReturnType<MatrixType,MatrixType>::Type
  operator * (const Transform& other) const
  { return m_matrix * other.matrix(); }

@ -192,7 +192,7 @@ QMatrix Transform<Scalar,Dim>::toQMatrix(void) const

 template<typename Scalar, int Dim>
 template<typename OtherDerived>
-const typename Transform<Scalar,Dim>::template ProductReturnType<OtherDerived>::Type
+const typename Transform<Scalar,Dim>::template TransformProductReturnType<OtherDerived>::Type
 Transform<Scalar,Dim>::operator*(const MatrixBase<OtherDerived> &other) const
 {
  return ei_transform_product_impl<OtherDerived>::run(*this,other.derived());
@ -380,7 +380,7 @@ template<typename Other>
 struct Transform<Scalar,Dim>::ei_transform_product_impl<Other,Dim+1,Dim+1>
 {
  typedef typename Transform<Scalar,Dim>::MatrixType MatrixType;
-  typedef Product<MatrixType,Other> ResultType;
+  typedef typename ProductReturnType<MatrixType,Other>::Type ResultType;
  static ResultType run(const Transform<Scalar,Dim>& tr, const Other& other)
  { return tr.matrix() * other; }
 };
@ -390,7 +390,7 @@ template<typename Other>
 struct Transform<Scalar,Dim>::ei_transform_product_impl<Other,Dim+1,1>
 {
  typedef typename Transform<Scalar,Dim>::MatrixType MatrixType;
-  typedef Product<MatrixType,Other> ResultType;
+  typedef typename ProductReturnType<MatrixType,Other>::Type ResultType;
  static ResultType run(const Transform<Scalar,Dim>& tr, const Other& other)
  { return tr.matrix() * other; }
 };
@ -404,7 +404,7 @@ struct Transform<Scalar,Dim>::ei_transform_product_impl<Other,Dim,1>
      ei_scalar_multiple_op<Scalar>,
      NestByValue<CwiseBinaryOp<
        ei_scalar_sum_op<Scalar>,
-        NestByValue<Product<NestByValue<MatrixType>,Other> >,
+        NestByValue<typename ProductReturnType<NestByValue<MatrixType>,Other>::Type >,
        NestByValue<typename Transform<Scalar,Dim>::VectorRef> > >
      > ResultType;
  // FIXME shall we offer an optimized version when the last row is know to be 0,0...,0,1 ?
--- a/bench/basicbenchmark.cpp
+++ b/bench/basicbenchmark.cpp
@ -4,7 +4,7 @@

 int main(int argc, char *argv[])
 {
-  // disbale floating point exceptions
+  // disable floating point exceptions
  // this leads to more stable bench results
  // (this is done by default by ICC)
  #ifndef __INTEL_COMPILER
--- a/Eigen/src/Array/ArrayBase.h
+++ b/Eigen/src/Array/ArrayBase.h
--- a/test/product.cpp
+++ b/test/product.cpp
@ -61,7 +61,7 @@ template<typename MatrixType> void product(const MatrixType& m)
  // (we use Transpose.h but this doesn't count as a test for it)
  VERIFY_IS_APPROX((m1*m1.transpose())*m2,  m1*(m1.transpose()*m2));
  m3 = m1;
-  m3 *= (m1.transpose() * m2);
+  m3 *= m1.transpose() * m2;
  VERIFY_IS_APPROX(m3,                      m1 * (m1.transpose()*m2));
  VERIFY_IS_APPROX(m3,                      m1.lazy() * (m1.transpose()*m2));

@ -91,6 +91,8 @@ void test_product()
    CALL_SUBTEST( product(Matrix3i()) );
    CALL_SUBTEST( product(Matrix<float, 3, 2>()) );
    CALL_SUBTEST( product(Matrix4d()) );
+    CALL_SUBTEST( product(Matrix4f()) );
+    CALL_SUBTEST( product(MatrixXf(3,5)) );
  }
  for(int i = 0; i < g_repeat; i++) {
    CALL_SUBTEST( product(MatrixXf(ei_random<int>(1,320), ei_random<int>(1,320))) );