Make the explicit vectorization much more flexible:

- support dynamic sizes - support arbitrary matrix size when the matrix can be seen as a 1D array (except for fixed size matrices where the size in Bytes must be a factor of 16, this is to allow compact storage of a vector of matrices) Note that the explict vectorization is still experimental and far to be completely tested.
2025-04-06 19:10:36 +08:00 · 2008-04-25 15:46:18 +00:00 · 2008-04-25 15:46:18 +00:00 · a451835bce
commit a451835bce
parent 30d47b5250
10 changed files with 264 additions and 100 deletions
--- a/Eigen/Core
+++ b/Eigen/Core
@ -2,7 +2,7 @@
 #define EIGEN_CORE_H

 #ifndef EIGEN_DONT_VECTORIZE
-#ifdef __SSE2__
+#if ((defined __SSE2__) && ( (!defined __GNUC__) || (__GNUC__>=4 && __GNUC_MINOR__>=2)))
 #define EIGEN_VECTORIZE
 #define EIGEN_VECTORIZE_SSE
 #include <emmintrin.h>
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@ -99,7 +99,11 @@ struct ei_matrix_assignment_packet_unroller<Derived1, Derived2, Dynamic>

 template <typename Derived, typename OtherDerived,
 bool Vectorize = (Derived::Flags & OtherDerived::Flags & VectorizableBit)
-              && ((Derived::Flags&RowMajorBit)==(OtherDerived::Flags&RowMajorBit))>
+              && ((Derived::Flags&RowMajorBit)==(OtherDerived::Flags&RowMajorBit))
+              && (  (Derived::Flags & OtherDerived::Flags & Like1DArrayBit)
+                  ||((Derived::Flags&RowMajorBit)
+                    ? Derived::ColsAtCompileTime!=Dynamic && (Derived::ColsAtCompileTime%ei_packet_traits<typename Derived::Scalar>::size==0)
+                    : Derived::RowsAtCompileTime!=Dynamic && (Derived::RowsAtCompileTime%ei_packet_traits<typename Derived::Scalar>::size==0)) )>
 struct ei_assignment_impl;

 template<typename Derived>
@ -107,6 +111,7 @@ template<typename OtherDerived>
 Derived& MatrixBase<Derived>
  ::lazyAssign(const MatrixBase<OtherDerived>& other)
 {
+//   std::cout << "lazyAssign = " << Derived::Flags << " " << OtherDerived::Flags << "\n";
  ei_assignment_impl<Derived,OtherDerived>::execute(derived(),other.derived());
  return derived();
 }
@ -178,6 +183,7 @@ struct ei_assignment_impl<Derived, OtherDerived, true>
    ei_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
    if(unroll)
    {
+//       std::cout << "vectorized unrolled\n";
      ei_matrix_assignment_packet_unroller
        <Derived, OtherDerived,
          unroll && int(Derived::SizeAtCompileTime)>=ei_packet_traits<typename Derived::Scalar>::size
@ -188,15 +194,61 @@ struct ei_assignment_impl<Derived, OtherDerived, true>
    {
      if(OtherDerived::Flags&RowMajorBit)
      {
-        for(int i = 0; i < dst.rows(); i++)
-          for(int j = 0; j < dst.cols(); j+=ei_packet_traits<typename Derived::Scalar>::size)
+        if ( (Derived::Flags & OtherDerived::Flags & Like1DArrayBit)
+          &&  (Derived::ColsAtCompileTime==Dynamic
+            || Derived::ColsAtCompileTime%ei_packet_traits<typename Derived::Scalar>::size!=0))
+        {
+//           std::cout << "vectorized linear row major\n";
+          const int size = dst.rows() * dst.cols();
+          const int alignedSize = (size/ei_packet_traits<typename Derived::Scalar>::size)*ei_packet_traits<typename Derived::Scalar>::size;
+          int index = 0;
+          for ( ; index<alignedSize ; index+=ei_packet_traits<typename Derived::Scalar>::size)
+          {
+            // FIXME the following is not really efficient
+            int i = index/dst.rows();
+            int j = index%dst.rows();
            dst.writePacketCoeff(i, j, src.packetCoeff(i, j));
+          }
+          for(int i = alignedSize/dst.rows(); i < dst.rows(); i++)
+            for(int j = alignedSize%dst.rows(); j < dst.cols(); j++)
+              dst.coeffRef(i, j) = src.coeff(i, j);
+        }
+        else
+        {
+//           std::cout << "vectorized normal row major\n";
+          for(int i = 0; i < dst.rows(); i++)
+            for(int j = 0; j < dst.cols(); j+=ei_packet_traits<typename Derived::Scalar>::size)
+              dst.writePacketCoeff(i, j, src.packetCoeff(i, j));
+        }
      }
      else
      {
-        for(int j = 0; j < dst.cols(); j++)
-          for(int i = 0; i < dst.rows(); i+=ei_packet_traits<typename Derived::Scalar>::size)
+        if ((Derived::Flags & OtherDerived::Flags & Like1DArrayBit)
+          && ( Derived::RowsAtCompileTime==Dynamic
+            || Derived::RowsAtCompileTime%ei_packet_traits<typename Derived::Scalar>::size!=0))
+        {
+//           std::cout << "vectorized linear col major\n";
+          const int size = dst.rows() * dst.cols();
+          const int alignedSize = (size/ei_packet_traits<typename Derived::Scalar>::size)*ei_packet_traits<typename Derived::Scalar>::size;
+          int index = 0;
+          for ( ; index<alignedSize ; index+=ei_packet_traits<typename Derived::Scalar>::size)
+          {
+            // FIXME the following is not really efficient
+            int i = index%dst.rows();
+            int j = index/dst.rows();
            dst.writePacketCoeff(i, j, src.packetCoeff(i, j));
+          }
+          for(int j = alignedSize/dst.rows(); j < dst.cols(); j++)
+            for(int i = alignedSize%dst.rows(); i < dst.rows(); i++)
+              dst.coeffRef(i, j) = src.coeff(i, j);
+        }
+        else
+        {
+//           std::cout << "vectorized normal col major\n";
+          for(int j = 0; j < dst.cols(); j++)
+            for(int i = 0; i < dst.rows(); i+=ei_packet_traits<typename Derived::Scalar>::size)
+              dst.writePacketCoeff(i, j, src.packetCoeff(i, j));
+        }
      }
    }
  }
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@ -31,8 +31,8 @@
  *
  * \param NullaryOp template functor implementing the operator
  *
-  * This class represents an expression of a generic zeroary operator.
-  * It is the return type of the ones(), zero(), constant() and random() functions,
+  * This class represents an expression of a generic nullary operator.
+  * It is the return type of the ones(), zero(), constant(), identity() and random() functions,
  * and most of the time this is the only way it is used.
  *
  * However, if you want to write a function returning such an expression, you
@ -94,12 +94,18 @@ class CwiseNullaryOp : ei_no_assignment_operator,
 };


-/* \returns an expression of a custom coefficient-wise operator \a func of *this and \a other
+/** \returns an expression of a matrix defined by a custom functor \a func
  *
-  * The template parameter \a CustomNullaryOp is the type of the functor
-  * of the custom operator (see class CwiseNullaryOp for an example)
+  * The parameters \a rows and \a cols are the number of rows and of columns of
+  * the returned matrix. Must be compatible with this MatrixBase type.
  *
-  * \sa class CwiseNullaryOp, MatrixBase::operator+, MatrixBase::operator-, MatrixBase::cwiseProduct, MatrixBase::cwiseQuotient
+  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
+  * it is redundant to pass \a rows and \a cols as arguments, so zero() should be used
+  * instead.
+  *
+  * The template parameter \a CustomNullaryOp is the type of the functor.
+  *
+  * \sa class CwiseNullaryOp
  */
 template<typename Derived>
 template<typename CustomNullaryOp>
@ -109,6 +115,21 @@ MatrixBase<Derived>::cwiseCreate(int rows, int cols, const CustomNullaryOp& func
  return CwiseNullaryOp<CustomNullaryOp, Derived>(rows, cols, func);
 }

+/** \returns an expression of a matrix defined by a custom functor \a func
+  *
+  * The parameter \a size is the size of the returned vector.
+  * Must be compatible with this MatrixBase type.
+  *
+  * \only_for_vectors
+  *
+  * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
+  * it is redundant to pass \a size as argument, so zero() should be used
+  * instead.
+  *
+  * The template parameter \a CustomNullaryOp is the type of the functor.
+  *
+  * \sa class CwiseNullaryOp
+  */
 template<typename Derived>
 template<typename CustomNullaryOp>
 const CwiseNullaryOp<CustomNullaryOp, Derived>
@ -119,6 +140,15 @@ MatrixBase<Derived>::cwiseCreate(int size, const CustomNullaryOp& func)
  else return CwiseNullaryOp<CustomNullaryOp, Derived>(size, 1, func);
 }

+/** \returns an expression of a matrix defined by a custom functor \a func
+  *
+  * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
+  * need to use the variants taking size arguments.
+  *
+  * The template parameter \a CustomNullaryOp is the type of the functor.
+  *
+  * \sa class CwiseNullaryOp
+  */
 template<typename Derived>
 template<typename CustomNullaryOp>
 const CwiseNullaryOp<CustomNullaryOp, Derived>
@ -127,7 +157,16 @@ MatrixBase<Derived>::cwiseCreate(const CustomNullaryOp& func)
  return CwiseNullaryOp<CustomNullaryOp, Derived>(rows(), cols(), func);
 }

-/* \returns an expression of the coefficient-wise \< operator of *this and \a other
+/** \returns an expression of a constant matrix of value \a value
+  *
+  * The parameters \a rows and \a cols are the number of rows and of columns of
+  * the returned matrix. Must be compatible with this MatrixBase type.
+  *
+  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
+  * it is redundant to pass \a rows and \a cols as arguments, so zero() should be used
+  * instead.
+  *
+  * The template parameter \a CustomNullaryOp is the type of the functor.
  *
  * \sa class CwiseNullaryOp
  */
@ -138,6 +177,21 @@ MatrixBase<Derived>::constant(int rows, int cols, const Scalar& value)
  return cwiseCreate(rows, cols, ei_scalar_constant_op<Scalar>(value));
 }

+/** \returns an expression of a constant matrix of value \a value
+  *
+  * The parameter \a size is the size of the returned vector.
+  * Must be compatible with this MatrixBase type.
+  *
+  * \only_for_vectors
+  *
+  * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
+  * it is redundant to pass \a size as argument, so zero() should be used
+  * instead.
+  *
+  * The template parameter \a CustomNullaryOp is the type of the functor.
+  *
+  * \sa class CwiseNullaryOp
+  */
 template<typename Derived>
 const CwiseNullaryOp<ei_scalar_constant_op<typename ei_traits<Derived>::Scalar>, Derived>
 MatrixBase<Derived>::constant(int size, const Scalar& value)
@ -145,6 +199,15 @@ MatrixBase<Derived>::constant(int size, const Scalar& value)
  return cwiseCreate(size, ei_scalar_constant_op<Scalar>(value));
 }

+/** \returns an expression of a constant matrix of value \a value
+  *
+  * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
+  * need to use the variants taking size arguments.
+  *
+  * The template parameter \a CustomNullaryOp is the type of the functor.
+  *
+  * \sa class CwiseNullaryOp
+  */
 template<typename Derived>
 const CwiseNullaryOp<ei_scalar_constant_op<typename ei_traits<Derived>::Scalar>, Derived>
 MatrixBase<Derived>::constant(const Scalar& value)
@ -163,6 +226,10 @@ bool MatrixBase<Derived>::isEqualToConstant
  return true;
 }

+/** Sets all coefficients in this expression to \a value.
+  *
+  * \sa class CwiseNullaryOp, zero(), ones()
+  */
 template<typename Derived>
 Derived& MatrixBase<Derived>::setConstant(const Scalar& value)
 {
@ -238,7 +305,7 @@ MatrixBase<Derived>::zero()
  * Example: \include MatrixBase_isZero.cpp
  * Output: \verbinclude MatrixBase_isZero.out
  *
-  * \sa class Zero, zero()
+  * \sa class CwiseNullaryOp, zero()
  */
 template<typename Derived>
 bool MatrixBase<Derived>::isZero
@ -256,7 +323,7 @@ bool MatrixBase<Derived>::isZero
  * Example: \include MatrixBase_setZero.cpp
  * Output: \verbinclude MatrixBase_setZero.out
  *
-  * \sa class Zero, zero()
+  * \sa class CwiseNullaryOp, zero()
  */
 template<typename Derived>
 Derived& MatrixBase<Derived>::setZero()
@ -333,7 +400,7 @@ MatrixBase<Derived>::ones()
  * Example: \include MatrixBase_isOnes.cpp
  * Output: \verbinclude MatrixBase_isOnes.out
  *
-  * \sa class Ones, ones()
+  * \sa class CwiseNullaryOp, ones()
  */
 template<typename Derived>
 bool MatrixBase<Derived>::isOnes
@ -347,7 +414,7 @@ bool MatrixBase<Derived>::isOnes
  * Example: \include MatrixBase_setOnes.cpp
  * Output: \verbinclude MatrixBase_setOnes.out
  *
-  * \sa class Ones, ones()
+  * \sa class CwiseNullaryOp, ones()
  */
 template<typename Derived>
 Derived& MatrixBase<Derived>::setOnes()
@ -424,7 +491,7 @@ MatrixBase<Derived>::random()
  * Example: \include MatrixBase_setRandom.cpp
  * Output: \verbinclude MatrixBase_setRandom.out
  *
-  * \sa class Random, ei_random()
+  * \sa class CwiseNullaryOp, ei_random()
  */
 template<typename Derived>
 Derived& MatrixBase<Derived>::setRandom()
@ -479,7 +546,7 @@ MatrixBase<Derived>::identity()
  * Example: \include MatrixBase_isIdentity.cpp
  * Output: \verbinclude MatrixBase_isIdentity.out
  *
-  * \sa class Identity, identity(), identity(int,int), setIdentity()
+  * \sa class CwiseNullaryOp, identity(), identity(int,int), setIdentity()
  */
 template<typename Derived>
 bool MatrixBase<Derived>::isIdentity
@ -509,7 +576,7 @@ bool MatrixBase<Derived>::isIdentity
  * Example: \include MatrixBase_setIdentity.cpp
  * Output: \verbinclude MatrixBase_setIdentity.out
  *
-  * \sa class Identity, identity(), identity(int,int), isIdentity()
+  * \sa class CwiseNullaryOp, identity(), identity(int,int), isIdentity()
  */
 template<typename Derived>
 Derived& MatrixBase<Derived>::setIdentity()
--- a/Eigen/src/Core/Lazy.h
+++ b/Eigen/src/Core/Lazy.h
@ -72,6 +72,11 @@ template<typename ExpressionType> class Lazy
      return m_expression.coeff(row, col);
    }

+    PacketScalar _packetCoeff(int row, int col) const
+    {
+      return m_expression.packetCoeff(row, col);
+    }
+
  protected:
    const typename ExpressionType::Nested m_expression;
 };
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@ -79,7 +79,7 @@ struct ei_traits<Matrix<_Scalar, _Rows, _Cols, _SuggestedFlags, _MaxRows, _MaxCo
    ColsAtCompileTime = _Cols,
    MaxRowsAtCompileTime = _MaxRows,
    MaxColsAtCompileTime = _MaxCols,
-    Flags = ei_corrected_matrix_flags<_Scalar, _Rows, _Cols, _SuggestedFlags>::ret,
+    Flags = ei_corrected_matrix_flags<_Scalar, ei_size_at_compile_time<_MaxRows,_MaxCols>::ret, _SuggestedFlags>::ret,
    CoeffReadCost = NumTraits<Scalar>::ReadCost
  };
 };
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@ -75,11 +75,8 @@ template<typename Derived> class MatrixBase
          * it is set to the \a Dynamic constant.
          * \sa MatrixBase::rows(), MatrixBase::cols(), RowsAtCompileTime, SizeAtCompileTime */

-      SizeAtCompileTime
-        = ei_traits<Derived>::RowsAtCompileTime == Dynamic
-        || ei_traits<Derived>::ColsAtCompileTime == Dynamic
-        ? Dynamic
-        : ei_traits<Derived>::RowsAtCompileTime * ei_traits<Derived>::ColsAtCompileTime,
+      SizeAtCompileTime = ei_size_at_compile_time<ei_traits<Derived>::RowsAtCompileTime,
+                                                  ei_traits<Derived>::ColsAtCompileTime>::ret,
        /**< This is equal to the number of coefficients, i.e. the number of
          * rows times the number of columns, or to \a Dynamic if this is not
          * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */
@ -106,11 +103,8 @@ template<typename Derived> class MatrixBase
          * \sa ColsAtCompileTime, MaxRowsAtCompileTime, MaxSizeAtCompileTime
          */

-      MaxSizeAtCompileTime
-        = ei_traits<Derived>::MaxRowsAtCompileTime == Dynamic
-        || ei_traits<Derived>::MaxColsAtCompileTime == Dynamic
-        ? Dynamic
-        : ei_traits<Derived>::MaxRowsAtCompileTime * ei_traits<Derived>::MaxColsAtCompileTime,
+      MaxSizeAtCompileTime = ei_size_at_compile_time<ei_traits<Derived>::MaxRowsAtCompileTime,
+                                                     ei_traits<Derived>::MaxColsAtCompileTime>::ret,
        /**< This value is equal to the maximum possible number of coefficients that this expression
          * might have. If this expression might have an arbitrarily high number of coefficients,
          * this value is set to \a Dynamic.
--- a/Eigen/src/Core/MatrixStorage.h
+++ b/Eigen/src/Core/MatrixStorage.h
@ -49,6 +49,28 @@ template <typename T, int Size> struct ei_aligned_array<T,Size,false>
  T array[Size];
 };

+template<typename T>
+T* ei_aligned_malloc(size_t size)
+{
+  #ifdef EIGEN_VECTORIZE
+  if (ei_packet_traits<T>::size>1)
+    return static_cast<T*>(_mm_malloc(sizeof(T)*size, 16));
+  else
+  #endif
+    return new T[size];
+}
+
+template<typename T>
+void ei_aligned_free(T* ptr)
+{
+  #ifdef EIGEN_VECTORIZE
+  if (ei_packet_traits<T>::size>1)
+    _mm_free(ptr);
+  else
+  #endif
+    delete[] ptr;
+}
+
 // purely fixed-size matrix
 template<typename T, int Size, int _Rows, int _Cols> class ei_matrix_storage
 {
@ -127,7 +149,7 @@ template<typename T> class ei_matrix_storage<T, Dynamic, Dynamic, Dynamic>
    int m_cols;
  public:
    ei_matrix_storage(int size, int rows, int cols)
-      : m_data(new T[size]), m_rows(rows), m_cols(cols) {}
+      : m_data(ei_aligned_malloc<T>(size)), m_rows(rows), m_cols(cols) {}
    ~ei_matrix_storage() { delete[] m_data; }
    int rows(void) const {return m_rows;}
    int cols(void) const {return m_cols;}
@ -135,8 +157,8 @@ template<typename T> class ei_matrix_storage<T, Dynamic, Dynamic, Dynamic>
    {
      if(size != m_rows*m_cols)
      {
-        delete[] m_data;
-        m_data = new T[size];
+        ei_aligned_free(m_data);
+        m_data = ei_aligned_malloc<T>(size);
      }
      m_rows = rows;
      m_cols = cols;
@ -151,7 +173,7 @@ template<typename T, int _Rows> class ei_matrix_storage<T, Dynamic, _Rows, Dynam
    T *m_data;
    int m_cols;
  public:
-    ei_matrix_storage(int size, int, int cols) : m_data(new T[size]), m_cols(cols) {}
+    ei_matrix_storage(int size, int, int cols) : m_data(ei_aligned_malloc<T>(size)), m_cols(cols) {}
    ~ei_matrix_storage() { delete[] m_data; }
    static int rows(void) {return _Rows;}
    int cols(void) const {return m_cols;}
@ -159,8 +181,8 @@ template<typename T, int _Rows> class ei_matrix_storage<T, Dynamic, _Rows, Dynam
    {
      if(size != _Rows*m_cols)
      {
-        delete[] m_data;
-        m_data = new T[size];
+        ei_aligned_free(m_data);
+        m_data = ei_aligned_malloc<T>(size);
      }
      m_cols = cols;
    }
@ -174,7 +196,7 @@ template<typename T, int _Cols> class ei_matrix_storage<T, Dynamic, Dynamic, _Co
    T *m_data;
    int m_rows;
  public:
-    ei_matrix_storage(int size, int rows, int) : m_data(new T[size]), m_rows(rows) {}
+    ei_matrix_storage(int size, int rows, int) : m_data(ei_aligned_malloc<T>(size)), m_rows(rows) {}
    ~ei_matrix_storage() { delete[] m_data; }
    int rows(void) const {return m_rows;}
    static int cols(void) {return _Cols;}
@ -182,8 +204,8 @@ template<typename T, int _Cols> class ei_matrix_storage<T, Dynamic, Dynamic, _Co
    {
      if(size != m_rows*_Cols)
      {
-        delete[] m_data;
-        m_data = new T[size];
+        ei_aligned_free(m_data);
+        m_data = ei_aligned_malloc<T>(size);
      }
      m_rows = rows;
    }
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@ -135,7 +135,7 @@ struct ei_traits<Product<Lhs, Rhs, EvalMode> >
          | EvalBeforeAssigningBit
          | (ei_product_eval_mode<Lhs, Rhs>::value == (int)CacheOptimalProduct ? EvalBeforeNestingBit : 0))
          & (
-              ~(RowMajorBit | VectorizableBit)
+              ~(RowMajorBit | VectorizableBit | Like1DArrayBit)
              | (
                  (
                    !(Lhs::Flags & RowMajorBit) && (Lhs::Flags & VectorizableBit)
@ -178,7 +178,11 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm

    /** \internal */
    template<typename DestDerived>
-    void _cacheOptimalEval(DestDerived& res) const;
+    void _cacheOptimalEval(DestDerived& res, ei_meta_false) const;
+    #ifdef EIGEN_VECTORIZE
+    template<typename DestDerived>
+    void _cacheOptimalEval(DestDerived& res, ei_meta_true) const;
+    #endif

  private:

@ -267,59 +271,29 @@ MatrixBase<Derived>::operator*=(const MatrixBase<OtherDerived> &other)
 }

 template<typename Derived>
-template<typename Derived1, typename Derived2>
-Derived& MatrixBase<Derived>::lazyAssign(const Product<Derived1,Derived2,CacheOptimalProduct>& product)
+template<typename Lhs, typename Rhs>
+Derived& MatrixBase<Derived>::lazyAssign(const Product<Lhs,Rhs,CacheOptimalProduct>& product)
 {
-  product._cacheOptimalEval(*this);
+  product._cacheOptimalEval(*this,
+    #ifdef EIGEN_VECTORIZE
+    typename ei_meta_if<(Flags & VectorizableBit)
+      && (!(Lhs::Flags & RowMajorBit)
+      && (Lhs::RowsAtCompileTime!=Dynamic)
+      && (Lhs::RowsAtCompileTime%ei_packet_traits<Scalar>::size==0) ),
+      ei_meta_true,ei_meta_false>::ret()
+    #else
+    ei_meta_false
+    #endif
+    );
  return derived();
 }

 template<typename Lhs, typename Rhs, int EvalMode>
 template<typename DestDerived>
-void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res) const
+void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_false) const
 {
  res.setZero();
  const int cols4 = m_lhs.cols() & 0xfffffffC;
-  #ifdef EIGEN_VECTORIZE
-  if( (Flags & VectorizableBit) && (!(Lhs::Flags & RowMajorBit)) )
-  {
-    for(int k=0; k<this->cols(); k++)
-    {
-      int j=0;
-      for(; j<cols4; j+=4)
-      {
-        const typename ei_packet_traits<Scalar>::type tmp0 = ei_pset1(m_rhs.coeff(j+0,k));
-        const typename ei_packet_traits<Scalar>::type tmp1 = ei_pset1(m_rhs.coeff(j+1,k));
-        const typename ei_packet_traits<Scalar>::type tmp2 = ei_pset1(m_rhs.coeff(j+2,k));
-        const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_rhs.coeff(j+3,k));
-        for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size)
-        {
-          res.writePacketCoeff(i,k,\
-            ei_padd(
-              res.packetCoeff(i,k),
-              ei_padd(
-                ei_padd(
-                  ei_pmul(tmp0, m_lhs.packetCoeff(i,j)),
-                  ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))),
-                ei_padd(
-                  ei_pmul(tmp2, m_lhs.packetCoeff(i,j+2)),
-                  ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3))
-                )
-              )
-            )
-          );
-        }
-      }
-      for(; j<m_lhs.cols(); ++j)
-      {
-        const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_rhs.coeff(j,k));
-        for (int i=0; i<this->rows(); ++i)
-          res.writePacketCoeff(i,k,ei_pmul(tmp, m_lhs.packetCoeff(i,j)));
-      }
-    }
-  }
-  else
-  #endif // EIGEN_VECTORIZE
  {
    for(int k=0; k<this->cols(); ++k)
    {
@ -344,4 +318,48 @@ void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res) const
  }
 }

+#ifdef EIGEN_VECTORIZE
+template<typename Lhs, typename Rhs, int EvalMode>
+template<typename DestDerived>
+void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_true) const
+{
+  res.setZero();
+  const int cols4 = m_lhs.cols() & 0xfffffffC;
+  for(int k=0; k<this->cols(); k++)
+  {
+    int j=0;
+    for(; j<cols4; j+=4)
+    {
+      const typename ei_packet_traits<Scalar>::type tmp0 = ei_pset1(m_rhs.coeff(j+0,k));
+      const typename ei_packet_traits<Scalar>::type tmp1 = ei_pset1(m_rhs.coeff(j+1,k));
+      const typename ei_packet_traits<Scalar>::type tmp2 = ei_pset1(m_rhs.coeff(j+2,k));
+      const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_rhs.coeff(j+3,k));
+      for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size)
+      {
+        res.writePacketCoeff(i,k,\
+          ei_padd(
+            res.packetCoeff(i,k),
+            ei_padd(
+              ei_padd(
+                ei_pmul(tmp0, m_lhs.packetCoeff(i,j)),
+                ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))),
+              ei_padd(
+                ei_pmul(tmp2, m_lhs.packetCoeff(i,j+2)),
+                ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3))
+              )
+            )
+          )
+        );
+      }
+    }
+    for(; j<m_lhs.cols(); ++j)
+    {
+      const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_rhs.coeff(j,k));
+      for (int i=0; i<this->rows(); ++i)
+        res.writePacketCoeff(i,k,ei_pmul(tmp, m_lhs.packetCoeff(i,j)));
+    }
+  }
+}
+#endif // EIGEN_VECTORIZE
+
 #endif // EIGEN_PRODUCT_H
--- a/Eigen/src/Core/Temporary.h
+++ b/Eigen/src/Core/Temporary.h
@ -71,6 +71,11 @@ template<typename ExpressionType> class Temporary
      return m_expression.coeff(row, col);
    }

+    PacketScalar _packetCoeff(int row, int col) const
+    {
+      return m_expression.packetCoeff(row, col);
+    }
+
  protected:
    const ExpressionType m_expression;
 };
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@ -70,6 +70,9 @@ struct ei_meta_if <false, Then, Else> { typedef Else ret; };
 template<typename T, typename U> struct ei_is_same_type { enum { ret = 0 }; };
 template<typename T> struct ei_is_same_type<T,T> { enum { ret = 1 }; };

+struct ei_meta_true {};
+struct ei_meta_false {};
+

 /** \internal
  * Convenient struct to get the result type of a unary or binary functor.
@ -145,19 +148,12 @@ template<typename T> struct ei_packet_traits
  enum {size=1};
 };

-template<typename Scalar, int Rows, int Cols, unsigned int SuggestedFlags>
+template<typename Scalar, int Size, unsigned int SuggestedFlags>
 class ei_corrected_matrix_flags
 {
    enum { is_vectorizable
            = ei_packet_traits<Scalar>::size > 1
-              && Rows!=Dynamic
-              && Cols!=Dynamic
-              &&
-              (
-                SuggestedFlags&RowMajorBit
-                  ? Cols%ei_packet_traits<Scalar>::size==0
-                  : Rows%ei_packet_traits<Scalar>::size==0
-              ),
+              && (Size%ei_packet_traits<Scalar>::size==0),
          _flags1 = (SuggestedFlags & ~(EvalBeforeNestingBit | EvalBeforeAssigningBit)) | Like1DArrayBit
    };

@ -168,19 +164,24 @@ class ei_corrected_matrix_flags
    };
 };

+template<int _Rows, int _Cols> struct ei_size_at_compile_time
+{
+  enum { ret = (_Rows==Dynamic || _Cols==Dynamic) ? Dynamic : _Rows * _Cols };
+};
+
 template<typename T> class ei_eval
 {
    typedef typename ei_traits<T>::Scalar _Scalar;
-    enum { _Rows = ei_traits<T>::RowsAtCompileTime,
-          _Cols = ei_traits<T>::ColsAtCompileTime,
+    enum {_MaxRows = ei_traits<T>::MaxRowsAtCompileTime,
+          _MaxCols = ei_traits<T>::MaxColsAtCompileTime,
          _Flags = ei_traits<T>::Flags
    };

  public:
    typedef Matrix<_Scalar,
-                  _Rows,
-                  _Cols,
-                  ei_corrected_matrix_flags<_Scalar, _Rows, _Cols, _Flags>::ret,
+                  ei_traits<T>::RowsAtCompileTime,
+                  ei_traits<T>::ColsAtCompileTime,
+                  ei_corrected_matrix_flags<_Scalar, ei_size_at_compile_time<_MaxRows,_MaxCols>::ret, _Flags>::ret,
                  ei_traits<T>::MaxRowsAtCompileTime,
                  ei_traits<T>::MaxColsAtCompileTime> type;
 };