* Added ReferencableBit flag to known if coeffRef is available.

(needed by the new product implementation) * Make the packet* members template to support aligned and unaligned access. This makes Block vectorizable. Combined with ReferencableBit, we should be able to determine at runtime (in some specific cases) if an aligned vectorization is possible or not. * Improved the new product implementation to robustly handle all cases, it now passes all the tests. * Renamed the packet version ei_predux to ei_preduxp to avoid name collision.
2025-01-18 14:34:17 +08:00 · 2008-05-08 08:12:52 +00:00 · 2008-05-08 08:12:52 +00:00 · bf5326c3ca
commit bf5326c3ca
parent 64c49de7ba
9 changed files with 191 additions and 120 deletions
--- a/Eigen/Core
+++ b/Eigen/Core
@ -1,13 +1,18 @@
 #ifndef EIGEN_CORE_H
 #define EIGEN_CORE_H

+#define EIGEN_WIP_PRODUCT
+
 #ifndef EIGEN_DONT_VECTORIZE
 #if ((defined __SSE2__) && ( (!defined __GNUC__) || (__GNUC__>=4 && __GNUC_MINOR__>=2)))
 #define EIGEN_VECTORIZE
 #define EIGEN_VECTORIZE_SSE
 #include <emmintrin.h>
 #include <xmmintrin.h>
+// SSE3:
 #include <pmmintrin.h>
+// SSSE3:
+//#include <tmmintrin.h>
 #endif
 #ifdef __ALTIVEC__  // There are zero chances of both __SSE2__ AND __ALTIVEC__ been defined
 #define EIGEN_VECTORIZE
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@ -71,7 +71,7 @@ struct ei_traits<Block<MatrixType, BlockRows, BlockCols> >
                      || (ColsAtCompileTime != Dynamic && MatrixType::ColsAtCompileTime == Dynamic))
                      ? ~LargeBit
                      : ~(unsigned int)0,
-    Flags = MatrixType::Flags & DefaultLostFlagMask & FlagsMaskLargeBit,
+    Flags = MatrixType::Flags & (DefaultLostFlagMask | VectorizableBit | ReferencableBit) & FlagsMaskLargeBit,
    CoeffReadCost = MatrixType::CoeffReadCost
  };
 };
@ -146,13 +146,13 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block
    template<int LoadMode>
    PacketScalar _packetCoeff(int row, int col) const
    {
-      return m_matrix.packetCoeff<UnAligned>(row + m_startRow.value(), col + m_startCol.value());
+      return m_matrix.template packetCoeff<UnAligned>(row + m_startRow.value(), col + m_startCol.value());
    }

    template<int LoadMode>
    void _writePacketCoeff(int row, int col, const PacketScalar& x)
    {
-      m_matrix.const_cast_derived().writePacketCoeff<UnAligned>(row + m_startRow.value(), col + m_startCol.value(), x);
+      m_matrix.const_cast_derived().template writePacketCoeff<UnAligned>(row + m_startRow.value(), col + m_startCol.value(), x);
    }

  protected:
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@ -47,7 +47,7 @@ struct ei_traits<Map<MatrixType> >
    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    Flags = MatrixType::Flags & DefaultLostFlagMask,
+    Flags = MatrixType::Flags & (DefaultLostFlagMask | ReferencableBit),
    CoeffReadCost = NumTraits<Scalar>::ReadCost
  };
 };
--- a/Eigen/src/Core/PacketMath.h
+++ b/Eigen/src/Core/PacketMath.h
@ -71,10 +71,10 @@ template <typename Scalar> inline void ei_pstoreu(Scalar* to, const Scalar& from
 template <typename Scalar> inline Scalar ei_pfirst(const Scalar& a) { return a; }

 /** \internal \returns a packet where the element i contains the sum of the packet of \a vec[i] */
-// template <typename Scalar> inline Scalar ei_predux(const Scalar* vecs) { return vecs[0]; }
+template <typename Scalar> inline Scalar ei_preduxp(const Scalar* vecs) { return vecs[0]; }

 /** \internal \returns the sum of the elements of \a a*/
-// template <typename Scalar> inline Scalar ei_predux(const Scalar& a) { return a; }
+template <typename Scalar> inline Scalar ei_predux(const Scalar& a) { return a; }

 #endif // EIGEN_PACKET_MATH_H

--- a/Eigen/src/Core/PacketMath_SSE.h
+++ b/Eigen/src/Core/PacketMath_SSE.h
@ -102,14 +102,19 @@ inline int    ei_pfirst(const __m128i& a) { return _mm_cvtsi128_si32(a); }

 #ifdef __SSE3__
 // TODO implement SSE2 versions as well as integer versions
-inline __m128 ei_predux(const __m128* vecs)
+inline __m128 ei_preduxp(const __m128* vecs)
 {
  return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));
 }
-inline __m128d ei_predux(const __m128d* vecs)
+inline __m128d ei_preduxp(const __m128d* vecs)
 {
  return _mm_hadd_pd(vecs[0], vecs[1]);
 }
+// SSSE3 version:
+// inline __m128i ei_preduxp(const __m128i* vecs)
+// {
+//   return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3]));
+// }

 inline float ei_predux(const __m128& a)
 {
@ -118,6 +123,13 @@ inline float ei_predux(const __m128& a)
 }

 inline double ei_predux(const __m128d& a) { return ei_pfirst(_mm_hadd_pd(a, a)); }
+
+// SSSE3 version:
+// inline float ei_predux(const __m128i& a)
+// {
+//   __m128i tmp0 = _mm_hadd_epi32(a,a);
+//   return ei_pfirst(_mm_hadd_epi32(tmp0, tmp0));
+// }
 #else
 // SSE2 versions
 inline float ei_predux(const __m128& a)
@ -130,7 +142,7 @@ inline double ei_predux(const __m128d& a)
  return ei_pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
 }

-inline __m128 ei_predux(const __m128* vecs)
+inline __m128 ei_preduxp(const __m128* vecs)
 {
  __m128 tmp0, tmp1, tmp2;
  tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]);
@ -144,11 +156,31 @@ inline __m128 ei_predux(const __m128* vecs)
  return _mm_add_ps(tmp0, tmp2);
 }

-inline __m128d ei_predux(const __m128d* vecs)
+inline __m128d ei_preduxp(const __m128d* vecs)
 {
  return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1]));
 }
 #endif  // SSE3

+inline int ei_predux(const __m128i& a)
+{
+  __m128i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
+  return ei_pfirst(tmp) + ei_pfirst(_mm_shuffle_epi32(tmp, 1));
+}
+
+inline __m128i ei_preduxp(const __m128i* vecs)
+{
+  __m128i tmp0, tmp1, tmp2;
+  tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
+  tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
+  tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
+  tmp0 = _mm_add_epi32(tmp0, tmp1);
+  tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
+  tmp1 = _mm_add_epi32(tmp1, tmp2);
+  tmp2 = _mm_unpacklo_epi64(tmp0, tmp1);
+  tmp0 = _mm_unpackhi_epi64(tmp0, tmp1);
+  return _mm_add_epi32(tmp0, tmp2);
+}
+
 #endif // EIGEN_PACKET_MATH_SSE_H

--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@ -107,6 +107,12 @@ struct ei_packet_product_unroller<RowMajor, Index, Dynamic, Lhs, Rhs, PacketScal
  static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {}
 };

+template<int Index, typename Lhs, typename Rhs, typename PacketScalar>
+struct ei_packet_product_unroller<false, Index, Dynamic, Lhs, Rhs, PacketScalar>
+{
+  static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {}
+};
+
 template<typename Product, bool RowMajor = true> struct ProductPacketCoeffImpl {
  inline static typename Product::PacketScalar execute(const Product& product, int row, int col)
  { return product._packetCoeffRowMajor(row,col); }
--- a/Eigen/src/Core/ProductWIP.h
+++ b/Eigen/src/Core/ProductWIP.h
@ -73,7 +73,7 @@ struct ei_packet_product_unroller<true, Index, Size, Lhs, Rhs, PacketScalar>
  static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
  {
    ei_packet_product_unroller<true, Index-1, Size, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
-    res =  ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.packetCoeff(Index, col), res);
+    res =  ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.template packetCoeff<Aligned>(Index, col), res);
  }
 };

@ -83,7 +83,7 @@ struct ei_packet_product_unroller<false, Index, Size, Lhs, Rhs, PacketScalar>
  static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
  {
    ei_packet_product_unroller<false, Index-1, Size, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
-    res =  ei_pmadd(lhs.packetCoeff(row, Index), ei_pset1(rhs.coeff(Index, col)), res);
+    res =  ei_pmadd(lhs.template packetCoeff<Aligned>(row, Index), ei_pset1(rhs.coeff(Index, col)), res);
  }
 };

@ -92,7 +92,7 @@ struct ei_packet_product_unroller<true, 0, Size, Lhs, Rhs, PacketScalar>
 {
  static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
  {
-    res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.packetCoeff(0, col));
+    res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packetCoeff<Aligned>(0, col));
  }
 };

@ -101,7 +101,7 @@ struct ei_packet_product_unroller<false, 0, Size, Lhs, Rhs, PacketScalar>
 {
  static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
  {
-    res = ei_pmul(lhs.packetCoeff(row, 0), ei_pset1(rhs.coeff(0, col)));
+    res = ei_pmul(lhs.template packetCoeff<Aligned>(row, 0), ei_pset1(rhs.coeff(0, col)));
  }
 };

@ -111,6 +111,12 @@ struct ei_packet_product_unroller<RowMajor, Index, Dynamic, Lhs, Rhs, PacketScal
  static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {}
 };

+template<int Index, typename Lhs, typename Rhs, typename PacketScalar>
+struct ei_packet_product_unroller<false, Index, Dynamic, Lhs, Rhs, PacketScalar>
+{
+  static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {}
+};
+
 template<typename Product, bool RowMajor = true> struct ProductPacketCoeffImpl {
  inline static typename Product::PacketScalar execute(const Product& product, int row, int col)
  { return product._packetCoeffRowMajor(row,col); }
@ -139,16 +145,53 @@ template<typename Lhs, typename Rhs> struct ei_product_eval_mode
 {
  enum{ value =  Lhs::MaxRowsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
              && Rhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-              && (!( (Lhs::Flags&RowMajorBit) && ((Rhs::Flags&RowMajorBit) ^ RowMajorBit)))
              ? CacheOptimalProduct : NormalProduct };
 };

+template<typename T> class ei_product_eval_to_column_major
+{
+    typedef typename ei_traits<T>::Scalar _Scalar;
+    enum {_MaxRows = ei_traits<T>::MaxRowsAtCompileTime,
+          _MaxCols = ei_traits<T>::MaxColsAtCompileTime,
+          _Flags = ei_traits<T>::Flags
+    };
+
+  public:
+    typedef Matrix<_Scalar,
+                  ei_traits<T>::RowsAtCompileTime,
+                  ei_traits<T>::ColsAtCompileTime,
+                  ei_corrected_matrix_flags<_Scalar, ei_size_at_compile_time<_MaxRows,_MaxCols>::ret, _Flags>::ret & ~RowMajorBit,
+                  ei_traits<T>::MaxRowsAtCompileTime,
+                  ei_traits<T>::MaxColsAtCompileTime> type;
+};
+
+template<typename T, int n=1> struct ei_product_nested_rhs
+{
+  typedef typename ei_meta_if<
+    ei_is_temporary<T>::ret && !(ei_traits<T>::Flags & RowMajorBit),
+    T,
+    typename ei_meta_if<
+         (ei_traits<T>::Flags & EvalBeforeNestingBit)
+      || (ei_traits<T>::Flags & RowMajorBit)
+      || (!(ei_traits<T>::Flags & ReferencableBit))
+      || (n+1) * NumTraits<typename ei_traits<T>::Scalar>::ReadCost < (n-1) * T::CoeffReadCost,
+      typename ei_product_eval_to_column_major<T>::type,
+      const T&
+    >::ret
+  >::ret type;
+};
+
 template<typename Lhs, typename Rhs, int EvalMode>
 struct ei_traits<Product<Lhs, Rhs, EvalMode> >
 {
  typedef typename Lhs::Scalar Scalar;
-  typedef typename ei_nested<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
-  typedef typename ei_nested<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
+  // the cache friendly product evals lhs once only
+  // FIXME what to do if we chose to dynamically call the normal product from the cache friendly one for small matrices ?
+  typedef typename ei_nested<Lhs, EvalMode==CacheOptimalProduct ? 0 : Rhs::ColsAtCompileTime>::type LhsNested;
+  // NOTE that rhs must be ColumnMajor, so we might need a special nested type calculation
+  typedef typename ei_meta_if<EvalMode==CacheOptimalProduct,
+      typename ei_product_nested_rhs<Rhs,Lhs::RowsAtCompileTime>::type,
+      typename ei_nested<Rhs,Lhs::RowsAtCompileTime>::type>::ret RhsNested;
  typedef typename ei_unref<LhsNested>::type _LhsNested;
  typedef typename ei_unref<RhsNested>::type _RhsNested;
  enum {
@ -160,16 +203,20 @@ struct ei_traits<Product<Lhs, Rhs, EvalMode> >
    ColsAtCompileTime = Rhs::ColsAtCompileTime,
    MaxRowsAtCompileTime = Lhs::MaxRowsAtCompileTime,
    MaxColsAtCompileTime = Rhs::MaxColsAtCompileTime,
+    // the vectorization flags are only used by the normal product,
+    // the other one is always vectorized !
    _RhsVectorizable = (RhsFlags & RowMajorBit) && (RhsFlags & VectorizableBit) && (ColsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
    _LhsVectorizable = (!(LhsFlags & RowMajorBit)) && (LhsFlags & VectorizableBit) && (RowsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
-    _Vectorizable = (_LhsVectorizable || _RhsVectorizable) ? 1 : 0,
+    _Vectorizable = (_LhsVectorizable || _RhsVectorizable) ? 0 : 0,
    _RowMajor = (RhsFlags & RowMajorBit)
              && (EvalMode==(int)CacheOptimalProduct ? (int)LhsFlags & RowMajorBit : (!_LhsVectorizable)),
    _LostBits = DefaultLostFlagMask & ~(
                (_RowMajor ? 0 : RowMajorBit)
              | ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)),
    Flags = ((unsigned int)(LhsFlags | RhsFlags) & _LostBits)
-//          | EvalBeforeAssigningBit //FIXME
+    #ifndef EIGEN_WIP_PRODUCT_DIRTY
+          | EvalBeforeAssigningBit //FIXME
+    #endif
          | EvalBeforeNestingBit
          | (_Vectorizable ? VectorizableBit : 0),
    CoeffReadCost
@ -197,11 +244,16 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
      PacketSize = ei_packet_traits<Scalar>::size,
      #if (defined __i386__)
      // i386 architectures provides only 8 xmmm register,
-      // so let's reduce the max number of rows processed at once
+      // so let's reduce the max number of rows processed at once.
+      // NOTE that so far the maximal supported value is 8.
      MaxBlockRows = 4,
+      MaxBlockRows_ClampingMask = 0xFFFFFC,
      #else
      MaxBlockRows = 8,
+      MaxBlockRows_ClampingMask = 0xFFFFF8,
      #endif
+      // maximal size of the blocks fitted in L2 cache
+      MaxL2BlockSize = EIGEN_TUNE_FOR_L2_CACHE_SIZE / sizeof(Scalar)
    };

    Product(const Lhs& lhs, const Rhs& rhs)
@ -214,16 +266,6 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
    template<typename DestDerived>
    void _cacheFriendlyEval(DestDerived& res) const;

-    /** \internal */
-    template<typename DestDerived, int RhsAlignment, int ResAlignment>
-    void _cacheFriendlyEvalImpl(DestDerived& res) const __attribute__ ((noinline));
-
-    /** \internal */
-    template<typename DestDerived, int RhsAlignment, int ResAlignment, int BlockRows>
-    void _cacheFriendlyEvalKernel(DestDerived& res,
-      int l2i, int l2j, int l2k, int l1i,
-      int l2blockRowEnd, int l2blockColEnd, int l2blockSizeEnd, const Scalar* block) const EIGEN_DONT_INLINE;
-
  private:

    int _rows() const { return m_lhs.rows(); }
@ -255,7 +297,7 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
      if(Lhs::ColsAtCompileTime <= EIGEN_UNROLLING_LIMIT)
      {
        PacketScalar res;
-        ei_packet_product_unroller<Flags&RowMajorBit, Lhs::ColsAtCompileTime-1,
+        ei_packet_product_unroller<Flags&RowMajorBit ? true : false, Lhs::ColsAtCompileTime-1,
                            Lhs::ColsAtCompileTime <= EIGEN_UNROLLING_LIMIT
                              ? Lhs::ColsAtCompileTime : Dynamic,
                            _LhsNested, _RhsNested, PacketScalar>
@ -269,21 +311,30 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
    PacketScalar _packetCoeffRowMajor(int row, int col) const
    {
      PacketScalar res;
-      res = ei_pmul(ei_pset1(m_lhs.coeff(row, 0)),m_rhs.packetCoeff(0, col));
+      res = ei_pmul(ei_pset1(m_lhs.coeff(row, 0)),m_rhs.template packetCoeff<Aligned>(0, col));
      for(int i = 1; i < m_lhs.cols(); i++)
-        res =  ei_pmadd(ei_pset1(m_lhs.coeff(row, i)), m_rhs.packetCoeff(i, col), res);
+        res =  ei_pmadd(ei_pset1(m_lhs.coeff(row, i)), m_rhs.template packetCoeff<Aligned>(i, col), res);
      return res;
    }

    PacketScalar _packetCoeffColumnMajor(int row, int col) const
    {
      PacketScalar res;
-      res = ei_pmul(m_lhs.packetCoeff(row, 0), ei_pset1(m_rhs.coeff(0, col)));
+      res = ei_pmul(m_lhs.template packetCoeff<Aligned>(row, 0), ei_pset1(m_rhs.coeff(0, col)));
      for(int i = 1; i < m_lhs.cols(); i++)
-        res =  ei_pmadd(m_lhs.packetCoeff(row, i), ei_pset1(m_rhs.coeff(i, col)), res);
+        res =  ei_pmadd(m_lhs.template packetCoeff<Aligned>(row, i), ei_pset1(m_rhs.coeff(i, col)), res);
      return res;
    }

+    /** \internal */
+    template<typename DestDerived, int RhsAlignment, int ResAlignment>
+    void _cacheFriendlyEvalImpl(DestDerived& res) const __attribute__ ((noinline));
+
+    /** \internal */
+    template<typename DestDerived, int RhsAlignment, int ResAlignment, int BlockRows>
+    void _cacheFriendlyEvalKernel(DestDerived& res,
+      int l2i, int l2j, int l2k, int l1i,
+      int l2blockRowEnd, int l2blockColEnd, int l2blockSizeEnd, const Scalar* block) const EIGEN_DONT_INLINE;

  protected:
    const LhsNested m_lhs;
@ -361,7 +412,8 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalKernel(DestDerived& res,
    int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*BlockRows;
    const Scalar* localB = &block[offsetblock];

-    int l1jsize = l1j * m_lhs.cols(); //TODO find a better way to optimize address computation ?
+//     int l1jsize = l1j * m_lhs.cols(); //TODO find a better way to optimize address computation ?
+    Scalar* rhsColumn = &(m_rhs.const_cast_derived().coeffRef(0, l1j));

    // don't worry, dst is a set of registers
    PacketScalar dst[BlockRows];
@ -383,7 +435,8 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalKernel(DestDerived& res,

    // unaligned loads are expensive, therefore let's preload the next element in advance
    if (RhsAlignment==UnAligned)
-      tmp1 = ei_ploadu(&m_rhs.derived().data()[l1jsize+l2k]);
+      //tmp1 = ei_ploadu(&m_rhs.data()[l1jsize+l2k]);
+      tmp1 = ei_ploadu(&rhsColumn[l2k]);

    for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
    {
@ -392,13 +445,15 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalKernel(DestDerived& res,
      //PacketScalar tmp = m_rhs.template packetCoeff<Aligned>(k, l1j);
      if (RhsAlignment==Aligned)
      {
-        tmp = ei_pload(&m_rhs.data()[l1jsize + k]);
+        //tmp = ei_pload(&m_rhs.data()[l1jsize + k]);
+        tmp = ei_pload(&rhsColumn[k]);
      }
      else
      {
        tmp = tmp1;
        if (k+PacketSize<l2blockSizeEnd)
-          tmp1 = ei_ploadu(&m_rhs.data()[l1jsize + k+PacketSize]);
+          //tmp1 = ei_ploadu(&m_rhs.data()[l1jsize + k+PacketSize]);
+          tmp1 = ei_ploadu(&rhsColumn[k+PacketSize]);
      }

                        dst[0] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows             ])), dst[0]);
@ -421,16 +476,16 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalKernel(DestDerived& res,
    // we have up to 4 packets (for doubles: 8 rows / 2)
    if (PacketRows>=1)
      res.template writePacketCoeff<Aligned>(l1i, l1j,
-        ei_padd(res.template packetCoeff<Aligned>(l1i, l1j), ei_predux(&(dst[0]))));
+        ei_padd(res.template packetCoeff<Aligned>(l1i, l1j), ei_preduxp(&(dst[0]))));
    if (PacketRows>=2)
      res.template writePacketCoeff<Aligned>(l1i+PacketSize, l1j,
-        ei_padd(res.template packetCoeff<Aligned>(l1i+PacketSize, l1j), ei_predux(&(dst[PacketSize]))));
+        ei_padd(res.template packetCoeff<Aligned>(l1i+PacketSize, l1j), ei_preduxp(&(dst[PacketSize]))));
    if (PacketRows>=3)
      res.template writePacketCoeff<Aligned>(l1i+2*PacketSize, l1j,
-        ei_padd(res.template packetCoeff<Aligned>(l1i+2*PacketSize, l1j), ei_predux(&(dst[2*PacketSize]))));
+        ei_padd(res.template packetCoeff<Aligned>(l1i+2*PacketSize, l1j), ei_preduxp(&(dst[2*PacketSize]))));
    if (PacketRows>=4)
      res.template writePacketCoeff<Aligned>(l1i+3*PacketSize, l1j,
-        ei_padd(res.template packetCoeff<Aligned>(l1i+3*PacketSize, l1j), ei_predux(&(dst[3*PacketSize]))));
+        ei_padd(res.template packetCoeff<Aligned>(l1i+3*PacketSize, l1j), ei_preduxp(&(dst[3*PacketSize]))));

    // process the remaining rows one at a time
    if (RemainingStart<=0 && BlockRows>=1) res.coeffRef(l1i+0, l1j) += ei_predux(dst[0]);
@ -450,38 +505,37 @@ template<typename Lhs, typename Rhs, int EvalMode>
 template<typename DestDerived, int RhsAlignment, int ResAlignment>
 void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
 {
-  // allow direct access to data for benchmark purpose
-  const Scalar* __restrict__ a = m_lhs.derived().data();
-  const Scalar* __restrict__ b = m_rhs.derived().data();
-  Scalar* __restrict__ c = res.derived().data();
-
  // FIXME find a way to optimize: (an_xpr) + (a * b)
  // then we don't need to clear res and avoid and additional mat-mat sum
-//   res.setZero();
+  #ifndef EIGEN_WIP_PRODUCT_DIRTY
+//   std::cout << "wip product\n";
+  res.setZero();
+  #endif

-  const int bs = PacketSize * MaxBlockRows;                         // total number of elements treated at once
  const int rows = _rows();
  const int cols = _cols();
  const int remainingSize = m_lhs.cols()%PacketSize;
  const int size = m_lhs.cols() - remainingSize; // third dimension of the product clamped to packet boundaries
-  const int l2blocksize = 256 > _cols() ? _cols() : 256;
-  // FIXME use calloca ?? (allocation on the stack)
-  Scalar* __restrict__ block = new Scalar[l2blocksize*size];
+  const int l2BlockRows = MaxL2BlockSize > _rows() ? _rows() : MaxL2BlockSize;
+  const int l2BlockCols = MaxL2BlockSize > _cols() ? _cols() : MaxL2BlockSize;
+  const int l2BlockSize = MaxL2BlockSize > size    ? size    : MaxL2BlockSize;
+  //Scalar* __restrict__ block = new Scalar[l2blocksize*size];;
+  Scalar* __restrict__ block = (Scalar*)alloca(sizeof(Scalar)*l2BlockRows*size);

  // loops on each L2 cache friendly blocks of the result
-  for(int l2i=0; l2i<_rows(); l2i+=l2blocksize)
+  for(int l2i=0; l2i<_rows(); l2i+=l2BlockRows)
  {
-    const int l2blockRowEnd = std::min(l2i+l2blocksize, rows);
-    const int l2blockRowEndBW = l2blockRowEnd & 0xFFFFF8;             // end of the rows aligned to bw
-    const int l2blockRowRemaining = l2blockRowEnd - l2blockRowEndBW;  // number of remaining rows
+    const int l2blockRowEnd = std::min(l2i+l2BlockRows, rows);
+    const int l2blockRowEndBW = l2blockRowEnd & MaxBlockRows_ClampingMask;    // end of the rows aligned to bw
+    const int l2blockRemainingRows = l2blockRowEnd - l2blockRowEndBW;         // number of remaining rows

    // build a cache friendly block
    int count = 0;

    // copy l2blocksize rows of m_lhs to blocks of ps x bw
-    for(int l2k=0; l2k<size; l2k+=l2blocksize)
+    for(int l2k=0; l2k<size; l2k+=l2BlockSize)
    {
-      const int l2blockSizeEnd = std::min(l2k+l2blocksize, size);
+      const int l2blockSizeEnd = std::min(l2k+l2BlockSize, size);

      for (int i = l2i; i<l2blockRowEndBW; i+=MaxBlockRows)
      {
@ -494,25 +548,25 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
              block[count++] = m_lhs.coeff(i+w,k+s);
        }
      }
-      if (l2blockRowRemaining>0)
+      if (l2blockRemainingRows>0)
      {
        for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
        {
-          for (int w=0; w<l2blockRowRemaining; ++w)
+          for (int w=0; w<l2blockRemainingRows; ++w)
            for (int s=0; s<PacketSize; ++s)
              block[count++] = m_lhs.coeff(l2blockRowEndBW+w,k+s);
        }
      }
    }

-    for(int l2j=0; l2j<cols; l2j+=l2blocksize)
+    for(int l2j=0; l2j<cols; l2j+=l2BlockCols)
    {
-      int l2blockColEnd = std::min(l2j+l2blocksize, cols);
+      int l2blockColEnd = std::min(l2j+l2BlockCols, cols);

-      for(int l2k=0; l2k<size; l2k+=l2blocksize)
+      for(int l2k=0; l2k<size; l2k+=l2BlockSize)
      {
        // acumulate a bw rows of lhs time a single column of rhs to a bw x 1 block of res
-        int l2blockSizeEnd = std::min(l2k+l2blocksize, size);
+        int l2blockSizeEnd = std::min(l2k+l2BlockSize, size);

        // for each bw x 1 result's block
        for(int l1i=l2i; l1i<l2blockRowEndBW; l1i+=MaxBlockRows)
@ -522,7 +576,7 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
 #if 0
          for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
          {
-            int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*bw/*bs*/;
+            int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*MaxBlockRows;
            const Scalar* localB = &block[offsetblock];

            int l1jsize = l1j * m_lhs.cols(); //TODO find a better way to optimize address computation ?
@ -532,7 +586,7 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
            dst[1] = dst[0];
            dst[2] = dst[0];
            dst[3] = dst[0];
-            if (bw==8)
+            if (MaxBlockRows==8)
            {
              dst[4] = dst[0];
              dst[5] = dst[0];
@ -543,7 +597,7 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
            // TODO in unaligned mode, preload the next element
 //             PacketScalar tmp1 = _mm_load_ps(&m_rhs.derived().data()[l1jsize+l2k]);
            asm("#eigen begincore");
-            for(int k=l2k; k<l2blockSizeEnd; k+=ps)
+            for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
            {
 //               PacketScalar tmp = m_rhs.template packetCoeff<Aligned>(k, l1j);
              // TODO make this branching compile time (costly for doubles)
@ -559,10 +613,10 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
              dst[1] = ei_pmadd(tmp, b1, dst[1]);
              b1 = ei_pload(&(localB[k*bw+3*ps]));
              dst[2] = ei_pmadd(tmp, b0, dst[2]);
-              if (bw==8)
+              if (MaxBlockRows==8)
                b0 = ei_pload(&(localB[k*bw+4*ps]));
              dst[3] = ei_pmadd(tmp, b1, dst[3]);
-              if (bw==8)
+              if (MaxBlockRows==8)
              {
                b1 = ei_pload(&(localB[k*bw+5*ps]));
                dst[4] = ei_pmadd(tmp, b0, dst[4]);
@ -576,19 +630,20 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const

 //             if (resIsAligned)
            {
-              res.template writePacketCoeff<Aligned>(l1i, l1j, ei_padd(res.template packetCoeff<Aligned>(l1i, l1j), ei_predux(dst)));
-              if (ps==2)
-                res.template writePacketCoeff<Aligned>(l1i+2,l1j, ei_padd(res.template packetCoeff<Aligned>(l1i+2,l1j), ei_predux(&(dst[2]))));
-              if (bw==8)
+              res.template writePacketCoeff<Aligned>(l1i, l1j, ei_padd(res.template packetCoeff<Aligned>(l1i, l1j), ei_preduxp(dst)));
+              if (PacketSize==2)
+                res.template writePacketCoeff<Aligned>(l1i+2,l1j, ei_padd(res.template packetCoeff<Aligned>(l1i+2,l1j), ei_preduxp(&(dst[2]))));
+              if (MaxBlockRows==8)
              {
-                res.template writePacketCoeff<Aligned>(l1i+4,l1j, ei_padd(res.template packetCoeff<Aligned>(l1i+4,l1j), ei_predux(&(dst[4]))));
-                if (ps==2)
-                  res.template writePacketCoeff<Aligned>(l1i+6,l1j, ei_padd(res.template packetCoeff<Aligned>(l1i+6,l1j), ei_predux(&(dst[6]))));
+                res.template writePacketCoeff<Aligned>(l1i+4,l1j, ei_padd(res.template packetCoeff<Aligned>(l1i+4,l1j), ei_preduxp(&(dst[4]))));
+                if (PacketSize==2)
+                  res.template writePacketCoeff<Aligned>(l1i+6,l1j, ei_padd(res.template packetCoeff<Aligned>(l1i+6,l1j), ei_preduxp(&(dst[6]))));
              }
            }
 //             else
 //             {
 //               // TODO uncommenting this code kill the perf, even though it is never called !!
+//               // this is because dst cannot be a set of registers only
 //               // TODO optimize this loop
 //               // TODO is it better to do one redux at once or packet reduxes + unaligned store ?
 //               for (int w = 0; w<bw; ++w)
@ -600,15 +655,15 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
          }
 #endif
        }
-        if (l2blockRowRemaining>0)
+        if (l2blockRemainingRows>0)
        {
          // this is an attempt to build an array of kernels, but I did not manage to get it compiles
 //           typedef void (*Kernel)(DestDerived& , int, int, int, int, int, int, int, const Scalar*);
 //           Kernel kernels[8];
 //           kernels[0] = (Kernel)(&Product<Lhs,Rhs,EvalMode>::template _cacheFriendlyEvalKernel<DestDerived, RhsAlignment, ResAlignment, 1>);
-//           kernels[l2blockRowRemaining](res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block);
+//           kernels[l2blockRemainingRows](res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block);

-          switch(l2blockRowRemaining)
+          switch(l2blockRemainingRows)
          {
          case 1:_cacheFriendlyEvalKernel<DestDerived, RhsAlignment, ResAlignment, 1>(
              res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break;
@ -627,43 +682,6 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
          default:
            ei_internal_assert(false && "internal error"); break;
          }
-
-#if 0
-          // TODO optimize this part using a generic templated function that processes N rows
-          // here we process the remaining l2blockRowRemaining rows
-          for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
-          {
-            int offsetblock = l2k * (l2blockRowEnd-l2i) + (l2blockRowEndBW-l2i)*(l2blockSizeEnd-l2k) - l2k*l2blockRowRemaining;
-            const Scalar* localB = &block[offsetblock];
-
-            int l1jsize = l1j * size;
-
-            PacketScalar dst[MaxBlockRows];
-            dst[0] = ei_pset1(Scalar(0.));
-            for (int w = 1; w<l2blockRowRemaining; ++w)
-              dst[w] = dst[0];
-            PacketScalar b0, b1, tmp;
-            asm("#eigen begincore dynamic");
-            for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
-            {
-              //PacketScalar tmp = m_rhs.packetCoeff(k, l1j);
-              if (rhsIsAligned)
-                tmp = ei_pload(&m_rhs.derived().data()[l1jsize + k]);
-              else
-                tmp = ei_ploadu(&m_rhs.derived().data()[l1jsize + k]);
-
-              // TODO optimize this loop
-              for (int w = 0; w<l2blockRowRemaining; ++w)
-                dst[w] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRowRemaining+w*PacketSize])), dst[w]);
-            }
-
-            // TODO optimize this loop
-            for (int w = 0; w<l2blockRowRemaining; ++w)
-              res.coeffRef(l2blockRowEndBW+w, l1j) += ei_predux(dst[w]);
-
-            asm("#eigen endcore dynamic");
-          }
-#endif
        }
      }
    }
@ -678,9 +696,10 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
      NormalProduct>(
        m_lhs.block(0,size, _rows(), remainingSize),
        m_rhs.block(size,0, remainingSize, _cols())).lazy();
+//     res += m_lhs.block(0,size, _rows(), remainingSize)._lazyProduct(m_rhs.block(size,0, remainingSize, _cols()));
  }

-  delete[] block;
+//   delete[] block;
 }

 #endif // EIGEN_PRODUCT_H
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@ -43,16 +43,18 @@ const unsigned int NullDiagBit = 0x40;      ///< means all diagonal coefficients
 const unsigned int UnitDiagBit = 0x80;      ///< means all diagonal coefficients are equal to 1
 const unsigned int NullLowerBit = 0x200;    ///< means the strictly triangular lower part is 0
 const unsigned int NullUpperBit = 0x400;    ///< means the strictly triangular upper part is 0
+const unsigned int ReferencableBit = 0x800; ///< means the expression is writable through MatrixBase::coeffRef(int,int)

 enum { Upper=NullLowerBit, Lower=NullUpperBit };
 enum { Aligned=0, UnAligned=1 };

 // list of flags that are lost by default
-const unsigned int DefaultLostFlagMask = ~(VectorizableBit | Like1DArrayBit | NullDiagBit | UnitDiagBit | NullLowerBit | NullUpperBit);
+const unsigned int DefaultLostFlagMask = ~(VectorizableBit | Like1DArrayBit | ReferencableBit
+  | NullDiagBit | UnitDiagBit | NullLowerBit | NullUpperBit);

 enum { ConditionalJumpCost = 5 };
 enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight };
 enum DirectionType { Vertical, Horizontal };
-enum ProductEvaluationMode { NormalProduct, CacheOptimalProduct };
+enum ProductEvaluationMode { NormalProduct, CacheOptimalProduct, LazyProduct};

 #endif // EIGEN_CONSTANTS_H
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@ -45,6 +45,13 @@

 #define EIGEN_DEFAULT_MATRIX_FLAGS EIGEN_DEFAULT_MATRIX_STORAGE_ORDER

+/** Define a hint size when dealling with large matrices and L2 cache friendlyness
+  * More precisely, its square value represents the amount of bytes which can be assumed to stay in L2 cache.
+  */
+#ifndef EIGEN_TUNE_FOR_L2_CACHE_SIZE
+#define EIGEN_TUNE_FOR_L2_CACHE_SIZE 1024
+#endif
+
 #define USING_PART_OF_NAMESPACE_EIGEN \
 EIGEN_USING_MATRIX_TYPEDEFS \
 using Eigen::Matrix; \