diff --git a/Eigen/Core b/Eigen/Core
index 999bb96f2..cc05c308e 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -1,13 +1,18 @@
 #ifndef EIGEN_CORE_H
 #define EIGEN_CORE_H
 
+#define EIGEN_WIP_PRODUCT
+
 #ifndef EIGEN_DONT_VECTORIZE
 #if ((defined __SSE2__) && ( (!defined __GNUC__) || (__GNUC__>=4 && __GNUC_MINOR__>=2)))
 #define EIGEN_VECTORIZE
 #define EIGEN_VECTORIZE_SSE
 #include <emmintrin.h>
 #include <xmmintrin.h>
+// SSE3:
 #include <pmmintrin.h>
+// SSSE3:
+//#include <tmmintrin.h>
 #endif
 #ifdef __ALTIVEC__  // There are zero chances of both __SSE2__ AND __ALTIVEC__ been defined
 #define EIGEN_VECTORIZE
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 247a46b40..00bb375a9 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -71,7 +71,7 @@ struct ei_traits<Block<MatrixType, BlockRows, BlockCols> >
                       || (ColsAtCompileTime != Dynamic && MatrixType::ColsAtCompileTime == Dynamic))
                       ? ~LargeBit
                       : ~(unsigned int)0,
-    Flags = MatrixType::Flags & DefaultLostFlagMask & FlagsMaskLargeBit,
+    Flags = MatrixType::Flags & (DefaultLostFlagMask | VectorizableBit | ReferencableBit) & FlagsMaskLargeBit,
     CoeffReadCost = MatrixType::CoeffReadCost
   };
 };
@@ -146,13 +146,13 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block
     template<int LoadMode>
     PacketScalar _packetCoeff(int row, int col) const
     {
-      return m_matrix.packetCoeff<UnAligned>(row + m_startRow.value(), col + m_startCol.value());
+      return m_matrix.template packetCoeff<UnAligned>(row + m_startRow.value(), col + m_startCol.value());
     }
 
     template<int LoadMode>
     void _writePacketCoeff(int row, int col, const PacketScalar& x)
     {
-      m_matrix.const_cast_derived().writePacketCoeff<UnAligned>(row + m_startRow.value(), col + m_startCol.value(), x);
+      m_matrix.const_cast_derived().template writePacketCoeff<UnAligned>(row + m_startRow.value(), col + m_startCol.value(), x);
     }
 
   protected:
diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h
index 266da0cfe..e9f65cd7a 100644
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -47,7 +47,7 @@ struct ei_traits<Map<MatrixType> >
     ColsAtCompileTime = MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    Flags = MatrixType::Flags & DefaultLostFlagMask,
+    Flags = MatrixType::Flags & (DefaultLostFlagMask | ReferencableBit),
     CoeffReadCost = NumTraits<Scalar>::ReadCost
   };
 };
diff --git a/Eigen/src/Core/PacketMath.h b/Eigen/src/Core/PacketMath.h
index 59c143ede..267192482 100644
--- a/Eigen/src/Core/PacketMath.h
+++ b/Eigen/src/Core/PacketMath.h
@@ -71,10 +71,10 @@ template <typename Scalar> inline void ei_pstoreu(Scalar* to, const Scalar& from
 template <typename Scalar> inline Scalar ei_pfirst(const Scalar& a) { return a; }
 
 /** \internal \returns a packet where the element i contains the sum of the packet of \a vec[i] */
-// template <typename Scalar> inline Scalar ei_predux(const Scalar* vecs) { return vecs[0]; }
+template <typename Scalar> inline Scalar ei_preduxp(const Scalar* vecs) { return vecs[0]; }
 
 /** \internal \returns the sum of the elements of \a a*/
-// template <typename Scalar> inline Scalar ei_predux(const Scalar& a) { return a; }
+template <typename Scalar> inline Scalar ei_predux(const Scalar& a) { return a; }
 
 #endif // EIGEN_PACKET_MATH_H
 
diff --git a/Eigen/src/Core/PacketMath_SSE.h b/Eigen/src/Core/PacketMath_SSE.h
index 1872affea..dd467f52a 100644
--- a/Eigen/src/Core/PacketMath_SSE.h
+++ b/Eigen/src/Core/PacketMath_SSE.h
@@ -102,14 +102,19 @@ inline int    ei_pfirst(const __m128i& a) { return _mm_cvtsi128_si32(a); }
 
 #ifdef __SSE3__
 // TODO implement SSE2 versions as well as integer versions
-inline __m128 ei_predux(const __m128* vecs)
+inline __m128 ei_preduxp(const __m128* vecs)
 {
   return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));
 }
-inline __m128d ei_predux(const __m128d* vecs)
+inline __m128d ei_preduxp(const __m128d* vecs)
 {
   return _mm_hadd_pd(vecs[0], vecs[1]);
 }
+// SSSE3 version:
+// inline __m128i ei_preduxp(const __m128i* vecs)
+// {
+//   return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3]));
+// }
 
 inline float ei_predux(const __m128& a)
 {
@@ -118,6 +123,13 @@ inline float ei_predux(const __m128& a)
 }
 
 inline double ei_predux(const __m128d& a) { return ei_pfirst(_mm_hadd_pd(a, a)); }
+
+// SSSE3 version:
+// inline float ei_predux(const __m128i& a)
+// {
+//   __m128i tmp0 = _mm_hadd_epi32(a,a);
+//   return ei_pfirst(_mm_hadd_epi32(tmp0, tmp0));
+// }
 #else
 // SSE2 versions
 inline float ei_predux(const __m128& a)
@@ -130,7 +142,7 @@ inline double ei_predux(const __m128d& a)
   return ei_pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
 }
 
-inline __m128 ei_predux(const __m128* vecs)
+inline __m128 ei_preduxp(const __m128* vecs)
 {
   __m128 tmp0, tmp1, tmp2;
   tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]);
@@ -144,11 +156,31 @@ inline __m128 ei_predux(const __m128* vecs)
   return _mm_add_ps(tmp0, tmp2);
 }
 
-inline __m128d ei_predux(const __m128d* vecs)
+inline __m128d ei_preduxp(const __m128d* vecs)
 {
   return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1]));
 }
 #endif  // SSE3
 
+inline int ei_predux(const __m128i& a)
+{
+  __m128i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
+  return ei_pfirst(tmp) + ei_pfirst(_mm_shuffle_epi32(tmp, 1));
+}
+
+inline __m128i ei_preduxp(const __m128i* vecs)
+{
+  __m128i tmp0, tmp1, tmp2;
+  tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
+  tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
+  tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
+  tmp0 = _mm_add_epi32(tmp0, tmp1);
+  tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
+  tmp1 = _mm_add_epi32(tmp1, tmp2);
+  tmp2 = _mm_unpacklo_epi64(tmp0, tmp1);
+  tmp0 = _mm_unpackhi_epi64(tmp0, tmp1);
+  return _mm_add_epi32(tmp0, tmp2);
+}
+
 #endif // EIGEN_PACKET_MATH_SSE_H
 
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index c1b2f5457..66c6d4d5b 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -107,6 +107,12 @@ struct ei_packet_product_unroller<RowMajor, Index, Dynamic, Lhs, Rhs, PacketScal
   static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {}
 };
 
+template<int Index, typename Lhs, typename Rhs, typename PacketScalar>
+struct ei_packet_product_unroller<false, Index, Dynamic, Lhs, Rhs, PacketScalar>
+{
+  static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {}
+};
+
 template<typename Product, bool RowMajor = true> struct ProductPacketCoeffImpl {
   inline static typename Product::PacketScalar execute(const Product& product, int row, int col)
   { return product._packetCoeffRowMajor(row,col); }
diff --git a/Eigen/src/Core/ProductWIP.h b/Eigen/src/Core/ProductWIP.h
index 57bb899d6..a5fc9d298 100644
--- a/Eigen/src/Core/ProductWIP.h
+++ b/Eigen/src/Core/ProductWIP.h
@@ -73,7 +73,7 @@ struct ei_packet_product_unroller<true, Index, Size, Lhs, Rhs, PacketScalar>
   static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
   {
     ei_packet_product_unroller<true, Index-1, Size, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
-    res =  ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.packetCoeff(Index, col), res);
+    res =  ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.template packetCoeff<Aligned>(Index, col), res);
   }
 };
 
@@ -83,7 +83,7 @@ struct ei_packet_product_unroller<false, Index, Size, Lhs, Rhs, PacketScalar>
   static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
   {
     ei_packet_product_unroller<false, Index-1, Size, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
-    res =  ei_pmadd(lhs.packetCoeff(row, Index), ei_pset1(rhs.coeff(Index, col)), res);
+    res =  ei_pmadd(lhs.template packetCoeff<Aligned>(row, Index), ei_pset1(rhs.coeff(Index, col)), res);
   }
 };
 
@@ -92,7 +92,7 @@ struct ei_packet_product_unroller<true, 0, Size, Lhs, Rhs, PacketScalar>
 {
   static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
   {
-    res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.packetCoeff(0, col));
+    res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packetCoeff<Aligned>(0, col));
   }
 };
 
@@ -101,7 +101,7 @@ struct ei_packet_product_unroller<false, 0, Size, Lhs, Rhs, PacketScalar>
 {
   static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
   {
-    res = ei_pmul(lhs.packetCoeff(row, 0), ei_pset1(rhs.coeff(0, col)));
+    res = ei_pmul(lhs.template packetCoeff<Aligned>(row, 0), ei_pset1(rhs.coeff(0, col)));
   }
 };
 
@@ -111,6 +111,12 @@ struct ei_packet_product_unroller<RowMajor, Index, Dynamic, Lhs, Rhs, PacketScal
   static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {}
 };
 
+template<int Index, typename Lhs, typename Rhs, typename PacketScalar>
+struct ei_packet_product_unroller<false, Index, Dynamic, Lhs, Rhs, PacketScalar>
+{
+  static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {}
+};
+
 template<typename Product, bool RowMajor = true> struct ProductPacketCoeffImpl {
   inline static typename Product::PacketScalar execute(const Product& product, int row, int col)
   { return product._packetCoeffRowMajor(row,col); }
@@ -139,16 +145,53 @@ template<typename Lhs, typename Rhs> struct ei_product_eval_mode
 {
   enum{ value =  Lhs::MaxRowsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
               && Rhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-              && (!( (Lhs::Flags&RowMajorBit) && ((Rhs::Flags&RowMajorBit) ^ RowMajorBit)))
               ? CacheOptimalProduct : NormalProduct };
 };
 
+template<typename T> class ei_product_eval_to_column_major
+{
+    typedef typename ei_traits<T>::Scalar _Scalar;
+    enum {_MaxRows = ei_traits<T>::MaxRowsAtCompileTime,
+          _MaxCols = ei_traits<T>::MaxColsAtCompileTime,
+          _Flags = ei_traits<T>::Flags
+    };
+
+  public:
+    typedef Matrix<_Scalar,
+                  ei_traits<T>::RowsAtCompileTime,
+                  ei_traits<T>::ColsAtCompileTime,
+                  ei_corrected_matrix_flags<_Scalar, ei_size_at_compile_time<_MaxRows,_MaxCols>::ret, _Flags>::ret & ~RowMajorBit,
+                  ei_traits<T>::MaxRowsAtCompileTime,
+                  ei_traits<T>::MaxColsAtCompileTime> type;
+};
+
+template<typename T, int n=1> struct ei_product_nested_rhs
+{
+  typedef typename ei_meta_if<
+    ei_is_temporary<T>::ret && !(ei_traits<T>::Flags & RowMajorBit),
+    T,
+    typename ei_meta_if<
+         (ei_traits<T>::Flags & EvalBeforeNestingBit)
+      || (ei_traits<T>::Flags & RowMajorBit)
+      || (!(ei_traits<T>::Flags & ReferencableBit))
+      || (n+1) * NumTraits<typename ei_traits<T>::Scalar>::ReadCost < (n-1) * T::CoeffReadCost,
+      typename ei_product_eval_to_column_major<T>::type,
+      const T&
+    >::ret
+  >::ret type;
+};
+
 template<typename Lhs, typename Rhs, int EvalMode>
 struct ei_traits<Product<Lhs, Rhs, EvalMode> >
 {
   typedef typename Lhs::Scalar Scalar;
-  typedef typename ei_nested<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
-  typedef typename ei_nested<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
+  // the cache friendly product evals lhs once only
+  // FIXME what to do if we chose to dynamically call the normal product from the cache friendly one for small matrices ?
+  typedef typename ei_nested<Lhs, EvalMode==CacheOptimalProduct ? 0 : Rhs::ColsAtCompileTime>::type LhsNested;
+  // NOTE that rhs must be ColumnMajor, so we might need a special nested type calculation
+  typedef typename ei_meta_if<EvalMode==CacheOptimalProduct,
+      typename ei_product_nested_rhs<Rhs,Lhs::RowsAtCompileTime>::type,
+      typename ei_nested<Rhs,Lhs::RowsAtCompileTime>::type>::ret RhsNested;
   typedef typename ei_unref<LhsNested>::type _LhsNested;
   typedef typename ei_unref<RhsNested>::type _RhsNested;
   enum {
@@ -160,16 +203,20 @@ struct ei_traits<Product<Lhs, Rhs, EvalMode> >
     ColsAtCompileTime = Rhs::ColsAtCompileTime,
     MaxRowsAtCompileTime = Lhs::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = Rhs::MaxColsAtCompileTime,
+    // the vectorization flags are only used by the normal product,
+    // the other one is always vectorized !
     _RhsVectorizable = (RhsFlags & RowMajorBit) && (RhsFlags & VectorizableBit) && (ColsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
     _LhsVectorizable = (!(LhsFlags & RowMajorBit)) && (LhsFlags & VectorizableBit) && (RowsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
-    _Vectorizable = (_LhsVectorizable || _RhsVectorizable) ? 1 : 0,
+    _Vectorizable = (_LhsVectorizable || _RhsVectorizable) ? 0 : 0,
     _RowMajor = (RhsFlags & RowMajorBit)
               && (EvalMode==(int)CacheOptimalProduct ? (int)LhsFlags & RowMajorBit : (!_LhsVectorizable)),
     _LostBits = DefaultLostFlagMask & ~(
                 (_RowMajor ? 0 : RowMajorBit)
               | ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)),
     Flags = ((unsigned int)(LhsFlags | RhsFlags) & _LostBits)
-//          | EvalBeforeAssigningBit //FIXME
+    #ifndef EIGEN_WIP_PRODUCT_DIRTY
+          | EvalBeforeAssigningBit //FIXME
+    #endif
           | EvalBeforeNestingBit
           | (_Vectorizable ? VectorizableBit : 0),
     CoeffReadCost
@@ -197,11 +244,16 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
       PacketSize = ei_packet_traits<Scalar>::size,
       #if (defined __i386__)
       // i386 architectures provides only 8 xmmm register,
-      // so let's reduce the max number of rows processed at once
+      // so let's reduce the max number of rows processed at once.
+      // NOTE that so far the maximal supported value is 8.
       MaxBlockRows = 4,
+      MaxBlockRows_ClampingMask = 0xFFFFFC,
       #else
       MaxBlockRows = 8,
+      MaxBlockRows_ClampingMask = 0xFFFFF8,
       #endif
+      // maximal size of the blocks fitted in L2 cache
+      MaxL2BlockSize = EIGEN_TUNE_FOR_L2_CACHE_SIZE / sizeof(Scalar)
     };
 
     Product(const Lhs& lhs, const Rhs& rhs)
@@ -214,16 +266,6 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
     template<typename DestDerived>
     void _cacheFriendlyEval(DestDerived& res) const;
 
-    /** \internal */
-    template<typename DestDerived, int RhsAlignment, int ResAlignment>
-    void _cacheFriendlyEvalImpl(DestDerived& res) const __attribute__ ((noinline));
-
-    /** \internal */
-    template<typename DestDerived, int RhsAlignment, int ResAlignment, int BlockRows>
-    void _cacheFriendlyEvalKernel(DestDerived& res,
-      int l2i, int l2j, int l2k, int l1i,
-      int l2blockRowEnd, int l2blockColEnd, int l2blockSizeEnd, const Scalar* block) const EIGEN_DONT_INLINE;
-
   private:
 
     int _rows() const { return m_lhs.rows(); }
@@ -255,7 +297,7 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
       if(Lhs::ColsAtCompileTime <= EIGEN_UNROLLING_LIMIT)
       {
         PacketScalar res;
-        ei_packet_product_unroller<Flags&RowMajorBit, Lhs::ColsAtCompileTime-1,
+        ei_packet_product_unroller<Flags&RowMajorBit ? true : false, Lhs::ColsAtCompileTime-1,
                             Lhs::ColsAtCompileTime <= EIGEN_UNROLLING_LIMIT
                               ? Lhs::ColsAtCompileTime : Dynamic,
                             _LhsNested, _RhsNested, PacketScalar>
@@ -269,21 +311,30 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
     PacketScalar _packetCoeffRowMajor(int row, int col) const
     {
       PacketScalar res;
-      res = ei_pmul(ei_pset1(m_lhs.coeff(row, 0)),m_rhs.packetCoeff(0, col));
+      res = ei_pmul(ei_pset1(m_lhs.coeff(row, 0)),m_rhs.template packetCoeff<Aligned>(0, col));
       for(int i = 1; i < m_lhs.cols(); i++)
-        res =  ei_pmadd(ei_pset1(m_lhs.coeff(row, i)), m_rhs.packetCoeff(i, col), res);
+        res =  ei_pmadd(ei_pset1(m_lhs.coeff(row, i)), m_rhs.template packetCoeff<Aligned>(i, col), res);
       return res;
     }
 
     PacketScalar _packetCoeffColumnMajor(int row, int col) const
     {
       PacketScalar res;
-      res = ei_pmul(m_lhs.packetCoeff(row, 0), ei_pset1(m_rhs.coeff(0, col)));
+      res = ei_pmul(m_lhs.template packetCoeff<Aligned>(row, 0), ei_pset1(m_rhs.coeff(0, col)));
       for(int i = 1; i < m_lhs.cols(); i++)
-        res =  ei_pmadd(m_lhs.packetCoeff(row, i), ei_pset1(m_rhs.coeff(i, col)), res);
+        res =  ei_pmadd(m_lhs.template packetCoeff<Aligned>(row, i), ei_pset1(m_rhs.coeff(i, col)), res);
       return res;
     }
 
+    /** \internal */
+    template<typename DestDerived, int RhsAlignment, int ResAlignment>
+    void _cacheFriendlyEvalImpl(DestDerived& res) const __attribute__ ((noinline));
+
+    /** \internal */
+    template<typename DestDerived, int RhsAlignment, int ResAlignment, int BlockRows>
+    void _cacheFriendlyEvalKernel(DestDerived& res,
+      int l2i, int l2j, int l2k, int l1i,
+      int l2blockRowEnd, int l2blockColEnd, int l2blockSizeEnd, const Scalar* block) const EIGEN_DONT_INLINE;
 
   protected:
     const LhsNested m_lhs;
@@ -361,7 +412,8 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalKernel(DestDerived& res,
     int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*BlockRows;
     const Scalar* localB = &block[offsetblock];
 
-    int l1jsize = l1j * m_lhs.cols(); //TODO find a better way to optimize address computation ?
+//     int l1jsize = l1j * m_lhs.cols(); //TODO find a better way to optimize address computation ?
+    Scalar* rhsColumn = &(m_rhs.const_cast_derived().coeffRef(0, l1j));
 
     // don't worry, dst is a set of registers
     PacketScalar dst[BlockRows];
@@ -383,7 +435,8 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalKernel(DestDerived& res,
 
     // unaligned loads are expensive, therefore let's preload the next element in advance
     if (RhsAlignment==UnAligned)
-      tmp1 = ei_ploadu(&m_rhs.derived().data()[l1jsize+l2k]);
+      //tmp1 = ei_ploadu(&m_rhs.data()[l1jsize+l2k]);
+      tmp1 = ei_ploadu(&rhsColumn[l2k]);
 
     for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
     {
@@ -392,13 +445,15 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalKernel(DestDerived& res,
       //PacketScalar tmp = m_rhs.template packetCoeff<Aligned>(k, l1j);
       if (RhsAlignment==Aligned)
       {
-        tmp = ei_pload(&m_rhs.data()[l1jsize + k]);
+        //tmp = ei_pload(&m_rhs.data()[l1jsize + k]);
+        tmp = ei_pload(&rhsColumn[k]);
       }
       else
       {
         tmp = tmp1;
         if (k+PacketSize<l2blockSizeEnd)
-          tmp1 = ei_ploadu(&m_rhs.data()[l1jsize + k+PacketSize]);
+          //tmp1 = ei_ploadu(&m_rhs.data()[l1jsize + k+PacketSize]);
+          tmp1 = ei_ploadu(&rhsColumn[k+PacketSize]);
       }
 
                         dst[0] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows             ])), dst[0]);
@@ -421,16 +476,16 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalKernel(DestDerived& res,
     // we have up to 4 packets (for doubles: 8 rows / 2)
     if (PacketRows>=1)
       res.template writePacketCoeff<Aligned>(l1i, l1j,
-        ei_padd(res.template packetCoeff<Aligned>(l1i, l1j), ei_predux(&(dst[0]))));
+        ei_padd(res.template packetCoeff<Aligned>(l1i, l1j), ei_preduxp(&(dst[0]))));
     if (PacketRows>=2)
       res.template writePacketCoeff<Aligned>(l1i+PacketSize, l1j,
-        ei_padd(res.template packetCoeff<Aligned>(l1i+PacketSize, l1j), ei_predux(&(dst[PacketSize]))));
+        ei_padd(res.template packetCoeff<Aligned>(l1i+PacketSize, l1j), ei_preduxp(&(dst[PacketSize]))));
     if (PacketRows>=3)
       res.template writePacketCoeff<Aligned>(l1i+2*PacketSize, l1j,
-        ei_padd(res.template packetCoeff<Aligned>(l1i+2*PacketSize, l1j), ei_predux(&(dst[2*PacketSize]))));
+        ei_padd(res.template packetCoeff<Aligned>(l1i+2*PacketSize, l1j), ei_preduxp(&(dst[2*PacketSize]))));
     if (PacketRows>=4)
       res.template writePacketCoeff<Aligned>(l1i+3*PacketSize, l1j,
-        ei_padd(res.template packetCoeff<Aligned>(l1i+3*PacketSize, l1j), ei_predux(&(dst[3*PacketSize]))));
+        ei_padd(res.template packetCoeff<Aligned>(l1i+3*PacketSize, l1j), ei_preduxp(&(dst[3*PacketSize]))));
 
     // process the remaining rows one at a time
     if (RemainingStart<=0 && BlockRows>=1) res.coeffRef(l1i+0, l1j) += ei_predux(dst[0]);
@@ -450,38 +505,37 @@ template<typename Lhs, typename Rhs, int EvalMode>
 template<typename DestDerived, int RhsAlignment, int ResAlignment>
 void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
 {
-  // allow direct access to data for benchmark purpose
-  const Scalar* __restrict__ a = m_lhs.derived().data();
-  const Scalar* __restrict__ b = m_rhs.derived().data();
-  Scalar* __restrict__ c = res.derived().data();
-
   // FIXME find a way to optimize: (an_xpr) + (a * b)
   // then we don't need to clear res and avoid and additional mat-mat sum
-//   res.setZero();
+  #ifndef EIGEN_WIP_PRODUCT_DIRTY
+//   std::cout << "wip product\n";
+  res.setZero();
+  #endif
 
-  const int bs = PacketSize * MaxBlockRows;                         // total number of elements treated at once
   const int rows = _rows();
   const int cols = _cols();
   const int remainingSize = m_lhs.cols()%PacketSize;
   const int size = m_lhs.cols() - remainingSize; // third dimension of the product clamped to packet boundaries
-  const int l2blocksize = 256 > _cols() ? _cols() : 256;
-  // FIXME use calloca ?? (allocation on the stack)
-  Scalar* __restrict__ block = new Scalar[l2blocksize*size];
+  const int l2BlockRows = MaxL2BlockSize > _rows() ? _rows() : MaxL2BlockSize;
+  const int l2BlockCols = MaxL2BlockSize > _cols() ? _cols() : MaxL2BlockSize;
+  const int l2BlockSize = MaxL2BlockSize > size    ? size    : MaxL2BlockSize;
+  //Scalar* __restrict__ block = new Scalar[l2blocksize*size];;
+  Scalar* __restrict__ block = (Scalar*)alloca(sizeof(Scalar)*l2BlockRows*size);
 
   // loops on each L2 cache friendly blocks of the result
-  for(int l2i=0; l2i<_rows(); l2i+=l2blocksize)
+  for(int l2i=0; l2i<_rows(); l2i+=l2BlockRows)
   {
-    const int l2blockRowEnd = std::min(l2i+l2blocksize, rows);
-    const int l2blockRowEndBW = l2blockRowEnd & 0xFFFFF8;             // end of the rows aligned to bw
-    const int l2blockRowRemaining = l2blockRowEnd - l2blockRowEndBW;  // number of remaining rows
+    const int l2blockRowEnd = std::min(l2i+l2BlockRows, rows);
+    const int l2blockRowEndBW = l2blockRowEnd & MaxBlockRows_ClampingMask;    // end of the rows aligned to bw
+    const int l2blockRemainingRows = l2blockRowEnd - l2blockRowEndBW;         // number of remaining rows
 
     // build a cache friendly block
     int count = 0;
 
     // copy l2blocksize rows of m_lhs to blocks of ps x bw
-    for(int l2k=0; l2k<size; l2k+=l2blocksize)
+    for(int l2k=0; l2k<size; l2k+=l2BlockSize)
     {
-      const int l2blockSizeEnd = std::min(l2k+l2blocksize, size);
+      const int l2blockSizeEnd = std::min(l2k+l2BlockSize, size);
 
       for (int i = l2i; i<l2blockRowEndBW; i+=MaxBlockRows)
       {
@@ -494,25 +548,25 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
               block[count++] = m_lhs.coeff(i+w,k+s);
         }
       }
-      if (l2blockRowRemaining>0)
+      if (l2blockRemainingRows>0)
       {
         for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
         {
-          for (int w=0; w<l2blockRowRemaining; ++w)
+          for (int w=0; w<l2blockRemainingRows; ++w)
             for (int s=0; s<PacketSize; ++s)
               block[count++] = m_lhs.coeff(l2blockRowEndBW+w,k+s);
         }
       }
     }
 
-    for(int l2j=0; l2j<cols; l2j+=l2blocksize)
+    for(int l2j=0; l2j<cols; l2j+=l2BlockCols)
     {
-      int l2blockColEnd = std::min(l2j+l2blocksize, cols);
+      int l2blockColEnd = std::min(l2j+l2BlockCols, cols);
 
-      for(int l2k=0; l2k<size; l2k+=l2blocksize)
+      for(int l2k=0; l2k<size; l2k+=l2BlockSize)
       {
         // acumulate a bw rows of lhs time a single column of rhs to a bw x 1 block of res
-        int l2blockSizeEnd = std::min(l2k+l2blocksize, size);
+        int l2blockSizeEnd = std::min(l2k+l2BlockSize, size);
 
         // for each bw x 1 result's block
         for(int l1i=l2i; l1i<l2blockRowEndBW; l1i+=MaxBlockRows)
@@ -522,7 +576,7 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
 #if 0
           for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
           {
-            int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*bw/*bs*/;
+            int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*MaxBlockRows;
             const Scalar* localB = &block[offsetblock];
 
             int l1jsize = l1j * m_lhs.cols(); //TODO find a better way to optimize address computation ?
@@ -532,7 +586,7 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
             dst[1] = dst[0];
             dst[2] = dst[0];
             dst[3] = dst[0];
-            if (bw==8)
+            if (MaxBlockRows==8)
             {
               dst[4] = dst[0];
               dst[5] = dst[0];
@@ -543,7 +597,7 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
             // TODO in unaligned mode, preload the next element
 //             PacketScalar tmp1 = _mm_load_ps(&m_rhs.derived().data()[l1jsize+l2k]);
             asm("#eigen begincore");
-            for(int k=l2k; k<l2blockSizeEnd; k+=ps)
+            for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
             {
 //               PacketScalar tmp = m_rhs.template packetCoeff<Aligned>(k, l1j);
               // TODO make this branching compile time (costly for doubles)
@@ -559,10 +613,10 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
               dst[1] = ei_pmadd(tmp, b1, dst[1]);
               b1 = ei_pload(&(localB[k*bw+3*ps]));
               dst[2] = ei_pmadd(tmp, b0, dst[2]);
-              if (bw==8)
+              if (MaxBlockRows==8)
                 b0 = ei_pload(&(localB[k*bw+4*ps]));
               dst[3] = ei_pmadd(tmp, b1, dst[3]);
-              if (bw==8)
+              if (MaxBlockRows==8)
               {
                 b1 = ei_pload(&(localB[k*bw+5*ps]));
                 dst[4] = ei_pmadd(tmp, b0, dst[4]);
@@ -576,19 +630,20 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
 
 //             if (resIsAligned)
             {
-              res.template writePacketCoeff<Aligned>(l1i, l1j, ei_padd(res.template packetCoeff<Aligned>(l1i, l1j), ei_predux(dst)));
-              if (ps==2)
-                res.template writePacketCoeff<Aligned>(l1i+2,l1j, ei_padd(res.template packetCoeff<Aligned>(l1i+2,l1j), ei_predux(&(dst[2]))));
-              if (bw==8)
+              res.template writePacketCoeff<Aligned>(l1i, l1j, ei_padd(res.template packetCoeff<Aligned>(l1i, l1j), ei_preduxp(dst)));
+              if (PacketSize==2)
+                res.template writePacketCoeff<Aligned>(l1i+2,l1j, ei_padd(res.template packetCoeff<Aligned>(l1i+2,l1j), ei_preduxp(&(dst[2]))));
+              if (MaxBlockRows==8)
               {
-                res.template writePacketCoeff<Aligned>(l1i+4,l1j, ei_padd(res.template packetCoeff<Aligned>(l1i+4,l1j), ei_predux(&(dst[4]))));
-                if (ps==2)
-                  res.template writePacketCoeff<Aligned>(l1i+6,l1j, ei_padd(res.template packetCoeff<Aligned>(l1i+6,l1j), ei_predux(&(dst[6]))));
+                res.template writePacketCoeff<Aligned>(l1i+4,l1j, ei_padd(res.template packetCoeff<Aligned>(l1i+4,l1j), ei_preduxp(&(dst[4]))));
+                if (PacketSize==2)
+                  res.template writePacketCoeff<Aligned>(l1i+6,l1j, ei_padd(res.template packetCoeff<Aligned>(l1i+6,l1j), ei_preduxp(&(dst[6]))));
               }
             }
 //             else
 //             {
 //               // TODO uncommenting this code kill the perf, even though it is never called !!
+//               // this is because dst cannot be a set of registers only
 //               // TODO optimize this loop
 //               // TODO is it better to do one redux at once or packet reduxes + unaligned store ?
 //               for (int w = 0; w<bw; ++w)
@@ -600,15 +655,15 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
           }
 #endif
         }
-        if (l2blockRowRemaining>0)
+        if (l2blockRemainingRows>0)
         {
           // this is an attempt to build an array of kernels, but I did not manage to get it compiles
 //           typedef void (*Kernel)(DestDerived& , int, int, int, int, int, int, int, const Scalar*);
 //           Kernel kernels[8];
 //           kernels[0] = (Kernel)(&Product<Lhs,Rhs,EvalMode>::template _cacheFriendlyEvalKernel<DestDerived, RhsAlignment, ResAlignment, 1>);
-//           kernels[l2blockRowRemaining](res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block);
+//           kernels[l2blockRemainingRows](res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block);
 
-          switch(l2blockRowRemaining)
+          switch(l2blockRemainingRows)
           {
           case 1:_cacheFriendlyEvalKernel<DestDerived, RhsAlignment, ResAlignment, 1>(
               res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break;
@@ -627,43 +682,6 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
           default:
             ei_internal_assert(false && "internal error"); break;
           }
-
-#if 0
-          // TODO optimize this part using a generic templated function that processes N rows
-          // here we process the remaining l2blockRowRemaining rows
-          for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
-          {
-            int offsetblock = l2k * (l2blockRowEnd-l2i) + (l2blockRowEndBW-l2i)*(l2blockSizeEnd-l2k) - l2k*l2blockRowRemaining;
-            const Scalar* localB = &block[offsetblock];
-
-            int l1jsize = l1j * size;
-
-            PacketScalar dst[MaxBlockRows];
-            dst[0] = ei_pset1(Scalar(0.));
-            for (int w = 1; w<l2blockRowRemaining; ++w)
-              dst[w] = dst[0];
-            PacketScalar b0, b1, tmp;
-            asm("#eigen begincore dynamic");
-            for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
-            {
-              //PacketScalar tmp = m_rhs.packetCoeff(k, l1j);
-              if (rhsIsAligned)
-                tmp = ei_pload(&m_rhs.derived().data()[l1jsize + k]);
-              else
-                tmp = ei_ploadu(&m_rhs.derived().data()[l1jsize + k]);
-
-              // TODO optimize this loop
-              for (int w = 0; w<l2blockRowRemaining; ++w)
-                dst[w] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRowRemaining+w*PacketSize])), dst[w]);
-            }
-
-            // TODO optimize this loop
-            for (int w = 0; w<l2blockRowRemaining; ++w)
-              res.coeffRef(l2blockRowEndBW+w, l1j) += ei_predux(dst[w]);
-
-            asm("#eigen endcore dynamic");
-          }
-#endif
         }
       }
     }
@@ -678,9 +696,10 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
       NormalProduct>(
         m_lhs.block(0,size, _rows(), remainingSize),
         m_rhs.block(size,0, remainingSize, _cols())).lazy();
+//     res += m_lhs.block(0,size, _rows(), remainingSize)._lazyProduct(m_rhs.block(size,0, remainingSize, _cols()));
   }
 
-  delete[] block;
+//   delete[] block;
 }
 
 #endif // EIGEN_PRODUCT_H
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index c74d8f87b..48498bfae 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -43,16 +43,18 @@ const unsigned int NullDiagBit = 0x40;      ///< means all diagonal coefficients
 const unsigned int UnitDiagBit = 0x80;      ///< means all diagonal coefficients are equal to 1
 const unsigned int NullLowerBit = 0x200;    ///< means the strictly triangular lower part is 0
 const unsigned int NullUpperBit = 0x400;    ///< means the strictly triangular upper part is 0
+const unsigned int ReferencableBit = 0x800; ///< means the expression is writable through MatrixBase::coeffRef(int,int)
 
 enum { Upper=NullLowerBit, Lower=NullUpperBit };
 enum { Aligned=0, UnAligned=1 };
 
 // list of flags that are lost by default
-const unsigned int DefaultLostFlagMask = ~(VectorizableBit | Like1DArrayBit | NullDiagBit | UnitDiagBit | NullLowerBit | NullUpperBit);
+const unsigned int DefaultLostFlagMask = ~(VectorizableBit | Like1DArrayBit | ReferencableBit
+  | NullDiagBit | UnitDiagBit | NullLowerBit | NullUpperBit);
 
 enum { ConditionalJumpCost = 5 };
 enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight };
 enum DirectionType { Vertical, Horizontal };
-enum ProductEvaluationMode { NormalProduct, CacheOptimalProduct };
+enum ProductEvaluationMode { NormalProduct, CacheOptimalProduct, LazyProduct};
 
 #endif // EIGEN_CONSTANTS_H
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 613ab9617..f703d159a 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -45,6 +45,13 @@
 
 #define EIGEN_DEFAULT_MATRIX_FLAGS EIGEN_DEFAULT_MATRIX_STORAGE_ORDER
 
+/** Define a hint size when dealling with large matrices and L2 cache friendlyness
+  * More precisely, its square value represents the amount of bytes which can be assumed to stay in L2 cache.
+  */
+#ifndef EIGEN_TUNE_FOR_L2_CACHE_SIZE
+#define EIGEN_TUNE_FOR_L2_CACHE_SIZE 1024
+#endif
+
 #define USING_PART_OF_NAMESPACE_EIGEN \
 EIGEN_USING_MATRIX_TYPEDEFS \
 using Eigen::Matrix; \