more product refactoring

2025-01-18 14:34:17 +08:00 · 2009-08-06 12:20:02 +02:00 · 2009-08-06 12:20:02 +02:00 · 56d00779db
commit 56d00779db
parent 6b2ab13ac5
12 changed files with 1004 additions and 927 deletions
--- a/Eigen/Core
+++ b/Eigen/Core
@ -138,7 +138,7 @@ namespace Eigen {
 #endif

 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
 #endif

 #include "src/Core/Functors.h"
@ -180,18 +180,20 @@ namespace Eigen {
 #include "src/Core/util/BlasUtil.h"
 #include "src/Core/ProductBase.h"
 #include "src/Core/Product.h"
-#include "src/Core/products/GeneralMatrixMatrix.h"
-#include "src/Core/products/GeneralMatrixVector.h"
-#include "src/Core/products/SelfadjointMatrixVector.h"
-#include "src/Core/products/SelfadjointMatrixMatrix.h"
 #include "src/Core/TriangularMatrix.h"
 #include "src/Core/SelfAdjointView.h"
 #include "src/Core/SolveTriangular.h"
+#include "src/Core/products/GeneralUnrolled.h"
+#include "src/Core/products/GeneralBlockPanelKernel.h"
+#include "src/Core/products/GeneralMatrixVector.h"
+#include "src/Core/products/GeneralMatrixMatrix.h"
+#include "src/Core/products/SelfadjointMatrixVector.h"
+#include "src/Core/products/SelfadjointMatrixMatrix.h"
 #include "src/Core/products/SelfadjointProduct.h"
 #include "src/Core/products/SelfadjointRank2Update.h"
 #include "src/Core/products/TriangularMatrixVector.h"
-#include "src/Core/products/TriangularSolverMatrix.h"
 #include "src/Core/products/TriangularMatrixMatrix.h"
+#include "src/Core/products/TriangularSolverMatrix.h"
 #include "src/Core/BandMatrix.h"

 } // namespace Eigen
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@ -63,23 +63,22 @@ template<typename Lhs, typename Rhs> struct ei_product_type
    Cols  = Rhs::ColsAtCompileTime,
    Depth = EIGEN_ENUM_MIN(Lhs::ColsAtCompileTime,Rhs::RowsAtCompileTime),

-    value = ei_product_type_selector<(Rows>8  ? Large : (Rows==1   ? 1 : Small)),
-                                     (Cols>8  ? Large : (Cols==1   ? 1 : Small)),
-                                     (Depth>8 ? Large : (Depth==1  ? 1 : Small))>::ret
+    value = ei_product_type_selector<(Rows >=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ? Large : (Rows==1   ? 1 : Small)),
+                                     (Cols >=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ? Large : (Cols==1   ? 1 : Small)),
+                                     (Depth>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ? Large : (Depth==1  ? 1 : Small))>::ret
  };
 };

+/* The following allows to select the kind of product at compile time
+ * based on the three dimensions of the product.
+ * This is a compile time mapping from {1,Small,Large}^3 -> {product types} */
+// FIXME I'm not sure the current mapping is the ideal one.
 template<int Rows, int Cols>  struct ei_product_type_selector<Rows,Cols,1>        { enum { ret = OuterProduct }; };
 template<int Depth>           struct ei_product_type_selector<1,1,Depth>          { enum { ret = InnerProduct }; };
 template<>                    struct ei_product_type_selector<1,1,1>              { enum { ret = InnerProduct }; };
 template<>                    struct ei_product_type_selector<Small,1,Small>      { enum { ret = UnrolledProduct }; };
 template<>                    struct ei_product_type_selector<1,Small,Small>      { enum { ret = UnrolledProduct }; };
 template<>                    struct ei_product_type_selector<Small,Small,Small>  { enum { ret = UnrolledProduct }; };
-
-// template<>                    struct ei_product_type_selector<Small,1,Small>      { enum { ret = GemvProduct }; };
-// template<>                    struct ei_product_type_selector<1,Small,Small>      { enum { ret = GemvProduct }; };
-// template<>                    struct ei_product_type_selector<Small,Small,Small>  { enum { ret = GemmProduct }; };
-
 template<>                    struct ei_product_type_selector<1,Large,Small>      { enum { ret = GemvProduct }; };
 template<>                    struct ei_product_type_selector<1,Large,Large>      { enum { ret = GemvProduct }; };
 template<>                    struct ei_product_type_selector<1,Small,Large>      { enum { ret = GemvProduct }; };
@ -129,48 +128,6 @@ struct ProductReturnType<Lhs,Rhs,UnrolledProduct>
 };


-/***********************************************************************
-*  Implementation of General Matrix Matrix Product
-***********************************************************************/
-
-template<typename Lhs, typename Rhs>
-struct ei_traits<GeneralProduct<Lhs,Rhs,GemmProduct> >
- : ei_traits<ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> >
-{};
-
-template<typename Lhs, typename Rhs>
-class GeneralProduct<Lhs, Rhs, GemmProduct>
-  : public ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs>
-{
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
-
-    GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
-
-    template<typename Dest> void addTo(Dest& dst, Scalar alpha) const
-    {
-      ei_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
-
-      const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs);
-      const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs);
-
-      Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
-                                 * RhsBlasTraits::extractScalarFactor(m_rhs);
-
-      ei_general_matrix_matrix_product<
-        Scalar,
-        (_ActualLhsType::Flags&RowMajorBit)?RowMajor:ColMajor, bool(LhsBlasTraits::NeedToConjugate),
-        (_ActualRhsType::Flags&RowMajorBit)?RowMajor:ColMajor, bool(RhsBlasTraits::NeedToConjugate),
-        (Dest::Flags&RowMajorBit)?RowMajor:ColMajor>
-      ::run(
-          this->rows(), this->cols(), lhs.cols(),
-          (const Scalar*)&(lhs.const_cast_derived().coeffRef(0,0)), lhs.stride(),
-          (const Scalar*)&(rhs.const_cast_derived().coeffRef(0,0)), rhs.stride(),
-          (Scalar*)&(dst.coeffRef(0,0)), dst.stride(),
-          actualAlpha);
-    }
-};
-
 /***********************************************************************
 *  Implementation of Inner Vector Vector Product
 ***********************************************************************/
@ -403,362 +360,6 @@ template<> struct ei_gemv_selector<OnTheRight,RowMajor,false>
  }
 };

-/***********************************************************************
-*  Implementation of products with small fixed sizes
-***********************************************************************/
-
-/* Since the all the dimensions of the product are small, here we can rely
- * on the generic Assign mechanism to evaluate the product per coeff (or packet).
- * 
- * Note that the here inner-loops should always be unrolled.
- */
-
-template<int VectorizationMode, int Index, typename Lhs, typename Rhs, typename RetScalar>
-struct ei_product_coeff_impl;
-
-template<int StorageOrder, int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
-struct ei_product_packet_impl;
-
-template<typename LhsNested, typename RhsNested>
-struct ei_traits<GeneralProduct<LhsNested,RhsNested,UnrolledProduct> >
-{
-  typedef typename ei_cleantype<LhsNested>::type _LhsNested;
-  typedef typename ei_cleantype<RhsNested>::type _RhsNested;
-  typedef typename ei_scalar_product_traits<typename _LhsNested::Scalar, typename _RhsNested::Scalar>::ReturnType Scalar;
-  
-  enum {
-      LhsCoeffReadCost = _LhsNested::CoeffReadCost,
-      RhsCoeffReadCost = _RhsNested::CoeffReadCost,
-      LhsFlags = _LhsNested::Flags,
-      RhsFlags = _RhsNested::Flags,
-
-      RowsAtCompileTime = _LhsNested::RowsAtCompileTime,
-      ColsAtCompileTime = _RhsNested::ColsAtCompileTime,
-      InnerSize = EIGEN_ENUM_MIN(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime),
-
-      MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime,
-
-      LhsRowMajor = LhsFlags & RowMajorBit,
-      RhsRowMajor = RhsFlags & RowMajorBit,
-
-      CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit)
-                      && (ColsAtCompileTime == Dynamic || (ColsAtCompileTime % ei_packet_traits<Scalar>::size) == 0),
-
-      CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit)
-                      && (RowsAtCompileTime == Dynamic || (RowsAtCompileTime % ei_packet_traits<Scalar>::size) == 0),
-
-      EvalToRowMajor = RhsRowMajor && (!CanVectorizeLhs),
-
-      RemovedBits = ~(EvalToRowMajor ? 0 : RowMajorBit),
-
-      Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
-            | EvalBeforeAssigningBit
-            | EvalBeforeNestingBit
-            | (CanVectorizeLhs || CanVectorizeRhs ? PacketAccessBit : 0)
-            | (LhsFlags & RhsFlags & AlignedBit),
-
-      CoeffReadCost = InnerSize == Dynamic ? Dynamic
-                    : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
-                      + (InnerSize - 1) * NumTraits<Scalar>::AddCost,
-
-      /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
-      * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
-      * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
-      * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
-      */
-      CanVectorizeInner = LhsRowMajor && (!RhsRowMajor) && (LhsFlags & RhsFlags & ActualPacketAccessBit)
-                        && (InnerSize % ei_packet_traits<Scalar>::size == 0)
-    };
-};
-
-template<typename LhsNested, typename RhsNested> class GeneralProduct<LhsNested,RhsNested,UnrolledProduct>
-  : ei_no_assignment_operator,
-    public MatrixBase<GeneralProduct<LhsNested, RhsNested, UnrolledProduct> >
-{
-  public:
-
-    EIGEN_GENERIC_PUBLIC_INTERFACE(GeneralProduct)
-
-  private:
-
-    typedef typename ei_traits<GeneralProduct>::_LhsNested _LhsNested;
-    typedef typename ei_traits<GeneralProduct>::_RhsNested _RhsNested;
-
-    enum {
-      PacketSize = ei_packet_traits<Scalar>::size,
-      InnerSize  = ei_traits<GeneralProduct>::InnerSize,
-      Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
-      CanVectorizeInner = ei_traits<GeneralProduct>::CanVectorizeInner
-    };
-
-    typedef ei_product_coeff_impl<CanVectorizeInner ? InnerVectorization : NoVectorization,
-                                  Unroll ? InnerSize-1 : Dynamic,
-                                  _LhsNested, _RhsNested, Scalar> ScalarCoeffImpl;
-
-  public:
-
-    template<typename Lhs, typename Rhs>
-    inline GeneralProduct(const Lhs& lhs, const Rhs& rhs)
-      : m_lhs(lhs), m_rhs(rhs)
-    {
-      // we don't allow taking products of matrices of different real types, as that wouldn't be vectorizable.
-      // We still allow to mix T and complex<T>.
-      EIGEN_STATIC_ASSERT((ei_is_same_type<typename Lhs::RealScalar, typename Rhs::RealScalar>::ret),
-        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-      ei_assert(lhs.cols() == rhs.rows()
-        && "invalid matrix product"
-        && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
-    }
-
-    EIGEN_STRONG_INLINE int rows() const { return m_lhs.rows(); }
-    EIGEN_STRONG_INLINE int cols() const { return m_rhs.cols(); }
-
-    EIGEN_STRONG_INLINE const Scalar coeff(int row, int col) const
-    {
-      Scalar res;
-      ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
-      return res;
-    }
-
-    /* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
-     * which is why we don't set the LinearAccessBit.
-     */
-    EIGEN_STRONG_INLINE const Scalar coeff(int index) const
-    {
-      Scalar res;
-      const int row = RowsAtCompileTime == 1 ? 0 : index;
-      const int col = RowsAtCompileTime == 1 ? index : 0;
-      ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
-      return res;
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE const PacketScalar packet(int row, int col) const
-    {
-      PacketScalar res;
-      ei_product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor,
-                                   Unroll ? InnerSize-1 : Dynamic,
-                                   _LhsNested, _RhsNested, PacketScalar, LoadMode>
-        ::run(row, col, m_lhs, m_rhs, res);
-      return res;
-    }
-
-  protected:
-    const LhsNested m_lhs;
-    const RhsNested m_rhs;
-};
-
-
-/***************************************************************************
-* Normal product .coeff() implementation (with meta-unrolling)
-***************************************************************************/
-
-/**************************************
-*** Scalar path  - no vectorization ***
-**************************************/
-
-template<int Index, typename Lhs, typename Rhs, typename RetScalar>
-struct ei_product_coeff_impl<NoVectorization, Index, Lhs, Rhs, RetScalar>
-{
-  EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
-  {
-    ei_product_coeff_impl<NoVectorization, Index-1, Lhs, Rhs, RetScalar>::run(row, col, lhs, rhs, res);
-    res += lhs.coeff(row, Index) * rhs.coeff(Index, col);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename RetScalar>
-struct ei_product_coeff_impl<NoVectorization, 0, Lhs, Rhs, RetScalar>
-{
-  EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
-  {
-    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename RetScalar>
-struct ei_product_coeff_impl<NoVectorization, Dynamic, Lhs, Rhs, RetScalar>
-{
-  EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar& res)
-  {
-    ei_assert(lhs.cols()>0 && "you are using a non initialized matrix");
-    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
-      for(int i = 1; i < lhs.cols(); ++i)
-        res += lhs.coeff(row, i) * rhs.coeff(i, col);
-  }
-};
-
-// prevent buggy user code from causing an infinite recursion
-template<typename Lhs, typename Rhs, typename RetScalar>
-struct ei_product_coeff_impl<NoVectorization, -1, Lhs, Rhs, RetScalar>
-{
-  EIGEN_STRONG_INLINE static void run(int, int, const Lhs&, const Rhs&, RetScalar&) {}
-};
-
-/*******************************************
-*** Scalar path with inner vectorization ***
-*******************************************/
-
-template<int Index, typename Lhs, typename Rhs, typename PacketScalar>
-struct ei_product_coeff_vectorized_unroller
-{
-  enum { PacketSize = ei_packet_traits<typename Lhs::Scalar>::size };
-  EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
-  {
-    ei_product_coeff_vectorized_unroller<Index-PacketSize, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, pres);
-    pres = ei_padd(pres, ei_pmul( lhs.template packet<Aligned>(row, Index) , rhs.template packet<Aligned>(Index, col) ));
-  }
-};
-
-template<typename Lhs, typename Rhs, typename PacketScalar>
-struct ei_product_coeff_vectorized_unroller<0, Lhs, Rhs, PacketScalar>
-{
-  EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
-  {
-    pres = ei_pmul(lhs.template packet<Aligned>(row, 0) , rhs.template packet<Aligned>(0, col));
-  }
-};
-
-template<int Index, typename Lhs, typename Rhs, typename RetScalar>
-struct ei_product_coeff_impl<InnerVectorization, Index, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::PacketScalar PacketScalar;
-  enum { PacketSize = ei_packet_traits<typename Lhs::Scalar>::size };
-  EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
-  {
-    PacketScalar pres;
-    ei_product_coeff_vectorized_unroller<Index+1-PacketSize, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, pres);
-    ei_product_coeff_impl<NoVectorization,Index,Lhs,Rhs,RetScalar>::run(row, col, lhs, rhs, res);
-    res = ei_predux(pres);
-  }
-};
-
-template<typename Lhs, typename Rhs, int LhsRows = Lhs::RowsAtCompileTime, int RhsCols = Rhs::ColsAtCompileTime>
-struct ei_product_coeff_vectorized_dyn_selector
-{
-  EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
-  {
-    res = ei_dot_impl<
-      Block<Lhs, 1, ei_traits<Lhs>::ColsAtCompileTime>,
-      Block<Rhs, ei_traits<Rhs>::RowsAtCompileTime, 1>,
-      LinearVectorization, NoUnrolling>::run(lhs.row(row), rhs.col(col));
-  }
-};
-
-// NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower
-// NOTE maybe they are now useless since we have a specialization for Block<Matrix>
-template<typename Lhs, typename Rhs, int RhsCols>
-struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols>
-{
-  EIGEN_STRONG_INLINE static void run(int /*row*/, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
-  {
-    res = ei_dot_impl<
-      Lhs,
-      Block<Rhs, ei_traits<Rhs>::RowsAtCompileTime, 1>,
-      LinearVectorization, NoUnrolling>::run(lhs, rhs.col(col));
-  }
-};
-
-template<typename Lhs, typename Rhs, int LhsRows>
-struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1>
-{
-  EIGEN_STRONG_INLINE static void run(int row, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
-  {
-    res = ei_dot_impl<
-      Block<Lhs, 1, ei_traits<Lhs>::ColsAtCompileTime>,
-      Rhs,
-      LinearVectorization, NoUnrolling>::run(lhs.row(row), rhs);
-  }
-};
-
-template<typename Lhs, typename Rhs>
-struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,1>
-{
-  EIGEN_STRONG_INLINE static void run(int /*row*/, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
-  {
-    res = ei_dot_impl<
-      Lhs,
-      Rhs,
-      LinearVectorization, NoUnrolling>::run(lhs, rhs);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename RetScalar>
-struct ei_product_coeff_impl<InnerVectorization, Dynamic, Lhs, Rhs, RetScalar>
-{
-  EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
-  {
-    ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs>::run(row, col, lhs, rhs, res);
-  }
-};
-
-/*******************
-*** Packet path  ***
-*******************/
-
-template<int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
-struct ei_product_packet_impl<RowMajor, Index, Lhs, Rhs, PacketScalar, LoadMode>
-{
-  EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
-  {
-    ei_product_packet_impl<RowMajor, Index-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res);
-    res =  ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.template packet<LoadMode>(Index, col), res);
-  }
-};
-
-template<int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
-struct ei_product_packet_impl<ColMajor, Index, Lhs, Rhs, PacketScalar, LoadMode>
-{
-  EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
-  {
-    ei_product_packet_impl<ColMajor, Index-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res);
-    res =  ei_pmadd(lhs.template packet<LoadMode>(row, Index), ei_pset1(rhs.coeff(Index, col)), res);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
-struct ei_product_packet_impl<RowMajor, 0, Lhs, Rhs, PacketScalar, LoadMode>
-{
-  EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
-  {
-    res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
-  }
-};
-
-template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
-struct ei_product_packet_impl<ColMajor, 0, Lhs, Rhs, PacketScalar, LoadMode>
-{
-  EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
-  {
-    res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col)));
-  }
-};
-
-template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
-struct ei_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMode>
-{
-  EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
-  {
-    ei_assert(lhs.cols()>0 && "you are using a non initialized matrix");
-    res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
-      for(int i = 1; i < lhs.cols(); ++i)
-        res =  ei_pmadd(ei_pset1(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
-struct ei_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMode>
-{
-  EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
-  {
-    ei_assert(lhs.cols()>0 && "you are using a non initialized matrix");
-    res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col)));
-      for(int i = 1; i < lhs.cols(); ++i)
-        res =  ei_pmadd(lhs.template packet<LoadMode>(row, i), ei_pset1(rhs.coeff(i, col)), res);
-  }
-};
-
 /***************************************************************************
 * Implementation of matrix base methods
 ***************************************************************************/
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@ -196,101 +196,6 @@ struct ei_triangular_assignment_selector<Derived1, Derived2, SelfAdjoint, Dynami
  }
 };

-/***************************************************************************
-* Wrapper to ei_product_selfadjoint_vector
-***************************************************************************/
-
-template<typename Lhs, int LhsMode, typename Rhs>
-struct ei_traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true> >
-  : ei_traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs> >
-{};
-
-template<typename Lhs, int LhsMode, typename Rhs>
-struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
-  : public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs >
-{
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
-
-  enum {
-    LhsUpLo = LhsMode&(UpperTriangularBit|LowerTriangularBit)
-  };
-
-  SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
-
-  template<typename Dest> void addTo(Dest& dst, Scalar alpha) const
-  {
-    ei_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
-
-    const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs);
-    const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs);
-
-    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
-                               * RhsBlasTraits::extractScalarFactor(m_rhs);
-
-    ei_assert((&dst.coeff(1))-(&dst.coeff(0))==1 && "not implemented yet");
-    ei_product_selfadjoint_vector<Scalar, ei_traits<_ActualLhsType>::Flags&RowMajorBit, int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)>
-      (
-        lhs.rows(),                                     // size
-        &lhs.coeff(0,0), lhs.stride(),                  // lhs info
-        &rhs.coeff(0), (&rhs.coeff(1))-(&rhs.coeff(0)), // rhs info
-        &dst.coeffRef(0),                               // result info
-        actualAlpha                                     // scale factor
-      );
-  }
-};
-
-/***************************************************************************
-* Wrapper to ei_product_selfadjoint_matrix
-***************************************************************************/
-
-template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
-struct ei_traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false> >
-  : ei_traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs> >
-{};
-
-template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
-struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>
-  : public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs >
-{
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
-
-  SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
-
-  enum {
-    LhsUpLo = LhsMode&(UpperTriangularBit|LowerTriangularBit),
-    LhsIsSelfAdjoint = (LhsMode&SelfAdjointBit)==SelfAdjointBit,
-    RhsUpLo = RhsMode&(UpperTriangularBit|LowerTriangularBit),
-    RhsIsSelfAdjoint = (RhsMode&SelfAdjointBit)==SelfAdjointBit
-  };
-
-  template<typename Dest> void addTo(Dest& dst, Scalar alpha) const
-  {
-    ei_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
-
-    const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs);
-    const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs);
-
-    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
-                               * RhsBlasTraits::extractScalarFactor(m_rhs);
-
-    ei_product_selfadjoint_matrix<Scalar,
-      EIGEN_LOGICAL_XOR(LhsUpLo==UpperTriangular,
-                        ei_traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint,
-      NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsUpLo==UpperTriangular,bool(LhsBlasTraits::NeedToConjugate)),
-      EIGEN_LOGICAL_XOR(RhsUpLo==UpperTriangular,
-                        ei_traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint,
-      NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsUpLo==UpperTriangular,bool(RhsBlasTraits::NeedToConjugate)),
-      ei_traits<Dest>::Flags&RowMajorBit  ? RowMajor : ColMajor>
-      ::run(
-        lhs.rows(), rhs.cols(),           // sizes
-        &lhs.coeff(0,0),    lhs.stride(), // lhs info
-        &rhs.coeff(0,0),    rhs.stride(), // rhs info
-        &dst.coeffRef(0,0), dst.stride(), // result info
-        actualAlpha                       // alpha
-      );
-  }
-};
-
 /***************************************************************************
 * Implementation of MatrixBase methods
 ***************************************************************************/
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@ -0,0 +1,443 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_GENERAL_BLOCK_PANEL_H
+#define EIGEN_GENERAL_BLOCK_PANEL_H
+
+#ifndef EIGEN_EXTERN_INSTANTIATIONS
+
+// optimized GEneral packed Block * packed Panel product kernel
+template<typename Scalar, int mr, int nr, typename Conj>
+struct ei_gebp_kernel
+{
+  void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int rows, int depth, int cols, int strideA=-1, int strideB=-1, int offsetA=0, int offsetB=0)
+  {
+    typedef typename ei_packet_traits<Scalar>::type PacketType;
+    enum { PacketSize = ei_packet_traits<Scalar>::size };
+    if(strideA==-1) strideA = depth;
+    if(strideB==-1) strideB = depth;
+    Conj cj;
+    int packet_cols = (cols/nr) * nr;
+    const int peeled_mc = (rows/mr)*mr;
+    // loops on each cache friendly block of the result/rhs
+    for(int j2=0; j2<packet_cols; j2+=nr)
+    {
+      // loops on each register blocking of lhs/res
+      for(int i=0; i<peeled_mc; i+=mr)
+      {
+        const Scalar* blA = &blockA[i*strideA+offsetA*mr];
+        #ifdef EIGEN_VECTORIZE_SSE
+        _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
+        #endif
+
+        // TODO move the res loads to the stores
+
+        // gets res block as register
+        PacketType C0, C1, C2, C3, C4, C5, C6, C7;
+                  C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
+                  C1 = ei_ploadu(&res[(j2+1)*resStride + i]);
+        if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i]);
+        if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]);
+                  C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]);
+                  C5 = ei_ploadu(&res[(j2+1)*resStride + i + PacketSize]);
+        if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i + PacketSize]);
+        if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i + PacketSize]);
+
+        // performs "inner" product
+        // TODO let's check wether the flowing peeled loop could not be
+        //      optimized via optimal prefetching from one loop to the other
+        const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
+        const int peeled_kc = (depth/4)*4;
+        for(int k=0; k<peeled_kc; k+=4)
+        {
+          PacketType B0, B1, B2, B3, A0, A1;
+
+                    A0 = ei_pload(&blA[0*PacketSize]);
+                    A1 = ei_pload(&blA[1*PacketSize]);
+                    B0 = ei_pload(&blB[0*PacketSize]);
+                    B1 = ei_pload(&blB[1*PacketSize]);
+                    C0 = cj.pmadd(A0, B0, C0);
+          if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
+                    C4 = cj.pmadd(A1, B0, C4);
+          if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
+                    B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
+                    C1 = cj.pmadd(A0, B1, C1);
+                    C5 = cj.pmadd(A1, B1, C5);
+                    B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
+          if(nr==4) C2 = cj.pmadd(A0, B2, C2);
+          if(nr==4) C6 = cj.pmadd(A1, B2, C6);
+          if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
+          if(nr==4) C3 = cj.pmadd(A0, B3, C3);
+                    A0 = ei_pload(&blA[2*PacketSize]);
+          if(nr==4) C7 = cj.pmadd(A1, B3, C7);
+                    A1 = ei_pload(&blA[3*PacketSize]);
+          if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
+                    C0 = cj.pmadd(A0, B0, C0);
+                    C4 = cj.pmadd(A1, B0, C4);
+                    B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
+                    C1 = cj.pmadd(A0, B1, C1);
+                    C5 = cj.pmadd(A1, B1, C5);
+                    B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
+          if(nr==4) C2 = cj.pmadd(A0, B2, C2);
+          if(nr==4) C6 = cj.pmadd(A1, B2, C6);
+          if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
+          if(nr==4) C3 = cj.pmadd(A0, B3, C3);
+                    A0 = ei_pload(&blA[4*PacketSize]);
+          if(nr==4) C7 = cj.pmadd(A1, B3, C7);
+                    A1 = ei_pload(&blA[5*PacketSize]);
+          if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
+
+                    C0 = cj.pmadd(A0, B0, C0);
+                    C4 = cj.pmadd(A1, B0, C4);
+                    B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
+                    C1 = cj.pmadd(A0, B1, C1);
+                    C5 = cj.pmadd(A1, B1, C5);
+                    B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
+          if(nr==4) C2 = cj.pmadd(A0, B2, C2);
+          if(nr==4) C6 = cj.pmadd(A1, B2, C6);
+          if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
+          if(nr==4) C3 = cj.pmadd(A0, B3, C3);
+                    A0 = ei_pload(&blA[6*PacketSize]);
+          if(nr==4) C7 = cj.pmadd(A1, B3, C7);
+                    A1 = ei_pload(&blA[7*PacketSize]);
+          if(nr==4) B3 = ei_pload(&blB[15*PacketSize]);
+                    C0 = cj.pmadd(A0, B0, C0);
+                    C4 = cj.pmadd(A1, B0, C4);
+                    C1 = cj.pmadd(A0, B1, C1);
+                    C5 = cj.pmadd(A1, B1, C5);
+          if(nr==4) C2 = cj.pmadd(A0, B2, C2);
+          if(nr==4) C6 = cj.pmadd(A1, B2, C6);
+          if(nr==4) C3 = cj.pmadd(A0, B3, C3);
+          if(nr==4) C7 = cj.pmadd(A1, B3, C7);
+
+          blB += 4*nr*PacketSize;
+          blA += 4*mr;
+        }
+        // process remaining peeled loop
+        for(int k=peeled_kc; k<depth; k++)
+        {
+          PacketType B0, B1, B2, B3, A0, A1;
+
+                    A0 = ei_pload(&blA[0*PacketSize]);
+                    A1 = ei_pload(&blA[1*PacketSize]);
+                    B0 = ei_pload(&blB[0*PacketSize]);
+                    B1 = ei_pload(&blB[1*PacketSize]);
+                    C0 = cj.pmadd(A0, B0, C0);
+          if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
+                    C4 = cj.pmadd(A1, B0, C4);
+          if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
+                    C1 = cj.pmadd(A0, B1, C1);
+                    C5 = cj.pmadd(A1, B1, C5);
+          if(nr==4) C2 = cj.pmadd(A0, B2, C2);
+          if(nr==4) C6 = cj.pmadd(A1, B2, C6);
+          if(nr==4) C3 = cj.pmadd(A0, B3, C3);
+          if(nr==4) C7 = cj.pmadd(A1, B3, C7);
+
+          blB += nr*PacketSize;
+          blA += mr;
+        }
+
+                  ei_pstoreu(&res[(j2+0)*resStride + i], C0);
+                  ei_pstoreu(&res[(j2+1)*resStride + i], C1);
+        if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i], C2);
+        if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i], C3);
+                  ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4);
+                  ei_pstoreu(&res[(j2+1)*resStride + i + PacketSize], C5);
+        if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i + PacketSize], C6);
+        if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i + PacketSize], C7);
+      }
+      for(int i=peeled_mc; i<rows; i++)
+      {
+        const Scalar* blA = &blockA[i*strideA+offsetA];
+        #ifdef EIGEN_VECTORIZE_SSE
+        _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
+        #endif
+
+        // gets a 1 x nr res block as registers
+        Scalar C0(0), C1(0), C2(0), C3(0);
+        const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
+        for(int k=0; k<depth; k++)
+        {
+          Scalar B0, B1, B2, B3, A0;
+
+                    A0 =  blA[k];
+                    B0 =  blB[0*PacketSize];
+                    B1 =  blB[1*PacketSize];
+                    C0 = cj.pmadd(A0, B0, C0);
+          if(nr==4) B2 =  blB[2*PacketSize];
+          if(nr==4) B3 =  blB[3*PacketSize];
+                    C1 = cj.pmadd(A0, B1, C1);
+          if(nr==4) C2 = cj.pmadd(A0, B2, C2);
+          if(nr==4) C3 = cj.pmadd(A0, B3, C3);
+
+          blB += nr*PacketSize;
+        }
+        res[(j2+0)*resStride + i] += C0;
+        res[(j2+1)*resStride + i] += C1;
+        if(nr==4) res[(j2+2)*resStride + i] += C2;
+        if(nr==4) res[(j2+3)*resStride + i] += C3;
+      }
+    }
+
+    // process remaining rhs/res columns one at a time
+    // => do the same but with nr==1
+    for(int j2=packet_cols; j2<cols; j2++)
+    {
+      for(int i=0; i<peeled_mc; i+=mr)
+      {
+        const Scalar* blA = &blockA[i*strideA+offsetA*mr];
+        #ifdef EIGEN_VECTORIZE_SSE
+        _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
+        #endif
+
+        // TODO move the res loads to the stores
+
+        // gets res block as register
+        PacketType C0, C4;
+        C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
+        C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]);
+
+        const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB];
+        for(int k=0; k<depth; k++)
+        {
+          PacketType B0, A0, A1;
+
+          A0 = ei_pload(&blA[0*PacketSize]);
+          A1 = ei_pload(&blA[1*PacketSize]);
+          B0 = ei_pload(&blB[0*PacketSize]);
+          C0 = cj.pmadd(A0, B0, C0);
+          C4 = cj.pmadd(A1, B0, C4);
+
+          blB += PacketSize;
+          blA += mr;
+        }
+
+        ei_pstoreu(&res[(j2+0)*resStride + i], C0);
+        ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4);
+      }
+      for(int i=peeled_mc; i<rows; i++)
+      {
+        const Scalar* blA = &blockA[i*strideA+offsetA];
+        #ifdef EIGEN_VECTORIZE_SSE
+        _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
+        #endif
+
+        // gets a 1 x 1 res block as registers
+        Scalar C0(0);
+        const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB];
+        for(int k=0; k<depth; k++)
+          C0 = cj.pmadd(blA[k], blB[k*PacketSize], C0);
+        res[(j2+0)*resStride + i] += C0;
+      }
+    }
+  }
+};
+
+// pack a block of the lhs
+// The travesal is as follow (mr==4):
+//   0  4  8 12 ...
+//   1  5  9 13 ...
+//   2  6 10 14 ...
+//   3  7 11 15 ...
+//
+//  16 20 24 28 ...
+//  17 21 25 29 ...
+//  18 22 26 30 ...
+//  19 23 27 31 ...
+//
+//  32 33 34 35 ...
+//  36 36 38 39 ...
+template<typename Scalar, int mr, int StorageOrder, bool Conjugate, bool PanelMode>
+struct ei_gemm_pack_lhs
+{
+  void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, int lhsStride, int depth, int rows,
+                  int stride=0, int offset=0)
+  {
+    ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+    ei_conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+    ei_const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride);
+    int count = 0;
+    const int peeled_mc = (rows/mr)*mr;
+    for(int i=0; i<peeled_mc; i+=mr)
+    {
+      if(PanelMode) count += mr * offset;
+      for(int k=0; k<depth; k++)
+        for(int w=0; w<mr; w++)
+          blockA[count++] = cj(lhs(i+w, k));
+      if(PanelMode) count += mr * (stride-offset-depth);
+    }
+    for(int i=peeled_mc; i<rows; i++)
+    {
+      if(PanelMode) count += offset;
+      for(int k=0; k<depth; k++)
+        blockA[count++] = cj(lhs(i, k));
+      if(PanelMode) count += (stride-offset-depth);
+    }
+  }
+};
+
+// copy a complete panel of the rhs while expending each coefficient into a packet form
+// this version is optimized for column major matrices
+// The traversal order is as follow (nr==4):
+//  0  1  2  3   12 13 14 15   24 27
+//  4  5  6  7   16 17 18 19   25 28
+//  8  9 10 11   20 21 22 23   26 29
+//  .  .  .  .    .  .  .  .    .  .
+template<typename Scalar, int nr, bool PanelMode>
+struct ei_gemm_pack_rhs<Scalar, nr, ColMajor, PanelMode>
+{
+  enum { PacketSize = ei_packet_traits<Scalar>::size };
+  void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols,
+                  int stride=0, int offset=0)
+  {
+    ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+    bool hasAlpha = alpha != Scalar(1);
+    int packet_cols = (cols/nr) * nr;
+    int count = 0;
+    for(int j2=0; j2<packet_cols; j2+=nr)
+    {
+      // skip what we have before
+      if(PanelMode) count += PacketSize * nr * offset;
+      const Scalar* b0 = &rhs[(j2+0)*rhsStride];
+      const Scalar* b1 = &rhs[(j2+1)*rhsStride];
+      const Scalar* b2 = &rhs[(j2+2)*rhsStride];
+      const Scalar* b3 = &rhs[(j2+3)*rhsStride];
+      if (hasAlpha)
+      {
+        for(int k=0; k<depth; k++)
+        {
+          ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[k]));
+          ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b1[k]));
+          if (nr==4)
+          {
+            ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b2[k]));
+            ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b3[k]));
+          }
+          count += nr*PacketSize;
+        }
+      }
+      else
+      {
+        for(int k=0; k<depth; k++)
+        {
+          ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[k]));
+          ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b1[k]));
+          if (nr==4)
+          {
+            ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b2[k]));
+            ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b3[k]));
+          }
+          count += nr*PacketSize;
+        }
+      }
+      // skip what we have after
+      if(PanelMode) count += PacketSize * nr * (stride-offset-depth);
+    }
+    // copy the remaining columns one at a time (nr==1)
+    for(int j2=packet_cols; j2<cols; ++j2)
+    {
+      if(PanelMode) count += PacketSize * offset;
+      const Scalar* b0 = &rhs[(j2+0)*rhsStride];
+      if (hasAlpha)
+      {
+        for(int k=0; k<depth; k++)
+        {
+          ei_pstore(&blockB[count], ei_pset1(alpha*b0[k]));
+          count += PacketSize;
+        }
+      }
+      else
+      {
+        for(int k=0; k<depth; k++)
+        {
+          ei_pstore(&blockB[count], ei_pset1(b0[k]));
+          count += PacketSize;
+        }
+      }
+      if(PanelMode) count += PacketSize * (stride-offset-depth);
+    }
+  }
+};
+
+// this version is optimized for row major matrices
+template<typename Scalar, int nr, bool PanelMode>
+struct ei_gemm_pack_rhs<Scalar, nr, RowMajor, PanelMode>
+{
+  enum { PacketSize = ei_packet_traits<Scalar>::size };
+  void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols,
+                  int stride=0, int offset=0)
+  {
+    ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+    bool hasAlpha = alpha != Scalar(1);
+    int packet_cols = (cols/nr) * nr;
+    int count = 0;
+    for(int j2=0; j2<packet_cols; j2+=nr)
+    {
+      // skip what we have before
+      if(PanelMode) count += PacketSize * nr * offset;
+      if (hasAlpha)
+      {
+        for(int k=0; k<depth; k++)
+        {
+          const Scalar* b0 = &rhs[k*rhsStride + j2];
+                    ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[0]));
+                    ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b0[1]));
+          if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b0[2]));
+          if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b0[3]));
+          count += nr*PacketSize;
+        }
+      }
+      else
+      {
+        for(int k=0; k<depth; k++)
+        {
+          const Scalar* b0 = &rhs[k*rhsStride + j2];
+                    ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[0]));
+                    ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b0[1]));
+          if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b0[2]));
+          if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b0[3]));
+          count += nr*PacketSize;
+        }
+      }
+      // skip what we have after
+      if(PanelMode) count += PacketSize * nr * (stride-offset-depth);
+    }
+    // copy the remaining columns one at a time (nr==1)
+    for(int j2=packet_cols; j2<cols; ++j2)
+    {
+      if(PanelMode) count += PacketSize * offset;
+      const Scalar* b0 = &rhs[j2];
+      for(int k=0; k<depth; k++)
+      {
+        ei_pstore(&blockB[count], ei_pset1(alpha*b0[k*rhsStride]));
+        count += PacketSize;
+      }
+      if(PanelMode) count += PacketSize * (stride-offset-depth);
+    }
+  }
+};
+
+#endif // EIGEN_EXTERN_INSTANTIATIONS
+
+#endif // EIGEN_GENERAL_BLOCK_PANEL_H
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@ -27,6 +27,7 @@

 #ifndef EIGEN_EXTERN_INSTANTIATIONS

+/* Specialization for a row-major destination matrix => simple transposition of the product */
 template<
  typename Scalar,
  int LhsStorageOrder, bool ConjugateLhs,
@ -51,6 +52,8 @@ struct ei_general_matrix_matrix_product<Scalar,LhsStorageOrder,ConjugateLhs,RhsS
  }
 };

+/*  Specialization for a col-major destination matrix
+ *    => Blocking algorithm following Goto's paper */
 template<
  typename Scalar,
  int LhsStorageOrder, bool ConjugateLhs,
@ -78,23 +81,30 @@ static void run(int rows, int cols, int depth,
  Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
  Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);

-  // => GEMM_VAR1
+  // For each horizontal panel of the rhs, and corresponding panel of the lhs...
+  // (==GEMM_VAR1)
  for(int k2=0; k2<depth; k2+=kc)
  {
    const int actual_kc = std::min(k2+kc,depth)-k2;

-    // we have selected one row panel of rhs and one column panel of lhs
-    // pack rhs's panel into a sequential chunk of memory
-    // and expand each coeff to a constant packet for further reuse
+    // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
+    // => Pack rhs's panel into a sequential chunk of memory (L2 caching)
+    // Note that this panel will be read as many times as the number of blocks in the lhs's
+    // vertical panel which is, in practice, a very low number.
    ei_gemm_pack_rhs<Scalar, Blocking::nr, RhsStorageOrder>()(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols);

-    // => GEPP_VAR1
+    // For each mc x kc block of the lhs's vertical panel...
+    // (==GEPP_VAR1)
    for(int i2=0; i2<rows; i2+=mc)
    {
      const int actual_mc = std::min(i2+mc,rows)-i2;

+      // We pack the lhs's block into a sequential chunk of memory (L1 caching)
+      // Note that this block will be read a very high number of times, which is equal to the number of
+      // micro vertical panel of the large rhs's panel (e.g., cols/4 times).
      ei_gemm_pack_lhs<Scalar, Blocking::mr, LhsStorageOrder>()(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);

+      // Everything is packed, we can now call the block * panel kernel:
      ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >()
        (res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
    }
@ -106,417 +116,49 @@ static void run(int rows, int cols, int depth,

 };

-// optimized GEneral packed Block * packed Panel product kernel
-template<typename Scalar, int mr, int nr, typename Conj>
-struct ei_gebp_kernel
-{
-  void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int rows, int depth, int cols, int strideA=-1, int strideB=-1, int offsetA=0, int offsetB=0)
-  {
-    typedef typename ei_packet_traits<Scalar>::type PacketType;
-    enum { PacketSize = ei_packet_traits<Scalar>::size };
-    if(strideA==-1) strideA = depth;
-    if(strideB==-1) strideB = depth;
-    Conj cj;
-    int packet_cols = (cols/nr) * nr;
-    const int peeled_mc = (rows/mr)*mr;
-    // loops on each cache friendly block of the result/rhs
-    for(int j2=0; j2<packet_cols; j2+=nr)
-    {
-      // loops on each register blocking of lhs/res
-      for(int i=0; i<peeled_mc; i+=mr)
-      {
-        const Scalar* blA = &blockA[i*strideA+offsetA*mr];
-        #ifdef EIGEN_VECTORIZE_SSE
-        _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
-        #endif
-
-        // TODO move the res loads to the stores
-
-        // gets res block as register
-        PacketType C0, C1, C2, C3, C4, C5, C6, C7;
-                  C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
-                  C1 = ei_ploadu(&res[(j2+1)*resStride + i]);
-        if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i]);
-        if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]);
-                  C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]);
-                  C5 = ei_ploadu(&res[(j2+1)*resStride + i + PacketSize]);
-        if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i + PacketSize]);
-        if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i + PacketSize]);
-
-        // performs "inner" product
-        // TODO let's check wether the flowing peeled loop could not be
-        //      optimized via optimal prefetching from one loop to the other
-        const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
-        const int peeled_kc = (depth/4)*4;
-        for(int k=0; k<peeled_kc; k+=4)
-        {
-          PacketType B0, B1, B2, B3, A0, A1;
-
-                    A0 = ei_pload(&blA[0*PacketSize]);
-                    A1 = ei_pload(&blA[1*PacketSize]);
-                    B0 = ei_pload(&blB[0*PacketSize]);
-                    B1 = ei_pload(&blB[1*PacketSize]);
-                    C0 = cj.pmadd(A0, B0, C0);
-          if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
-                    C4 = cj.pmadd(A1, B0, C4);
-          if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
-                    B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
-                    C1 = cj.pmadd(A0, B1, C1);
-                    C5 = cj.pmadd(A1, B1, C5);
-                    B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
-          if(nr==4) C2 = cj.pmadd(A0, B2, C2);
-          if(nr==4) C6 = cj.pmadd(A1, B2, C6);
-          if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
-          if(nr==4) C3 = cj.pmadd(A0, B3, C3);
-                    A0 = ei_pload(&blA[2*PacketSize]);
-          if(nr==4) C7 = cj.pmadd(A1, B3, C7);
-                    A1 = ei_pload(&blA[3*PacketSize]);
-          if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
-                    C0 = cj.pmadd(A0, B0, C0);
-                    C4 = cj.pmadd(A1, B0, C4);
-                    B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
-                    C1 = cj.pmadd(A0, B1, C1);
-                    C5 = cj.pmadd(A1, B1, C5);
-                    B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
-          if(nr==4) C2 = cj.pmadd(A0, B2, C2);
-          if(nr==4) C6 = cj.pmadd(A1, B2, C6);
-          if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
-          if(nr==4) C3 = cj.pmadd(A0, B3, C3);
-                    A0 = ei_pload(&blA[4*PacketSize]);
-          if(nr==4) C7 = cj.pmadd(A1, B3, C7);
-                    A1 = ei_pload(&blA[5*PacketSize]);
-          if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
-
-                    C0 = cj.pmadd(A0, B0, C0);
-                    C4 = cj.pmadd(A1, B0, C4);
-                    B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
-                    C1 = cj.pmadd(A0, B1, C1);
-                    C5 = cj.pmadd(A1, B1, C5);
-                    B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
-          if(nr==4) C2 = cj.pmadd(A0, B2, C2);
-          if(nr==4) C6 = cj.pmadd(A1, B2, C6);
-          if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
-          if(nr==4) C3 = cj.pmadd(A0, B3, C3);
-                    A0 = ei_pload(&blA[6*PacketSize]);
-          if(nr==4) C7 = cj.pmadd(A1, B3, C7);
-                    A1 = ei_pload(&blA[7*PacketSize]);
-          if(nr==4) B3 = ei_pload(&blB[15*PacketSize]);
-                    C0 = cj.pmadd(A0, B0, C0);
-                    C4 = cj.pmadd(A1, B0, C4);
-                    C1 = cj.pmadd(A0, B1, C1);
-                    C5 = cj.pmadd(A1, B1, C5);
-          if(nr==4) C2 = cj.pmadd(A0, B2, C2);
-          if(nr==4) C6 = cj.pmadd(A1, B2, C6);
-          if(nr==4) C3 = cj.pmadd(A0, B3, C3);
-          if(nr==4) C7 = cj.pmadd(A1, B3, C7);
-
-          blB += 4*nr*PacketSize;
-          blA += 4*mr;
-        }
-        // process remaining peeled loop
-        for(int k=peeled_kc; k<depth; k++)
-        {
-          PacketType B0, B1, B2, B3, A0, A1;
-
-                    A0 = ei_pload(&blA[0*PacketSize]);
-                    A1 = ei_pload(&blA[1*PacketSize]);
-                    B0 = ei_pload(&blB[0*PacketSize]);
-                    B1 = ei_pload(&blB[1*PacketSize]);
-                    C0 = cj.pmadd(A0, B0, C0);
-          if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
-                    C4 = cj.pmadd(A1, B0, C4);
-          if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
-                    C1 = cj.pmadd(A0, B1, C1);
-                    C5 = cj.pmadd(A1, B1, C5);
-          if(nr==4) C2 = cj.pmadd(A0, B2, C2);
-          if(nr==4) C6 = cj.pmadd(A1, B2, C6);
-          if(nr==4) C3 = cj.pmadd(A0, B3, C3);
-          if(nr==4) C7 = cj.pmadd(A1, B3, C7);
-
-          blB += nr*PacketSize;
-          blA += mr;
-        }
-
-                  ei_pstoreu(&res[(j2+0)*resStride + i], C0);
-                  ei_pstoreu(&res[(j2+1)*resStride + i], C1);
-        if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i], C2);
-        if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i], C3);
-                  ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4);
-                  ei_pstoreu(&res[(j2+1)*resStride + i + PacketSize], C5);
-        if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i + PacketSize], C6);
-        if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i + PacketSize], C7);
-      }
-      for(int i=peeled_mc; i<rows; i++)
-      {
-        const Scalar* blA = &blockA[i*strideA+offsetA];
-        #ifdef EIGEN_VECTORIZE_SSE
-        _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
-        #endif
-
-        // gets a 1 x nr res block as registers
-        Scalar C0(0), C1(0), C2(0), C3(0);
-        const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
-        for(int k=0; k<depth; k++)
-        {
-          Scalar B0, B1, B2, B3, A0;
-
-                    A0 =  blA[k];
-                    B0 =  blB[0*PacketSize];
-                    B1 =  blB[1*PacketSize];
-                    C0 = cj.pmadd(A0, B0, C0);
-          if(nr==4) B2 =  blB[2*PacketSize];
-          if(nr==4) B3 =  blB[3*PacketSize];
-                    C1 = cj.pmadd(A0, B1, C1);
-          if(nr==4) C2 = cj.pmadd(A0, B2, C2);
-          if(nr==4) C3 = cj.pmadd(A0, B3, C3);
-
-          blB += nr*PacketSize;
-        }
-        res[(j2+0)*resStride + i] += C0;
-        res[(j2+1)*resStride + i] += C1;
-        if(nr==4) res[(j2+2)*resStride + i] += C2;
-        if(nr==4) res[(j2+3)*resStride + i] += C3;
-      }
-    }
-
-    // process remaining rhs/res columns one at a time
-    // => do the same but with nr==1
-    for(int j2=packet_cols; j2<cols; j2++)
-    {
-      for(int i=0; i<peeled_mc; i+=mr)
-      {
-        const Scalar* blA = &blockA[i*strideA+offsetA*mr];
-        #ifdef EIGEN_VECTORIZE_SSE
-        _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
-        #endif
-
-        // TODO move the res loads to the stores
-
-        // gets res block as register
-        PacketType C0, C4;
-        C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
-        C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]);
-
-        const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB];
-        for(int k=0; k<depth; k++)
-        {
-          PacketType B0, A0, A1;
-
-          A0 = ei_pload(&blA[0*PacketSize]);
-          A1 = ei_pload(&blA[1*PacketSize]);
-          B0 = ei_pload(&blB[0*PacketSize]);
-          C0 = cj.pmadd(A0, B0, C0);
-          C4 = cj.pmadd(A1, B0, C4);
-
-          blB += PacketSize;
-          blA += mr;
-        }
-
-        ei_pstoreu(&res[(j2+0)*resStride + i], C0);
-        ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4);
-      }
-      for(int i=peeled_mc; i<rows; i++)
-      {
-        const Scalar* blA = &blockA[i*strideA+offsetA];
-        #ifdef EIGEN_VECTORIZE_SSE
-        _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
-        #endif
-
-        // gets a 1 x 1 res block as registers
-        Scalar C0(0);
-        const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB];
-        for(int k=0; k<depth; k++)
-          C0 = cj.pmadd(blA[k], blB[k*PacketSize], C0);
-        res[(j2+0)*resStride + i] += C0;
-      }
-    }
-  }
-};
-
-// pack a block of the lhs
-// The travesal is as follow (mr==4):
-//   0  4  8 12 ...
-//   1  5  9 13 ...
-//   2  6 10 14 ...
-//   3  7 11 15 ...
-//
-//  16 20 24 28 ...
-//  17 21 25 29 ...
-//  18 22 26 30 ...
-//  19 23 27 31 ...
-//
-//  32 33 34 35 ...
-//  36 36 38 39 ...
-template<typename Scalar, int mr, int StorageOrder, bool Conjugate, bool PanelMode>
-struct ei_gemm_pack_lhs
-{
-  void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, int lhsStride, int depth, int rows,
-                  int stride=0, int offset=0)
-  {
-    ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
-    ei_conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
-    ei_const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride);
-    int count = 0;
-    const int peeled_mc = (rows/mr)*mr;
-    for(int i=0; i<peeled_mc; i+=mr)
-    {
-      if(PanelMode) count += mr * offset;
-      for(int k=0; k<depth; k++)
-        for(int w=0; w<mr; w++)
-          blockA[count++] = cj(lhs(i+w, k));
-      if(PanelMode) count += mr * (stride-offset-depth);
-    }
-    for(int i=peeled_mc; i<rows; i++)
-    {
-      if(PanelMode) count += offset;
-      for(int k=0; k<depth; k++)
-        blockA[count++] = cj(lhs(i, k));
-      if(PanelMode) count += (stride-offset-depth);
-    }
-  }
-};
-
-// copy a complete panel of the rhs while expending each coefficient into a packet form
-// this version is optimized for column major matrices
-// The traversal order is as follow (nr==4):
-//  0  1  2  3   12 13 14 15   24 27
-//  4  5  6  7   16 17 18 19   25 28
-//  8  9 10 11   20 21 22 23   26 29
-//  .  .  .  .    .  .  .  .    .  .
-template<typename Scalar, int nr, bool PanelMode>
-struct ei_gemm_pack_rhs<Scalar, nr, ColMajor, PanelMode>
-{
-  enum { PacketSize = ei_packet_traits<Scalar>::size };
-  void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols,
-                  int stride=0, int offset=0)
-  {
-    ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
-    bool hasAlpha = alpha != Scalar(1);
-    int packet_cols = (cols/nr) * nr;
-    int count = 0;
-    for(int j2=0; j2<packet_cols; j2+=nr)
-    {
-      // skip what we have before
-      if(PanelMode) count += PacketSize * nr * offset;
-      const Scalar* b0 = &rhs[(j2+0)*rhsStride];
-      const Scalar* b1 = &rhs[(j2+1)*rhsStride];
-      const Scalar* b2 = &rhs[(j2+2)*rhsStride];
-      const Scalar* b3 = &rhs[(j2+3)*rhsStride];
-      if (hasAlpha)
-      {
-        for(int k=0; k<depth; k++)
-        {
-          ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[k]));
-          ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b1[k]));
-          if (nr==4)
-          {
-            ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b2[k]));
-            ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b3[k]));
-          }
-          count += nr*PacketSize;
-        }
-      }
-      else
-      {
-        for(int k=0; k<depth; k++)
-        {
-          ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[k]));
-          ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b1[k]));
-          if (nr==4)
-          {
-            ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b2[k]));
-            ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b3[k]));
-          }
-          count += nr*PacketSize;
-        }
-      }
-      // skip what we have after
-      if(PanelMode) count += PacketSize * nr * (stride-offset-depth);
-    }
-    // copy the remaining columns one at a time (nr==1)
-    for(int j2=packet_cols; j2<cols; ++j2)
-    {
-      if(PanelMode) count += PacketSize * offset;
-      const Scalar* b0 = &rhs[(j2+0)*rhsStride];
-      if (hasAlpha)
-      {
-        for(int k=0; k<depth; k++)
-        {
-          ei_pstore(&blockB[count], ei_pset1(alpha*b0[k]));
-          count += PacketSize;
-        }
-      }
-      else
-      {
-        for(int k=0; k<depth; k++)
-        {
-          ei_pstore(&blockB[count], ei_pset1(b0[k]));
-          count += PacketSize;
-        }
-      }
-      if(PanelMode) count += PacketSize * (stride-offset-depth);
-    }
-  }
-};
-
-// this version is optimized for row major matrices
-template<typename Scalar, int nr, bool PanelMode>
-struct ei_gemm_pack_rhs<Scalar, nr, RowMajor, PanelMode>
-{
-  enum { PacketSize = ei_packet_traits<Scalar>::size };
-  void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols,
-                  int stride=0, int offset=0)
-  {
-    ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
-    bool hasAlpha = alpha != Scalar(1);
-    int packet_cols = (cols/nr) * nr;
-    int count = 0;
-    for(int j2=0; j2<packet_cols; j2+=nr)
-    {
-      // skip what we have before
-      if(PanelMode) count += PacketSize * nr * offset;
-      if (hasAlpha)
-      {
-        for(int k=0; k<depth; k++)
-        {
-          const Scalar* b0 = &rhs[k*rhsStride + j2];
-                    ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[0]));
-                    ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b0[1]));
-          if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b0[2]));
-          if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b0[3]));
-          count += nr*PacketSize;
-        }
-      }
-      else
-      {
-        for(int k=0; k<depth; k++)
-        {
-          const Scalar* b0 = &rhs[k*rhsStride + j2];
-                    ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[0]));
-                    ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b0[1]));
-          if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b0[2]));
-          if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b0[3]));
-          count += nr*PacketSize;
-        }
-      }
-      // skip what we have after
-      if(PanelMode) count += PacketSize * nr * (stride-offset-depth);
-    }
-    // copy the remaining columns one at a time (nr==1)
-    for(int j2=packet_cols; j2<cols; ++j2)
-    {
-      if(PanelMode) count += PacketSize * offset;
-      const Scalar* b0 = &rhs[j2];
-      for(int k=0; k<depth; k++)
-      {
-        ei_pstore(&blockB[count], ei_pset1(alpha*b0[k*rhsStride]));
-        count += PacketSize;
-      }
-      if(PanelMode) count += PacketSize * (stride-offset-depth);
-    }
-  }
-};
-
 #endif // EIGEN_EXTERN_INSTANTIATIONS

+/*********************************************************************************
+*  Specialization of GeneralProduct<> for "large" GEMM, i.e.,
+*  implementation of the high level wrapper to ei_general_matrix_matrix_product
+**********************************************************************************/
+
+template<typename Lhs, typename Rhs>
+struct ei_traits<GeneralProduct<Lhs,Rhs,GemmProduct> >
+ : ei_traits<ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> >
+{};
+
+template<typename Lhs, typename Rhs>
+class GeneralProduct<Lhs, Rhs, GemmProduct>
+  : public ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs>
+{
+  public:
+    EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
+
+    GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
+
+    template<typename Dest> void addTo(Dest& dst, Scalar alpha) const
+    {
+      ei_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
+
+      const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs);
+      const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs);
+
+      Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
+                                 * RhsBlasTraits::extractScalarFactor(m_rhs);
+
+      ei_general_matrix_matrix_product<
+        Scalar,
+        (_ActualLhsType::Flags&RowMajorBit)?RowMajor:ColMajor, bool(LhsBlasTraits::NeedToConjugate),
+        (_ActualRhsType::Flags&RowMajorBit)?RowMajor:ColMajor, bool(RhsBlasTraits::NeedToConjugate),
+        (Dest::Flags&RowMajorBit)?RowMajor:ColMajor>
+      ::run(
+          this->rows(), this->cols(), lhs.cols(),
+          (const Scalar*)&(lhs.const_cast_derived().coeffRef(0,0)), lhs.stride(),
+          (const Scalar*)&(rhs.const_cast_derived().coeffRef(0,0)), rhs.stride(),
+          (Scalar*)&(dst.coeffRef(0,0)), dst.stride(),
+          actualAlpha);
+    }
+};
+
 #endif // EIGEN_GENERAL_MATRIX_MATRIX_H
--- a/Eigen/src/Core/products/GeneralUnrolled.h
+++ b/Eigen/src/Core/products/GeneralUnrolled.h
@ -0,0 +1,385 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_GENERAL_UNROLLED_PRODUCT_H
+#define EIGEN_GENERAL_UNROLLED_PRODUCT_H
+
+/*********************************************************************************
+*  Specialization of GeneralProduct<> for products with small fixed sizes
+*********************************************************************************/
+
+/* Since the all the dimensions of the product are small, here we can rely
+ * on the generic Assign mechanism to evaluate the product per coeff (or packet).
+ * 
+ * Note that here the inner-loops should always be unrolled.
+ */
+
+template<int VectorizationMode, int Index, typename Lhs, typename Rhs, typename RetScalar>
+struct ei_product_coeff_impl;
+
+template<int StorageOrder, int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
+struct ei_product_packet_impl;
+
+template<typename LhsNested, typename RhsNested>
+struct ei_traits<GeneralProduct<LhsNested,RhsNested,UnrolledProduct> >
+{
+  typedef typename ei_cleantype<LhsNested>::type _LhsNested;
+  typedef typename ei_cleantype<RhsNested>::type _RhsNested;
+  typedef typename ei_scalar_product_traits<typename _LhsNested::Scalar, typename _RhsNested::Scalar>::ReturnType Scalar;
+  
+  enum {
+      LhsCoeffReadCost = _LhsNested::CoeffReadCost,
+      RhsCoeffReadCost = _RhsNested::CoeffReadCost,
+      LhsFlags = _LhsNested::Flags,
+      RhsFlags = _RhsNested::Flags,
+
+      RowsAtCompileTime = _LhsNested::RowsAtCompileTime,
+      ColsAtCompileTime = _RhsNested::ColsAtCompileTime,
+      InnerSize = EIGEN_ENUM_MIN(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime),
+
+      MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime,
+
+      LhsRowMajor = LhsFlags & RowMajorBit,
+      RhsRowMajor = RhsFlags & RowMajorBit,
+
+      CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit)
+                      && (ColsAtCompileTime == Dynamic || (ColsAtCompileTime % ei_packet_traits<Scalar>::size) == 0),
+
+      CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit)
+                      && (RowsAtCompileTime == Dynamic || (RowsAtCompileTime % ei_packet_traits<Scalar>::size) == 0),
+
+      EvalToRowMajor = RhsRowMajor && (!CanVectorizeLhs),
+
+      RemovedBits = ~(EvalToRowMajor ? 0 : RowMajorBit),
+
+      Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
+            | EvalBeforeAssigningBit
+            | EvalBeforeNestingBit
+            | (CanVectorizeLhs || CanVectorizeRhs ? PacketAccessBit : 0)
+            | (LhsFlags & RhsFlags & AlignedBit),
+
+      CoeffReadCost = InnerSize == Dynamic ? Dynamic
+                    : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
+                      + (InnerSize - 1) * NumTraits<Scalar>::AddCost,
+
+      /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
+      * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
+      * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
+      * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
+      */
+      CanVectorizeInner = LhsRowMajor && (!RhsRowMajor) && (LhsFlags & RhsFlags & ActualPacketAccessBit)
+                        && (InnerSize % ei_packet_traits<Scalar>::size == 0)
+    };
+};
+
+template<typename LhsNested, typename RhsNested> class GeneralProduct<LhsNested,RhsNested,UnrolledProduct>
+  : ei_no_assignment_operator,
+    public MatrixBase<GeneralProduct<LhsNested, RhsNested, UnrolledProduct> >
+{
+  public:
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(GeneralProduct)
+
+  private:
+
+    typedef typename ei_traits<GeneralProduct>::_LhsNested _LhsNested;
+    typedef typename ei_traits<GeneralProduct>::_RhsNested _RhsNested;
+
+    enum {
+      PacketSize = ei_packet_traits<Scalar>::size,
+      InnerSize  = ei_traits<GeneralProduct>::InnerSize,
+      Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
+      CanVectorizeInner = ei_traits<GeneralProduct>::CanVectorizeInner
+    };
+
+    typedef ei_product_coeff_impl<CanVectorizeInner ? InnerVectorization : NoVectorization,
+                                  Unroll ? InnerSize-1 : Dynamic,
+                                  _LhsNested, _RhsNested, Scalar> ScalarCoeffImpl;
+
+  public:
+
+    template<typename Lhs, typename Rhs>
+    inline GeneralProduct(const Lhs& lhs, const Rhs& rhs)
+      : m_lhs(lhs), m_rhs(rhs)
+    {
+      // we don't allow taking products of matrices of different real types, as that wouldn't be vectorizable.
+      // We still allow to mix T and complex<T>.
+      EIGEN_STATIC_ASSERT((ei_is_same_type<typename Lhs::RealScalar, typename Rhs::RealScalar>::ret),
+        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+      ei_assert(lhs.cols() == rhs.rows()
+        && "invalid matrix product"
+        && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
+    }
+
+    EIGEN_STRONG_INLINE int rows() const { return m_lhs.rows(); }
+    EIGEN_STRONG_INLINE int cols() const { return m_rhs.cols(); }
+
+    EIGEN_STRONG_INLINE const Scalar coeff(int row, int col) const
+    {
+      Scalar res;
+      ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
+      return res;
+    }
+
+    /* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
+     * which is why we don't set the LinearAccessBit.
+     */
+    EIGEN_STRONG_INLINE const Scalar coeff(int index) const
+    {
+      Scalar res;
+      const int row = RowsAtCompileTime == 1 ? 0 : index;
+      const int col = RowsAtCompileTime == 1 ? index : 0;
+      ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
+      return res;
+    }
+
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE const PacketScalar packet(int row, int col) const
+    {
+      PacketScalar res;
+      ei_product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor,
+                                   Unroll ? InnerSize-1 : Dynamic,
+                                   _LhsNested, _RhsNested, PacketScalar, LoadMode>
+        ::run(row, col, m_lhs, m_rhs, res);
+      return res;
+    }
+
+  protected:
+    const LhsNested m_lhs;
+    const RhsNested m_rhs;
+};
+
+
+/***************************************************************************
+* Normal product .coeff() implementation (with meta-unrolling)
+***************************************************************************/
+
+/**************************************
+*** Scalar path  - no vectorization ***
+**************************************/
+
+template<int Index, typename Lhs, typename Rhs, typename RetScalar>
+struct ei_product_coeff_impl<NoVectorization, Index, Lhs, Rhs, RetScalar>
+{
+  EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
+  {
+    ei_product_coeff_impl<NoVectorization, Index-1, Lhs, Rhs, RetScalar>::run(row, col, lhs, rhs, res);
+    res += lhs.coeff(row, Index) * rhs.coeff(Index, col);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename RetScalar>
+struct ei_product_coeff_impl<NoVectorization, 0, Lhs, Rhs, RetScalar>
+{
+  EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
+  {
+    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename RetScalar>
+struct ei_product_coeff_impl<NoVectorization, Dynamic, Lhs, Rhs, RetScalar>
+{
+  EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar& res)
+  {
+    ei_assert(lhs.cols()>0 && "you are using a non initialized matrix");
+    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
+      for(int i = 1; i < lhs.cols(); ++i)
+        res += lhs.coeff(row, i) * rhs.coeff(i, col);
+  }
+};
+
+// prevent buggy user code from causing an infinite recursion
+template<typename Lhs, typename Rhs, typename RetScalar>
+struct ei_product_coeff_impl<NoVectorization, -1, Lhs, Rhs, RetScalar>
+{
+  EIGEN_STRONG_INLINE static void run(int, int, const Lhs&, const Rhs&, RetScalar&) {}
+};
+
+/*******************************************
+*** Scalar path with inner vectorization ***
+*******************************************/
+
+template<int Index, typename Lhs, typename Rhs, typename PacketScalar>
+struct ei_product_coeff_vectorized_unroller
+{
+  enum { PacketSize = ei_packet_traits<typename Lhs::Scalar>::size };
+  EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
+  {
+    ei_product_coeff_vectorized_unroller<Index-PacketSize, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, pres);
+    pres = ei_padd(pres, ei_pmul( lhs.template packet<Aligned>(row, Index) , rhs.template packet<Aligned>(Index, col) ));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename PacketScalar>
+struct ei_product_coeff_vectorized_unroller<0, Lhs, Rhs, PacketScalar>
+{
+  EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
+  {
+    pres = ei_pmul(lhs.template packet<Aligned>(row, 0) , rhs.template packet<Aligned>(0, col));
+  }
+};
+
+template<int Index, typename Lhs, typename Rhs, typename RetScalar>
+struct ei_product_coeff_impl<InnerVectorization, Index, Lhs, Rhs, RetScalar>
+{
+  typedef typename Lhs::PacketScalar PacketScalar;
+  enum { PacketSize = ei_packet_traits<typename Lhs::Scalar>::size };
+  EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
+  {
+    PacketScalar pres;
+    ei_product_coeff_vectorized_unroller<Index+1-PacketSize, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, pres);
+    ei_product_coeff_impl<NoVectorization,Index,Lhs,Rhs,RetScalar>::run(row, col, lhs, rhs, res);
+    res = ei_predux(pres);
+  }
+};
+
+template<typename Lhs, typename Rhs, int LhsRows = Lhs::RowsAtCompileTime, int RhsCols = Rhs::ColsAtCompileTime>
+struct ei_product_coeff_vectorized_dyn_selector
+{
+  EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+  {
+    res = ei_dot_impl<
+      Block<Lhs, 1, ei_traits<Lhs>::ColsAtCompileTime>,
+      Block<Rhs, ei_traits<Rhs>::RowsAtCompileTime, 1>,
+      LinearVectorization, NoUnrolling>::run(lhs.row(row), rhs.col(col));
+  }
+};
+
+// NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower
+// NOTE maybe they are now useless since we have a specialization for Block<Matrix>
+template<typename Lhs, typename Rhs, int RhsCols>
+struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols>
+{
+  EIGEN_STRONG_INLINE static void run(int /*row*/, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+  {
+    res = ei_dot_impl<
+      Lhs,
+      Block<Rhs, ei_traits<Rhs>::RowsAtCompileTime, 1>,
+      LinearVectorization, NoUnrolling>::run(lhs, rhs.col(col));
+  }
+};
+
+template<typename Lhs, typename Rhs, int LhsRows>
+struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1>
+{
+  EIGEN_STRONG_INLINE static void run(int row, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+  {
+    res = ei_dot_impl<
+      Block<Lhs, 1, ei_traits<Lhs>::ColsAtCompileTime>,
+      Rhs,
+      LinearVectorization, NoUnrolling>::run(lhs.row(row), rhs);
+  }
+};
+
+template<typename Lhs, typename Rhs>
+struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,1>
+{
+  EIGEN_STRONG_INLINE static void run(int /*row*/, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+  {
+    res = ei_dot_impl<
+      Lhs,
+      Rhs,
+      LinearVectorization, NoUnrolling>::run(lhs, rhs);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename RetScalar>
+struct ei_product_coeff_impl<InnerVectorization, Dynamic, Lhs, Rhs, RetScalar>
+{
+  EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+  {
+    ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs>::run(row, col, lhs, rhs, res);
+  }
+};
+
+/*******************
+*** Packet path  ***
+*******************/
+
+template<int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
+struct ei_product_packet_impl<RowMajor, Index, Lhs, Rhs, PacketScalar, LoadMode>
+{
+  EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
+  {
+    ei_product_packet_impl<RowMajor, Index-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res);
+    res =  ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.template packet<LoadMode>(Index, col), res);
+  }
+};
+
+template<int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
+struct ei_product_packet_impl<ColMajor, Index, Lhs, Rhs, PacketScalar, LoadMode>
+{
+  EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
+  {
+    ei_product_packet_impl<ColMajor, Index-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res);
+    res =  ei_pmadd(lhs.template packet<LoadMode>(row, Index), ei_pset1(rhs.coeff(Index, col)), res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
+struct ei_product_packet_impl<RowMajor, 0, Lhs, Rhs, PacketScalar, LoadMode>
+{
+  EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
+  {
+    res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
+struct ei_product_packet_impl<ColMajor, 0, Lhs, Rhs, PacketScalar, LoadMode>
+{
+  EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
+  {
+    res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col)));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
+struct ei_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMode>
+{
+  EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
+  {
+    ei_assert(lhs.cols()>0 && "you are using a non initialized matrix");
+    res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
+      for(int i = 1; i < lhs.cols(); ++i)
+        res =  ei_pmadd(ei_pset1(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
+struct ei_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMode>
+{
+  EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
+  {
+    ei_assert(lhs.cols()>0 && "you are using a non initialized matrix");
+    res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col)));
+      for(int i = 1; i < lhs.cols(); ++i)
+        res =  ei_pmadd(lhs.template packet<LoadMode>(row, i), ei_pset1(rhs.coeff(i, col)), res);
+  }
+};
+
+#endif // EIGEN_GENERAL_UNROLLED_PRODUCT_H
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@ -339,4 +339,56 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,false,ConjugateLhs,
  }
 };

+/***************************************************************************
+* Wrapper to ei_product_selfadjoint_matrix
+***************************************************************************/
+
+template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
+struct ei_traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false> >
+  : ei_traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs> >
+{};
+
+template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
+struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>
+  : public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs >
+{
+  EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
+
+  SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
+
+  enum {
+    LhsUpLo = LhsMode&(UpperTriangularBit|LowerTriangularBit),
+    LhsIsSelfAdjoint = (LhsMode&SelfAdjointBit)==SelfAdjointBit,
+    RhsUpLo = RhsMode&(UpperTriangularBit|LowerTriangularBit),
+    RhsIsSelfAdjoint = (RhsMode&SelfAdjointBit)==SelfAdjointBit
+  };
+
+  template<typename Dest> void addTo(Dest& dst, Scalar alpha) const
+  {
+    ei_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
+
+    const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs);
+    const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs);
+
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
+                               * RhsBlasTraits::extractScalarFactor(m_rhs);
+
+    ei_product_selfadjoint_matrix<Scalar,
+      EIGEN_LOGICAL_XOR(LhsUpLo==UpperTriangular,
+                        ei_traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint,
+      NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsUpLo==UpperTriangular,bool(LhsBlasTraits::NeedToConjugate)),
+      EIGEN_LOGICAL_XOR(RhsUpLo==UpperTriangular,
+                        ei_traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint,
+      NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsUpLo==UpperTriangular,bool(RhsBlasTraits::NeedToConjugate)),
+      ei_traits<Dest>::Flags&RowMajorBit  ? RowMajor : ColMajor>
+      ::run(
+        lhs.rows(), rhs.cols(),           // sizes
+        &lhs.coeff(0,0),    lhs.stride(), // lhs info
+        &rhs.coeff(0,0),    rhs.stride(), // rhs info
+        &dst.coeffRef(0,0), dst.stride(), // result info
+        actualAlpha                       // alpha
+      );
+  }
+};
+
 #endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@ -154,5 +154,48 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector(
    ei_aligned_stack_delete(Scalar, const_cast<Scalar*>(rhs), size);
 }

+/***************************************************************************
+* Wrapper to ei_product_selfadjoint_vector
+***************************************************************************/
+
+template<typename Lhs, int LhsMode, typename Rhs>
+struct ei_traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true> >
+  : ei_traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs> >
+{};
+
+template<typename Lhs, int LhsMode, typename Rhs>
+struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
+  : public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs >
+{
+  EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
+
+  enum {
+    LhsUpLo = LhsMode&(UpperTriangularBit|LowerTriangularBit)
+  };
+
+  SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
+
+  template<typename Dest> void addTo(Dest& dst, Scalar alpha) const
+  {
+    ei_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
+
+    const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs);
+    const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs);
+
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
+                               * RhsBlasTraits::extractScalarFactor(m_rhs);
+
+    ei_assert((&dst.coeff(1))-(&dst.coeff(0))==1 && "not implemented yet");
+    ei_product_selfadjoint_vector<Scalar, ei_traits<_ActualLhsType>::Flags&RowMajorBit, int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)>
+      (
+        lhs.rows(),                                     // size
+        &lhs.coeff(0,0), lhs.stride(),                  // lhs info
+        &rhs.coeff(0), (&rhs.coeff(1))-(&rhs.coeff(0)), // rhs info
+        &dst.coeffRef(0),                               // result info
+        actualAlpha                                     // scale factor
+      );
+  }
+};
+

 #endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_H
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@ -139,7 +139,7 @@ struct ei_product_blocking_traits
    // register block size along the N direction (must be either 2 or 4)
    nr = HalfRegisterCount/2,

-    // register block size along the M direction (this cannot be modified)
+    // register block size along the M direction (currently, this one cannot be modified)
    mr = 2 * PacketSize,

    // max cache block size along the K direction
--- a/Eigen/src/Geometry/Umeyama.h
+++ b/Eigen/src/Geometry/Umeyama.h
@ -144,7 +144,7 @@ umeyama(const MatrixBase<Derived>& src, const MatrixBase<OtherDerived>& dst, boo
  // const Scalar dst_var = dst_demean.rowwise().squaredNorm().sum() * one_over_n;

  // Eq. (38)
-  const MatrixType sigma = (dst_demean * src_demean.transpose() * one_over_n).lazy();
+  const MatrixType sigma = (one_over_n * dst_demean * src_demean.transpose()).lazy();

  SVD<MatrixType> svd(sigma);

--- a/test/geo_transformations.cpp
+++ b/test/geo_transformations.cpp
@ -148,7 +148,6 @@ template<typename Scalar, int Mode> void transformations(void)
  Transform3 tmat3(mat3), tmat4(mat4);
  if(Mode!=int(AffineCompact))
    tmat4.matrix()(3,3) = Scalar(1);
-  std::cerr << tmat3.matrix() << "\n\n" << tmat4.matrix() << "\n\n";
  VERIFY_IS_APPROX(tmat3.matrix(), tmat4.matrix());

  Scalar a3 = ei_random<Scalar>(-Scalar(M_PI), Scalar(M_PI));
--- a/test/nomalloc.cpp
+++ b/test/nomalloc.cpp
@ -65,7 +65,12 @@ template<typename MatrixType> void nomalloc(const MatrixType& m)
  VERIFY_IS_APPROX((m1+m2)*s1,              s1*m1+s1*m2);
  VERIFY_IS_APPROX((m1+m2)(r,c), (m1(r,c))+(m2(r,c)));
  VERIFY_IS_APPROX(m1.cwise() * m1.block(0,0,rows,cols), m1.cwise() * m1);
-  VERIFY_IS_APPROX((m1*m1.transpose())*m2,  m1*(m1.transpose()*m2));
+  if (MatrixType::RowsAtCompileTime<EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD) {
+    // If the matrices are too large, we have better to use the optimized GEMM
+    // routines which allocates temporaries. However, on some platforms
+    // these temporaries are allocated on the stack using alloca.
+    VERIFY_IS_APPROX((m1*m1.transpose())*m2,  m1*(m1.transpose()*m2));
+  }
 }

 void test_nomalloc()