* split CacheFriendlyProduct into multiple smaller files

* add an efficient selfadjoint * vector implementation (= blas symv) perf are inbetween MKL and GOTO => the interface is still missing (have to be rethougth)
2025-01-24 14:45:14 +08:00 · 2009-02-21 20:20:38 +00:00 · 2009-02-21 20:20:38 +00:00 · de014efdaf
commit de014efdaf
parent 3d86dcf473
6 changed files with 663 additions and 385 deletions
--- a/Eigen/Core
+++ b/Eigen/Core
@ -145,7 +145,9 @@ namespace Eigen {
 #include "src/Core/Swap.h"
 #include "src/Core/CommaInitializer.h"
 #include "src/Core/Part.h"
-#include "src/Core/CacheFriendlyProduct.h"
+#include "src/Core/products/GeneralMatrixMatrix.h"
+#include "src/Core/products/GeneralMatrixVector.h"
+#include "src/Core/products/SelfadjointMatrixVector.h"

 } // namespace Eigen

--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@ -0,0 +1,409 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_H
+
+template <int L2MemorySize,typename Scalar>
+struct ei_L2_block_traits {
+  enum {width = 8 * ei_meta_sqrt<L2MemorySize/(64*sizeof(Scalar))>::ret };
+};
+
+#ifndef EIGEN_EXTERN_INSTANTIATIONS
+
+template<typename Scalar>
+static void ei_cache_friendly_product(
+  int _rows, int _cols, int depth,
+  bool _lhsRowMajor, const Scalar* _lhs, int _lhsStride,
+  bool _rhsRowMajor, const Scalar* _rhs, int _rhsStride,
+  bool resRowMajor, Scalar* res, int resStride)
+{
+  const Scalar* EIGEN_RESTRICT lhs;
+  const Scalar* EIGEN_RESTRICT rhs;
+  int lhsStride, rhsStride, rows, cols;
+  bool lhsRowMajor;
+
+  if (resRowMajor)
+  {
+    lhs = _rhs;
+    rhs = _lhs;
+    lhsStride = _rhsStride;
+    rhsStride = _lhsStride;
+    cols = _rows;
+    rows = _cols;
+    lhsRowMajor = !_rhsRowMajor;
+    ei_assert(_lhsRowMajor);
+  }
+  else
+  {
+    lhs = _lhs;
+    rhs = _rhs;
+    lhsStride = _lhsStride;
+    rhsStride = _rhsStride;
+    rows = _rows;
+    cols = _cols;
+    lhsRowMajor = _lhsRowMajor;
+    ei_assert(!_rhsRowMajor);
+  }
+
+  typedef typename ei_packet_traits<Scalar>::type PacketType;
+
+  enum {
+    PacketSize = sizeof(PacketType)/sizeof(Scalar),
+    #if (defined __i386__)
+    // i386 architecture provides only 8 xmm registers,
+    // so let's reduce the max number of rows processed at once.
+    MaxBlockRows = 4,
+    MaxBlockRows_ClampingMask = 0xFFFFFC,
+    #else
+    MaxBlockRows = 8,
+    MaxBlockRows_ClampingMask = 0xFFFFF8,
+    #endif
+    // maximal size of the blocks fitted in L2 cache
+    MaxL2BlockSize = ei_L2_block_traits<EIGEN_TUNE_FOR_CPU_CACHE_SIZE,Scalar>::width
+  };
+
+  const bool resIsAligned = (PacketSize==1) || (((resStride%PacketSize) == 0) && (size_t(res)%16==0));
+
+  const int remainingSize = depth % PacketSize;
+  const int size = depth - remainingSize; // third dimension of the product clamped to packet boundaries
+  const int l2BlockRows = MaxL2BlockSize > rows ? rows : MaxL2BlockSize;
+  const int l2BlockCols = MaxL2BlockSize > cols ? cols : MaxL2BlockSize;
+  const int l2BlockSize = MaxL2BlockSize > size ? size : MaxL2BlockSize;
+  const int l2BlockSizeAligned = (1 + std::max(l2BlockSize,l2BlockCols)/PacketSize)*PacketSize;
+  const bool needRhsCopy = (PacketSize>1) && ((rhsStride%PacketSize!=0) || (size_t(rhs)%16!=0));
+  Scalar* EIGEN_RESTRICT block = 0;
+  const int allocBlockSize = l2BlockRows*size;
+  block = ei_aligned_stack_new(Scalar, allocBlockSize);
+  Scalar* EIGEN_RESTRICT rhsCopy
+    = ei_aligned_stack_new(Scalar, l2BlockSizeAligned*l2BlockSizeAligned);
+
+#ifndef EIGEN_USE_NEW_PRODUCT
+  // loops on each L2 cache friendly blocks of the result
+  for(int l2i=0; l2i<rows; l2i+=l2BlockRows)
+  {
+    const int l2blockRowEnd = std::min(l2i+l2BlockRows, rows);
+    const int l2blockRowEndBW = l2blockRowEnd & MaxBlockRows_ClampingMask;    // end of the rows aligned to bw
+    const int l2blockRemainingRows = l2blockRowEnd - l2blockRowEndBW;         // number of remaining rows
+    //const int l2blockRowEndBWPlusOne = l2blockRowEndBW + (l2blockRemainingRows?0:MaxBlockRows);
+
+    // build a cache friendly blocky matrix
+    int count = 0;
+
+    // copy l2blocksize rows of m_lhs to blocks of ps x bw
+    for(int l2k=0; l2k<size; l2k+=l2BlockSize)
+    {
+      const int l2blockSizeEnd = std::min(l2k+l2BlockSize, size);
+
+      for (int i = l2i; i<l2blockRowEndBW/*PlusOne*/; i+=MaxBlockRows)
+      {
+        // TODO merge the "if l2blockRemainingRows" using something like:
+        // const int blockRows = std::min(i+MaxBlockRows, rows) - i;
+
+        for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
+        {
+          // TODO write these loops using meta unrolling
+          // negligible for large matrices but useful for small ones
+          if (lhsRowMajor)
+          {
+            for (int w=0; w<MaxBlockRows; ++w)
+              for (int s=0; s<PacketSize; ++s)
+                block[count++] = lhs[(i+w)*lhsStride + (k+s)];
+          }
+          else
+          {
+            for (int w=0; w<MaxBlockRows; ++w)
+              for (int s=0; s<PacketSize; ++s)
+                block[count++] = lhs[(i+w) + (k+s)*lhsStride];
+          }
+        }
+      }
+      if (l2blockRemainingRows>0)
+      {
+        for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
+        {
+          if (lhsRowMajor)
+          {
+            for (int w=0; w<l2blockRemainingRows; ++w)
+              for (int s=0; s<PacketSize; ++s)
+                block[count++] = lhs[(l2blockRowEndBW+w)*lhsStride + (k+s)];
+          }
+          else
+          {
+            for (int w=0; w<l2blockRemainingRows; ++w)
+              for (int s=0; s<PacketSize; ++s)
+                block[count++] = lhs[(l2blockRowEndBW+w) + (k+s)*lhsStride];
+          }
+        }
+      }
+    }
+
+    for(int l2j=0; l2j<cols; l2j+=l2BlockCols)
+    {
+      int l2blockColEnd = std::min(l2j+l2BlockCols, cols);
+
+      for(int l2k=0; l2k<size; l2k+=l2BlockSize)
+      {
+        // acumulate bw rows of lhs time a single column of rhs to a bw x 1 block of res
+        int l2blockSizeEnd = std::min(l2k+l2BlockSize, size);
+
+        // if not aligned, copy the rhs block
+        if (needRhsCopy)
+          for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
+          {
+            ei_internal_assert(l2BlockSizeAligned*(l1j-l2j)+(l2blockSizeEnd-l2k) < l2BlockSizeAligned*l2BlockSizeAligned);
+            memcpy(rhsCopy+l2BlockSizeAligned*(l1j-l2j),&(rhs[l1j*rhsStride+l2k]),(l2blockSizeEnd-l2k)*sizeof(Scalar));
+          }
+
+        // for each bw x 1 result's block
+        for(int l1i=l2i; l1i<l2blockRowEndBW; l1i+=MaxBlockRows)
+        {
+          int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*MaxBlockRows;
+          const Scalar* EIGEN_RESTRICT localB = &block[offsetblock];
+
+          for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
+          {
+            const Scalar* EIGEN_RESTRICT rhsColumn;
+            if (needRhsCopy)
+              rhsColumn = &(rhsCopy[l2BlockSizeAligned*(l1j-l2j)-l2k]);
+            else
+              rhsColumn = &(rhs[l1j*rhsStride]);
+
+            PacketType dst[MaxBlockRows];
+            dst[3] = dst[2] = dst[1] = dst[0] = ei_pset1(Scalar(0.));
+            if (MaxBlockRows==8)
+              dst[7] = dst[6] = dst[5] = dst[4] = dst[0];
+
+            PacketType tmp;
+
+            for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
+            {
+              tmp = ei_ploadu(&rhsColumn[k]);
+              PacketType A0, A1, A2, A3, A4, A5;
+              A0 = ei_pload(localB + k*MaxBlockRows);
+              A1 = ei_pload(localB + k*MaxBlockRows+1*PacketSize);
+              A2 = ei_pload(localB + k*MaxBlockRows+2*PacketSize);
+              A3 = ei_pload(localB + k*MaxBlockRows+3*PacketSize);
+              if (MaxBlockRows==8) A4 = ei_pload(localB + k*MaxBlockRows+4*PacketSize);
+              if (MaxBlockRows==8) A5 = ei_pload(localB + k*MaxBlockRows+5*PacketSize);
+              dst[0] = ei_pmadd(tmp, A0, dst[0]);
+              if (MaxBlockRows==8) A0 = ei_pload(localB + k*MaxBlockRows+6*PacketSize);
+              dst[1] = ei_pmadd(tmp, A1, dst[1]);
+              if (MaxBlockRows==8) A1 = ei_pload(localB + k*MaxBlockRows+7*PacketSize);
+              dst[2] = ei_pmadd(tmp, A2, dst[2]);
+              dst[3] = ei_pmadd(tmp, A3, dst[3]);
+              if (MaxBlockRows==8)
+              {
+                dst[4] = ei_pmadd(tmp, A4, dst[4]);
+                dst[5] = ei_pmadd(tmp, A5, dst[5]);
+                dst[6] = ei_pmadd(tmp, A0, dst[6]);
+                dst[7] = ei_pmadd(tmp, A1, dst[7]);
+              }
+            }
+
+            Scalar* EIGEN_RESTRICT localRes = &(res[l1i + l1j*resStride]);
+
+            if (PacketSize>1 && resIsAligned)
+            {
+              // the result is aligned: let's do packet reduction
+              ei_pstore(&(localRes[0]), ei_padd(ei_pload(&(localRes[0])), ei_preduxp(&dst[0])));
+              if (PacketSize==2)
+                ei_pstore(&(localRes[2]), ei_padd(ei_pload(&(localRes[2])), ei_preduxp(&(dst[2]))));
+              if (MaxBlockRows==8)
+              {
+                ei_pstore(&(localRes[4]), ei_padd(ei_pload(&(localRes[4])), ei_preduxp(&(dst[4]))));
+                if (PacketSize==2)
+                  ei_pstore(&(localRes[6]), ei_padd(ei_pload(&(localRes[6])), ei_preduxp(&(dst[6]))));
+              }
+            }
+            else
+            {
+              // not aligned => per coeff packet reduction
+              localRes[0] += ei_predux(dst[0]);
+              localRes[1] += ei_predux(dst[1]);
+              localRes[2] += ei_predux(dst[2]);
+              localRes[3] += ei_predux(dst[3]);
+              if (MaxBlockRows==8)
+              {
+                localRes[4] += ei_predux(dst[4]);
+                localRes[5] += ei_predux(dst[5]);
+                localRes[6] += ei_predux(dst[6]);
+                localRes[7] += ei_predux(dst[7]);
+              }
+            }
+          }
+        }
+        if (l2blockRemainingRows>0)
+        {
+          int offsetblock = l2k * (l2blockRowEnd-l2i) + (l2blockRowEndBW-l2i)*(l2blockSizeEnd-l2k) - l2k*l2blockRemainingRows;
+          const Scalar* localB = &block[offsetblock];
+
+          for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
+          {
+            const Scalar* EIGEN_RESTRICT rhsColumn;
+            if (needRhsCopy)
+              rhsColumn = &(rhsCopy[l2BlockSizeAligned*(l1j-l2j)-l2k]);
+            else
+              rhsColumn = &(rhs[l1j*rhsStride]);
+
+            PacketType dst[MaxBlockRows];
+            dst[3] = dst[2] = dst[1] = dst[0] = ei_pset1(Scalar(0.));
+            if (MaxBlockRows==8)
+              dst[7] = dst[6] = dst[5] = dst[4] = dst[0];
+
+            // let's declare a few other temporary registers
+            PacketType tmp;
+
+            for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
+            {
+              tmp = ei_pload(&rhsColumn[k]);
+
+                                           dst[0] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows             ])), dst[0]);
+              if (l2blockRemainingRows>=2) dst[1] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+  PacketSize])), dst[1]);
+              if (l2blockRemainingRows>=3) dst[2] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+2*PacketSize])), dst[2]);
+              if (l2blockRemainingRows>=4) dst[3] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+3*PacketSize])), dst[3]);
+              if (MaxBlockRows==8)
+              {
+                if (l2blockRemainingRows>=5) dst[4] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+4*PacketSize])), dst[4]);
+                if (l2blockRemainingRows>=6) dst[5] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+5*PacketSize])), dst[5]);
+                if (l2blockRemainingRows>=7) dst[6] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+6*PacketSize])), dst[6]);
+                if (l2blockRemainingRows>=8) dst[7] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+7*PacketSize])), dst[7]);
+              }
+            }
+
+            Scalar* EIGEN_RESTRICT localRes = &(res[l2blockRowEndBW + l1j*resStride]);
+
+            // process the remaining rows once at a time
+                                         localRes[0] += ei_predux(dst[0]);
+            if (l2blockRemainingRows>=2) localRes[1] += ei_predux(dst[1]);
+            if (l2blockRemainingRows>=3) localRes[2] += ei_predux(dst[2]);
+            if (l2blockRemainingRows>=4) localRes[3] += ei_predux(dst[3]);
+            if (MaxBlockRows==8)
+            {
+              if (l2blockRemainingRows>=5) localRes[4] += ei_predux(dst[4]);
+              if (l2blockRemainingRows>=6) localRes[5] += ei_predux(dst[5]);
+              if (l2blockRemainingRows>=7) localRes[6] += ei_predux(dst[6]);
+              if (l2blockRemainingRows>=8) localRes[7] += ei_predux(dst[7]);
+            }
+
+          }
+        }
+      }
+    }
+  }
+  if (PacketSize>1 && remainingSize)
+  {
+    if (lhsRowMajor)
+    {
+      for (int j=0; j<cols; ++j)
+        for (int i=0; i<rows; ++i)
+        {
+          Scalar tmp = lhs[i*lhsStride+size] * rhs[j*rhsStride+size];
+          // FIXME this loop get vectorized by the compiler !
+          for (int k=1; k<remainingSize; ++k)
+            tmp += lhs[i*lhsStride+size+k] * rhs[j*rhsStride+size+k];
+          res[i+j*resStride] += tmp;
+        }
+    }
+    else
+    {
+      for (int j=0; j<cols; ++j)
+        for (int i=0; i<rows; ++i)
+        {
+          Scalar tmp = lhs[i+size*lhsStride] * rhs[j*rhsStride+size];
+          for (int k=1; k<remainingSize; ++k)
+            tmp += lhs[i+(size+k)*lhsStride] * rhs[j*rhsStride+size+k];
+          res[i+j*resStride] += tmp;
+        }
+    }
+  }
+
+#else
+  // loops on each L2 cache friendly blocks of the result
+
+  for(int l2i=0; l2i<rows; l2i+=l2BlockRows)
+  {
+    for(int l2j=0; l2j<cols; l2j+=l2BlockCols)
+    {
+      // We have selected a block of lhs
+      // Packs this block into 'block'
+      for(int j=0; j<l2BlockCols; ++j)
+      {
+        int count = 0;
+        for(int i=0; i<l2BlockRows; ++i)
+        {
+          block[ (j*l2BlockCols) + i] = lhs[(j+l2j)*rows+l2i+count++];
+        }
+      }
+
+      // loops on each L2 cache firendly block of the result/rhs
+      for(int l2k=0; l2k<cols; l2k+=l2BlockCols)
+      {
+        for(int j=0; j<l2BlockCols; ++j)
+        {
+          for(int i=0; i<l2BlockRows; i+=PacketSize)
+          {
+            PacketType A0, A1, A2, A3, A4, A5;
+
+            // Load the packets from rhs and reorder them
+
+            // Here we need some vector reordering
+            // Right now its hardcoded to packets of 4 elements
+            A0 = ei_pset1(rhs[(j+l2k)*rows+(i+l2j)]);
+            A1 = ei_pset1(rhs[(j+l2k)*rows+(i+l2j)+1]);
+            A2 = ei_pset1(rhs[(j+l2k)*rows+(i+l2j)+2]);
+            A3 = ei_pset1(rhs[(j+l2k)*rows+(i+l2j)+3]);
+
+            for(int k=0; k<l2BlockRows; k+=PacketSize)
+            {
+              PacketType L0, L1, L2, L3;
+
+              // We perform "cross products" of vectors to avoid
+              // reductions (horizontal ops) afterwards
+              A4 = ei_pload(&res[(j+l2k)*rows+l2i+k]);
+              L0 = ei_pload(&block[ k + (i + 0)*l2BlockRows ]);
+              L1 = ei_pload(&block[ k + (i + 1)*l2BlockRows ]);
+              A4 = ei_pmadd(L0, A0, A4);
+              L2 = ei_pload(&block[ k + (i + 2)*l2BlockRows ]);
+              A4 = ei_pmadd(L1, A1, A4);
+              L3 = ei_pload(&block[ k + (i + 3)*l2BlockRows ]);
+              A4 = ei_pmadd(L2, A2, A4);
+              A4 = ei_pmadd(L3, A3, A4);
+
+              ei_pstore(&res[(j+l2k)*rows+l2i+k], A4);
+            }
+          }
+        }
+      }
+    }
+  }
+#endif
+
+  ei_aligned_stack_delete(Scalar, block, allocBlockSize);
+  ei_aligned_stack_delete(Scalar, rhsCopy, l2BlockSizeAligned*l2BlockSizeAligned);
+}
+
+#endif // EIGEN_EXTERN_INSTANTIATIONS
+
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_H
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@ -22,389 +22,8 @@
 // License and a copy of the GNU General Public License along with
 // Eigen. If not, see <http://www.gnu.org/licenses/>.

-#ifndef EIGEN_CACHE_FRIENDLY_PRODUCT_H
-#define EIGEN_CACHE_FRIENDLY_PRODUCT_H
-
-template <int L2MemorySize,typename Scalar>
-struct ei_L2_block_traits {
-  enum {width = 8 * ei_meta_sqrt<L2MemorySize/(64*sizeof(Scalar))>::ret };
-};
-
-#ifndef EIGEN_EXTERN_INSTANTIATIONS
-
-template<typename Scalar>
-static void ei_cache_friendly_product(
-  int _rows, int _cols, int depth,
-  bool _lhsRowMajor, const Scalar* _lhs, int _lhsStride,
-  bool _rhsRowMajor, const Scalar* _rhs, int _rhsStride,
-  bool resRowMajor, Scalar* res, int resStride)
-{
-  const Scalar* EIGEN_RESTRICT lhs;
-  const Scalar* EIGEN_RESTRICT rhs;
-  int lhsStride, rhsStride, rows, cols;
-  bool lhsRowMajor;
-
-  if (resRowMajor)
-  {
-    lhs = _rhs;
-    rhs = _lhs;
-    lhsStride = _rhsStride;
-    rhsStride = _lhsStride;
-    cols = _rows;
-    rows = _cols;
-    lhsRowMajor = !_rhsRowMajor;
-    ei_assert(_lhsRowMajor);
-  }
-  else
-  {
-    lhs = _lhs;
-    rhs = _rhs;
-    lhsStride = _lhsStride;
-    rhsStride = _rhsStride;
-    rows = _rows;
-    cols = _cols;
-    lhsRowMajor = _lhsRowMajor;
-    ei_assert(!_rhsRowMajor);
-  }
-
-  typedef typename ei_packet_traits<Scalar>::type PacketType;
-
-  enum {
-    PacketSize = sizeof(PacketType)/sizeof(Scalar),
-    #if (defined __i386__)
-    // i386 architecture provides only 8 xmm registers,
-    // so let's reduce the max number of rows processed at once.
-    MaxBlockRows = 4,
-    MaxBlockRows_ClampingMask = 0xFFFFFC,
-    #else
-    MaxBlockRows = 8,
-    MaxBlockRows_ClampingMask = 0xFFFFF8,
-    #endif
-    // maximal size of the blocks fitted in L2 cache
-    MaxL2BlockSize = ei_L2_block_traits<EIGEN_TUNE_FOR_CPU_CACHE_SIZE,Scalar>::width
-  };
-
-  const bool resIsAligned = (PacketSize==1) || (((resStride%PacketSize) == 0) && (size_t(res)%16==0));
-
-  const int remainingSize = depth % PacketSize;
-  const int size = depth - remainingSize; // third dimension of the product clamped to packet boundaries
-  const int l2BlockRows = MaxL2BlockSize > rows ? rows : MaxL2BlockSize;
-  const int l2BlockCols = MaxL2BlockSize > cols ? cols : MaxL2BlockSize;
-  const int l2BlockSize = MaxL2BlockSize > size ? size : MaxL2BlockSize;
-  const int l2BlockSizeAligned = (1 + std::max(l2BlockSize,l2BlockCols)/PacketSize)*PacketSize;
-  const bool needRhsCopy = (PacketSize>1) && ((rhsStride%PacketSize!=0) || (size_t(rhs)%16!=0));
-  Scalar* EIGEN_RESTRICT block = 0;
-  const int allocBlockSize = l2BlockRows*size;
-  block = ei_aligned_stack_new(Scalar, allocBlockSize);
-  Scalar* EIGEN_RESTRICT rhsCopy
-    = ei_aligned_stack_new(Scalar, l2BlockSizeAligned*l2BlockSizeAligned);
-
-#ifndef EIGEN_USE_NEW_PRODUCT
-  // loops on each L2 cache friendly blocks of the result
-  for(int l2i=0; l2i<rows; l2i+=l2BlockRows)
-  {
-    const int l2blockRowEnd = std::min(l2i+l2BlockRows, rows);
-    const int l2blockRowEndBW = l2blockRowEnd & MaxBlockRows_ClampingMask;    // end of the rows aligned to bw
-    const int l2blockRemainingRows = l2blockRowEnd - l2blockRowEndBW;         // number of remaining rows
-    //const int l2blockRowEndBWPlusOne = l2blockRowEndBW + (l2blockRemainingRows?0:MaxBlockRows);
-
-    // build a cache friendly blocky matrix
-    int count = 0;
-
-    // copy l2blocksize rows of m_lhs to blocks of ps x bw
-    for(int l2k=0; l2k<size; l2k+=l2BlockSize)
-    {
-      const int l2blockSizeEnd = std::min(l2k+l2BlockSize, size);
-
-      for (int i = l2i; i<l2blockRowEndBW/*PlusOne*/; i+=MaxBlockRows)
-      {
-        // TODO merge the "if l2blockRemainingRows" using something like:
-        // const int blockRows = std::min(i+MaxBlockRows, rows) - i;
-
-        for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
-        {
-          // TODO write these loops using meta unrolling
-          // negligible for large matrices but useful for small ones
-          if (lhsRowMajor)
-          {
-            for (int w=0; w<MaxBlockRows; ++w)
-              for (int s=0; s<PacketSize; ++s)
-                block[count++] = lhs[(i+w)*lhsStride + (k+s)];
-          }
-          else
-          {
-            for (int w=0; w<MaxBlockRows; ++w)
-              for (int s=0; s<PacketSize; ++s)
-                block[count++] = lhs[(i+w) + (k+s)*lhsStride];
-          }
-        }
-      }
-      if (l2blockRemainingRows>0)
-      {
-        for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
-        {
-          if (lhsRowMajor)
-          {
-            for (int w=0; w<l2blockRemainingRows; ++w)
-              for (int s=0; s<PacketSize; ++s)
-                block[count++] = lhs[(l2blockRowEndBW+w)*lhsStride + (k+s)];
-          }
-          else
-          {
-            for (int w=0; w<l2blockRemainingRows; ++w)
-              for (int s=0; s<PacketSize; ++s)
-                block[count++] = lhs[(l2blockRowEndBW+w) + (k+s)*lhsStride];
-          }
-        }
-      }
-    }
-
-    for(int l2j=0; l2j<cols; l2j+=l2BlockCols)
-    {
-      int l2blockColEnd = std::min(l2j+l2BlockCols, cols);
-
-      for(int l2k=0; l2k<size; l2k+=l2BlockSize)
-      {
-        // acumulate bw rows of lhs time a single column of rhs to a bw x 1 block of res
-        int l2blockSizeEnd = std::min(l2k+l2BlockSize, size);
-
-        // if not aligned, copy the rhs block
-        if (needRhsCopy)
-          for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
-          {
-            ei_internal_assert(l2BlockSizeAligned*(l1j-l2j)+(l2blockSizeEnd-l2k) < l2BlockSizeAligned*l2BlockSizeAligned);
-            memcpy(rhsCopy+l2BlockSizeAligned*(l1j-l2j),&(rhs[l1j*rhsStride+l2k]),(l2blockSizeEnd-l2k)*sizeof(Scalar));
-          }
-
-        // for each bw x 1 result's block
-        for(int l1i=l2i; l1i<l2blockRowEndBW; l1i+=MaxBlockRows)
-        {
-          int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*MaxBlockRows;
-          const Scalar* EIGEN_RESTRICT localB = &block[offsetblock];
-
-          for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
-          {
-            const Scalar* EIGEN_RESTRICT rhsColumn;
-            if (needRhsCopy)
-              rhsColumn = &(rhsCopy[l2BlockSizeAligned*(l1j-l2j)-l2k]);
-            else
-              rhsColumn = &(rhs[l1j*rhsStride]);
-
-            PacketType dst[MaxBlockRows];
-            dst[3] = dst[2] = dst[1] = dst[0] = ei_pset1(Scalar(0.));
-            if (MaxBlockRows==8)
-              dst[7] = dst[6] = dst[5] = dst[4] = dst[0];
-
-            PacketType tmp;
-
-            for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
-            {
-              tmp = ei_ploadu(&rhsColumn[k]);
-              PacketType A0, A1, A2, A3, A4, A5;
-              A0 = ei_pload(localB + k*MaxBlockRows);
-              A1 = ei_pload(localB + k*MaxBlockRows+1*PacketSize);
-              A2 = ei_pload(localB + k*MaxBlockRows+2*PacketSize);
-              A3 = ei_pload(localB + k*MaxBlockRows+3*PacketSize);
-              if (MaxBlockRows==8) A4 = ei_pload(localB + k*MaxBlockRows+4*PacketSize);
-              if (MaxBlockRows==8) A5 = ei_pload(localB + k*MaxBlockRows+5*PacketSize);
-              dst[0] = ei_pmadd(tmp, A0, dst[0]);
-              if (MaxBlockRows==8) A0 = ei_pload(localB + k*MaxBlockRows+6*PacketSize);
-              dst[1] = ei_pmadd(tmp, A1, dst[1]);
-              if (MaxBlockRows==8) A1 = ei_pload(localB + k*MaxBlockRows+7*PacketSize);
-              dst[2] = ei_pmadd(tmp, A2, dst[2]);
-              dst[3] = ei_pmadd(tmp, A3, dst[3]);
-              if (MaxBlockRows==8)
-              {
-                dst[4] = ei_pmadd(tmp, A4, dst[4]);
-                dst[5] = ei_pmadd(tmp, A5, dst[5]);
-                dst[6] = ei_pmadd(tmp, A0, dst[6]);
-                dst[7] = ei_pmadd(tmp, A1, dst[7]);
-              }
-            }
-
-            Scalar* EIGEN_RESTRICT localRes = &(res[l1i + l1j*resStride]);
-
-            if (PacketSize>1 && resIsAligned)
-            {
-              // the result is aligned: let's do packet reduction
-              ei_pstore(&(localRes[0]), ei_padd(ei_pload(&(localRes[0])), ei_preduxp(&dst[0])));
-              if (PacketSize==2)
-                ei_pstore(&(localRes[2]), ei_padd(ei_pload(&(localRes[2])), ei_preduxp(&(dst[2]))));
-              if (MaxBlockRows==8)
-              {
-                ei_pstore(&(localRes[4]), ei_padd(ei_pload(&(localRes[4])), ei_preduxp(&(dst[4]))));
-                if (PacketSize==2)
-                  ei_pstore(&(localRes[6]), ei_padd(ei_pload(&(localRes[6])), ei_preduxp(&(dst[6]))));
-              }
-            }
-            else
-            {
-              // not aligned => per coeff packet reduction
-              localRes[0] += ei_predux(dst[0]);
-              localRes[1] += ei_predux(dst[1]);
-              localRes[2] += ei_predux(dst[2]);
-              localRes[3] += ei_predux(dst[3]);
-              if (MaxBlockRows==8)
-              {
-                localRes[4] += ei_predux(dst[4]);
-                localRes[5] += ei_predux(dst[5]);
-                localRes[6] += ei_predux(dst[6]);
-                localRes[7] += ei_predux(dst[7]);
-              }
-            }
-          }
-        }
-        if (l2blockRemainingRows>0)
-        {
-          int offsetblock = l2k * (l2blockRowEnd-l2i) + (l2blockRowEndBW-l2i)*(l2blockSizeEnd-l2k) - l2k*l2blockRemainingRows;
-          const Scalar* localB = &block[offsetblock];
-
-          for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
-          {
-            const Scalar* EIGEN_RESTRICT rhsColumn;
-            if (needRhsCopy)
-              rhsColumn = &(rhsCopy[l2BlockSizeAligned*(l1j-l2j)-l2k]);
-            else
-              rhsColumn = &(rhs[l1j*rhsStride]);
-
-            PacketType dst[MaxBlockRows];
-            dst[3] = dst[2] = dst[1] = dst[0] = ei_pset1(Scalar(0.));
-            if (MaxBlockRows==8)
-              dst[7] = dst[6] = dst[5] = dst[4] = dst[0];
-
-            // let's declare a few other temporary registers
-            PacketType tmp;
-
-            for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
-            {
-              tmp = ei_pload(&rhsColumn[k]);
-
-                                           dst[0] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows             ])), dst[0]);
-              if (l2blockRemainingRows>=2) dst[1] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+  PacketSize])), dst[1]);
-              if (l2blockRemainingRows>=3) dst[2] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+2*PacketSize])), dst[2]);
-              if (l2blockRemainingRows>=4) dst[3] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+3*PacketSize])), dst[3]);
-              if (MaxBlockRows==8)
-              {
-                if (l2blockRemainingRows>=5) dst[4] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+4*PacketSize])), dst[4]);
-                if (l2blockRemainingRows>=6) dst[5] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+5*PacketSize])), dst[5]);
-                if (l2blockRemainingRows>=7) dst[6] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+6*PacketSize])), dst[6]);
-                if (l2blockRemainingRows>=8) dst[7] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+7*PacketSize])), dst[7]);
-              }
-            }
-
-            Scalar* EIGEN_RESTRICT localRes = &(res[l2blockRowEndBW + l1j*resStride]);
-
-            // process the remaining rows once at a time
-                                         localRes[0] += ei_predux(dst[0]);
-            if (l2blockRemainingRows>=2) localRes[1] += ei_predux(dst[1]);
-            if (l2blockRemainingRows>=3) localRes[2] += ei_predux(dst[2]);
-            if (l2blockRemainingRows>=4) localRes[3] += ei_predux(dst[3]);
-            if (MaxBlockRows==8)
-            {
-              if (l2blockRemainingRows>=5) localRes[4] += ei_predux(dst[4]);
-              if (l2blockRemainingRows>=6) localRes[5] += ei_predux(dst[5]);
-              if (l2blockRemainingRows>=7) localRes[6] += ei_predux(dst[6]);
-              if (l2blockRemainingRows>=8) localRes[7] += ei_predux(dst[7]);
-            }
-
-          }
-        }
-      }
-    }
-  }
-  if (PacketSize>1 && remainingSize)
-  {
-    if (lhsRowMajor)
-    {
-      for (int j=0; j<cols; ++j)
-        for (int i=0; i<rows; ++i)
-        {
-          Scalar tmp = lhs[i*lhsStride+size] * rhs[j*rhsStride+size];
-          // FIXME this loop get vectorized by the compiler !
-          for (int k=1; k<remainingSize; ++k)
-            tmp += lhs[i*lhsStride+size+k] * rhs[j*rhsStride+size+k];
-          res[i+j*resStride] += tmp;
-        }
-    }
-    else
-    {
-      for (int j=0; j<cols; ++j)
-        for (int i=0; i<rows; ++i)
-        {
-          Scalar tmp = lhs[i+size*lhsStride] * rhs[j*rhsStride+size];
-          for (int k=1; k<remainingSize; ++k)
-            tmp += lhs[i+(size+k)*lhsStride] * rhs[j*rhsStride+size+k];
-          res[i+j*resStride] += tmp;
-        }
-    }
-  }
-
-#else
-  // loops on each L2 cache friendly blocks of the result
-
-  for(int l2i=0; l2i<rows; l2i+=l2BlockRows)
-  {
-    for(int l2j=0; l2j<cols; l2j+=l2BlockCols)
-    {
-      // We have selected a block of lhs
-      // Packs this block into 'block'
-      for(int j=0; j<l2BlockCols; ++j)
-      {
-        int count = 0;
-        for(int i=0; i<l2BlockRows; ++i)
-        {
-          block[ (j*l2BlockCols) + i] = lhs[(j+l2j)*rows+l2i+count++];
-        }
-      }
-
-      // loops on each L2 cache firendly block of the result/rhs
-      for(int l2k=0; l2k<cols; l2k+=l2BlockCols)
-      {
-        for(int j=0; j<l2BlockCols; ++j)
-        {
-          for(int i=0; i<l2BlockRows; i+=PacketSize)
-          {
-            PacketType A0, A1, A2, A3, A4, A5;
-
-            // Load the packets from rhs and reorder them
-
-            // Here we need some vector reordering
-            // Right now its hardcoded to packets of 4 elements
-            A0 = ei_pset1(rhs[(j+l2k)*rows+(i+l2j)]);
-            A1 = ei_pset1(rhs[(j+l2k)*rows+(i+l2j)+1]);
-            A2 = ei_pset1(rhs[(j+l2k)*rows+(i+l2j)+2]);
-            A3 = ei_pset1(rhs[(j+l2k)*rows+(i+l2j)+3]);
-
-            for(int k=0; k<l2BlockRows; k+=PacketSize)
-            {
-              PacketType L0, L1, L2, L3;
-
-              // We perform "cross products" of vectors to avoid
-              // reductions (horizontal ops) afterwards
-              A4 = ei_pload(&res[(j+l2k)*rows+l2i+k]);
-              L0 = ei_pload(&block[ k + (i + 0)*l2BlockRows ]);
-              L1 = ei_pload(&block[ k + (i + 1)*l2BlockRows ]);
-              A4 = ei_pmadd(L0, A0, A4);
-              L2 = ei_pload(&block[ k + (i + 2)*l2BlockRows ]);
-              A4 = ei_pmadd(L1, A1, A4);
-              L3 = ei_pload(&block[ k + (i + 3)*l2BlockRows ]);
-              A4 = ei_pmadd(L2, A2, A4);
-              A4 = ei_pmadd(L3, A3, A4);
-
-              ei_pstore(&res[(j+l2k)*rows+l2i+k], A4);
-            }
-          }
-        }
-      }
-    }
-  }
-#endif
-
-  ei_aligned_stack_delete(Scalar, block, allocBlockSize);
-  ei_aligned_stack_delete(Scalar, rhsCopy, l2BlockSizeAligned*l2BlockSizeAligned);
-}
-
-#endif // EIGEN_EXTERN_INSTANTIATIONS
+#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
+#define EIGEN_GENERAL_MATRIX_VECTOR_H

 /* Optimized col-major matrix * vector product:
 * This algorithm processes 4 columns at onces that allows to both reduce
@ -813,4 +432,4 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
  #undef _EIGEN_ACCUMULATE_PACKETS
 }

-#endif // EIGEN_CACHE_FRIENDLY_PRODUCT_H
+#endif // EIGEN_GENERAL_MATRIX_VECTOR_H
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@ -0,0 +1,177 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_H
+#define EIGEN_SELFADJOINT_MATRIX_VECTOR_H
+
+template<bool Conjugate> struct ei_conj_if {
+  template<typename Scalar> Scalar operator() (const Scalar& x) const { return ei_conj(x); }
+};
+
+template<> struct ei_conj_if<false> {
+  template<typename Scalar> Scalar& operator() (Scalar& x) const { return x; }
+};
+
+/* Optimized col-major selfadjoint matrix * vector product:
+ * This algorithm processes 2 columns at onces that allows to both reduce
+ * the number of load/stores of the result by a factor 2 and to reduce
+ * the instruction dependency.
+ */
+template<typename Scalar, int StorageOrder, int UpLo>
+static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector(
+  int size,
+  const Scalar* lhs, int lhsStride,
+  const Scalar* rhs, //int rhsIncr,
+  Scalar* res)
+{
+  typedef typename ei_packet_traits<Scalar>::type Packet;
+  const int PacketSize = sizeof(Packet)/sizeof(Scalar);
+  
+  enum {
+    IsRowMajor = StorageOrder==RowMajorBit ? 1 : 0,
+    IsLower = UpLo == LowerTriangularBit ? 1 : 0,
+    FirstTriangular = IsRowMajor == IsLower
+  };
+  
+  ei_conj_if<NumTraits<Scalar>::IsComplex && IsRowMajor> conj0;
+  ei_conj_if<NumTraits<Scalar>::IsComplex && !IsRowMajor> conj1;
+
+  for (int i=0;i<size;i++)
+    res[i] = 0;
+  
+  int bound = std::max(0,size-8) & 0xfffffffE;
+  if (FirstTriangular)
+    bound = size - bound;
+
+  for (int j=FirstTriangular ? bound : 0;
+       j<(FirstTriangular ? size : bound);j+=2)
+  {
+    register const Scalar* __restrict__ A0 = lhs + j*lhsStride;
+    register const Scalar* __restrict__ A1 = lhs + (j+1)*lhsStride;
+    
+    Scalar t0 = rhs[j];
+    Packet ptmp0 = ei_pset1(t0);
+    Scalar t1 = rhs[j+1];
+    Packet ptmp1 = ei_pset1(t1);
+    
+    Scalar t2 = 0;
+    Packet ptmp2 = ei_pset1(t2);
+    Scalar t3 = 0;
+    Packet ptmp3 = ei_pset1(t3);
+    
+    size_t starti = FirstTriangular ? 0 : j+2;
+    size_t endi   = FirstTriangular ? j : size;
+    size_t alignedEnd = starti;
+    size_t alignedStart = (starti) + ei_alignmentOffset(&res[starti], endi-starti);
+    alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize);
+
+    res[j]   += t0 * conj0(A0[j]);
+    if(FirstTriangular)
+    {
+      res[j+1]   += t1 * conj0(A1[j+1]);
+      res[j] += t1 * conj0(A1[j]);
+      t3 += conj1(A1[j]) * rhs[j];
+    }
+    else
+    {
+      res[j+1] += t0 * conj0(A0[j+1]) + t1 * conj0(A1[j+1]);
+      t2 += conj1(A0[j+1]) * rhs[j+1];
+    }
+    
+    for (size_t i=starti; i<alignedStart; ++i)
+    {
+      res[i] += t0 * A0[i] + t1 * A1[i];
+      t2 += ei_conj(A0[i]) * rhs[i];
+      t3 += ei_conj(A1[i]) * rhs[i];
+    }
+    for (size_t i=alignedStart; i<alignedEnd; i+=PacketSize)
+    {
+      Packet A0i = ei_ploadu(&A0[i]);
+      Packet A1i = ei_ploadu(&A1[i]);
+      Packet Bi = ei_ploadu(&rhs[i]); // FIXME should be aligned in most cases
+      Packet Xi = ei_pload(&res[i]);
+      
+      Xi = ei_padd(ei_padd(Xi, ei_pmul(ptmp0, conj0(A0i))), ei_pmul(ptmp1, conj0(A1i)));
+      ptmp2 = ei_padd(ptmp2, ei_pmul(conj1(A0i), Bi));
+      ptmp3 = ei_padd(ptmp3, ei_pmul(conj1(A1i), Bi));
+      ei_pstore(&res[i],Xi);
+    }
+    for (size_t i=alignedEnd; i<endi; i++)
+    {
+      res[i] += t0 * conj0(A0[i]) + t1 * conj0(A1[i]);
+      t2 += conj1(A0[i]) * rhs[i];
+      t3 += conj1(A1[i]) * rhs[i];
+    }
+    
+    res[j]   += t2 + ei_predux(ptmp2);
+    res[j+1] += t3 + ei_predux(ptmp3);
+  }
+  for (int j=FirstTriangular ? 0 : bound;j<(FirstTriangular ? bound : size);j++)
+  {
+    register const Scalar* __restrict__ A0 = lhs + j*lhsStride;
+    
+    Scalar t1 = rhs[j];
+    Scalar t2 = 0;
+    res[j] += t1 * conj0(A0[j]);
+    for (int i=FirstTriangular ? 0 : j+1; i<(FirstTriangular ? j : size); i++) {
+      res[i] += t1 * conj0(A0[i]);
+      t2 += conj1(A0[i]) * rhs[i];
+    }
+    res[j] += t2;
+  }
+  /*
+  // colmajor - upper
+  for (int j=0;j<size;j++)
+  {
+    register const Scalar* __restrict__ A0 = lhs + j*lhsStride;
+    
+    Scalar t1 = rhs[j];
+    Scalar t2 = 0;
+    for (int i=0; i<j; i+=PacketSize) {
+      res[i] += t1 * A0[i];
+      t2 += A0[i] * rhs[i];
+    }
+    res[j] += t1 * A0[j];
+    res[j] += t2;
+  }
+  
+  // rowmajor - lower
+  for (int j=0;j<size;j++)
+  {
+    register const Scalar* __restrict__ A0 = lhs + j*lhsStride;
+    
+    Scalar t1 = rhs[j];
+    Scalar t2 = 0;
+    for (int i=0; i<j; i+=PacketSize) {
+      res[i] += t1 * A0[i];
+      t2 += A0[i] * rhs[i];
+    }
+    res[j] += t1 * A0[j];
+    res[j] += t2;
+  }
+  */
+}
+
+
+#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_H
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@ -114,6 +114,7 @@ ei_add_test(cwiseop)
 ei_add_test(redux)
 ei_add_test(product_small)
 ei_add_test(product_large ${EI_OFLAG})
+ei_add_test(product_selfadjoint)
 ei_add_test(adjoint)
 ei_add_test(submatrices)
 ei_add_test(miscmatrices)
--- a/test/product_selfadjoint.cpp
+++ b/test/product_selfadjoint.cpp
@ -0,0 +1,70 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@gmail.com>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#include "main.h"
+
+template<typename MatrixType> void product_selfadjoint(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
+
+  int rows = m.rows();
+  int cols = m.cols();
+
+  MatrixType m1 = MatrixType::Random(rows, cols),
+             m2 = MatrixType::Random(rows, cols);
+  VectorType v1 = VectorType::Random(rows),
+             v2 = VectorType::Random(rows);
+  
+  m1 = m1.adjoint()*m1;
+  
+  // col-lower
+  m2.setZero();
+  m2.template part<LowerTriangular>() = m1;
+  ei_product_selfadjoint_vector<Scalar,MatrixType::Flags&RowMajorBit,LowerTriangularBit>
+    (cols,m2.data(),cols, v1.data(), v2.data());
+  VERIFY_IS_APPROX(v2, m1 * v1);
+
+  // col-upper
+  m2.setZero();
+  m2.template part<UpperTriangular>() = m1;
+  ei_product_selfadjoint_vector<Scalar,MatrixType::Flags&RowMajorBit,UpperTriangularBit>(cols,m2.data(),cols, v1.data(), v2.data());
+  VERIFY_IS_APPROX(v2, m1 * v1);
+
+}
+
+void test_product_selfadjoint()
+{
+  for(int i = 0; i < g_repeat ; i++) {
+    CALL_SUBTEST( product_selfadjoint(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST( product_selfadjoint(Matrix<float, 2, 2>()) );
+    CALL_SUBTEST( product_selfadjoint(Matrix3d()) );
+    CALL_SUBTEST( product_selfadjoint(MatrixXcf(4, 4)) );
+    CALL_SUBTEST( product_selfadjoint(MatrixXcd(21,21)) );
+    CALL_SUBTEST( product_selfadjoint(MatrixXd(17,17)) );
+    CALL_SUBTEST( product_selfadjoint(Matrix<float,Dynamic,Dynamic,RowMajor>(18,18)) );
+    CALL_SUBTEST( product_selfadjoint(Matrix<std::complex<double>,Dynamic,Dynamic,RowMajor>(19, 19)) );
+  }
+}