* rework Map, allow vectorization

* rework PacketMath and DummyPacketMath, make these actual template specializations instead of just overriding by non-template inline functions * introduce ei_ploadt and ei_pstoret, make use of them in Map and Matrix * remove Matrix::map() methods, use Map constructors instead.
2025-03-31 19:00:35 +08:00 · 2008-06-27 01:22:35 +00:00 · 2008-06-27 01:22:35 +00:00 · e27b2b95cf
commit e27b2b95cf
parent e5d301dc96
15 changed files with 220 additions and 216 deletions
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@ -348,7 +348,7 @@ struct ei_assign_impl<Derived1, Derived2, SliceVectorization, NoUnrolling>
      {
        const int row = rowMajor ? i : index;
        const int col = rowMajor ? index : i;
-        dst.template writePacket<UnAligned>(row, col, src.template packet<UnAligned>(row, col));
+        dst.template writePacket<Unaligned>(row, col, src.template packet<Unaligned>(row, col));
      }

      // do the non-vectorizable part of the assignment
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@ -168,26 +168,26 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block
    template<int LoadMode>
    inline PacketScalar packet(int row, int col) const
    {
-      return m_matrix.template packet<UnAligned>(row + m_startRow.value(), col + m_startCol.value());
+      return m_matrix.template packet<Unaligned>(row + m_startRow.value(), col + m_startCol.value());
    }

    template<int LoadMode>
    inline void writePacket(int row, int col, const PacketScalar& x)
    {
-      m_matrix.const_cast_derived().template writePacket<UnAligned>(row + m_startRow.value(), col + m_startCol.value(), x);
+      m_matrix.const_cast_derived().template writePacket<Unaligned>(row + m_startRow.value(), col + m_startCol.value(), x);
    }

    template<int LoadMode>
    inline PacketScalar packet(int index) const
    {
-      return m_matrix.template packet<UnAligned>(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+      return m_matrix.template packet<Unaligned>(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
                                                 m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
    }

    template<int LoadMode>
    inline void writePacket(int index, const PacketScalar& x)
    {
-      m_matrix.const_cast_derived().template writePacket<UnAligned>
+      m_matrix.const_cast_derived().template writePacket<Unaligned>
         (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
          m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0), x);
    }
@ -195,10 +195,10 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block
  protected:

    const typename MatrixType::Nested m_matrix;
-    ei_int_if_dynamic<MatrixType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;
-    ei_int_if_dynamic<MatrixType::ColsAtCompileTime == 1 ? 0 : Dynamic> m_startCol;
-    ei_int_if_dynamic<RowsAtCompileTime> m_blockRows;
-    ei_int_if_dynamic<ColsAtCompileTime> m_blockCols;
+    const ei_int_if_dynamic<MatrixType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;
+    const ei_int_if_dynamic<MatrixType::ColsAtCompileTime == 1 ? 0 : Dynamic> m_startCol;
+    const ei_int_if_dynamic<RowsAtCompileTime> m_blockRows;
+    const ei_int_if_dynamic<ColsAtCompileTime> m_blockCols;
 };

 /** \returns a dynamic-size expression of a block in *this.
--- a/Eigen/src/Core/Coeffs.h
+++ b/Eigen/src/Core/Coeffs.h
@ -214,7 +214,7 @@ inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
  * to ensure that a packet really starts there. This method is only available on expressions having the
  * PacketAccessBit.
  *
-  * The \a LoadMode parameter may have the value \a Aligned or \a UnAligned. Its effect is to select
+  * The \a LoadMode parameter may have the value \a Aligned or \a Unaligned. Its effect is to select
  * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
  * starting at an address which is a multiple of the packet size.
  */
@ -232,7 +232,7 @@ MatrixBase<Derived>::packet(int row, int col) const
  * to ensure that a packet really starts there. This method is only available on expressions having the
  * PacketAccessBit.
  *
-  * The \a LoadMode parameter may have the value \a Aligned or \a UnAligned. Its effect is to select
+  * The \a LoadMode parameter may have the value \a Aligned or \a Unaligned. Its effect is to select
  * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
  * starting at an address which is a multiple of the packet size.
  */
@ -250,7 +250,7 @@ inline void MatrixBase<Derived>::writePacket
  * to ensure that a packet really starts there. This method is only available on expressions having the
  * PacketAccessBit and the LinearAccessBit.
  *
-  * The \a LoadMode parameter may have the value \a Aligned or \a UnAligned. Its effect is to select
+  * The \a LoadMode parameter may have the value \a Aligned or \a Unaligned. Its effect is to select
  * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
  * starting at an address which is a multiple of the packet size.
  */
@ -267,7 +267,7 @@ MatrixBase<Derived>::packet(int index) const
  * to ensure that a packet really starts there. This method is only available on expressions having the
  * PacketAccessBit and the LinearAccessBit.
  *
-  * The \a LoadMode parameter may have the value \a Aligned or \a UnAligned. Its effect is to select
+  * The \a LoadMode parameter may have the value \a Aligned or \a Unaligned. Its effect is to select
  * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
  * starting at an address which is a multiple of the packet size.
  */
--- a/Eigen/src/Core/DummyPacketMath.h
+++ b/Eigen/src/Core/DummyPacketMath.h
@ -30,50 +30,93 @@
 // of generic vectorized code. However, at runtime, they should never be
 // called, TODO so sould we raise an assertion or not ?
 /** \internal \returns a + b (coeff-wise) */
-template <typename Scalar> inline Scalar ei_padd(const Scalar&  a, const Scalar&  b) { return a + b; }
+template <typename Packet> inline Packet
+ei_padd(const Packet&,
+        const Packet&) { Packet ret; return ret; }

 /** \internal \returns a - b (coeff-wise) */
-template <typename Scalar> inline Scalar ei_psub(const Scalar&  a, const Scalar&  b) { return a - b; }
+template <typename Packet> inline Packet
+ei_psub(const Packet&,
+        const Packet&) { Packet ret; return ret; }

 /** \internal \returns a * b (coeff-wise) */
-template <typename Scalar> inline Scalar ei_pmul(const Scalar&  a, const Scalar&  b) { return a * b; }
+template <typename Packet> inline Packet
+ei_pmul(const Packet&,
+        const Packet&) { Packet ret; return ret; }

 /** \internal \returns a / b (coeff-wise) */
-template <typename Scalar> inline Scalar ei_pdiv(const Scalar&  a, const Scalar&  b) { return a / b; }
-
-/** \internal \returns a * b - c (coeff-wise) */
-template <typename Scalar> inline Scalar ei_pmadd(const Scalar&  a, const Scalar&  b, const Scalar&  c)
-{ return ei_padd(ei_pmul(a, b),c); }
+template <typename Packet> inline Packet
+ei_pdiv(const Packet&,
+        const Packet&) { Packet ret; return ret; }

 /** \internal \returns the min of \a a and \a b  (coeff-wise) */
-template <typename Scalar> inline Scalar ei_pmin(const Scalar&  a, const Scalar&  b) { return std::min(a,b); }
+template <typename Packet> inline Packet
+ei_pmin(const Packet&,
+        const Packet&) { Packet ret; return ret; }

 /** \internal \returns the max of \a a and \a b  (coeff-wise) */
-template <typename Scalar> inline Scalar ei_pmax(const Scalar&  a, const Scalar&  b) { return std::max(a,b); }
+template <typename Packet> inline Packet
+ei_pmax(const Packet&,
+        const Packet&) { Packet ret; return ret; }

 /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
-template <typename Scalar> inline Scalar ei_pload(const Scalar* from) { return *from; }
+template <typename Scalar> inline typename ei_packet_traits<Scalar>::type
+ei_pload(const Scalar*) { typename ei_packet_traits<Scalar>::type ret; return ret; }

 /** \internal \returns a packet version of \a *from, (un-aligned load) */
-template <typename Scalar> inline Scalar ei_ploadu(const Scalar* from) { return *from; }
+template <typename Scalar> inline typename ei_packet_traits<Scalar>::type
+ei_ploadu(const Scalar*) { typename ei_packet_traits<Scalar>::type ret; return ret; }

 /** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
-template <typename Scalar> inline Scalar ei_pset1(const Scalar& a) { return a; }
+template <typename Scalar> inline typename ei_packet_traits<Scalar>::type
+ei_pset1(const Scalar&) { typename ei_packet_traits<Scalar>::type ret; return ret; }

 /** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
-template <typename Scalar> inline void ei_pstore(Scalar* to, const Scalar& from) { (*to) = from; }
+template <typename Scalar, typename Packet> inline void ei_pstore(Scalar*, const Packet&) {}

 /** \internal copy the packet \a from to \a *to, (un-aligned store) */
-template <typename Scalar> inline void ei_pstoreu(Scalar* to, const Scalar& from) { (*to) = from; }
+template <typename Scalar, typename Packet> inline void ei_pstoreu(Scalar*, const Packet&) {}

 /** \internal \returns the first element of a packet */
-template <typename Scalar> inline Scalar ei_pfirst(const Scalar& a) { return a; }
+template <typename Packet> inline typename ei_unpacket_traits<Packet>::type ei_pfirst(const Packet&)
+{ typename ei_unpacket_traits<Packet>::type ret; return ret; }

 /** \internal \returns a packet where the element i contains the sum of the packet of \a vec[i] */
-template <typename Scalar> inline Scalar ei_preduxp(const Scalar* vecs) { return vecs[0]; }
+template <typename Packet> inline Packet
+ei_preduxp(const Packet*) { Packet ret; return ret; }

 /** \internal \returns the sum of the elements of \a a*/
-template <typename Scalar> inline Scalar ei_predux(const Scalar& a) { return a; }
+template <typename Packet> inline typename ei_unpacket_traits<Packet>::type ei_predux(const Packet&)
+{ typename ei_unpacket_traits<Packet>::type ret; return ret; }
+
+
+////////////
+
+
+/** \internal \returns a * b + c (coeff-wise) */
+template <typename Packet> inline Packet
+ei_pmadd(const Packet&  a,
+         const Packet&  b,
+         const Packet&  c)
+{ return ei_padd(ei_pmul(a, b),c); }
+
+/** \internal \returns a packet version of \a *from. If LoadMode equals Aligned, \a from must be 16 bytes aligned */
+template <typename Scalar, int LoadMode> inline typename ei_packet_traits<Scalar>::type ei_ploadt(const Scalar* from)
+{
+  if(LoadMode == Aligned)
+    return ei_pload(from);
+  else
+    return ei_ploadu(from);
+}
+
+/** \internal copy the packet \a from to \a *to. If StoreMode equals Aligned, \a to must be 16 bytes aligned */
+template <typename Scalar, typename Packet, int LoadMode> inline void ei_pstoret(Scalar* to, const Packet& from)
+{
+  if(LoadMode == Aligned)
+    ei_pstore(to, from);
+  else
+    ei_pstoreu(to, from);  
+}

 #endif // EIGEN_DUMMY_PACKET_MATH_H

--- a/Eigen/src/Core/Flagged.h
+++ b/Eigen/src/Core/Flagged.h
@ -65,9 +65,6 @@ template<typename ExpressionType, unsigned int Added, unsigned int Removed> clas

    inline Flagged(const ExpressionType& matrix) : m_matrix(matrix) {}

-    /** \internal */
-    inline const ExpressionType& _expression() const { return m_matrix; }
-
    inline int rows() const { return m_matrix.rows(); }
    inline int cols() const { return m_matrix.cols(); }
    inline int stride() const { return m_matrix.stride(); }
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@ -29,17 +29,19 @@
  *
  * \brief A matrix or vector expression mapping an existing array of data.
  *
+  * \param Alignment can be either Aligned or Unaligned. Tells whether the array is suitably aligned for
+  *                  vectorization on the present CPU architecture. Defaults to Unaligned.
+  *
  * This class represents a matrix or vector expression mapping an existing array of data.
  * It can be used to let Eigen interface without any overhead with non-Eigen data structures,
  * such as plain C arrays or structures from other libraries.
  *
-  * This class is the return type of Matrix::map() and most of the time this is the only
-  * way it is used.
+  * This class is the return type of Matrix::map() but can also be used directly.
  *
  * \sa Matrix::map()
  */
-template<typename MatrixType>
-struct ei_traits<Map<MatrixType> >
+template<typename MatrixType, int Alignment>
+struct ei_traits<Map<MatrixType, Alignment> >
 {
  typedef typename MatrixType::Scalar Scalar;
  enum {
@ -47,35 +49,37 @@ struct ei_traits<Map<MatrixType> >
    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    Flags = MatrixType::Flags & (HereditaryBits | DirectAccessBit),
+    Flags = MatrixType::Flags
+          & (HereditaryBits | LinearAccessBit | DirectAccessBit)
+          & (Alignment == Aligned ? PacketAccessBit : 0),
    CoeffReadCost = NumTraits<Scalar>::ReadCost
  };
 };

-template<typename MatrixType> class Map
-  : public MatrixBase<Map<MatrixType> >
+template<typename MatrixType, int Alignment> class Map
+  : public MatrixBase<Map<MatrixType, Alignment> >
 {
  public:

    EIGEN_GENERIC_PUBLIC_INTERFACE(Map)

-    inline int rows() const { return m_rows; }
-    inline int cols() const { return m_cols; }
+    inline int rows() const { return m_rows.value(); }
+    inline int cols() const { return m_cols.value(); }

    inline const Scalar& coeff(int row, int col) const
    {
      if(Flags & RowMajorBit)
-        return m_data[col + row * m_cols];
+        return m_data[col + row * m_cols.value()];
      else // column-major
-        return m_data[row + col * m_rows];
+        return m_data[row + col * m_rows.value()];
    }

    inline Scalar& coeffRef(int row, int col)
    {
      if(Flags & RowMajorBit)
-        return const_cast<Scalar*>(m_data)[col + row * m_cols];
+        return const_cast<Scalar*>(m_data)[col + row * m_cols.value()];
      else // column-major
-        return const_cast<Scalar*>(m_data)[row + col * m_rows];
+        return const_cast<Scalar*>(m_data)[row + col * m_rows.value()];
    }

    inline const Scalar& coeff(int index) const
@ -88,107 +92,69 @@ template<typename MatrixType> class Map
      return m_data[index];
    }

-  public:
-    inline Map(const Scalar* data, int rows, int cols) : m_data(data), m_rows(rows), m_cols(cols)
+    template<int LoadMode>
+    inline PacketScalar packet(int row, int col) const
    {
-      ei_assert(rows > 0
-          && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows)
-          && cols > 0
-          && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols));
+      return ei_ploadt<Scalar, LoadMode == Aligned ? Alignment : Unaligned>
+               (m_data + (Flags & RowMajorBit
+                         ? col + row * m_cols.value()
+                         : row + col * m_rows.value()));
+    }
+
+    template<int LoadMode>
+    inline PacketScalar packet(int index) const
+    {
+      return ei_ploadt<Scalar, LoadMode == Aligned ? Alignment : Unaligned>(m_data + index);
+    }
+
+    template<int StoreMode>
+    inline void writePacket(int row, int col, const PacketScalar& x)
+    {
+      ei_pstoret<Scalar, PacketScalar, StoreMode == Aligned ? Alignment : Unaligned>
+               (m_data + (Flags & RowMajorBit
+                         ? col + row * m_cols.value()
+                         : row + col * m_rows.value()), x);
+    }
+
+    template<int StoreMode>
+    inline void writePacket(int index, const PacketScalar& x)
+    {
+      ei_pstoret<Scalar, PacketScalar, StoreMode == Aligned ? Alignment : Unaligned>(m_data + index, x);
+    }
+
+    inline Map(const Scalar* data) : m_data(data), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime)
+    {
+      ei_assert(RowsAtCompileTime != Dynamic && ColsAtCompileTime != Dynamic);
+      ei_assert(RowsAtCompileTime > 0 && ColsAtCompileTime > 0);
+    }
+
+    inline Map(const Scalar* data, int size)
+            : m_data(data),
+              m_rows(RowsAtCompileTime == Dynamic ? size : RowsAtCompileTime),
+              m_cols(ColsAtCompileTime == Dynamic ? size : ColsAtCompileTime)
+    {
+      ei_assert(size > 0);
+      ei_assert((RowsAtCompileTime == 1
+              && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == size))
+          || (ColsAtCompileTime == 1
+              && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == size)));
+    }
+
+    inline Map(const Scalar* data, int rows, int cols)
+            : m_data(data), m_rows(rows), m_cols(cols)
+    {
+      ei_assert(rows > 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows)
+               && cols > 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols));
    }

    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map)

  protected:
    const Scalar* m_data;
-    const int m_rows, m_cols;
+    const ei_int_if_dynamic<RowsAtCompileTime> m_rows;
+    const ei_int_if_dynamic<ColsAtCompileTime> m_cols;
 };

-/** This is the const version of map(Scalar*,int,int). */
-template<typename _Scalar, int _Rows, int _Cols, int _MaxRows, int _MaxCols, unsigned int _Flags>
-inline const Map<Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCols, _Flags> >
-Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCols, _Flags>::map(const Scalar* data, int rows, int cols)
-{
-  return Map<Matrix>(data, rows, cols);
-}
-
-/** This is the const version of map(Scalar*,int). */
-template<typename _Scalar, int _Rows, int _Cols, int _MaxRows, int _MaxCols, unsigned int _Flags>
-inline const Map<Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCols, _Flags> >
-Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCols, _Flags>::map(const Scalar* data, int size)
-{
-  ei_assert(_Cols == 1 || _Rows ==1);
-  if(_Cols == 1)
-    return Map<Matrix>(data, size, 1);
-  else
-    return Map<Matrix>(data, 1, size);
-}
-
-/** This is the const version of map(Scalar*). */
-template<typename _Scalar, int _Rows, int _Cols, int _MaxRows, int _MaxCols, unsigned int _Flags>
-inline const Map<Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCols, _Flags> >
-Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCols, _Flags>::map(const Scalar* data)
-{
-  return Map<Matrix>(data, _Rows, _Cols);
-}
-
-/** \returns a expression of a matrix or vector mapping the given data.
-  *
-  * \param data The array of data to map
-  * \param rows The number of rows of the expression to construct
-  * \param cols The number of columns of the expression to construct
-  *
-  * Example: \include MatrixBase_map_int_int.cpp
-  * Output: \verbinclude MatrixBase_map_int_int.out
-  *
-  * \sa map(const Scalar*, int, int), map(Scalar*, int), map(Scalar*), class Map
-  */
-template<typename _Scalar, int _Rows, int _Cols, int _MaxRows, int _MaxCols, unsigned int _Flags>
-inline Map<Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCols, _Flags> >
-Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCols, _Flags>::map(Scalar* data, int rows, int cols)
-{
-  return Map<Matrix>(data, rows, cols);
-}
-
-/** \returns a expression of a vector mapping the given data.
-  *
-  * \param data The array of data to map
-  * \param size The size (number of coefficients) of the expression to construct
-  *
-  * \only_for_vectors
-  *
-  * Example: \include MatrixBase_map_int.cpp
-  * Output: \verbinclude MatrixBase_map_int.out
-  *
-  * \sa map(const Scalar*, int), map(Scalar*, int, int), map(Scalar*), class Map
-  */
-template<typename _Scalar, int _Rows, int _Cols, int _MaxRows, int _MaxCols, unsigned int _Flags>
-inline Map<Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCols, _Flags> >
-Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCols, _Flags>::map(Scalar* data, int size)
-{
-  ei_assert(_Cols == 1 || _Rows ==1);
-  if(_Cols == 1)
-    return Map<Matrix>(data, size, 1);
-  else
-    return Map<Matrix>(data, 1, size);
-}
-
-/** \returns a expression of a fixed-size matrix or vector mapping the given data.
-  *
-  * \param data The array of data to map
-  *
-  * Example: \include MatrixBase_map.cpp
-  * Output: \verbinclude MatrixBase_map.out
-  *
-  * \sa map(const Scalar*), map(Scalar*, int), map(Scalar*, int, int), class Map
-  */
-template<typename _Scalar, int _Rows, int _Cols, int _MaxRows, int _MaxCols, unsigned int _Flags>
-inline Map<Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCols, _Flags> >
-Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCols, _Flags>::map(Scalar* data)
-{
-  return Map<Matrix>(data, _Rows, _Cols);
-}
-
 /** Constructor copying an existing array of data. Only useful for dynamic-size matrices:
  * for fixed-size matrices, it is redundant to pass the \a rows and \a cols parameters.
  * \param data The array of data to copy
@ -202,7 +168,7 @@ inline Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCols, _Flags>
  ::Matrix(const Scalar *data, int rows, int cols)
  : m_storage(rows*cols, rows, cols)
 {
-  *this = map(data, rows, cols);
+  *this = Map<Matrix>(data, rows, cols);
 }

 /** Constructor copying an existing array of data. Only useful for dynamic-size vectors:
@ -220,7 +186,7 @@ inline Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCols, _Flags>
  ::Matrix(const Scalar *data, int size)
  : m_storage(size, RowsAtCompileTime == 1 ? 1 : size, ColsAtCompileTime == 1 ? 1 : size)
 {
-  *this = map(data, size);
+  *this = Map<Matrix>(data, size);
 }

 /** Constructor copying an existing array of data.
@ -237,7 +203,7 @@ template<typename _Scalar, int _Rows, int _Cols, int _MaxRows, int _MaxCols, uns
 inline Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCols, _Flags>
  ::Matrix(const Scalar *data)
 {
-  *this = map(data);
+  *this = Map<Matrix>(data);
 }

 #endif // EIGEN_MAP_H
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@ -102,12 +102,13 @@ class Matrix : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCol
 {
  public:
    EIGEN_GENERIC_PUBLIC_INTERFACE(Matrix)
+    friend class Eigen::Map<Matrix, Unaligned>;
+    friend class Eigen::Map<Matrix, Aligned>;

  protected:
    ei_matrix_storage<Scalar, MaxSizeAtCompileTime, RowsAtCompileTime, ColsAtCompileTime> m_storage;

  public:
-    friend class Map<Matrix>;

    inline int rows() const { return m_storage.rows(); }
    inline int cols() const { return m_storage.cols(); }
@ -149,50 +150,31 @@ class Matrix : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCol
    template<int LoadMode>
    inline PacketScalar packet(int row, int col) const
    {
-      if(Flags & RowMajorBit)
-        if (LoadMode==Aligned)
-          return ei_pload(m_storage.data() + col + row * m_storage.cols());
-        else
-          return ei_ploadu(m_storage.data() + col + row * m_storage.cols());
-      else
-        if (LoadMode==Aligned)
-          return ei_pload(m_storage.data() + row + col * m_storage.rows());
-        else
-          return ei_ploadu(m_storage.data() + row + col * m_storage.rows());
+      return ei_ploadt<Scalar, LoadMode>
+               (m_storage.data() + (Flags & RowMajorBit
+                                   ? col + row * m_storage.cols()
+                                   : row + col * m_storage.rows()));
    }

    template<int LoadMode>
    inline PacketScalar packet(int index) const
    {
-      if (LoadMode==Aligned)
-        return ei_pload(m_storage.data() + index);
-      else
-        return ei_ploadu(m_storage.data() + index);
+      return ei_ploadt<Scalar, LoadMode>(m_storage.data() + index);
    }

    template<int StoreMode>
    inline void writePacket(int row, int col, const PacketScalar& x)
    {
-      ei_internal_assert(Flags & PacketAccessBit);
-      if(Flags & RowMajorBit)
-        if (StoreMode==Aligned)
-          ei_pstore(m_storage.data() + col + row * m_storage.cols(), x);
-        else
-          ei_pstoreu(m_storage.data() + col + row * m_storage.cols(), x);
-      else
-        if (StoreMode==Aligned)
-          ei_pstore(m_storage.data() + row + col * m_storage.rows(), x);
-        else
-          ei_pstoreu(m_storage.data() + row + col * m_storage.rows(), x);
+      ei_pstoret<Scalar, PacketScalar, StoreMode>
+              (m_storage.data() + (Flags & RowMajorBit
+                                   ? col + row * m_storage.cols()
+                                   : row + col * m_storage.rows()), x);
    }

    template<int StoreMode>
    inline void writePacket(int index, const PacketScalar& x)
    {
-        if (StoreMode==Aligned)
-          ei_pstore(m_storage.data() + index, x);
-        else
-          ei_pstoreu(m_storage.data() + index, x);
+      ei_pstoret<Scalar, PacketScalar, StoreMode>(m_storage.data() + index, x);
    }

  public:
@ -253,19 +235,13 @@ class Matrix : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCol
    EIGEN_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Matrix, *=)
    EIGEN_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Matrix, /=)

-    static const Map<Matrix> map(const Scalar* array, int rows, int cols);
-    static const Map<Matrix> map(const Scalar* array, int size);
-    static const Map<Matrix> map(const Scalar* array);
-    static Map<Matrix> map(Scalar* array, int rows, int cols);
-    static Map<Matrix> map(Scalar* array, int size);
-    static Map<Matrix> map(Scalar* array);
-
    /** Default constructor, does nothing. Only for fixed-size matrices.
      * For dynamic-size matrices and vectors, this constructor is forbidden (guarded by
      * an assertion) because it would leave the matrix without an allocated data buffer.
      */
    inline explicit Matrix()
    {
+      ei_assert(RowsAtCompileTime != Dynamic && ColsAtCompileTime != Dynamic);
      ei_assert(RowsAtCompileTime > 0 && ColsAtCompileTime > 0);
    }

--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@ -72,6 +72,11 @@ inline vector int    ei_pmax(const vector int     a, const vector int     b) { r
 inline vector float  ei_pload(const float*   from) { return vec_ld(0, from); }
 inline vector int    ei_pload(const int*     from) { return vec_ld(0, from); }

+inline vector float  ei_ploadu(const float*)
+{ EIGEN_STATIC_ASSERT(unaligned_load_and_store_operations_unimplemented_on_AltiVec) }
+inline vector int    ei_ploadu(const int*  )
+{ EIGEN_STATIC_ASSERT(unaligned_load_and_store_operations_unimplemented_on_AltiVec) }
+
 inline vector float  ei_pset1(const float&  from)
 {
  static float __attribute__(aligned(16)) af[4];
@ -93,6 +98,11 @@ inline vector int    ei_pset1(const int&    from)
 inline void ei_pstore(float*   to, const vector float   from) { vec_st(from, 0, to); }
 inline void ei_pstore(int*     to, const vector int     from) { vec_st(from, 0, to); }

+inline void ei_pstoreu(float*, const vector float)
+{ EIGEN_STATIC_ASSERT(unaligned_load_and_store_operations_unimplemented_on_AltiVec) }
+inline void ei_pstoreu(int*  , const vector int  )
+{ EIGEN_STATIC_ASSERT(unaligned_load_and_store_operations_unimplemented_on_AltiVec) }
+
 inline float  ei_pfirst(const vector float  a)
 {
  static float __attribute__(aligned(16)) af[4];
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@ -33,6 +33,10 @@ template<> struct ei_packet_traits<float>  { typedef __m128  type; enum {size=4}
 template<> struct ei_packet_traits<double> { typedef __m128d type; enum {size=2}; };
 template<> struct ei_packet_traits<int>    { typedef __m128i type; enum {size=4}; };

+template<> struct ei_unpacket_traits<__m128>  { typedef float  type; enum {size=4}; };
+template<> struct ei_unpacket_traits<__m128d> { typedef double type; enum {size=2}; };
+template<> struct ei_unpacket_traits<__m128i> { typedef int    type; enum {size=4}; };
+
 template<> inline __m128  ei_padd(const __m128&  a, const __m128&  b) { return _mm_add_ps(a,b); }
 template<> inline __m128d ei_padd(const __m128d& a, const __m128d& b) { return _mm_add_pd(a,b); }
 template<> inline __m128i ei_padd(const __m128i& a, const __m128i& b) { return _mm_add_epi32(a,b); }
@ -79,29 +83,29 @@ template<> inline __m128i ei_pmax(const __m128i& a, const __m128i& b)
  return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
 }

-inline __m128  ei_pload(const float*   from) { return _mm_load_ps(from); }
-inline __m128d ei_pload(const double*  from) { return _mm_load_pd(from); }
-inline __m128i ei_pload(const int* from) { return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
+template<> inline __m128  ei_pload(const float*   from) { return _mm_load_ps(from); }
+template<> inline __m128d ei_pload(const double*  from) { return _mm_load_pd(from); }
+template<> inline __m128i ei_pload(const int* from) { return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }

-inline __m128  ei_ploadu(const float*   from) { return _mm_loadu_ps(from); }
-inline __m128d ei_ploadu(const double*  from) { return _mm_loadu_pd(from); }
-inline __m128i ei_ploadu(const int* from) { return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); }
+template<> inline __m128  ei_ploadu(const float*   from) { return _mm_loadu_ps(from); }
+template<> inline __m128d ei_ploadu(const double*  from) { return _mm_loadu_pd(from); }
+template<> inline __m128i ei_ploadu(const int* from) { return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); }

-inline __m128  ei_pset1(const float&  from) { return _mm_set1_ps(from); }
-inline __m128d ei_pset1(const double& from) { return _mm_set1_pd(from); }
-inline __m128i ei_pset1(const int&    from) { return _mm_set1_epi32(from); }
+template<> inline __m128  ei_pset1(const float&  from) { return _mm_set1_ps(from); }
+template<> inline __m128d ei_pset1(const double& from) { return _mm_set1_pd(from); }
+template<> inline __m128i ei_pset1(const int&    from) { return _mm_set1_epi32(from); }

-inline void ei_pstore(float*  to, const __m128&  from) { _mm_store_ps(to, from); }
-inline void ei_pstore(double* to, const __m128d& from) { _mm_store_pd(to, from); }
-inline void ei_pstore(int*    to, const __m128i& from) { _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
+template<> inline void ei_pstore(float*  to, const __m128&  from) { _mm_store_ps(to, from); }
+template<> inline void ei_pstore(double* to, const __m128d& from) { _mm_store_pd(to, from); }
+template<> inline void ei_pstore(int*    to, const __m128i& from) { _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }

-inline void ei_pstoreu(float*  to, const __m128&  from) { _mm_storeu_ps(to, from); }
-inline void ei_pstoreu(double* to, const __m128d& from) { _mm_storeu_pd(to, from); }
-inline void ei_pstoreu(int*    to, const __m128i& from) { _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
+template<> inline void ei_pstoreu(float*  to, const __m128&  from) { _mm_storeu_ps(to, from); }
+template<> inline void ei_pstoreu(double* to, const __m128d& from) { _mm_storeu_pd(to, from); }
+template<> inline void ei_pstoreu(int*    to, const __m128i& from) { _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }

-inline float  ei_pfirst(const __m128&  a) { return _mm_cvtss_f32(a); }
-inline double ei_pfirst(const __m128d& a) { return _mm_cvtsd_f64(a); }
-inline int    ei_pfirst(const __m128i& a) { return _mm_cvtsi128_si32(a); }
+template<> inline float  ei_pfirst(const __m128&  a) { return _mm_cvtss_f32(a); }
+template<> inline double ei_pfirst(const __m128d& a) { return _mm_cvtsd_f64(a); }
+template<> inline int    ei_pfirst(const __m128i& a) { return _mm_cvtsi128_si32(a); }

 #ifdef __SSE3__
 // TODO implement SSE2 versions as well as integer versions
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@ -167,7 +167,7 @@ const unsigned int UnitUpper = UpperTriangularBit | UnitDiagBit;
 const unsigned int UnitLower = LowerTriangularBit | UnitDiagBit;
 const unsigned int Diagonal = Upper | Lower;

-enum { Aligned=0, UnAligned=1 };
+enum { Aligned=0, Unaligned=1 };
 enum { ConditionalJumpCost = 5 };
 enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight };
 enum DirectionType { Vertical, Horizontal };
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@ -51,7 +51,7 @@ template<typename BinaryOp,  typename Lhs, typename Rhs>  class CwiseBinaryOp;
 template<typename Lhs, typename Rhs, int ProductMode> class Product;
 template<typename CoeffsVectorType> class DiagonalMatrix;
 template<typename MatrixType> class DiagonalCoeffs;
-template<typename MatrixType> class Map;
+template<typename MatrixType, int Alignment = Unaligned> class Map;
 template<int Direction, typename UnaryOp, typename MatrixType> class PartialRedux;
 template<typename MatrixType, unsigned int Mode> class Part;
 template<typename MatrixType, unsigned int Mode> class Extract;
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@ -45,7 +45,7 @@

 #define EIGEN_DEFAULT_MATRIX_FLAGS EIGEN_DEFAULT_MATRIX_STORAGE_ORDER

-/** Define a hint size when dealling with large matrices and L2 cache friendlyness
+/** Define a hint size when dealing with large matrices and L2 cache friendlyness
  * More precisely, its square value represents the amount of bytes which can be assumed to stay in L2 cache.
  */
 #ifndef EIGEN_TUNE_FOR_L2_CACHE_SIZE
@ -136,15 +136,15 @@ typedef typename Base::PacketScalar PacketScalar; \
 typedef typename Eigen::ei_nested<Derived>::type Nested; \
 typedef typename Eigen::ei_eval<Derived>::type Eval; \
 typedef typename Eigen::Inverse<Eval> InverseType; \
-enum { RowsAtCompileTime = Base::RowsAtCompileTime, \
-       ColsAtCompileTime = Base::ColsAtCompileTime, \
-       MaxRowsAtCompileTime = Base::MaxRowsAtCompileTime, \
-       MaxColsAtCompileTime = Base::MaxColsAtCompileTime, \
+enum { RowsAtCompileTime = Eigen::ei_traits<Derived>::RowsAtCompileTime, \
+       ColsAtCompileTime = Eigen::ei_traits<Derived>::ColsAtCompileTime, \
+       MaxRowsAtCompileTime = Eigen::ei_traits<Derived>::MaxRowsAtCompileTime, \
+       MaxColsAtCompileTime = Eigen::ei_traits<Derived>::MaxColsAtCompileTime, \
+       Flags = Eigen::ei_traits<Derived>::Flags, \
+       CoeffReadCost = Eigen::ei_traits<Derived>::CoeffReadCost, \
       SizeAtCompileTime = Base::SizeAtCompileTime, \
       MaxSizeAtCompileTime = Base::MaxSizeAtCompileTime, \
-       IsVectorAtCompileTime = Base::IsVectorAtCompileTime, \
-       Flags = Base::Flags, \
-       CoeffReadCost = Base::CoeffReadCost };
+       IsVectorAtCompileTime = Base::IsVectorAtCompileTime };

 #define EIGEN_GENERIC_PUBLIC_INTERFACE(Derived) \
 _EIGEN_GENERIC_PUBLIC_INTERFACE(Derived, Eigen::MatrixBase<Derived>) \
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@ -147,6 +147,13 @@ template<typename T> struct ei_packet_traits
  enum {size=1};
 };

+template<typename T> struct ei_unpacket_traits
+{
+  typedef T type;
+  enum {size=1};
+};
+
+
 template<typename Scalar, int Rows, int Cols, int MaxRows, int MaxCols, unsigned int SuggestedFlags>
 class ei_corrected_matrix_flags
 {
--- a/Eigen/src/Core/util/StaticAssert.h
+++ b/Eigen/src/Core/util/StaticAssert.h
@ -58,7 +58,8 @@
        you_tried_calling_a_vector_method_on_a_matrix,
        you_mixed_vectors_of_different_sizes,
        you_mixed_matrices_of_different_sizes,
-        you_did_a_programming_error
+        you_did_a_programming_error,
+        unaligned_load_and_store_operations_unimplemented_on_AltiVec
      };
    };

--- a/test/map.cpp
+++ b/test/map.cpp
@ -31,16 +31,16 @@ template<typename VectorType> void tmap(const VectorType& m)
  int size = m.size();

  // test Map.h
-  Scalar* array1 = new Scalar[size];
-  Scalar* array2 = new Scalar[size];
-  VectorType::map(array1, size) = VectorType::random(size);
-  VectorType::map(array2, size) = VectorType::map(array1, size);
-  VectorType ma1 = VectorType::map(array1, size);
-  VectorType ma2 = VectorType::map(array2, size);
+  Scalar* array1 = ei_aligned_malloc<Scalar>(size);
+  Scalar* array2 = ei_aligned_malloc<Scalar>(size);
+  Map<VectorType, Aligned>(array1, size) = VectorType::random(size);
+  Map<VectorType>(array2, size) = Map<VectorType>(array1, size);
+  VectorType ma1 = Map<VectorType>(array1, size);
+  VectorType ma2 = Map<VectorType, Aligned>(array2, size);
  VERIFY_IS_APPROX(ma1, ma2);
  VERIFY_IS_APPROX(ma1, VectorType(array2, size));
-  delete[] array1;
-  delete[] array2;
+  ei_aligned_free(array1);
+  ei_aligned_free(array2);
 }

 void test_map()