First part of a big refactoring of alignment control to enable the handling of arbitrarily aligned buffers. It includes:

- AlignedBit flag is deprecated. Alignment is now specified by the evaluator through the 'Alignment' enum, e.g., evaluator<Xpr>::Alignment. Its value is in Bytes. - Add several enums to specify alignment: Aligned8, Aligned16, Aligned32, Aligned64, Aligned128. AlignedMax corresponds to EIGEN_MAX_ALIGN_BYTES. Such enums are used to define the above Alignment value, and as the 'Options' template parameter of Map<> and Ref<>. - The Aligned enum is now deprecated. It is now an alias for Aligned16. - Currently, traits<Matrix<>>, traits<Array<>>, traits<Ref<>>, traits<Map<>>, and traits<Block<>> also expose the Alignment enum.
2025-03-25 18:50:40 +08:00 · 2015-08-06 15:31:07 +02:00 · 2015-08-06 15:31:07 +02:00 · 1f5024332e
commit 1f5024332e
parent 65186ef18d
24 changed files with 231 additions and 200 deletions
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@ -28,18 +28,19 @@ template <typename DstEvaluator, typename SrcEvaluator, typename AssignFunc>
 struct copy_using_evaluator_traits
 {
  typedef typename DstEvaluator::XprType Dst;
-  
+  typedef typename Dst::Scalar DstScalar;
  enum {
    DstFlags = DstEvaluator::Flags,
-    SrcFlags = SrcEvaluator::Flags
+    SrcFlags = SrcEvaluator::Flags,
+    RequiredAlignment = packet_traits<DstScalar>::size*sizeof(DstScalar) // FIXME ask packet_traits for the true alignment requirement
  };
  
 public:
  enum {
-    DstIsAligned = DstFlags & AlignedBit,
+    DstAlignment = DstEvaluator::Alignment,
+    SrcAlignment = SrcEvaluator::Alignment,
    DstHasDirectAccess = DstFlags & DirectAccessBit,
-    SrcIsAligned = SrcFlags & AlignedBit,
-    JointAlignment = bool(DstIsAligned) && bool(SrcIsAligned) ? Aligned : Unaligned
+    JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment)
  };

 private:
@ -51,7 +52,7 @@ private:
              : int(DstFlags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime)
              : int(Dst::MaxRowsAtCompileTime),
    MaxSizeAtCompileTime = Dst::SizeAtCompileTime,
-    PacketSize = packet_traits<typename Dst::Scalar>::size
+    PacketSize = packet_traits<DstScalar>::size
  };

  enum {
@ -62,10 +63,10 @@ private:
                  && (int(DstFlags) & int(SrcFlags) & ActualPacketAccessBit)
                  && (functor_traits<AssignFunc>::PacketAccess),
    MayInnerVectorize  = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0
-                       && int(DstIsAligned) && int(SrcIsAligned),
+                       && int(JointAlignment)>=int(RequiredAlignment),
    MayLinearize = StorageOrdersAgree && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
    MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess
-                       && (DstIsAligned || MaxSizeAtCompileTime == Dynamic),
+                       && ((int(DstAlignment)>=int(RequiredAlignment)) || MaxSizeAtCompileTime == Dynamic),
      /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
         so it's only good for large enough sizes. */
    MaySliceVectorize  = MightVectorize && DstHasDirectAccess
@ -107,8 +108,8 @@ public:
                                             : int(NoUnrolling)
                  )
              : int(Traversal) == int(LinearVectorizedTraversal)
-                ? ( bool(MayUnrollCompletely) && bool(DstIsAligned) ? int(CompleteUnrolling) 
-                                                                    : int(NoUnrolling) )
+                ? ( bool(MayUnrollCompletely) && (int(DstAlignment)>=int(RequiredAlignment)) ? int(CompleteUnrolling)
+                                                                                             : int(NoUnrolling) )
              : int(Traversal) == int(LinearTraversal)
                ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) 
                                              : int(NoUnrolling) )
@ -124,8 +125,9 @@ public:
    EIGEN_DEBUG_VAR(DstFlags)
    EIGEN_DEBUG_VAR(SrcFlags)
    std::cerr.unsetf(std::ios::hex);
-    EIGEN_DEBUG_VAR(DstIsAligned)
-    EIGEN_DEBUG_VAR(SrcIsAligned)
+    EIGEN_DEBUG_VAR(DstAlignment)
+    EIGEN_DEBUG_VAR(SrcAlignment)
+    EIGEN_DEBUG_VAR(RequiredAlignment)
    EIGEN_DEBUG_VAR(JointAlignment)
    EIGEN_DEBUG_VAR(InnerSize)
    EIGEN_DEBUG_VAR(InnerMaxSize)
@ -360,11 +362,13 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling>
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
    const Index size = kernel.size();
-    typedef packet_traits<typename Kernel::Scalar> PacketTraits;
+    typedef typename Kernel::Scalar Scalar;
+    typedef packet_traits<Scalar> PacketTraits;
    enum {
      packetSize = PacketTraits::size,
-      dstIsAligned = int(Kernel::AssignmentTraits::DstIsAligned),
-      dstAlignment = PacketTraits::AlignedOnScalar ? Aligned : dstIsAligned,
+      dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(Kernel::AssignmentTraits::RequiredAlignment),
+      dstAlignment = PacketTraits::AlignedOnScalar ? int(Kernel::AssignmentTraits::RequiredAlignment)
+                                                   : int(Kernel::AssignmentTraits::DstAlignment),
      srcAlignment = Kernel::AssignmentTraits::JointAlignment
    };
    const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned(&kernel.dstEvaluator().coeffRef(0), size);
@ -475,9 +479,10 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
    typedef packet_traits<Scalar> PacketTraits;
    enum {
      packetSize = PacketTraits::size,
-      alignable = PacketTraits::AlignedOnScalar,
-      dstIsAligned = Kernel::AssignmentTraits::DstIsAligned,
-      dstAlignment = alignable ? Aligned : int(dstIsAligned)
+      alignable = PacketTraits::AlignedOnScalar || int(Kernel::AssignmentTraits::DstAlignment)>=sizeof(Scalar),
+      dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(Kernel::AssignmentTraits::RequiredAlignment),
+      dstAlignment = alignable ? int(Kernel::AssignmentTraits::RequiredAlignment)
+                               : int(Kernel::AssignmentTraits::DstAlignment)
    };
    const Scalar *dst_ptr = &kernel.dstEvaluator().coeffRef(0,0);
    if((!bool(dstIsAligned)) && (size_t(dst_ptr) % sizeof(Scalar))>0)
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@ -81,14 +81,16 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
    OuterStrideAtCompileTime = HasSameStorageOrderAsXprType
                             ? int(outer_stride_at_compile_time<XprType>::ret)
                             : int(inner_stride_at_compile_time<XprType>::ret),
-    // IsAligned is needed by MapBase's assertions
-    // We can sefely set it to false here. Internal alignment errors will be detected by an eigen_internal_assert in the respective evaluator
-    IsAligned = 0,
+
    // FIXME, this traits is rather specialized for dense object and it needs to be cleaned further
    FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
    FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
-    Flags = (traits<XprType>::Flags & (DirectAccessBit | (InnerPanel?CompressedAccessBit:0))) | FlagsLvalueBit | FlagsRowMajorBit
+    Flags = (traits<XprType>::Flags & (DirectAccessBit | (InnerPanel?CompressedAccessBit:0))) | FlagsLvalueBit | FlagsRowMajorBit,
    // FIXME DirectAccessBit should not be handled by expressions
+    // 
+    // Alignment is needed by MapBase's assertions
+    // We can sefely set it to false here. Internal alignment errors will be detected by an eigen_internal_assert in the respective evaluator
+    Alignment = 0
  };
 };

--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@ -111,6 +111,10 @@ struct evaluator_base
  typedef typename traits<ExpressionType>::StorageIndex StorageIndex;
  // TODO that's not very nice to have to propagate all these traits. They are currently only needed to handle outer,inner indices.
  typedef traits<ExpressionType> ExpressionTraits;
+  
+  enum {
+    Alignment = 0
+  };
 };

 // -------------------- Matrix and Array --------------------
@ -137,8 +141,8 @@ struct evaluator<PlainObjectBase<Derived> >
    ColsAtCompileTime = PlainObjectType::ColsAtCompileTime,
    
    CoeffReadCost = NumTraits<Scalar>::ReadCost,
-    Flags = compute_matrix_evaluator_flags< Scalar,Derived::RowsAtCompileTime,Derived::ColsAtCompileTime,
-                                            Derived::Options,Derived::MaxRowsAtCompileTime,Derived::MaxColsAtCompileTime>::ret
+    Flags = traits<Derived>::EvaluatorFlags,
+    Alignment = traits<Derived>::Alignment
  };
  
  EIGEN_DEVICE_FUNC evaluator()
@ -255,7 +259,8 @@ struct unary_evaluator<Transpose<ArgType>, IndexBased>
  
  enum {
    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,    
-    Flags = evaluator<ArgType>::Flags ^ RowMajorBit
+    Flags = evaluator<ArgType>::Flags ^ RowMajorBit,
+    Alignment = evaluator<ArgType>::Alignment
  };

  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {}
@ -331,7 +336,8 @@ struct evaluator<CwiseNullaryOp<NullaryOp,PlainObjectType> >
          &  (  HereditaryBits
              | (functor_has_linear_access<NullaryOp>::ret  ? LinearAccessBit : 0)
              | (functor_traits<NullaryOp>::PacketAccess    ? PacketAccessBit : 0)))
-          | (functor_traits<NullaryOp>::IsRepeatable ? 0 : EvalBeforeNestingBit) // FIXME EvalBeforeNestingBit should be needed anymore
+          | (functor_traits<NullaryOp>::IsRepeatable ? 0 : EvalBeforeNestingBit), // FIXME EvalBeforeNestingBit should be needed anymore
+    Alignment = 0 // FIXME alignment should not matter here, perhaps we could set it to AlignMax??
  };

  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& n)
@ -378,9 +384,9 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased >
  enum {
    CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost,
    
-    Flags = evaluator<ArgType>::Flags & (
-              HereditaryBits | LinearAccessBit | AlignedBit
-            | (functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0))
+    Flags = evaluator<ArgType>::Flags
+          & (HereditaryBits | LinearAccessBit | (functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0)),
+    Alignment = evaluator<ArgType>::Alignment
  };

  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op)
@ -447,13 +453,13 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
    Flags0 = (int(LhsFlags) | int(RhsFlags)) & (
        HereditaryBits
      | (int(LhsFlags) & int(RhsFlags) &
-           ( AlignedBit
-           | (StorageOrdersAgree ? LinearAccessBit : 0)
+           ( (StorageOrdersAgree ? LinearAccessBit : 0)
           | (functor_traits<BinaryOp>::PacketAccess && StorageOrdersAgree && SameType ? PacketAccessBit : 0)
           )
        )
     ),
-    Flags = (Flags0 & ~RowMajorBit) | (LhsFlags & RowMajorBit)
+    Flags = (Flags0 & ~RowMajorBit) | (LhsFlags & RowMajorBit),
+    Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<Lhs>::Alignment,evaluator<Rhs>::Alignment)
  };

  EIGEN_DEVICE_FUNC explicit binary_evaluator(const XprType& xpr)
@ -506,7 +512,9 @@ struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType>, IndexBased>
  enum {
    CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost,
    
-    Flags = (evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit))
+    Flags = (evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit)),
+    
+    Alignment = 0 // FIXME it is not very clear why alignment is necessarily lost...
  };

  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op)
@ -641,7 +649,6 @@ struct evaluator<Map<PlainObjectType, MapOptions, StrideType> >
    HasNoInnerStride = InnerStrideAtCompileTime == 1,
    HasNoOuterStride = StrideType::OuterStrideAtCompileTime == 0,
    HasNoStride = HasNoInnerStride && HasNoOuterStride,
-    IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0) && ((int(MapOptions)&Aligned)==Aligned),
    IsDynamicSize = PlainObjectType::SizeAtCompileTime==Dynamic,
    
    // TODO: should check for smaller packet types once we can handle multi-sized packet types
@ -653,10 +660,13 @@ struct evaluator<Map<PlainObjectType, MapOptions, StrideType> >
                           || ( OuterStrideAtCompileTime!=Dynamic
                           && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime) % AlignBytes)==0 ) ),
    Flags0 = evaluator<PlainObjectType>::Flags,
-    Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit),
-    Flags2 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime))
-           ? int(Flags1) : int(Flags1 & ~LinearAccessBit),
-    Flags = KeepsPacketAccess ? int(Flags2) : (int(Flags2) & ~PacketAccessBit)
+    //Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit),
+    Flags1 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime))
+           ? int(Flags0) : int(Flags0 & ~LinearAccessBit),
+    Flags = KeepsPacketAccess ? int(Flags1) : (int(Flags1) & ~PacketAccessBit),
+    
+    //IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0) && ((int(MapOptions)&int(AlignedMask))>0),
+    Alignment = int(MapOptions)&int(AlignedMask)
  };

  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& map)
@ -673,7 +683,8 @@ struct evaluator<Ref<PlainObjectType, RefOptions, StrideType> >
  typedef Ref<PlainObjectType, RefOptions, StrideType> XprType;
  
  enum {
-    Flags = evaluator<Map<PlainObjectType, RefOptions, StrideType> >::Flags
+    Flags = evaluator<Map<PlainObjectType, RefOptions, StrideType> >::Flags,
+    Alignment = evaluator<Map<PlainObjectType, RefOptions, StrideType> >::Alignment
  };

  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& ref)
@ -717,17 +728,17 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
                       && (InnerStrideAtCompileTime == 1)
                        ? PacketAccessBit : 0,
    
-    // TODO: should check for smaller packet types once we can handle multi-sized packet types
-    AlignBytes = int(packet_traits<Scalar>::size) * sizeof(Scalar),
-    
-    MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % AlignBytes) == 0)) ? AlignedBit : 0,
    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator<ArgType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,    
    FlagsRowMajorBit = XprType::Flags&RowMajorBit,
    Flags0 = evaluator<ArgType>::Flags & ( (HereditaryBits & ~RowMajorBit) |
                                           DirectAccessBit |
-                                           MaskPacketAccessBit |
-                                           MaskAlignedBit),
-    Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit
+                                           MaskPacketAccessBit),
+    Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit,
+    
+    // TODO: should check for smaller packet types once we can handle multi-sized packet types
+    AlignBytes = int(packet_traits<Scalar>::size) * sizeof(Scalar),
+    Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % AlignBytes) == 0)) ? AlignBytes : 0,
+    Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ArgType>::Alignment, Alignment0)
  };
  typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type;
  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& block) : block_evaluator_type(block) {}
@ -833,11 +844,8 @@ struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /* HasDirectAc
  EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block)
    : mapbase_evaluator<XprType, typename XprType::PlainObject>(block) 
  {
-    // TODO: should check for smaller packet types once we can handle multi-sized packet types
-    const int AlignBytes = int(packet_traits<Scalar>::size) * sizeof(Scalar);
-    EIGEN_ONLY_USED_FOR_DEBUG(AlignBytes)
    // FIXME this should be an internal assertion
-    eigen_assert(EIGEN_IMPLIES(evaluator<XprType>::Flags&AlignedBit, (size_t(block.data()) % AlignBytes) == 0) && "data is not aligned");
+    eigen_assert(((size_t(block.data()) % EIGEN_PLAIN_ENUM_MAX(1,evaluator<XprType>::Alignment)) == 0) && "data is not aligned");
  }
 };

@ -856,7 +864,9 @@ struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
                  + EIGEN_SIZE_MAX(evaluator<ThenMatrixType>::CoeffReadCost,
                                   evaluator<ElseMatrixType>::CoeffReadCost),

-    Flags = (unsigned int)evaluator<ThenMatrixType>::Flags & evaluator<ElseMatrixType>::Flags & HereditaryBits
+    Flags = (unsigned int)evaluator<ThenMatrixType>::Flags & evaluator<ElseMatrixType>::Flags & HereditaryBits,
+    
+    Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ThenMatrixType>::Alignment, evaluator<ElseMatrixType>::Alignment)
  };

  inline EIGEN_DEVICE_FUNC  explicit evaluator(const XprType& select)
@ -908,7 +918,9 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
  enum {
    CoeffReadCost = evaluator<ArgTypeNestedCleaned>::CoeffReadCost,
    
-    Flags = (evaluator<ArgTypeNestedCleaned>::Flags & HereditaryBits & ~RowMajorBit) | (traits<XprType>::Flags & RowMajorBit)
+    Flags = (evaluator<ArgTypeNestedCleaned>::Flags & HereditaryBits & ~RowMajorBit) | (traits<XprType>::Flags & RowMajorBit),
+    
+    Alignment = evaluator<ArgTypeNestedCleaned>::Alignment
  };

  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& replicate)
@ -992,7 +1004,9 @@ struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
    CoeffReadCost = TraversalSize==Dynamic ? Dynamic
                  : TraversalSize * evaluator<ArgType>::CoeffReadCost + int(CostOpType::value),
    
-    Flags = (traits<XprType>::Flags&RowMajorBit) | (evaluator<ArgType>::Flags&HereditaryBits)
+    Flags = (traits<XprType>::Flags&RowMajorBit) | (evaluator<ArgType>::Flags&HereditaryBits),
+    
+    Alignment = 0 // FIXME this could be improved
  };

  EIGEN_DEVICE_FUNC explicit evaluator(const XprType expr)
@ -1028,7 +1042,8 @@ struct evaluator_wrapper_base
  typedef typename remove_all<typename XprType::NestedExpressionType>::type ArgType;
  enum {
    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
-    Flags = evaluator<ArgType>::Flags
+    Flags = evaluator<ArgType>::Flags,
+    Alignment = evaluator<ArgType>::Alignment
  };

  EIGEN_DEVICE_FUNC explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {}
@ -1144,7 +1159,9 @@ struct unary_evaluator<Reverse<ArgType, Direction> >
    LinearAccess = ( (Direction==BothDirections) && (int(Flags0)&PacketAccessBit) )
                 ? LinearAccessBit : 0,

-    Flags = int(Flags0) & (HereditaryBits | PacketAccessBit | LinearAccess)
+    Flags = int(Flags0) & (HereditaryBits | PacketAccessBit | LinearAccess),
+    
+    Alignment = 0 // FIXME in some rare cases, Alignment could be preserved, like a Vector4f.
  };
  typedef internal::reverse_packet_cond<PacketScalar,ReversePacket> reverse_packet;

@ -1226,7 +1243,9 @@ struct evaluator<Diagonal<ArgType, DiagIndex> >
  enum {
    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
    
-    Flags = (unsigned int)evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit) & ~RowMajorBit
+    Flags = (unsigned int)evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit) & ~RowMajorBit,
+    
+    Alignment = 0
  };

  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& diagonal)
--- a/Eigen/src/Core/DenseCoeffsBase.h
+++ b/Eigen/src/Core/DenseCoeffsBase.h
@ -602,11 +602,11 @@ struct first_aligned_impl<Derived, false>
  * documentation.
  */
 template<typename Derived>
-static inline Index first_aligned(const Derived& m)
+static inline Index first_aligned(const DenseBase<Derived>& m)
 {
  return first_aligned_impl
-          <Derived, (Derived::Flags & AlignedBit) || !(Derived::Flags & DirectAccessBit)>
-          ::run(m);
+          <Derived, (evaluator<Derived>::Alignment > 0 ) || !(Derived::Flags & DirectAccessBit)> // FIXME Alignment!
+          ::run(m.derived());
 }

 template<typename Derived, bool HasDirectAccess = has_direct_access<Derived>::ret>
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@ -450,22 +450,22 @@ pmadd(const Packet&  a,
 { return padd(pmul(a, b),c); }

 /** \internal \returns a packet version of \a *from.
-  * If LoadMode equals #Aligned, \a from must be 16 bytes aligned */
-template<typename Packet, int LoadMode>
+  * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
+template<typename Packet, int Alignment>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_traits<Packet>::type* from)
 {
-  if(LoadMode == Aligned)
+  if(Alignment >= unpacket_traits<Packet>::size*sizeof(typename unpacket_traits<Packet>::type))
    return pload<Packet>(from);
  else
    return ploadu<Packet>(from);
 }

 /** \internal copy the packet \a from to \a *to.
-  * If StoreMode equals #Aligned, \a to must be 16 bytes aligned */
-template<typename Scalar, typename Packet, int LoadMode>
+  * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
+template<typename Scalar, typename Packet, int Alignment>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& from)
 {
-  if(LoadMode == Aligned)
+  if(Alignment >= unpacket_traits<Packet>::size*sizeof(typename unpacket_traits<Packet>::type))
    pstore(to, from);
  else
    pstoreu(to, from);
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@ -19,7 +19,7 @@ namespace Eigen {
  * \brief A matrix or vector expression mapping an existing array of data.
  *
  * \tparam PlainObjectType the equivalent matrix type of the mapped data
-  * \tparam MapOptions specifies whether the pointer is \c #Aligned, or \c #Unaligned.
+  * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned.
  *                The default is \c #Unaligned.
  * \tparam StrideType optionally specifies strides. By default, Map assumes the memory layout
  *                   of an ordinary, contiguous array. This can be overridden by specifying strides.
@ -77,7 +77,7 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> >
    OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
                             ? int(PlainObjectType::OuterStrideAtCompileTime)
                             : int(StrideType::OuterStrideAtCompileTime),
-    IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0) && ((int(MapOptions)&Aligned)==Aligned),
+    Alignment = int(MapOptions)&int(AlignedMask),
    Flags0 = TraitsBase::Flags & (~NestByRefBit),
    Flags = is_lvalue<PlainObjectType>::value ? int(Flags0) : (int(Flags0) & ~LvalueBit)
  };
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@ -160,9 +160,8 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
    EIGEN_DEVICE_FUNC
    void checkSanity() const
    {
-      // TODO "IsAligned" should be replaced to handle arbitrary alignment
 #if EIGEN_MAX_ALIGN_BYTES>0
-      eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::IsAligned, (size_t(m_data) % EIGEN_MAX_ALIGN_BYTES) == 0) && "data is not aligned");
+      eigen_assert(((size_t(m_data) % EIGEN_PLAIN_ENUM_MAX(1,internal::traits<Derived>::Alignment)) == 0) && "data is not aligned");
 #endif
    }

--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@ -139,6 +139,18 @@ namespace internal {
 template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
 struct traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
 {
+private:
+  enum {
+      row_major_bit = _Options&RowMajor ? RowMajorBit : 0,
+      is_dynamic_size_storage = _MaxRows==Dynamic || _MaxCols==Dynamic,
+      max_size = is_dynamic_size_storage ? Dynamic : _MaxRows*_MaxCols,
+      default_alignment = compute_default_alignment<_Scalar,max_size>::value,
+      actual_alignment = ((_Options&DontAlign)==0) ? default_alignment : 0,
+      required_alignment = packet_traits<_Scalar>::size * sizeof(_Scalar), // FIXME ask packet_traits for the true required alignment
+      packet_access_bit = packet_traits<_Scalar>::Vectorizable && (actual_alignment>=required_alignment) ? PacketAccessBit : 0
+    };
+    
+public:
  typedef _Scalar Scalar;
  typedef Dense StorageKind;
  typedef Eigen::Index StorageIndex;
@ -149,11 +161,13 @@ struct traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
    MaxRowsAtCompileTime = _MaxRows,
    MaxColsAtCompileTime = _MaxCols,
    Flags = compute_matrix_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret,
-    // FIXME, the following flag in only used to define NeedsToAlign in PlainObjectBase
-    EvaluatorFlags = compute_matrix_evaluator_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret,
    Options = _Options,
    InnerStrideAtCompileTime = 1,
-    OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime
+    OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime,
+    
+    // FIXME, the following flag in only used to define NeedsToAlign in PlainObjectBase
+    EvaluatorFlags = LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit,
+    Alignment = actual_alignment
  };
 };
 }
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@ -116,20 +116,20 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    typedef Eigen::Map<Derived, Unaligned>  MapType;
    friend  class Eigen::Map<const Derived, Unaligned>;
    typedef const Eigen::Map<const Derived, Unaligned> ConstMapType;
-    friend  class Eigen::Map<Derived, Aligned>;
-    typedef Eigen::Map<Derived, Aligned> AlignedMapType;
-    friend  class Eigen::Map<const Derived, Aligned>;
-    typedef const Eigen::Map<const Derived, Aligned> ConstAlignedMapType;
+    friend  class Eigen::Map<Derived, AlignedMax>;
+    typedef Eigen::Map<Derived, AlignedMax> AlignedMapType;
+    friend  class Eigen::Map<const Derived, AlignedMax>;
+    typedef const Eigen::Map<const Derived, AlignedMax> ConstAlignedMapType;
    template<typename StrideType> struct StridedMapType { typedef Eigen::Map<Derived, Unaligned, StrideType> type; };
    template<typename StrideType> struct StridedConstMapType { typedef Eigen::Map<const Derived, Unaligned, StrideType> type; };
-    template<typename StrideType> struct StridedAlignedMapType { typedef Eigen::Map<Derived, Aligned, StrideType> type; };
-    template<typename StrideType> struct StridedConstAlignedMapType { typedef Eigen::Map<const Derived, Aligned, StrideType> type; };
+    template<typename StrideType> struct StridedAlignedMapType { typedef Eigen::Map<Derived, AlignedMax, StrideType> type; };
+    template<typename StrideType> struct StridedConstAlignedMapType { typedef Eigen::Map<const Derived, AlignedMax, StrideType> type; };

  protected:
    DenseStorage<Scalar, Base::MaxSizeAtCompileTime, Base::RowsAtCompileTime, Base::ColsAtCompileTime, Options> m_storage;

  public:
-    enum { NeedsToAlign = SizeAtCompileTime != Dynamic && (internal::traits<Derived>::EvaluatorFlags & AlignedBit) != 0 };
+    enum { NeedsToAlign = (SizeAtCompileTime != Dynamic) && (internal::traits<Derived>::Alignment>0) };
    EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)

    EIGEN_DEVICE_FUNC
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@ -430,24 +430,22 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
    LhsFlags = LhsEtorType::Flags,
    RhsFlags = RhsEtorType::Flags,
    
+    LhsAlignment = LhsEtorType::Alignment,
+    RhsAlignment = RhsEtorType::Alignment,
+    
+    LhsIsAligned = int(LhsAlignment) >= int(sizeof(Scalar)*PacketSize), // FIXME compare to required alignment
+    RhsIsAligned = int(RhsAlignment) >= int(sizeof(Scalar)*PacketSize),
+    
    LhsRowMajor = LhsFlags & RowMajorBit,
    RhsRowMajor = RhsFlags & RowMajorBit,
      
    SameType = is_same<typename LhsNestedCleaned::Scalar,typename RhsNestedCleaned::Scalar>::value,

    CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit)
-                    && (ColsAtCompileTime == Dynamic
-                        || ( (ColsAtCompileTime % packet_traits<Scalar>::size) == 0
-                            && (RhsFlags&AlignedBit)
-                            )
-                        ),
+                    && (ColsAtCompileTime == Dynamic || ( (ColsAtCompileTime % PacketSize) == 0 && RhsIsAligned ) ),

    CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit)
-                    && (RowsAtCompileTime == Dynamic
-                        || ( (RowsAtCompileTime % packet_traits<Scalar>::size) == 0
-                            && (LhsFlags&AlignedBit)
-                            )
-                        ),
+                    && (RowsAtCompileTime == Dynamic || ( (RowsAtCompileTime % PacketSize) == 0 && LhsIsAligned ) ),

    EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
                    : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
@ -455,11 +453,13 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,

    Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit)
          | (EvalToRowMajor ? RowMajorBit : 0)
-          | (CanVectorizeLhs ? (LhsFlags & AlignedBit) : 0)
-          | (CanVectorizeRhs ? (RhsFlags & AlignedBit) : 0)
          // TODO enable vectorization for mixed types
          | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0),
          
+    Alignment = CanVectorizeLhs ? LhsAlignment
+              : CanVectorizeRhs ? RhsAlignment
+              : 0,
+          
    /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
    * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
    * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
@ -469,7 +469,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
                        && LhsRowMajor
                        && (!RhsRowMajor)
                        && (LhsFlags & RhsFlags & ActualPacketAccessBit)
-                        && (LhsFlags & RhsFlags & AlignedBit)
+                        && (LhsIsAligned && RhsIsAligned)
                        && (InnerSize % packet_traits<Scalar>::size == 0)
  };
  
@ -706,7 +706,8 @@ public:
    //_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagFlags)&PacketAccessBit))),
    _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
    _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
-    Flags = ((HereditaryBits|_LinearAccessMask|AlignedBit) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0)
+    Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0),
+    Alignment = evaluator<MatrixType>::Alignment
  };
  
  diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
@ -732,7 +733,7 @@ protected:
  {
    enum {
      InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime,
-      DiagonalPacketLoadMode = (LoadMode == Aligned && (((InnerSize%16) == 0) || (int(DiagFlags)&AlignedBit)==AlignedBit) ? Aligned : Unaligned)
+      DiagonalPacketLoadMode = EIGEN_PLAIN_ENUM_MIN(LoadMode,((InnerSize%16) == 0) ? int(Aligned16) : int(evaluator<DiagonalType>::Alignment)) // FIXME hardcoded 16!!
    };
    return internal::pmul(m_matImpl.template packet<LoadMode>(row, col),
                          m_diagImpl.template packet<DiagonalPacketLoadMode>(id));
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@ -165,7 +165,7 @@ struct redux_vec_unroller<Func, Derived, Start, 1>
    index = Start * packet_traits<typename Derived::Scalar>::size,
    outer = index / int(Derived::InnerSizeAtCompileTime),
    inner = index % int(Derived::InnerSizeAtCompileTime),
-    alignment = (Derived::Flags & AlignedBit) ? Aligned : Unaligned
+    alignment = Derived::Alignment
  };

  typedef typename Derived::Scalar Scalar;
@ -222,10 +222,10 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
    const Index size = mat.size();
    
    const Index packetSize = packet_traits<Scalar>::size;
-    const Index alignedStart = internal::first_aligned(mat);
+    const Index alignedStart = internal::first_aligned(mat.nestedExpression());
    enum {
-      alignment = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) || bool(Derived::Flags & AlignedBit)
-                ? Aligned : Unaligned
+      alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(sizeof(Scalar)*packetSize) : int(Unaligned), // FIXME take into account alignment requirement
+      alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment)
    };
    const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
    const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize);
@ -352,7 +352,8 @@ public:
    IsRowMajor = XprType::IsRowMajor,
    SizeAtCompileTime = XprType::SizeAtCompileTime,
    InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime,
-    CoeffReadCost = evaluator<XprType>::CoeffReadCost
+    CoeffReadCost = evaluator<XprType>::CoeffReadCost,
+    Alignment = evaluator<XprType>::Alignment
  };
  
  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
@ -385,6 +386,8 @@ public:
  PacketReturnType packetByOuterInner(Index outer, Index inner) const
  { return m_evaluator.template packet<LoadMode>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
  
+  const XprType & nestedExpression() const { return m_xpr; }
+  
 protected:
  typename internal::evaluator<XprType>::nestedType m_evaluator;
  const XprType &m_xpr;
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@ -18,7 +18,7 @@ namespace Eigen {
  * \brief A matrix or vector expression mapping an existing expression
  *
  * \tparam PlainObjectType the equivalent matrix type of the mapped data
-  * \tparam Options specifies whether the pointer is \c #Aligned, or \c #Unaligned.
+  * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned.
  *                The default is \c #Unaligned.
  * \tparam StrideType optionally specifies strides. By default, Ref implies a contiguous storage along the inner dimension (inner stride==1),
  *                   but accepts a variable outer stride (leading dimension).
@ -92,7 +92,8 @@ struct traits<Ref<_PlainObjectType, _Options, _StrideType> >
  typedef _StrideType StrideType;
  enum {
    Options = _Options,
-    Flags = traits<Map<_PlainObjectType, _Options, _StrideType> >::Flags | NestByRefBit
+    Flags = traits<Map<_PlainObjectType, _Options, _StrideType> >::Flags | NestByRefBit,
+    Alignment = traits<Map<_PlainObjectType, _Options, _StrideType> >::Alignment
  };

  template<typename Derived> struct match {
@ -104,7 +105,7 @@ struct traits<Ref<_PlainObjectType, _Options, _StrideType> >
                      || (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1),
      OuterStrideMatch = Derived::IsVectorAtCompileTime
                      || int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime),
-      AlignmentMatch = (_Options!=Aligned) || ((PlainObjectType::Flags&AlignedBit)==0) || ((traits<Derived>::Flags&AlignedBit)==AlignedBit),
+      AlignmentMatch = (int(traits<PlainObjectType>::Alignment)==int(Unaligned)) || (int(evaluator<Derived>::Alignment) >= int(Alignment)), // FIXME the first condition is not very clear, it should be replaced by the required alignment
      ScalarTypeMatch = internal::is_same<typename PlainObjectType::Scalar, typename Derived::Scalar>::value,
      MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch && AlignmentMatch && ScalarTypeMatch
    };
--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h
@ -162,21 +162,27 @@ MatrixBase<Derived>::stableNorm() const
  RealScalar scale(0);
  RealScalar invScale(1);
  RealScalar ssq(0); // sum of square
+  
+  typedef typename internal::nested_eval<Derived,2>::type DerivedCopy;
+  typedef typename internal::remove_all<DerivedCopy>::type DerivedCopyClean;
+  DerivedCopy copy(derived());
+  
  enum {
-    Alignment = (int(Flags)&DirectAccessBit) || (int(Flags)&AlignedBit) ? 1 : 0
+    CanAlign = (int(Flags)&DirectAccessBit) || (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME
  };
-  typedef typename internal::conditional<Alignment, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, Aligned>,
-                                                    typename Base::ConstSegmentReturnType>::type SegmentWrapper;
+  typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>,
+                                                   typename DerivedCopyClean
+                                                   ::ConstSegmentReturnType>::type SegmentWrapper;
  Index n = size();
  
  if(n==1)
    return abs(this->coeff(0));
  
-  Index bi = internal::first_aligned(derived());
+  Index bi = internal::first_aligned(copy);
  if (bi>0)
-    internal::stable_norm_kernel(this->head(bi), ssq, scale, invScale);
+    internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
  for (; bi<n; bi+=blockSize)
-    internal::stable_norm_kernel(SegmentWrapper(this->segment(bi,numext::mini(blockSize, n - bi))), ssq, scale, invScale);
+    internal::stable_norm_kernel(SegmentWrapper(copy.segment(bi,numext::mini(blockSize, n - bi))), ssq, scale, invScale);
  return scale * sqrt(ssq);
 }

--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@ -233,7 +233,7 @@ struct inplace_transpose_selector<MatrixType,true,true> { // PacketSize x Packet
    typedef typename MatrixType::Scalar Scalar;
    typedef typename internal::packet_traits<typename MatrixType::Scalar>::type Packet;
    const Index PacketSize = internal::packet_traits<Scalar>::size;
-    const Index Alignment = internal::evaluator<MatrixType>::Flags&AlignedBit ? Aligned : Unaligned;
+    const Index Alignment = internal::evaluator<MatrixType>::Alignment;
    PacketBlock<Packet> A;
    for (Index i=0; i<PacketSize; ++i)
      A.packet[i] = m.template packetByOuterInner<Alignment>(i,0);
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2007-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@ -140,7 +140,7 @@ const unsigned int LvalueBit = 0x20;
  */
 const unsigned int DirectAccessBit = 0x40;

-/** \ingroup flags
+/* \ingroup flags
  *
  * means the first coefficient packet is guaranteed to be aligned.
  * An expression cannot has the AlignedBit without the PacketAccessBit flag.
@ -215,12 +215,31 @@ enum {
 };

 /** \ingroup enums
-  * Enum for indicating whether an object is aligned or not. */
+  * Enum for indicating whether a buffer is aligned or not. */
 enum { 
-  /** Object is not correctly aligned for vectorization. */
-  Unaligned=0, 
-  /** Object is aligned for vectorization. */
-  Aligned=1 
+  Unaligned=0,        /**< Data pointer has no specific alignment. */
+  Aligned8=8,         /**< Data pointer is aligned on a 8 bytes boundary. */
+  Aligned16=16,       /**< Data pointer is aligned on a 16 bytes boundary. */
+  Aligned32=32,       /**< Data pointer is aligned on a 32 bytes boundary. */
+  Aligned64=64,       /**< Data pointer is aligned on a 64 bytes boundary. */
+  Aligned128=128,     /**< Data pointer is aligned on a 128 bytes boundary. */
+  AlignedMask=255,
+  Aligned=16,         /**< \deprecated Synonym for Aligned16. */
+#if EIGEN_MAX_ALIGN_BYTES==128
+  AlignedMax = Aligned128
+#elif EIGEN_MAX_ALIGN_BYTES==64
+  AlignedMax = Aligned64
+#elif EIGEN_MAX_ALIGN_BYTES==32
+  AlignedMax = Aligned32
+#elif EIGEN_MAX_ALIGN_BYTES==16
+  AlignedMax = Aligned16
+#elif EIGEN_MAX_ALIGN_BYTES==8
+  AlignedMax = Aligned8
+#elif EIGEN_MAX_ALIGN_BYTES==0
+  AlignedMax = Unaligned
+#else
+#error Invalid value for EIGEN_MAX_ALIGN_BYTES
+#endif
 };

 /** \ingroup enums
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@ -192,43 +192,6 @@ class compute_matrix_flags
    enum { ret = DirectAccessBit | LvalueBit | NestByRefBit | row_major_bit };
 };

-template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
-class compute_matrix_evaluator_flags
-{
-    enum {
-      row_major_bit = Options&RowMajor ? RowMajorBit : 0,
-      is_dynamic_size_storage = MaxRows==Dynamic || MaxCols==Dynamic,
-      
-      // TODO: should check for smaller packet types once we can handle multi-sized packet types
-      align_bytes = int(packet_traits<Scalar>::size) * sizeof(Scalar),
-
-      aligned_bit =
-      (
-            ((Options&DontAlign)==0)
-        && (
-#if EIGEN_MAX_STATIC_ALIGN_BYTES!=0
-             ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % align_bytes) == 0))
-#else
-             0
-#endif
-
-          ||
-
-#if EIGEN_MAX_ALIGN_BYTES!=0
-             is_dynamic_size_storage
-#else
-             0
-#endif
-
-          )
-      ) ? AlignedBit : 0,
-      packet_access_bit = packet_traits<Scalar>::Vectorizable && aligned_bit ? PacketAccessBit : 0
-    };
-
-  public:
-    enum { ret = LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit | aligned_bit };
-};
-
 template<int _Rows, int _Cols> struct size_at_compile_time
 {
  enum { ret = (_Rows==Dynamic || _Cols==Dynamic) ? Dynamic : _Rows * _Cols };
--- a/Eigen/src/Geometry/Quaternion.h
+++ b/Eigen/src/Geometry/Quaternion.h
@ -217,8 +217,8 @@ struct traits<Quaternion<_Scalar,_Options> >
  typedef _Scalar Scalar;
  typedef Matrix<_Scalar,4,1,_Options> Coefficients;
  enum{
-    IsAligned = (internal::traits<Coefficients>::EvaluatorFlags & AlignedBit) != 0,
-    Flags = IsAligned ? (AlignedBit | LvalueBit) : LvalueBit
+    Alignment = internal::traits<Coefficients>::Alignment,
+    Flags = LvalueBit
  };
 };
 }
@ -228,7 +228,7 @@ class Quaternion : public QuaternionBase<Quaternion<_Scalar,_Options> >
 {
 public:
  typedef QuaternionBase<Quaternion<_Scalar,_Options> > Base;
-  enum { IsAligned = internal::traits<Quaternion>::IsAligned };
+  enum { NeedsAlignment = internal::traits<Quaternion>::Alignment>0 };

  typedef _Scalar Scalar;

@ -277,7 +277,7 @@ public:
  inline Coefficients& coeffs() { return m_coeffs;}
  inline const Coefficients& coeffs() const { return m_coeffs;}

-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(IsAligned)
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsAlignment)

 protected:
  Coefficients m_coeffs;
@ -441,7 +441,7 @@ QuaternionBase<Derived>::operator* (const QuaternionBase<OtherDerived>& other) c
   YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
  return internal::quat_product<Architecture::Target, Derived, OtherDerived,
                         typename internal::traits<Derived>::Scalar,
-                         (internal::traits<Derived>::IsAligned && internal::traits<OtherDerived>::IsAligned)?Aligned:Unaligned>::run(*this, other);
+                         EIGEN_PLAIN_ENUM_MIN(internal::traits<Derived>::Alignment, internal::traits<OtherDerived>::Alignment)>::run(*this, other);
 }

 /** \sa operator*(Quaternion) */
@ -668,7 +668,7 @@ QuaternionBase<Derived>::conjugate() const
 {
  return internal::quat_conj<Architecture::Target, Derived,
                         typename internal::traits<Derived>::Scalar,
-                         internal::traits<Derived>::IsAligned?Aligned:Unaligned>::run(*this);
+                         internal::traits<Derived>::Alignment>::run(*this);
                         
 }

--- a/Eigen/src/Geometry/arch/Geometry_SSE.h
+++ b/Eigen/src/Geometry/arch/Geometry_SSE.h
@ -16,14 +16,14 @@ namespace Eigen {
 namespace internal {

 template<class Derived, class OtherDerived>
-struct quat_product<Architecture::SSE, Derived, OtherDerived, float, Aligned>
+struct quat_product<Architecture::SSE, Derived, OtherDerived, float, Aligned16>
 {
  static inline Quaternion<float> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
  {
    Quaternion<float> res;
    const __m128 mask = _mm_setr_ps(0.f,0.f,0.f,-0.f);
-    __m128 a = _a.coeffs().template packet<Aligned>(0);
-    __m128 b = _b.coeffs().template packet<Aligned>(0);
+    __m128 a = _a.coeffs().template packet<Aligned16>(0);
+    __m128 b = _b.coeffs().template packet<Aligned16>(0);
    __m128 s1 = _mm_mul_ps(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2));
    __m128 s2 = _mm_mul_ps(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1));
    pstore(&res.x(),
@ -55,8 +55,8 @@ struct cross3_impl<Architecture::SSE,VectorLhs,VectorRhs,float,true>
  static inline typename plain_matrix_type<VectorLhs>::type
  run(const VectorLhs& lhs, const VectorRhs& rhs)
  {
-    __m128 a = lhs.template packet<VectorLhs::Flags&AlignedBit ? Aligned : Unaligned>(0);
-    __m128 b = rhs.template packet<VectorRhs::Flags&AlignedBit ? Aligned : Unaligned>(0);
+    __m128 a = lhs.template packet<traits<VectorLhs>::Alignment>(0);
+    __m128 b = rhs.template packet<traits<VectorRhs>::Alignment>(0);
    __m128 mul1=_mm_mul_ps(vec4f_swizzle1(a,1,2,0,3),vec4f_swizzle1(b,2,0,1,3));
    __m128 mul2=_mm_mul_ps(vec4f_swizzle1(a,2,0,1,3),vec4f_swizzle1(b,1,2,0,3));
    typename plain_matrix_type<VectorLhs>::type res;
--- a/Eigen/src/Jacobi/Jacobi.h
+++ b/Eigen/src/Jacobi/Jacobi.h
@ -263,7 +263,7 @@ namespace internal {
  * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
  */
 template<typename VectorX, typename VectorY, typename OtherScalar>
-void apply_rotation_in_the_plane(VectorX& _x, VectorY& _y, const JacobiRotation<OtherScalar>& j);
+void apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j);
 }

 /** \jacobi_module
@ -298,18 +298,18 @@ inline void MatrixBase<Derived>::applyOnTheRight(Index p, Index q, const JacobiR

 namespace internal {
 template<typename VectorX, typename VectorY, typename OtherScalar>
-void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(VectorX& _x, VectorY& _y, const JacobiRotation<OtherScalar>& j)
+void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j)
 {
  typedef typename VectorX::Scalar Scalar;
  enum { PacketSize = packet_traits<Scalar>::size };
  typedef typename packet_traits<Scalar>::type Packet;
-  eigen_assert(_x.size() == _y.size());
-  Index size = _x.size();
-  Index incrx = _x.innerStride();
-  Index incry = _y.innerStride();
+  eigen_assert(xpr_x.size() == xpr_y.size());
+  Index size = xpr_x.size();
+  Index incrx = xpr_x.derived().innerStride();
+  Index incry = xpr_y.derived().innerStride();

-  Scalar* EIGEN_RESTRICT x = &_x.coeffRef(0);
-  Scalar* EIGEN_RESTRICT y = &_y.coeffRef(0);
+  Scalar* EIGEN_RESTRICT x = &xpr_x.derived().coeffRef(0);
+  Scalar* EIGEN_RESTRICT y = &xpr_y.derived().coeffRef(0);
  
  OtherScalar c = j.c();
  OtherScalar s = j.s();
@ -392,7 +392,7 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(VectorX& _x, VectorY& _y,
  /*** fixed-size vectorized path ***/
  else if(VectorX::SizeAtCompileTime != Dynamic &&
          (VectorX::Flags & VectorY::Flags & PacketAccessBit) &&
-          (VectorX::Flags & VectorY::Flags & AlignedBit))
+          (EIGEN_PLAIN_ENUM_MIN(evaluator<VectorX>::Alignment, evaluator<VectorY>::Alignment)>0)) // FIXME should be compared to the required alignment
  {
    const Packet pc = pset1<Packet>(c);
    const Packet ps = pset1<Packet>(s);
--- a/Eigen/src/LU/arch/Inverse_SSE.h
+++ b/Eigen/src/LU/arch/Inverse_SSE.h
@ -35,8 +35,8 @@ template<typename MatrixType, typename ResultType>
 struct compute_inverse_size4<Architecture::SSE, float, MatrixType, ResultType>
 {
  enum {
-    MatrixAlignment     = bool(MatrixType::Flags&AlignedBit),
-    ResultAlignment     = bool(ResultType::Flags&AlignedBit),
+    MatrixAlignment     = traits<MatrixType>::Alignment,
+    ResultAlignment     = traits<ResultType>::Alignment,
    StorageOrdersMatch  = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit)
  };
  typedef typename conditional<(MatrixType::Flags&LinearAccessBit),MatrixType const &,typename MatrixType::PlainObject>::type ActualMatrixType;
@ -165,8 +165,8 @@ template<typename MatrixType, typename ResultType>
 struct compute_inverse_size4<Architecture::SSE, double, MatrixType, ResultType>
 {
  enum {
-    MatrixAlignment = bool(MatrixType::Flags&AlignedBit),
-    ResultAlignment = bool(ResultType::Flags&AlignedBit),
+    MatrixAlignment     = traits<MatrixType>::Alignment,
+    ResultAlignment     = traits<ResultType>::Alignment,
    StorageOrdersMatch  = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit)
  };
  typedef typename conditional<(MatrixType::Flags&LinearAccessBit),MatrixType const &,typename MatrixType::PlainObject>::type ActualMatrixType;
--- a/Eigen/src/SparseCore/SparseDiagonalProduct.h
+++ b/Eigen/src/SparseCore/SparseDiagonalProduct.h
@ -41,7 +41,7 @@ struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, ProductTag, Diagonal
  typedef Product<Lhs, Rhs, DefaultProduct> XprType;
  typedef evaluator<XprType> type;
  typedef evaluator<XprType> nestedType;
-  enum { CoeffReadCost = Dynamic, Flags = Rhs::Flags&RowMajorBit }; // FIXME CoeffReadCost & Flags
+  enum { CoeffReadCost = Dynamic, Flags = Rhs::Flags&RowMajorBit, Alignment = 0 }; // FIXME CoeffReadCost & Flags
  
  typedef sparse_diagonal_product_evaluator<Rhs, typename Lhs::DiagonalVectorType, Rhs::Flags&RowMajorBit?SDP_AsScalarProduct:SDP_AsCwiseProduct> Base;
  explicit product_evaluator(const XprType& xpr) : Base(xpr.rhs(), xpr.lhs().diagonal()) {}
@ -54,14 +54,14 @@ struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, ProductTag, SparseSh
  typedef Product<Lhs, Rhs, DefaultProduct> XprType;
  typedef evaluator<XprType> type;
  typedef evaluator<XprType> nestedType;
-  enum { CoeffReadCost = Dynamic, Flags = Lhs::Flags&RowMajorBit }; // FIXME CoeffReadCost & Flags
+  enum { CoeffReadCost = Dynamic, Flags = Lhs::Flags&RowMajorBit, Alignment = 0 }; // FIXME CoeffReadCost & Flags
  
  typedef sparse_diagonal_product_evaluator<Lhs, Transpose<const typename Rhs::DiagonalVectorType>, Lhs::Flags&RowMajorBit?SDP_AsCwiseProduct:SDP_AsScalarProduct> Base;
  explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs().diagonal().transpose()) {}
 };

 template<typename SparseXprType, typename DiagonalCoeffType>
-struct sparse_diagonal_product_evaluator<SparseXprType, DiagonalCoeffType, SDP_AsScalarProduct> 
+struct sparse_diagonal_product_evaluator<SparseXprType, DiagonalCoeffType, SDP_AsScalarProduct>
 {
 protected:
  typedef typename evaluator<SparseXprType>::InnerIterator SparseXprInnerIterator;
@ -92,7 +92,7 @@ protected:


 template<typename SparseXprType, typename DiagCoeffType>
-struct sparse_diagonal_product_evaluator<SparseXprType, DiagCoeffType, SDP_AsCwiseProduct> 
+struct sparse_diagonal_product_evaluator<SparseXprType, DiagCoeffType, SDP_AsCwiseProduct>
 {
  typedef typename SparseXprType::Scalar Scalar;
  
--- a/test/mapped_matrix.cpp
+++ b/test/mapped_matrix.cpp
@ -25,15 +25,15 @@ template<typename VectorType> void map_class_vector(const VectorType& m)
  Scalar* array1 = internal::aligned_new<Scalar>(size);
  Scalar* array2 = internal::aligned_new<Scalar>(size);
  Scalar* array3 = new Scalar[size+1];
-  Scalar* array3unaligned = size_t(array3)%EIGEN_MAX_ALIGN_BYTES == 0 ? array3+1 : array3;
+  Scalar* array3unaligned = (std::size_t(array3)%EIGEN_MAX_ALIGN_BYTES) == 0 ? array3+1 : array3;
  Scalar  array4[EIGEN_TESTMAP_MAX_SIZE];

-  Map<VectorType, Aligned>(array1, size) = VectorType::Random(size);
-  Map<VectorType, Aligned>(array2, size) = Map<VectorType,Aligned>(array1, size);
+  Map<VectorType, AlignedMax>(array1, size) = VectorType::Random(size);
+  Map<VectorType, AlignedMax>(array2, size) = Map<VectorType,AlignedMax>(array1, size);
  Map<VectorType>(array3unaligned, size) = Map<VectorType>(array1, size);
-  Map<VectorType>(array4, size)          = Map<VectorType,Aligned>(array1, size);
-  VectorType ma1 = Map<VectorType, Aligned>(array1, size);
-  VectorType ma2 = Map<VectorType, Aligned>(array2, size);
+  Map<VectorType>(array4, size)          = Map<VectorType,AlignedMax>(array1, size);
+  VectorType ma1 = Map<VectorType, AlignedMax>(array1, size);
+  VectorType ma2 = Map<VectorType, AlignedMax>(array2, size);
  VectorType ma3 = Map<VectorType>(array3unaligned, size);
  VectorType ma4 = Map<VectorType>(array4, size);
  VERIFY_IS_EQUAL(ma1, ma2);
@ -41,7 +41,7 @@ template<typename VectorType> void map_class_vector(const VectorType& m)
  VERIFY_IS_EQUAL(ma1, ma4);
  #ifdef EIGEN_VECTORIZE
  if(internal::packet_traits<Scalar>::Vectorizable)
-    VERIFY_RAISES_ASSERT((Map<VectorType,Aligned>(array3unaligned, size)))
+    VERIFY_RAISES_ASSERT((Map<VectorType,AlignedMax>(array3unaligned, size)))
  #endif

  internal::aligned_delete(array1, size);
@ -71,7 +71,7 @@ template<typename MatrixType> void map_class_matrix(const MatrixType& m)
    for(int i = 0; i < size; i++) array4[i] = Scalar(1);
  
  Map<MatrixType> map1(array1, rows, cols);
-  Map<MatrixType, Aligned> map2(array2, rows, cols);
+  Map<MatrixType, AlignedMax> map2(array2, rows, cols);
  Map<MatrixType> map3(array3unaligned, rows, cols);
  Map<MatrixType> map4(array4, rows, cols);
  
@ -154,9 +154,9 @@ template<typename PlainObjectType> void check_const_correctness(const PlainObjec
  // verify that map-to-const don't have LvalueBit
  typedef typename internal::add_const<PlainObjectType>::type ConstPlainObjectType;
  VERIFY( !(internal::traits<Map<ConstPlainObjectType> >::Flags & LvalueBit) );
-  VERIFY( !(internal::traits<Map<ConstPlainObjectType, Aligned> >::Flags & LvalueBit) );
+  VERIFY( !(internal::traits<Map<ConstPlainObjectType, AlignedMax> >::Flags & LvalueBit) );
  VERIFY( !(Map<ConstPlainObjectType>::Flags & LvalueBit) );
-  VERIFY( !(Map<ConstPlainObjectType, Aligned>::Flags & LvalueBit) );
+  VERIFY( !(Map<ConstPlainObjectType, AlignedMax>::Flags & LvalueBit) );
 }

 template<typename Scalar>
--- a/test/unalignedassert.cpp
+++ b/test/unalignedassert.cpp
@ -162,12 +162,12 @@ void unalignedassert()
  }
  for(int b=8; b<EIGEN_MAX_ALIGN_BYTES; b+=8)
  {
-    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector8f>(b));
-    VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix4f>(b));
-    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4d>(b));
-    VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix2d>(b));
-    VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix4d>(b));
-    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2cd>(b));
+    if(b<32)  VERIFY_RAISES_ASSERT(construct_at_boundary<Vector8f>(b));
+    if(b<64)  VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix4f>(b));
+    if(b<32)  VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4d>(b));
+    if(b<32)  VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix2d>(b));
+    if(b<128) VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix4d>(b));
+    if(b<32)  VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2cd>(b));
  }
 #endif
 }
--- a/test/vectorization_logic.cpp
+++ b/test/vectorization_logic.cpp
@ -35,7 +35,6 @@ std::string demangle_flags(int f)
  if(f&LinearAccessBit)             res += " | Linear";
  if(f&LvalueBit)                   res += " | Lvalue";
  if(f&DirectAccessBit)             res += " | Direct";
-  if(f&AlignedBit)                  res += " | Aligned";
  if(f&NestByRefBit)                res += " | NestByRef";
  if(f&NoPreferredStorageOrderBit)  res += " | NoPreferredStorageOrderBit";
  
@ -204,12 +203,12 @@ template<typename Scalar, bool Enable = internal::packet_traits<Scalar>::Vectori
      LinearVectorizedTraversal,CompleteUnrolling));

    VERIFY((test_assign<
-            Map<Matrix22, Aligned, OuterStride<3*PacketSize> >,
+            Map<Matrix22, AlignedMax, OuterStride<3*PacketSize> >,
            Matrix22
            >(InnerVectorizedTraversal,CompleteUnrolling)));

    VERIFY((test_assign<
-            Map<Matrix<Scalar,EIGEN_PLAIN_ENUM_MAX(2,PacketSize),EIGEN_PLAIN_ENUM_MAX(2,PacketSize)>, Aligned, InnerStride<3*PacketSize> >,
+            Map<Matrix<Scalar,EIGEN_PLAIN_ENUM_MAX(2,PacketSize),EIGEN_PLAIN_ENUM_MAX(2,PacketSize)>, AlignedMax, InnerStride<3*PacketSize> >,
            Matrix<Scalar,EIGEN_PLAIN_ENUM_MAX(2,PacketSize),EIGEN_PLAIN_ENUM_MAX(2,PacketSize)>
            >(DefaultTraversal,CompleteUnrolling)));