Update to latest default branch

2025-01-30 17:40:05 +08:00 · 2016-01-21 23:08:54 -07:00 · 2016-01-21 23:08:54 -07:00 · 9b6c72958a
commit 9b6c72958a
parent 73aec9219b 7b68cf2e0f
122 changed files with 1684 additions and 1127 deletions
--- a/Eigen/PaStiXSupport
+++ b/Eigen/PaStiXSupport
@ -12,7 +12,6 @@

 #include "src/Core/util/DisableStupidWarnings.h"

-#include <complex.h>
 extern "C" {
 #include <pastix_nompi.h>
 #include <pastix.h>
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@ -28,8 +28,8 @@ namespace internal {
  *
  * \brief Robust Cholesky decomposition of a matrix with pivoting
  *
-  * \param MatrixType the type of the matrix of which to compute the LDL^T Cholesky decomposition
-  * \param UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
+  * \tparam _MatrixType the type of the matrix of which to compute the LDL^T Cholesky decomposition
+  * \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
  *             The other triangular part won't be read.
  *
  * Perform a robust Cholesky decomposition of a positive semidefinite or negative semidefinite
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@ -22,8 +22,8 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
  *
  * \brief Standard Cholesky decomposition (LL^T) of a matrix and associated features
  *
-  * \param MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition
-  * \param UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
+  * \tparam _MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition
+  * \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
  *             The other triangular part won't be read.
  *
  * This class performs a LL^T Cholesky decomposition of a symmetric, positive definite
@ -436,10 +436,7 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
  *
  * \param bAndX represents both the right-hand side matrix b and result x.
  *
-  * \returns true always! If you need to check for existence of solutions, use another decomposition like LU, QR, or SVD.
-  *
-  * This version avoids a copy when the right hand side matrix b is not
-  * needed anymore.
+  * This version avoids a copy when the right hand side matrix b is not needed anymore.
  *
  * \sa LLT::solve(), MatrixBase::llt()
  */
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h
@ -12,7 +12,16 @@

 namespace Eigen {

-/** \class Array 
+namespace internal {
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct traits<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > : traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+{
+  typedef ArrayXpr XprKind;
+  typedef ArrayBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > XprBase;
+};
+}
+
+/** \class Array
  * \ingroup Core_Module
  *
  * \brief General-purpose arrays with easy API for coefficient-wise operations
@ -26,21 +35,12 @@ namespace Eigen {
  *
  * See documentation of class Matrix for detailed information on the template parameters
  * storage layout.
-  * 
+  *
  * This class can be extended with the help of the plugin mechanism described on the page
  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_ARRAY_PLUGIN.
  *
-  * \sa \ref TutorialArrayClass, \ref TopicClassHierarchy
+  * \sa \blank \ref TutorialArrayClass, \ref TopicClassHierarchy
  */
-namespace internal {
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct traits<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > : traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
-{
-  typedef ArrayXpr XprKind;
-  typedef ArrayBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > XprBase;
-};
-}
-
 template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
 class Array
  : public PlainObjectBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@ -682,9 +682,9 @@ template< typename DstXprType, typename SrcXprType, typename Functor,
 struct Assignment;


-// The only purpose of this call_assignment() function is to deal with noalias() / AssumeAliasing and automatic transposition.
-// Indeed, I (Gael) think that this concept of AssumeAliasing was a mistake, and it makes thing quite complicated.
-// So this intermediate function removes everything related to AssumeAliasing such that Assignment
+// The only purpose of this call_assignment() function is to deal with noalias() / "assume-aliasing" and automatic transposition.
+// Indeed, I (Gael) think that this concept of "assume-aliasing" was a mistake, and it makes thing quite complicated.
+// So this intermediate function removes everything related to "assume-aliasing" such that Assignment
 // does not has to bother about these annoying details.

 template<typename Dst, typename Src>
@ -698,21 +698,21 @@ EIGEN_DEVICE_FUNC void call_assignment(const Dst& dst, const Src& src)
  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar>());
 }
                     
-// Deal with AssumeAliasing
+// Deal with "assume-aliasing"
 template<typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<evaluator_traits<Src>::AssumeAliasing==1, void*>::type = 0)
+EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if< evaluator_assume_aliasing<Src>::value, void*>::type = 0)
 {
  typename plain_matrix_type<Src>::type tmp(src);
  call_assignment_no_alias(dst, tmp, func);
 }

 template<typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<evaluator_traits<Src>::AssumeAliasing==0, void*>::type = 0)
+EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<!evaluator_assume_aliasing<Src>::value, void*>::type = 0)
 {
  call_assignment_no_alias(dst, src, func);
 }

-// by-pass AssumeAliasing
+// by-pass "assume-aliasing"
 // When there is no aliasing, we require that 'dst' has been properly resized
 template<typename Dst, template <typename> class StorageBase, typename Src, typename Func>
 EIGEN_DEVICE_FUNC void call_assignment(NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
--- a/Eigen/src/Core/BandMatrix.h
+++ b/Eigen/src/Core/BandMatrix.h
@ -161,15 +161,15 @@ class BandMatrixBase : public EigenBase<Derived>
  *
  * \brief Represents a rectangular matrix with a banded storage
  *
-  * \param _Scalar Numeric type, i.e. float, double, int
-  * \param Rows Number of rows, or \b Dynamic
-  * \param Cols Number of columns, or \b Dynamic
-  * \param Supers Number of super diagonal
-  * \param Subs Number of sub diagonal
-  * \param _Options A combination of either \b #RowMajor or \b #ColMajor, and of \b #SelfAdjoint
-  *                 The former controls \ref TopicStorageOrders "storage order", and defaults to
-  *                 column-major. The latter controls whether the matrix represents a selfadjoint 
-  *                 matrix in which case either Supers of Subs have to be null.
+  * \tparam _Scalar Numeric type, i.e. float, double, int
+  * \tparam _Rows Number of rows, or \b Dynamic
+  * \tparam _Cols Number of columns, or \b Dynamic
+  * \tparam _Supers Number of super diagonal
+  * \tparam _Subs Number of sub diagonal
+  * \tparam _Options A combination of either \b #RowMajor or \b #ColMajor, and of \b #SelfAdjoint
+  *                  The former controls \ref TopicStorageOrders "storage order", and defaults to
+  *                  column-major. The latter controls whether the matrix represents a selfadjoint
+  *                  matrix in which case either Supers of Subs have to be null.
  *
  * \sa class TridiagonalMatrix
  */
@ -302,9 +302,9 @@ class BandMatrixWrapper : public BandMatrixBase<BandMatrixWrapper<_CoefficientsT
  *
  * \brief Represents a tridiagonal matrix with a compact banded storage
  *
-  * \param _Scalar Numeric type, i.e. float, double, int
-  * \param Size Number of rows and cols, or \b Dynamic
-  * \param _Options Can be 0 or \b SelfAdjoint
+  * \tparam Scalar Numeric type, i.e. float, double, int
+  * \tparam Size Number of rows and cols, or \b Dynamic
+  * \tparam Options Can be 0 or \b SelfAdjoint
  *
  * \sa class BandMatrix
  */
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@ -13,41 +13,6 @@

 namespace Eigen { 

-/** \class Block
-  * \ingroup Core_Module
-  *
-  * \brief Expression of a fixed-size or dynamic-size block
-  *
-  * \param XprType the type of the expression in which we are taking a block
-  * \param BlockRows the number of rows of the block we are taking at compile time (optional)
-  * \param BlockCols the number of columns of the block we are taking at compile time (optional)
-  * \param InnerPanel is true, if the block maps to a set of rows of a row major matrix or
-  *        to set of columns of a column major matrix (optional). The parameter allows to determine
-  *        at compile time whether aligned access is possible on the block expression.
-  *
-  * This class represents an expression of either a fixed-size or dynamic-size block. It is the return
-  * type of DenseBase::block(Index,Index,Index,Index) and DenseBase::block<int,int>(Index,Index) and
-  * most of the time this is the only way it is used.
-  *
-  * However, if you want to directly maniputate block expressions,
-  * for instance if you want to write a function returning such an expression, you
-  * will need to use this class.
-  *
-  * Here is an example illustrating the dynamic case:
-  * \include class_Block.cpp
-  * Output: \verbinclude class_Block.out
-  *
-  * \note Even though this expression has dynamic size, in the case where \a XprType
-  * has fixed size, this expression inherits a fixed maximal size which means that evaluating
-  * it does not cause a dynamic memory allocation.
-  *
-  * Here is an example illustrating the fixed-size case:
-  * \include class_FixedBlock.cpp
-  * Output: \verbinclude class_FixedBlock.out
-  *
-  * \sa DenseBase::block(Index,Index,Index,Index), DenseBase::block(Index,Index), class VectorBlock
-  */
-
 namespace internal {
 template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
 struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprType>
@ -101,6 +66,40 @@ template<typename XprType, int BlockRows=Dynamic, int BlockCols=Dynamic, bool In

 template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, typename StorageKind> class BlockImpl;

+/** \class Block
+  * \ingroup Core_Module
+  *
+  * \brief Expression of a fixed-size or dynamic-size block
+  *
+  * \tparam XprType the type of the expression in which we are taking a block
+  * \tparam BlockRows the number of rows of the block we are taking at compile time (optional)
+  * \tparam BlockCols the number of columns of the block we are taking at compile time (optional)
+  * \tparam InnerPanel is true, if the block maps to a set of rows of a row major matrix or
+  *         to set of columns of a column major matrix (optional). The parameter allows to determine
+  *         at compile time whether aligned access is possible on the block expression.
+  *
+  * This class represents an expression of either a fixed-size or dynamic-size block. It is the return
+  * type of DenseBase::block(Index,Index,Index,Index) and DenseBase::block<int,int>(Index,Index) and
+  * most of the time this is the only way it is used.
+  *
+  * However, if you want to directly maniputate block expressions,
+  * for instance if you want to write a function returning such an expression, you
+  * will need to use this class.
+  *
+  * Here is an example illustrating the dynamic case:
+  * \include class_Block.cpp
+  * Output: \verbinclude class_Block.out
+  *
+  * \note Even though this expression has dynamic size, in the case where \a XprType
+  * has fixed size, this expression inherits a fixed maximal size which means that evaluating
+  * it does not cause a dynamic memory allocation.
+  *
+  * Here is an example illustrating the fixed-size case:
+  * \include class_FixedBlock.cpp
+  * Output: \verbinclude class_FixedBlock.out
+  *
+  * \sa DenseBase::block(Index,Index,Index,Index), DenseBase::block(Index,Index), class VectorBlock
+  */
 template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class Block
  : public BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, typename internal::traits<XprType>::StorageKind>
 {
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@ -63,10 +63,6 @@ struct evaluator_traits_base
  // by default, get evaluator kind and shape from storage
  typedef typename storage_kind_to_evaluator_kind<typename traits<T>::StorageKind>::Kind Kind;
  typedef typename storage_kind_to_shape<typename traits<T>::StorageKind>::Shape Shape;
-  
-  // 1 if assignment A = B assumes aliasing when B is of type T and thus B needs to be evaluated into a
-  // temporary; 0 if not.
-  static const int AssumeAliasing = 0;
 };

 // Default evaluator traits
@ -75,6 +71,10 @@ struct evaluator_traits : public evaluator_traits_base<T>
 {
 };

+template<typename T, typename Shape = typename evaluator_traits<T>::Shape >
+struct evaluator_assume_aliasing {
+  static const bool value = false;
+};

 // By default, we assume a unary expression:
 template<typename T>
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@ -13,26 +13,6 @@

 namespace Eigen {

-/** \class CwiseBinaryOp
-  * \ingroup Core_Module
-  *
-  * \brief Generic expression where a coefficient-wise binary operator is applied to two expressions
-  *
-  * \param BinaryOp template functor implementing the operator
-  * \param Lhs the type of the left-hand side
-  * \param Rhs the type of the right-hand side
-  *
-  * This class represents an expression  where a coefficient-wise binary operator is applied to two expressions.
-  * It is the return type of binary operators, by which we mean only those binary operators where
-  * both the left-hand side and the right-hand side are Eigen expressions.
-  * For example, the return type of matrix1+matrix2 is a CwiseBinaryOp.
-  *
-  * Most of the time, this is the only way that it is used, so you typically don't have to name
-  * CwiseBinaryOp types explicitly.
-  *
-  * \sa MatrixBase::binaryExpr(const MatrixBase<OtherDerived> &,const CustomBinaryOp &) const, class CwiseUnaryOp, class CwiseNullaryOp
-  */
-
 namespace internal {
 template<typename BinaryOp, typename Lhs, typename Rhs>
 struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
@ -74,6 +54,25 @@ struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
 template<typename BinaryOp, typename Lhs, typename Rhs, typename StorageKind>
 class CwiseBinaryOpImpl;

+/** \class CwiseBinaryOp
+  * \ingroup Core_Module
+  *
+  * \brief Generic expression where a coefficient-wise binary operator is applied to two expressions
+  *
+  * \tparam BinaryOp template functor implementing the operator
+  * \tparam LhsType the type of the left-hand side
+  * \tparam RhsType the type of the right-hand side
+  *
+  * This class represents an expression  where a coefficient-wise binary operator is applied to two expressions.
+  * It is the return type of binary operators, by which we mean only those binary operators where
+  * both the left-hand side and the right-hand side are Eigen expressions.
+  * For example, the return type of matrix1+matrix2 is a CwiseBinaryOp.
+  *
+  * Most of the time, this is the only way that it is used, so you typically don't have to name
+  * CwiseBinaryOp types explicitly.
+  *
+  * \sa MatrixBase::binaryExpr(const MatrixBase<OtherDerived> &,const CustomBinaryOp &) const, class CwiseUnaryOp, class CwiseNullaryOp
+  */
 template<typename BinaryOp, typename LhsType, typename RhsType>
 class CwiseBinaryOp : 
  public CwiseBinaryOpImpl<
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@ -12,24 +12,6 @@

 namespace Eigen {

-/** \class CwiseNullaryOp
-  * \ingroup Core_Module
-  *
-  * \brief Generic expression of a matrix where all coefficients are defined by a functor
-  *
-  * \param NullaryOp template functor implementing the operator
-  * \param PlainObjectType the underlying plain matrix/array type
-  *
-  * This class represents an expression of a generic nullary operator.
-  * It is the return type of the Ones(), Zero(), Constant(), Identity() and Random() methods,
-  * and most of the time this is the only way it is used.
-  *
-  * However, if you want to write a function returning such an expression, you
-  * will need to use this class.
-  *
-  * \sa class CwiseUnaryOp, class CwiseBinaryOp, DenseBase::NullaryExpr()
-  */
-
 namespace internal {
 template<typename NullaryOp, typename PlainObjectType>
 struct traits<CwiseNullaryOp<NullaryOp, PlainObjectType> > : traits<PlainObjectType>
@ -40,6 +22,23 @@ struct traits<CwiseNullaryOp<NullaryOp, PlainObjectType> > : traits<PlainObjectT
 };
 }

+/** \class CwiseNullaryOp
+  * \ingroup Core_Module
+  *
+  * \brief Generic expression of a matrix where all coefficients are defined by a functor
+  *
+  * \tparam NullaryOp template functor implementing the operator
+  * \tparam PlainObjectType the underlying plain matrix/array type
+  *
+  * This class represents an expression of a generic nullary operator.
+  * It is the return type of the Ones(), Zero(), Constant(), Identity() and Random() methods,
+  * and most of the time this is the only way it is used.
+  *
+  * However, if you want to write a function returning such an expression, you
+  * will need to use this class.
+  *
+  * \sa class CwiseUnaryOp, class CwiseBinaryOp, DenseBase::NullaryExpr()
+  */
 template<typename NullaryOp, typename PlainObjectType>
 class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp, PlainObjectType> >::type, internal::no_assignment_operator
 {
--- a/Eigen/src/Core/CwiseUnaryOp.h
+++ b/Eigen/src/Core/CwiseUnaryOp.h
@ -13,26 +13,6 @@

 namespace Eigen { 

-/** \class CwiseUnaryOp
-  * \ingroup Core_Module
-  *
-  * \brief Generic expression where a coefficient-wise unary operator is applied to an expression
-  *
-  * \param UnaryOp template functor implementing the operator
-  * \param XprType the type of the expression to which we are applying the unary operator
-  *
-  * This class represents an expression where a unary operator is applied to an expression.
-  * It is the return type of all operations taking exactly 1 input expression, regardless of the
-  * presence of other inputs such as scalars. For example, the operator* in the expression 3*matrix
-  * is considered unary, because only the right-hand side is an expression, and its
-  * return type is a specialization of CwiseUnaryOp.
-  *
-  * Most of the time, this is the only way that it is used, so you typically don't have to name
-  * CwiseUnaryOp types explicitly.
-  *
-  * \sa MatrixBase::unaryExpr(const CustomUnaryOp &) const, class CwiseBinaryOp, class CwiseNullaryOp
-  */
-
 namespace internal {
 template<typename UnaryOp, typename XprType>
 struct traits<CwiseUnaryOp<UnaryOp, XprType> >
@ -52,6 +32,25 @@ struct traits<CwiseUnaryOp<UnaryOp, XprType> >
 template<typename UnaryOp, typename XprType, typename StorageKind>
 class CwiseUnaryOpImpl;

+/** \class CwiseUnaryOp
+  * \ingroup Core_Module
+  *
+  * \brief Generic expression where a coefficient-wise unary operator is applied to an expression
+  *
+  * \tparam UnaryOp template functor implementing the operator
+  * \tparam XprType the type of the expression to which we are applying the unary operator
+  *
+  * This class represents an expression where a unary operator is applied to an expression.
+  * It is the return type of all operations taking exactly 1 input expression, regardless of the
+  * presence of other inputs such as scalars. For example, the operator* in the expression 3*matrix
+  * is considered unary, because only the right-hand side is an expression, and its
+  * return type is a specialization of CwiseUnaryOp.
+  *
+  * Most of the time, this is the only way that it is used, so you typically don't have to name
+  * CwiseUnaryOp types explicitly.
+  *
+  * \sa MatrixBase::unaryExpr(const CustomUnaryOp &) const, class CwiseBinaryOp, class CwiseNullaryOp
+  */
 template<typename UnaryOp, typename XprType>
 class CwiseUnaryOp : public CwiseUnaryOpImpl<UnaryOp, XprType, typename internal::traits<XprType>::StorageKind>, internal::no_assignment_operator
 {
--- a/Eigen/src/Core/CwiseUnaryView.h
+++ b/Eigen/src/Core/CwiseUnaryView.h
@ -12,20 +12,6 @@

 namespace Eigen {

-/** \class CwiseUnaryView
-  * \ingroup Core_Module
-  *
-  * \brief Generic lvalue expression of a coefficient-wise unary operator of a matrix or a vector
-  *
-  * \param ViewOp template functor implementing the view
-  * \param MatrixType the type of the matrix we are applying the unary operator
-  *
-  * This class represents a lvalue expression of a generic unary view operator of a matrix or a vector.
-  * It is the return type of real() and imag(), and most of the time this is the only way it is used.
-  *
-  * \sa MatrixBase::unaryViewExpr(const CustomUnaryOp &) const, class CwiseUnaryOp
-  */
-
 namespace internal {
 template<typename ViewOp, typename MatrixType>
 struct traits<CwiseUnaryView<ViewOp, MatrixType> >
@ -55,6 +41,19 @@ struct traits<CwiseUnaryView<ViewOp, MatrixType> >
 template<typename ViewOp, typename MatrixType, typename StorageKind>
 class CwiseUnaryViewImpl;

+/** \class CwiseUnaryView
+  * \ingroup Core_Module
+  *
+  * \brief Generic lvalue expression of a coefficient-wise unary operator of a matrix or a vector
+  *
+  * \tparam ViewOp template functor implementing the view
+  * \tparam MatrixType the type of the matrix we are applying the unary operator
+  *
+  * This class represents a lvalue expression of a generic unary view operator of a matrix or a vector.
+  * It is the return type of real() and imag(), and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::unaryViewExpr(const CustomUnaryOp &) const, class CwiseUnaryOp
+  */
 template<typename ViewOp, typename MatrixType>
 class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename internal::traits<MatrixType>::StorageKind>
 {
--- a/Eigen/src/Core/DenseCoeffsBase.h
+++ b/Eigen/src/Core/DenseCoeffsBase.h
@ -191,19 +191,31 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>

    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType
-    y() const { return (*this)[1]; }
+    y() const
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=2, OUT_OF_RANGE_ACCESS);
+      return (*this)[1];
+    }

    /** equivalent to operator[](2).  */

    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType
-    z() const { return (*this)[2]; }
+    z() const
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=3, OUT_OF_RANGE_ACCESS);
+      return (*this)[2];
+    }

    /** equivalent to operator[](3).  */

    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType
-    w() const { return (*this)[3]; }
+    w() const
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=4, OUT_OF_RANGE_ACCESS);
+      return (*this)[3];
+    }

    /** \internal
      * \returns the packet of coefficients starting at the given row and column. It is your responsibility
@ -424,19 +436,31 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,

    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
-    y() { return (*this)[1]; }
+    y()
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=2, OUT_OF_RANGE_ACCESS);
+      return (*this)[1];
+    }

    /** equivalent to operator[](2).  */

    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
-    z() { return (*this)[2]; }
+    z()
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=3, OUT_OF_RANGE_ACCESS);
+      return (*this)[2];
+    }

    /** equivalent to operator[](3).  */

    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
-    w() { return (*this)[3]; }
+    w()
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=4, OUT_OF_RANGE_ACCESS);
+      return (*this)[3];
+    }
 };

 /** \brief Base class providing direct read-only coefficient access to matrices and arrays.
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@ -99,11 +99,13 @@ EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scala
 template<typename Derived>
 inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
 {
-  EIGEN_USING_STD_MATH(sqrt)
-  return sqrt(squaredNorm());
+  return numext::sqrt(squaredNorm());
 }

-/** \returns an expression of the quotient of *this by its own norm.
+/** \returns an expression of the quotient of \c *this by its own norm.
+  *
+  * \warning If the input vector is too small (i.e., this->norm()==0),
+  *          then this function returns a copy of the input.
  *
  * \only_for_vectors
  *
@ -115,19 +117,29 @@ MatrixBase<Derived>::normalized() const
 {
  typedef typename internal::nested_eval<Derived,2>::type _Nested;
  _Nested n(derived());
-  return n / n.norm();
+  RealScalar z = n.squaredNorm();
+  // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
+  if(z>RealScalar(0))
+    return n / numext::sqrt(z);
+  else
+    return n;
 }

 /** Normalizes the vector, i.e. divides it by its own norm.
  *
  * \only_for_vectors
  *
+  * \warning If the input vector is too small (i.e., this->norm()==0), then \c *this is left unchanged.
+  *
  * \sa norm(), normalized()
  */
 template<typename Derived>
 inline void MatrixBase<Derived>::normalize()
 {
-  *this /= norm();
+  RealScalar z = squaredNorm();
+  // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
+  if(z>RealScalar(0))
+    derived() /= numext::sqrt(z);
 }

 //---------- implementation of other norms ----------
--- a/Eigen/src/Core/IO.h
+++ b/Eigen/src/Core/IO.h
@ -80,7 +80,7 @@ struct IOFormat
  *
  * \brief Pseudo expression providing matrix output with given format
  *
-  * \param ExpressionType the type of the object on which IO stream operations are performed
+  * \tparam ExpressionType the type of the object on which IO stream operations are performed
  *
  * This class represents an expression with stream operators controlled by a given IOFormat.
  * It is the return type of DenseBase::format()
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@ -13,6 +13,28 @@

 namespace Eigen { 

+namespace internal {
+template<typename PlainObjectType, int MapOptions, typename StrideType>
+struct traits<Map<PlainObjectType, MapOptions, StrideType> >
+  : public traits<PlainObjectType>
+{
+  typedef traits<PlainObjectType> TraitsBase;
+  enum {
+    InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
+                             ? int(PlainObjectType::InnerStrideAtCompileTime)
+                             : int(StrideType::InnerStrideAtCompileTime),
+    OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
+                             ? int(PlainObjectType::OuterStrideAtCompileTime)
+                             : int(StrideType::OuterStrideAtCompileTime),
+    Alignment = int(MapOptions)&int(AlignedMask),
+    Flags0 = TraitsBase::Flags & (~NestByRefBit),
+    Flags = is_lvalue<PlainObjectType>::value ? int(Flags0) : (int(Flags0) & ~LvalueBit)
+  };
+private:
+  enum { Options }; // Expressions don't have Options
+};
+}
+
 /** \class Map
  * \ingroup Core_Module
  *
@ -63,29 +85,6 @@ namespace Eigen {
  *
  * \sa PlainObjectBase::Map(), \ref TopicStorageOrders
  */
-
-namespace internal {
-template<typename PlainObjectType, int MapOptions, typename StrideType>
-struct traits<Map<PlainObjectType, MapOptions, StrideType> >
-  : public traits<PlainObjectType>
-{
-  typedef traits<PlainObjectType> TraitsBase;
-  enum {
-    InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
-                             ? int(PlainObjectType::InnerStrideAtCompileTime)
-                             : int(StrideType::InnerStrideAtCompileTime),
-    OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
-                             ? int(PlainObjectType::OuterStrideAtCompileTime)
-                             : int(StrideType::OuterStrideAtCompileTime),
-    Alignment = int(MapOptions)&int(AlignedMask),
-    Flags0 = TraitsBase::Flags & (~NestByRefBit),
-    Flags = is_lvalue<PlainObjectType>::value ? int(Flags0) : (int(Flags0) & ~LvalueBit)
-  };
-private:
-  enum { Options }; // Expressions don't have Options
-};
-}
-
 template<typename PlainObjectType, int MapOptions, typename StrideType> class Map
  : public MapBase<Map<PlainObjectType, MapOptions, StrideType> >
 {
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@ -26,7 +26,7 @@ long double abs(long double x) { return (fabsl(x)); }
  
 namespace internal {

-/** \internal \struct global_math_functions_filtering_base
+/** \internal \class global_math_functions_filtering_base
  *
  * What it does:
  * Defines a typedef 'type' as follows:
@ -954,8 +954,8 @@ T (ceil)(const T& x)
  return ceil(x);
 }

-// Log base 2 for 32 bits positive integers.
-// Conveniently returns 0 for x==0.
+/** Log base 2 for 32 bits positive integers.
+  * Conveniently returns 0 for x==0. */
 inline int log2(int x)
 {
  eigen_assert(x>=0);
@ -969,6 +969,22 @@ inline int log2(int x)
  return table[(v * 0x07C4ACDDU) >> 27];
 }

+/** \returns the square root of \a x.
+  *
+  * It is essentially equivalent to \code using std::sqrt; return sqrt(x); \endcode,
+  * but slightly faster for float/double and some compilers (e.g., gcc), thanks to
+  * specializations when SSE is enabled.
+  *
+  * It's usage is justified in performance critical functions, like norm/normalize.
+  */
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T sqrt(const T &x)
+{
+  EIGEN_USING_STD_MATH(sqrt);
+  return sqrt(x);
+}
+
 } // end namespace numext

 namespace internal {
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@ -13,6 +13,45 @@

 namespace Eigen {

+namespace internal {
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+{
+private:
+  enum { size = internal::size_at_compile_time<_Rows,_Cols>::ret };
+  typedef typename find_best_packet<_Scalar,size>::type PacketScalar;
+  enum {
+      row_major_bit = _Options&RowMajor ? RowMajorBit : 0,
+      is_dynamic_size_storage = _MaxRows==Dynamic || _MaxCols==Dynamic,
+      max_size = is_dynamic_size_storage ? Dynamic : _MaxRows*_MaxCols,
+      default_alignment = compute_default_alignment<_Scalar,max_size>::value,
+      actual_alignment = ((_Options&DontAlign)==0) ? default_alignment : 0,
+      required_alignment = unpacket_traits<PacketScalar>::alignment,
+      packet_access_bit = packet_traits<_Scalar>::Vectorizable && (actual_alignment>=required_alignment) ? PacketAccessBit : 0
+    };
+    
+public:
+  typedef _Scalar Scalar;
+  typedef Dense StorageKind;
+  typedef Eigen::Index StorageIndex;
+  typedef MatrixXpr XprKind;
+  enum {
+    RowsAtCompileTime = _Rows,
+    ColsAtCompileTime = _Cols,
+    MaxRowsAtCompileTime = _MaxRows,
+    MaxColsAtCompileTime = _MaxCols,
+    Flags = compute_matrix_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret,
+    Options = _Options,
+    InnerStrideAtCompileTime = 1,
+    OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime,
+    
+    // FIXME, the following flag in only used to define NeedsToAlign in PlainObjectBase
+    EvaluatorFlags = LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit,
+    Alignment = actual_alignment
+  };
+};
+}
+
 /** \class Matrix
  * \ingroup Core_Module
  *
@ -98,7 +137,7 @@ namespace Eigen {
  * </dl>
  *
  * <i><b>ABI and storage layout</b></i>
-  * 
+  *
  * The table below summarizes the ABI of some possible Matrix instances which is fixed thorough the lifetime of Eigen 3.
  * <table  class="manual">
  * <tr><th>Matrix type</th><th>Equivalent C structure</th></tr>
@ -130,50 +169,11 @@ namespace Eigen {
  * </table>
  * Note that in this table Rows, Cols, MaxRows and MaxCols are all positive integers. A(S) is defined to the largest possible power-of-two
  * smaller to EIGEN_MAX_STATIC_ALIGN_BYTES.
-  * 
-  * \see MatrixBase for the majority of the API methods for matrices, \ref TopicClassHierarchy, 
-  * \ref TopicStorageOrders 
+  *
+  * \see MatrixBase for the majority of the API methods for matrices, \ref TopicClassHierarchy,
+  * \ref TopicStorageOrders
  */

-namespace internal {
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
-{
-private:
-  enum { size = internal::size_at_compile_time<_Rows,_Cols>::ret };
-  typedef typename find_best_packet<_Scalar,size>::type PacketScalar;
-  enum {
-      row_major_bit = _Options&RowMajor ? RowMajorBit : 0,
-      is_dynamic_size_storage = _MaxRows==Dynamic || _MaxCols==Dynamic,
-      max_size = is_dynamic_size_storage ? Dynamic : _MaxRows*_MaxCols,
-      default_alignment = compute_default_alignment<_Scalar,max_size>::value,
-      actual_alignment = ((_Options&DontAlign)==0) ? default_alignment : 0,
-      required_alignment = unpacket_traits<PacketScalar>::alignment,
-      packet_access_bit = packet_traits<_Scalar>::Vectorizable && (actual_alignment>=required_alignment) ? PacketAccessBit : 0
-    };
-    
-public:
-  typedef _Scalar Scalar;
-  typedef Dense StorageKind;
-  typedef Eigen::Index StorageIndex;
-  typedef MatrixXpr XprKind;
-  enum {
-    RowsAtCompileTime = _Rows,
-    ColsAtCompileTime = _Cols,
-    MaxRowsAtCompileTime = _MaxRows,
-    MaxColsAtCompileTime = _MaxCols,
-    Flags = compute_matrix_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret,
-    Options = _Options,
-    InnerStrideAtCompileTime = 1,
-    OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime,
-    
-    // FIXME, the following flag in only used to define NeedsToAlign in PlainObjectBase
-    EvaluatorFlags = LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit,
-    Alignment = actual_alignment
-  };
-};
-}
-
 template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
 class Matrix
  : public PlainObjectBase<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h
@ -13,25 +13,24 @@

 namespace Eigen {

-/** \class NestByValue
-  * \ingroup Core_Module
-  *
-  * \brief Expression which must be nested by value
-  *
-  * \param ExpressionType the type of the object of which we are requiring nesting-by-value
-  *
-  * This class is the return type of MatrixBase::nestByValue()
-  * and most of the time this is the only way it is used.
-  *
-  * \sa MatrixBase::nestByValue()
-  */
-
 namespace internal {
 template<typename ExpressionType>
 struct traits<NestByValue<ExpressionType> > : public traits<ExpressionType>
 {};
 }

+/** \class NestByValue
+  * \ingroup Core_Module
+  *
+  * \brief Expression which must be nested by value
+  *
+  * \tparam ExpressionType the type of the object of which we are requiring nesting-by-value
+  *
+  * This class is the return type of MatrixBase::nestByValue()
+  * and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::nestByValue()
+  */
 template<typename ExpressionType> class NestByValue
  : public internal::dense_xpr_base< NestByValue<ExpressionType> >::type
 {
--- a/Eigen/src/Core/NoAlias.h
+++ b/Eigen/src/Core/NoAlias.h
@ -17,7 +17,7 @@ namespace Eigen {
  *
  * \brief Pseudo expression providing an operator = assuming no aliasing
  *
-  * \param ExpressionType the type of the object on which to do the lazy assignment
+  * \tparam ExpressionType the type of the object on which to do the lazy assignment
  *
  * This class represents an expression with special assignment operators
  * assuming no aliasing between the target expression and the source expression.
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@ -17,7 +17,7 @@ namespace Eigen {
  *
  * \brief Holds information about the various numeric (i.e. scalar) types allowed by Eigen.
  *
-  * \param T the numeric type at hand
+  * \tparam T the numeric type at hand
  *
  * This class stores enums, typedefs and static methods giving information about a numeric type.
  *
--- a/Eigen/src/Core/PermutationMatrix.h
+++ b/Eigen/src/Core/PermutationMatrix.h
@ -13,12 +13,18 @@

 namespace Eigen { 

+namespace internal {
+
+enum PermPermProduct_t {PermPermProduct};
+
+} // end namespace internal
+
 /** \class PermutationBase
  * \ingroup Core_Module
  *
  * \brief Base class for permutations
  *
-  * \param Derived the derived class
+  * \tparam Derived the derived class
  *
  * This class is the base class for all expressions representing a permutation matrix,
  * internally stored as a vector of integers.
@ -36,13 +42,6 @@ namespace Eigen {
  *
  * \sa class PermutationMatrix, class PermutationWrapper
  */
-
-namespace internal {
-
-enum PermPermProduct_t {PermPermProduct};
-
-} // end namespace internal
-
 template<typename Derived>
 class PermutationBase : public EigenBase<Derived>
 {
@ -280,20 +279,6 @@ class PermutationBase : public EigenBase<Derived>

 };

-/** \class PermutationMatrix
-  * \ingroup Core_Module
-  *
-  * \brief Permutation matrix
-  *
-  * \param SizeAtCompileTime the number of rows/cols, or Dynamic
-  * \param MaxSizeAtCompileTime the maximum number of rows/cols, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
-  * \param StorageIndex the integer type of the indices
-  *
-  * This class represents a permutation matrix, internally stored as a vector of integers.
-  *
-  * \sa class PermutationBase, class PermutationWrapper, class DiagonalMatrix
-  */
-
 namespace internal {
 template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
 struct traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex> >
@ -306,6 +291,19 @@ struct traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _Storag
 };
 }

+/** \class PermutationMatrix
+  * \ingroup Core_Module
+  *
+  * \brief Permutation matrix
+  *
+  * \tparam SizeAtCompileTime the number of rows/cols, or Dynamic
+  * \tparam MaxSizeAtCompileTime the maximum number of rows/cols, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
+  * \tparam _StorageIndex the integer type of the indices
+  *
+  * This class represents a permutation matrix, internally stored as a vector of integers.
+  *
+  * \sa class PermutationBase, class PermutationWrapper, class DiagonalMatrix
+  */
 template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
 class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex> >
 {
@ -482,18 +480,6 @@ class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageInd
    IndicesType m_indices;
 };

-/** \class PermutationWrapper
-  * \ingroup Core_Module
-  *
-  * \brief Class to view a vector of integers as a permutation matrix
-  *
-  * \param _IndicesType the type of the vector of integer (can be any compatible expression)
-  *
-  * This class allows to view any vector expression of integers as a permutation matrix.
-  *
-  * \sa class PermutationBase, class PermutationMatrix
-  */
-
 template<typename _IndicesType> class TranspositionsWrapper;
 namespace internal {
 template<typename _IndicesType>
@ -513,6 +499,17 @@ struct traits<PermutationWrapper<_IndicesType> >
 };
 }

+/** \class PermutationWrapper
+  * \ingroup Core_Module
+  *
+  * \brief Class to view a vector of integers as a permutation matrix
+  *
+  * \tparam _IndicesType the type of the vector of integer (can be any compatible expression)
+  *
+  * This class allows to view any vector expression of integers as a permutation matrix.
+  *
+  * \sa class PermutationBase, class PermutationMatrix
+  */
 template<typename _IndicesType>
 class PermutationWrapper : public PermutationBase<PermutationWrapper<_IndicesType> >
 {
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@ -14,22 +14,6 @@ namespace Eigen {

 template<typename Lhs, typename Rhs, int Option, typename StorageKind> class ProductImpl;

-/** \class Product
-  * \ingroup Core_Module
-  *
-  * \brief Expression of the product of two arbitrary matrices or vectors
-  *
-  * \param Lhs the type of the left-hand side expression
-  * \param Rhs the type of the right-hand side expression
-  *
-  * This class represents an expression of the product of two arbitrary matrices.
-  * 
-  * The other template parameters are:
-  * \tparam Option     can be DefaultProduct, AliasFreeProduct, or LazyProduct
-  *
-  */
-
-
 namespace internal {

 // Determine the scalar of Product<Lhs, Rhs>. This is normally the same as Lhs::Scalar times
@ -102,7 +86,20 @@ struct traits<Product<Lhs, Rhs, Option> >

 } // end namespace internal

-
+/** \class Product
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the product of two arbitrary matrices or vectors
+  *
+  * \tparam _Lhs the type of the left-hand side expression
+  * \tparam _Rhs the type of the right-hand side expression
+  *
+  * This class represents an expression of the product of two arbitrary matrices.
+  *
+  * The other template parameters are:
+  * \tparam Option     can be DefaultProduct, AliasFreeProduct, or LazyProduct
+  *
+  */
 template<typename _Lhs, typename _Rhs, int Option>
 class Product : public ProductImpl<_Lhs,_Rhs,Option,
                                   typename internal::product_promote_storage_type<typename internal::traits<_Lhs>::StorageKind,
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@ -38,10 +38,9 @@ struct evaluator<Product<Lhs, Rhs, Options> >
 // Catch scalar * ( A * B ) and transform it to (A*scalar) * B
 // TODO we should apply that rule only if that's really helpful
 template<typename Lhs, typename Rhs, typename Scalar>
-struct evaluator_traits<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,  const Product<Lhs, Rhs, DefaultProduct>  > >
- : evaluator_traits_base<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,  const Product<Lhs, Rhs, DefaultProduct>  > >
+struct evaluator_assume_aliasing<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,  const Product<Lhs, Rhs, DefaultProduct>  > >
 {
-  enum { AssumeAliasing = 1 };
+  static const bool value = true;
 };
 template<typename Lhs, typename Rhs, typename Scalar>
 struct evaluator<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,  const Product<Lhs, Rhs, DefaultProduct>  > > 
@ -81,17 +80,8 @@ template< typename Lhs, typename Rhs,
 struct generic_product_impl;

 template<typename Lhs, typename Rhs>
-struct evaluator_traits<Product<Lhs, Rhs, DefaultProduct> > 
- : evaluator_traits_base<Product<Lhs, Rhs, DefaultProduct> >
-{
-  enum { AssumeAliasing = 1 };
-};
-
-template<typename Lhs, typename Rhs>
-struct evaluator_traits<Product<Lhs, Rhs, AliasFreeProduct> > 
- : evaluator_traits_base<Product<Lhs, Rhs, AliasFreeProduct> >
-{
-  enum { AssumeAliasing = 0 };
+struct evaluator_assume_aliasing<Product<Lhs, Rhs, DefaultProduct> > {
+  static const bool value = true;
 };

 // This is the default evaluator implementation for products:
@ -189,6 +179,13 @@ struct Assignment<DstXprType, CwiseUnaryOp<internal::scalar_multiple_op<ScalarBi
 //----------------------------------------
 // Catch "Dense ?= xpr + Product<>" expression to save one temporary
 // FIXME we could probably enable these rules for any product, i.e., not only Dense and DefaultProduct
+// TODO enable it for "Dense ?= xpr - Product<>" as well.
+
+template<typename OtherXpr, typename Lhs, typename Rhs>
+struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_sum_op<typename OtherXpr::Scalar>, const OtherXpr,
+                                               const Product<Lhs,Rhs,DefaultProduct> >, DenseShape > {
+  static const bool value = true;
+};

 template<typename DstXprType, typename OtherXpr, typename ProductType, typename Scalar, typename Func1, typename Func2>
 struct assignment_from_xpr_plus_product
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@ -12,76 +12,6 @@

 namespace Eigen { 

-/** \class Ref
-  * \ingroup Core_Module
-  *
-  * \brief A matrix or vector expression mapping an existing expression
-  *
-  * \tparam PlainObjectType the equivalent matrix type of the mapped data
-  * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned.
-  *                The default is \c #Unaligned.
-  * \tparam StrideType optionally specifies strides. By default, Ref implies a contiguous storage along the inner dimension (inner stride==1),
-  *                   but accepts a variable outer stride (leading dimension).
-  *                   This can be overridden by specifying strides.
-  *                   The type passed here must be a specialization of the Stride template, see examples below.
-  *
-  * This class provides a way to write non-template functions taking Eigen objects as parameters while limiting the number of copies.
-  * A Ref<> object can represent either a const expression or a l-value:
-  * \code
-  * // in-out argument:
-  * void foo1(Ref<VectorXf> x);
-  *
-  * // read-only const argument:
-  * void foo2(const Ref<const VectorXf>& x);
-  * \endcode
-  *
-  * In the in-out case, the input argument must satisfy the constraints of the actual Ref<> type, otherwise a compilation issue will be triggered.
-  * By default, a Ref<VectorXf> can reference any dense vector expression of float having a contiguous memory layout.
-  * Likewise, a Ref<MatrixXf> can reference any column-major dense matrix expression of float whose column's elements are contiguously stored with
-  * the possibility to have a constant space in-between each column, i.e. the inner stride must be equal to 1, but the outer stride (or leading dimension)
-  * can be greater than the number of rows.
-  *
-  * In the const case, if the input expression does not match the above requirement, then it is evaluated into a temporary before being passed to the function.
-  * Here are some examples:
-  * \code
-  * MatrixXf A;
-  * VectorXf a;
-  * foo1(a.head());             // OK
-  * foo1(A.col());              // OK
-  * foo1(A.row());              // Compilation error because here innerstride!=1
-  * foo2(A.row());              // Compilation error because A.row() is a 1xN object while foo2 is expecting a Nx1 object
-  * foo2(A.row().transpose());  // The row is copied into a contiguous temporary
-  * foo2(2*a);                  // The expression is evaluated into a temporary
-  * foo2(A.col().segment(2,4)); // No temporary
-  * \endcode
-  *
-  * The range of inputs that can be referenced without temporary can be enlarged using the last two template parameters.
-  * Here is an example accepting an innerstride!=1:
-  * \code
-  * // in-out argument:
-  * void foo3(Ref<VectorXf,0,InnerStride<> > x);
-  * foo3(A.row());              // OK
-  * \endcode
-  * The downside here is that the function foo3 might be significantly slower than foo1 because it won't be able to exploit vectorization, and will involve more
-  * expensive address computations even if the input is contiguously stored in memory. To overcome this issue, one might propose to overload internally calling a
-  * template function, e.g.:
-  * \code
-  * // in the .h:
-  * void foo(const Ref<MatrixXf>& A);
-  * void foo(const Ref<MatrixXf,0,Stride<> >& A);
-  *
-  * // in the .cpp:
-  * template<typename TypeOfA> void foo_impl(const TypeOfA& A) {
-  *     ... // crazy code goes here
-  * }
-  * void foo(const Ref<MatrixXf>& A) { foo_impl(A); }
-  * void foo(const Ref<MatrixXf,0,Stride<> >& A) { foo_impl(A); }
-  * \endcode
-  *
-  *
-  * \sa PlainObjectBase::Map(), \ref TopicStorageOrders
-  */
-
 namespace internal {

 template<typename _PlainObjectType, int _Options, typename _StrideType>
@ -182,7 +112,75 @@ protected:
  StrideBase m_stride;
 };

-
+/** \class Ref
+  * \ingroup Core_Module
+  *
+  * \brief A matrix or vector expression mapping an existing expression
+  *
+  * \tparam PlainObjectType the equivalent matrix type of the mapped data
+  * \tparam Options specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned.
+  *                 The default is \c #Unaligned.
+  * \tparam StrideType optionally specifies strides. By default, Ref implies a contiguous storage along the inner dimension (inner stride==1),
+  *                   but accepts a variable outer stride (leading dimension).
+  *                   This can be overridden by specifying strides.
+  *                   The type passed here must be a specialization of the Stride template, see examples below.
+  *
+  * This class provides a way to write non-template functions taking Eigen objects as parameters while limiting the number of copies.
+  * A Ref<> object can represent either a const expression or a l-value:
+  * \code
+  * // in-out argument:
+  * void foo1(Ref<VectorXf> x);
+  *
+  * // read-only const argument:
+  * void foo2(const Ref<const VectorXf>& x);
+  * \endcode
+  *
+  * In the in-out case, the input argument must satisfy the constraints of the actual Ref<> type, otherwise a compilation issue will be triggered.
+  * By default, a Ref<VectorXf> can reference any dense vector expression of float having a contiguous memory layout.
+  * Likewise, a Ref<MatrixXf> can reference any column-major dense matrix expression of float whose column's elements are contiguously stored with
+  * the possibility to have a constant space in-between each column, i.e. the inner stride must be equal to 1, but the outer stride (or leading dimension)
+  * can be greater than the number of rows.
+  *
+  * In the const case, if the input expression does not match the above requirement, then it is evaluated into a temporary before being passed to the function.
+  * Here are some examples:
+  * \code
+  * MatrixXf A;
+  * VectorXf a;
+  * foo1(a.head());             // OK
+  * foo1(A.col());              // OK
+  * foo1(A.row());              // Compilation error because here innerstride!=1
+  * foo2(A.row());              // Compilation error because A.row() is a 1xN object while foo2 is expecting a Nx1 object
+  * foo2(A.row().transpose());  // The row is copied into a contiguous temporary
+  * foo2(2*a);                  // The expression is evaluated into a temporary
+  * foo2(A.col().segment(2,4)); // No temporary
+  * \endcode
+  *
+  * The range of inputs that can be referenced without temporary can be enlarged using the last two template parameters.
+  * Here is an example accepting an innerstride!=1:
+  * \code
+  * // in-out argument:
+  * void foo3(Ref<VectorXf,0,InnerStride<> > x);
+  * foo3(A.row());              // OK
+  * \endcode
+  * The downside here is that the function foo3 might be significantly slower than foo1 because it won't be able to exploit vectorization, and will involve more
+  * expensive address computations even if the input is contiguously stored in memory. To overcome this issue, one might propose to overload internally calling a
+  * template function, e.g.:
+  * \code
+  * // in the .h:
+  * void foo(const Ref<MatrixXf>& A);
+  * void foo(const Ref<MatrixXf,0,Stride<> >& A);
+  *
+  * // in the .cpp:
+  * template<typename TypeOfA> void foo_impl(const TypeOfA& A) {
+  *     ... // crazy code goes here
+  * }
+  * void foo(const Ref<MatrixXf>& A) { foo_impl(A); }
+  * void foo(const Ref<MatrixXf,0,Stride<> >& A) { foo_impl(A); }
+  * \endcode
+  *
+  *
+  * \sa PlainObjectBase::Map(), \ref TopicStorageOrders
+  */
 template<typename PlainObjectType, int Options, typename StrideType> class Ref
  : public RefBase<Ref<PlainObjectType, Options, StrideType> >
 {
@ -209,6 +207,7 @@ template<typename PlainObjectType, int Options, typename StrideType> class Ref
    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr,
                                 typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
    #else
+    /** Implicit constructor from any dense expression */
    template<typename Derived>
    inline Ref(DenseBase<Derived>& expr)
    #endif
--- a/Eigen/src/Core/Replicate.h
+++ b/Eigen/src/Core/Replicate.h
@ -12,21 +12,6 @@

 namespace Eigen { 

-/**
-  * \class Replicate
-  * \ingroup Core_Module
-  *
-  * \brief Expression of the multiple replication of a matrix or vector
-  *
-  * \param MatrixType the type of the object we are replicating
-  *
-  * This class represents an expression of the multiple replication of a matrix or vector.
-  * It is the return type of DenseBase::replicate() and most of the time
-  * this is the only way it is used.
-  *
-  * \sa DenseBase::replicate()
-  */
-
 namespace internal {
 template<typename MatrixType,int RowFactor,int ColFactor>
 struct traits<Replicate<MatrixType,RowFactor,ColFactor> >
@ -57,6 +42,22 @@ struct traits<Replicate<MatrixType,RowFactor,ColFactor> >
 };
 }

+/**
+  * \class Replicate
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the multiple replication of a matrix or vector
+  *
+  * \tparam MatrixType the type of the object we are replicating
+  * \tparam RowFactor number of repetitions at compile time along the vertical direction, can be Dynamic.
+  * \tparam ColFactor number of repetitions at compile time along the horizontal direction, can be Dynamic.
+  *
+  * This class represents an expression of the multiple replication of a matrix or vector.
+  * It is the return type of DenseBase::replicate() and most of the time
+  * this is the only way it is used.
+  *
+  * \sa DenseBase::replicate()
+  */
 template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
  : public internal::dense_xpr_base< Replicate<MatrixType,RowFactor,ColFactor> >::type
 {
--- a/Eigen/src/Core/ReturnByValue.h
+++ b/Eigen/src/Core/ReturnByValue.h
@ -13,11 +13,6 @@

 namespace Eigen {

-/** \class ReturnByValue
-  * \ingroup Core_Module
-  *
-  */
-
 namespace internal {

 template<typename Derived>
@ -48,6 +43,10 @@ struct nested_eval<ReturnByValue<Derived>, n, PlainObject>

 } // end namespace internal

+/** \class ReturnByValue
+  * \ingroup Core_Module
+  *
+  */
 template<typename Derived> class ReturnByValue
  : public internal::dense_xpr_base< ReturnByValue<Derived> >::type, internal::no_assignment_operator
 {
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h
@ -14,20 +14,6 @@

 namespace Eigen { 

-/** \class Reverse
-  * \ingroup Core_Module
-  *
-  * \brief Expression of the reverse of a vector or matrix
-  *
-  * \param MatrixType the type of the object of which we are taking the reverse
-  *
-  * This class represents an expression of the reverse of a vector.
-  * It is the return type of MatrixBase::reverse() and VectorwiseOp::reverse()
-  * and most of the time this is the only way it is used.
-  *
-  * \sa MatrixBase::reverse(), VectorwiseOp::reverse()
-  */
-
 namespace internal {

 template<typename MatrixType, int Direction>
@ -60,6 +46,20 @@ template<typename PacketType> struct reverse_packet_cond<PacketType,false>

 } // end namespace internal 

+/** \class Reverse
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the reverse of a vector or matrix
+  *
+  * \tparam MatrixType the type of the object of which we are taking the reverse
+  * \tparam Direction defines the direction of the reverse operation, can be Vertical, Horizontal, or BothDirections
+  *
+  * This class represents an expression of the reverse of a vector.
+  * It is the return type of MatrixBase::reverse() and VectorwiseOp::reverse()
+  * and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::reverse(), VectorwiseOp::reverse()
+  */
 template<typename MatrixType, int Direction> class Reverse
  : public internal::dense_xpr_base< Reverse<MatrixType, Direction> >::type
 {
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@ -203,8 +203,6 @@ struct evaluator_traits<SelfAdjointView<MatrixType,Mode> >
 {
  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
  typedef SelfAdjointShape Shape;
-  
-  static const int AssumeAliasing = 0;
 };

 template<int UpLo, int SetOpposite, typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor, int Version>
--- a/Eigen/src/Core/Stride.h
+++ b/Eigen/src/Core/Stride.h
@ -31,8 +31,8 @@ namespace Eigen {
  * arguments to the constructor.
  *
  * Indeed, this class takes two template parameters:
-  *  \param _OuterStrideAtCompileTime the outer stride, or Dynamic if you want to specify it at runtime.
-  *  \param _InnerStrideAtCompileTime the inner stride, or Dynamic if you want to specify it at runtime.
+  *  \tparam _OuterStrideAtCompileTime the outer stride, or Dynamic if you want to specify it at runtime.
+  *  \tparam _InnerStrideAtCompileTime the inner stride, or Dynamic if you want to specify it at runtime.
  *
  * Here is an example:
  * \include Map_general_stride.cpp
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@ -13,20 +13,6 @@

 namespace Eigen { 

-/** \class Transpose
-  * \ingroup Core_Module
-  *
-  * \brief Expression of the transpose of a matrix
-  *
-  * \param MatrixType the type of the object of which we are taking the transpose
-  *
-  * This class represents an expression of the transpose of a matrix.
-  * It is the return type of MatrixBase::transpose() and MatrixBase::adjoint()
-  * and most of the time this is the only way it is used.
-  *
-  * \sa MatrixBase::transpose(), MatrixBase::adjoint()
-  */
-
 namespace internal {
 template<typename MatrixType>
 struct traits<Transpose<MatrixType> > : public traits<MatrixType>
@ -50,6 +36,19 @@ struct traits<Transpose<MatrixType> > : public traits<MatrixType>

 template<typename MatrixType, typename StorageKind> class TransposeImpl;

+/** \class Transpose
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the transpose of a matrix
+  *
+  * \tparam MatrixType the type of the object of which we are taking the transpose
+  *
+  * This class represents an expression of the transpose of a matrix.
+  * It is the return type of MatrixBase::transpose() and MatrixBase::adjoint()
+  * and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::transpose(), MatrixBase::adjoint()
+  */
 template<typename MatrixType> class Transpose
  : public TransposeImpl<MatrixType,typename internal::traits<MatrixType>::StorageKind>
 {
--- a/Eigen/src/Core/Transpositions.h
+++ b/Eigen/src/Core/Transpositions.h
@ -12,35 +12,6 @@

 namespace Eigen { 

-/** \class Transpositions
-  * \ingroup Core_Module
-  *
-  * \brief Represents a sequence of transpositions (row/column interchange)
-  *
-  * \param SizeAtCompileTime the number of transpositions, or Dynamic
-  * \param MaxSizeAtCompileTime the maximum number of transpositions, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
-  *
-  * This class represents a permutation transformation as a sequence of \em n transpositions
-  * \f$[T_{n-1} \ldots T_{i} \ldots T_{0}]\f$. It is internally stored as a vector of integers \c indices.
-  * Each transposition \f$ T_{i} \f$ applied on the left of a matrix (\f$ T_{i} M\f$) interchanges
-  * the rows \c i and \c indices[i] of the matrix \c M.
-  * A transposition applied on the right (e.g., \f$ M T_{i}\f$) yields a column interchange.
-  *
-  * Compared to the class PermutationMatrix, such a sequence of transpositions is what is
-  * computed during a decomposition with pivoting, and it is faster when applying the permutation in-place.
-  * 
-  * To apply a sequence of transpositions to a matrix, simply use the operator * as in the following example:
-  * \code
-  * Transpositions tr;
-  * MatrixXf mat;
-  * mat = tr * mat;
-  * \endcode
-  * In this example, we detect that the matrix appears on both side, and so the transpositions
-  * are applied in-place without any temporary or extra copy.
-  *
-  * \sa class PermutationMatrix
-  */
-
 template<typename Derived>
 class TranspositionsBase
 {
@ -154,6 +125,35 @@ struct traits<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageInde
 };
 }

+/** \class Transpositions
+  * \ingroup Core_Module
+  *
+  * \brief Represents a sequence of transpositions (row/column interchange)
+  *
+  * \tparam SizeAtCompileTime the number of transpositions, or Dynamic
+  * \tparam MaxSizeAtCompileTime the maximum number of transpositions, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
+  *
+  * This class represents a permutation transformation as a sequence of \em n transpositions
+  * \f$[T_{n-1} \ldots T_{i} \ldots T_{0}]\f$. It is internally stored as a vector of integers \c indices.
+  * Each transposition \f$ T_{i} \f$ applied on the left of a matrix (\f$ T_{i} M\f$) interchanges
+  * the rows \c i and \c indices[i] of the matrix \c M.
+  * A transposition applied on the right (e.g., \f$ M T_{i}\f$) yields a column interchange.
+  *
+  * Compared to the class PermutationMatrix, such a sequence of transpositions is what is
+  * computed during a decomposition with pivoting, and it is faster when applying the permutation in-place.
+  *
+  * To apply a sequence of transpositions to a matrix, simply use the operator * as in the following example:
+  * \code
+  * Transpositions tr;
+  * MatrixXf mat;
+  * mat = tr * mat;
+  * \endcode
+  * In this example, we detect that the matrix appears on both side, and so the transpositions
+  * are applied in-place without any temporary or extra copy.
+  *
+  * \sa class PermutationMatrix
+  */
+
 template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
 class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex> >
 {
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@ -704,10 +704,6 @@ struct evaluator_traits<TriangularView<MatrixType,Mode> >
 {
  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
  typedef typename glue_shapes<typename evaluator_traits<MatrixType>::Shape, TriangularShape>::type Shape;
-  
-  // 1 if assignment A = B assumes aliasing when B is of type T and thus B needs to be evaluated into a
-  // temporary; 0 if not.
-  static const int AssumeAliasing = 0;
 };

 template<typename MatrixType, unsigned int Mode>
--- a/Eigen/src/Core/VectorBlock.h
+++ b/Eigen/src/Core/VectorBlock.h
@ -13,13 +13,23 @@

 namespace Eigen { 

+namespace internal {
+template<typename VectorType, int Size>
+struct traits<VectorBlock<VectorType, Size> >
+  : public traits<Block<VectorType,
+                     traits<VectorType>::Flags & RowMajorBit ? 1 : Size,
+                     traits<VectorType>::Flags & RowMajorBit ? Size : 1> >
+{
+};
+}
+
 /** \class VectorBlock
  * \ingroup Core_Module
  *
  * \brief Expression of a fixed-size or dynamic-size sub-vector
  *
-  * \param VectorType the type of the object in which we are taking a sub-vector
-  * \param Size size of the sub-vector we are taking at compile time (optional)
+  * \tparam VectorType the type of the object in which we are taking a sub-vector
+  * \tparam Size size of the sub-vector we are taking at compile time (optional)
  *
  * This class represents an expression of either a fixed-size or dynamic-size sub-vector.
  * It is the return type of DenseBase::segment(Index,Index) and DenseBase::segment<int>(Index) and
@ -43,17 +53,6 @@ namespace Eigen {
  *
  * \sa class Block, DenseBase::segment(Index,Index,Index,Index), DenseBase::segment(Index,Index)
  */
-
-namespace internal {
-template<typename VectorType, int Size>
-struct traits<VectorBlock<VectorType, Size> >
-  : public traits<Block<VectorType,
-                     traits<VectorType>::Flags & RowMajorBit ? 1 : Size,
-                     traits<VectorType>::Flags & RowMajorBit ? Size : 1> >
-{
-};
-}
-
 template<typename VectorType, int Size> class VectorBlock
  : public Block<VectorType,
                     internal::traits<VectorType>::Flags & RowMajorBit ? 1 : Size,
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@ -141,8 +141,8 @@ struct member_redux {
  *
  * \brief Pseudo expression providing partial reduction operations
  *
-  * \param ExpressionType the type of the object on which to do partial reductions
-  * \param Direction indicates the direction of the redux (#Vertical or #Horizontal)
+  * \tparam ExpressionType the type of the object on which to do partial reductions
+  * \tparam Direction indicates the direction of the redux (#Vertical or #Horizontal)
  *
  * This class represents a pseudo expression with partial reduction features.
  * It is the return type of DenseBase::colwise() and DenseBase::rowwise()
@ -187,11 +187,11 @@ template<typename ExpressionType, int Direction> class VectorwiseOp

  protected:

-    /** \internal
-      * \returns the i-th subvector according to the \c Direction */
    typedef typename internal::conditional<isVertical,
                               typename ExpressionType::ColXpr,
                               typename ExpressionType::RowXpr>::type SubVector;
+    /** \internal
+      * \returns the i-th subvector according to the \c Direction */
    EIGEN_DEVICE_FUNC
    SubVector subVector(Index i)
    {
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@ -518,6 +518,28 @@ Packet2d prsqrt<Packet2d>(const Packet2d& x) {

 } // end namespace internal

+namespace numext {
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float sqrt(const float &x)
+{
+  return internal::pfirst(internal::Packet4f(_mm_sqrt_ss(_mm_set_ss(x))));
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double sqrt(const double &x)
+{
+#if EIGEN_COMP_GNUC
+  return internal::pfirst(internal::Packet2d(__builtin_ia32_sqrtsd(_mm_set_sd(x))));
+#else
+  return internal::pfirst(internal::Packet2d(_mm_sqrt_pd(_mm_set_sd(x))));
+#endif
+}
+
+} // end namespace numex
+
 } // end namespace Eigen

 #endif // EIGEN_MATH_FUNCTIONS_SSE_H
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@ -145,12 +145,9 @@ static void run(Index rows, Index cols, Index depth,

      // Release all the sub blocks A'_i of A' for the current thread,
      // i.e., we simply decrement the number of users by 1
-      #pragma omp critical
-      {
      for(Index i=0; i<threads; ++i)
        #pragma omp atomic
        info[i].users -= 1;
-      }
    }
  }
  else
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@ -123,18 +123,18 @@ template<typename Scalar> struct get_factor<Scalar,typename NumTraits<Scalar>::R
 template<typename Scalar, typename Index>
 class BlasVectorMapper {
  public:
-  EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar *data) : m_data(data) {}
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar *data) : m_data(data) {}

-  EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
    return m_data[i];
  }
  template <typename Packet, int AlignmentType>
-  EIGEN_ALWAYS_INLINE Packet load(Index i) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet load(Index i) const {
    return ploadt<Packet, AlignmentType>(m_data + i);
  }

  template <typename Packet>
-  bool aligned(Index i) const {
+  EIGEN_DEVICE_FUNC bool aligned(Index i) const {
    return (size_t(m_data+i)%sizeof(Packet))==0;
  }

@ -148,25 +148,25 @@ class BlasLinearMapper {
  typedef typename packet_traits<Scalar>::type Packet;
  typedef typename packet_traits<Scalar>::half HalfPacket;

-  EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {}
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {}

-  EIGEN_ALWAYS_INLINE void prefetch(int i) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const {
    internal::prefetch(&operator()(i));
  }

-  EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const {
    return m_data[i];
  }

-  EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
    return ploadt<Packet, AlignmentType>(m_data + i);
  }

-  EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
    return ploadt<HalfPacket, AlignmentType>(m_data + i);
  }

-  EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet &p) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet &p) const {
    pstoret<Scalar, Packet, AlignmentType>(m_data + i, p);
  }

@ -184,18 +184,18 @@ class blas_data_mapper {
  typedef BlasLinearMapper<Scalar, Index, AlignmentType> LinearMapper;
  typedef BlasVectorMapper<Scalar, Index> VectorMapper;

-  EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}

-  EIGEN_ALWAYS_INLINE blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>
+  EIGEN_DEVICE_FUNC  EIGEN_ALWAYS_INLINE blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>
  getSubMapper(Index i, Index j) const {
    return blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>(&operator()(i, j), m_stride);
  }

-  EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+  EIGEN_DEVICE_FUNC  EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
    return LinearMapper(&operator()(i, j));
  }

-  EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
+  EIGEN_DEVICE_FUNC  EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
    return VectorMapper(&operator()(i, j));
  }

@ -205,28 +205,28 @@ class blas_data_mapper {
    return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride];
  }

-  EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
    return ploadt<Packet, AlignmentType>(&operator()(i, j));
  }

-  EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
    return ploadt<HalfPacket, AlignmentType>(&operator()(i, j));
  }

  template<typename SubPacket>
-  EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
    pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
  }

  template<typename SubPacket>
-  EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const {
    return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride);
  }

-  const Index stride() const { return m_stride; }
-  const Scalar* data() const { return m_data; }
+  EIGEN_DEVICE_FUNC const Index stride() const { return m_stride; }
+  EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; }

-  Index firstAligned(Index size) const {
+  EIGEN_DEVICE_FUNC Index firstAligned(Index size) const {
    if (size_t(m_data)%sizeof(Scalar)) {
      return -1;
    }
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@ -224,7 +224,7 @@ enum {

 /** \ingroup enums
  * Enum for indicating whether a buffer is aligned or not. */
-enum Foo {
+enum {
  Unaligned=0,        /**< Data pointer has no specific alignment. */
  Aligned8=8,         /**< Data pointer is aligned on a 8 bytes boundary. */
  Aligned16=16,       /**< Data pointer is aligned on a 16 bytes boundary. */
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@ -524,7 +524,7 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_align
  * \sa first_default_aligned()
  */
 template<int Alignment, typename Scalar, typename Index>
-inline Index first_aligned(const Scalar* array, Index size)
+EIGEN_DEVICE_FUNC inline Index first_aligned(const Scalar* array, Index size)
 {
  static const Index ScalarSize = sizeof(Scalar);
  static const Index AlignmentSize = Alignment / ScalarSize;
@ -544,14 +544,15 @@ inline Index first_aligned(const Scalar* array, Index size)
  }
  else
  {
-    return std::min<Index>( (AlignmentSize - (Index((std::size_t(array)/sizeof(Scalar))) & AlignmentMask)) & AlignmentMask, size);
+    Index first = (AlignmentSize - (Index((std::size_t(array)/sizeof(Scalar))) & AlignmentMask)) & AlignmentMask;
+    return (first < size) ? first : size;
  }
 }

 /** \internal Returns the index of the first element of the array that is well aligned with respect the largest packet requirement.
   * \sa first_aligned(Scalar*,Index) and first_default_aligned(DenseBase<Derived>) */
 template<typename Scalar, typename Index>
-inline Index first_default_aligned(const Scalar* array, Index size)
+EIGEN_DEVICE_FUNC inline Index first_default_aligned(const Scalar* array, Index size)
 {
  typedef typename packet_traits<Scalar>::type DefaultPacketType;
  return first_aligned<unpacket_traits<DefaultPacketType>::alignment>(array, size);
--- a/Eigen/src/Core/util/StaticAssert.h
+++ b/Eigen/src/Core/util/StaticAssert.h
@ -50,6 +50,7 @@
        THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE,
        THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE,
        THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE,
+        OUT_OF_RANGE_ACCESS,
        YOU_MADE_A_PROGRAMMING_MISTAKE,
        EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT,
        EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE,
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@ -390,9 +390,9 @@ struct transfer_constness
  * a*d. Evaluating can be beneficial for example if every coefficient access in the resulting expression causes
  * many coefficient accesses in the nested expressions -- as is the case with matrix product for example.
  *
-  * \param T the type of the expression being nested.
-  * \param n the number of coefficient accesses in the nested expression for each coefficient access in the bigger expression.
-  * \param PlainObject the type of the temporary if needed.
+  * \tparam T the type of the expression being nested.
+  * \tparam n the number of coefficient accesses in the nested expression for each coefficient access in the bigger expression.
+  * \tparam PlainObject the type of the temporary if needed.
  */
 template<typename T, int n, typename PlainObject = typename plain_object_eval<T>::type> struct nested_eval
 {
@ -575,7 +575,7 @@ template <int ProductTag>             struct product_promote_storage_type<Dense,
 template <int ProductTag>             struct product_promote_storage_type<PermutationStorage, Dense,              ProductTag> { typedef Dense ret; };

 /** \internal gives the plain matrix or array type to store a row/column/diagonal of a matrix type.
-  * \param Scalar optional parameter allowing to pass a different scalar type than the one of the MatrixType.
+  * \tparam Scalar optional parameter allowing to pass a different scalar type than the one of the MatrixType.
  */
 template<typename ExpressionType, typename Scalar = typename ExpressionType::Scalar>
 struct plain_row_type
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
@ -375,8 +375,12 @@ namespace internal {
  * Performs a QR step on a tridiagonal symmetric matrix represented as a
  * pair of two vectors \a diag and \a subdiag.
  *
-  * \param matA the input selfadjoint matrix
-  * \param hCoeffs returned Householder coefficients
+  * \param diag the diagonal part of the input selfadjoint tridiagonal matrix
+  * \param subdiag the sub-diagonal part of the input selfadjoint tridiagonal matrix
+  * \param start starting index of the submatrix to work on
+  * \param end last+1 index of the submatrix to work on
+  * \param matrixQ pointer to the column-major matrix holding the eigenvectors, can be 0
+  * \param n size of the input matrix
  *
  * For compilation efficiency reasons, this procedure does not use eigen expression
  * for its arguments.
@ -467,9 +471,10 @@ namespace internal {
  * \brief Compute the eigendecomposition from a tridiagonal matrix
  *
  * \param[in,out] diag : On input, the diagonal of the matrix, on output the eigenvalues
-  * \param[in] subdiag : The subdiagonal part of the matrix.
-  * \param[in,out] : On input, the maximum number of iterations, on output, the effective number of iterations.
-  * \param[out] eivec : The matrix to store the eigenvectors... if needed. allocated on input
+  * \param[in,out] subdiag : The subdiagonal part of the matrix (entries are modified during the decomposition)
+  * \param[in] maxIterations : the maximum number of iterations
+  * \param[in] computeEigenvectors : whether the eigenvectors have to be computed or not
+  * \param[out] eivec : The matrix to store the eigenvectors if computeEigenvectors==true. Must be allocated on input.
  * \returns \c Success or \c NoConvergence
  */
 template<typename MatrixType, typename DiagType, typename SubDiagType>
--- a/Eigen/src/Geometry/Homogeneous.h
+++ b/Eigen/src/Geometry/Homogeneous.h
@ -304,7 +304,6 @@ struct evaluator_traits<Homogeneous<ArgType,Direction> >
 {
  typedef typename storage_kind_to_evaluator_kind<typename ArgType::StorageKind>::Kind Kind;
  typedef HomogeneousShape Shape;  
-  static const int AssumeAliasing = 0;
 };

 template<> struct AssignmentKind<DenseShape,HomogeneousShape> { typedef Dense2Dense Kind; };
--- a/Eigen/src/Geometry/Hyperplane.h
+++ b/Eigen/src/Geometry/Hyperplane.h
@ -22,8 +22,8 @@ namespace Eigen {
  * A hyperplane is an affine subspace of dimension n-1 in a space of dimension n.
  * For example, a hyperplane in a plane is a line; a hyperplane in 3-space is a plane.
  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
-  * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
+  * \tparam _Scalar the scalar type, i.e., the type of the coefficients
+  * \tparam _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
  *             Notice that the dimension of the hyperplane is _AmbientDim-1.
  *
  * This class represents an hyperplane as the zero set of the implicit equation
--- a/Eigen/src/Geometry/ParametrizedLine.h
+++ b/Eigen/src/Geometry/ParametrizedLine.h
@ -23,8 +23,8 @@ namespace Eigen {
  * direction vector \f$ \mathbf{d} \f$ such that the line corresponds to
  * the set \f$ l(t) = \mathbf{o} + t \mathbf{d} \f$, \f$ t \in \mathbf{R} \f$.
  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
-  * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
+  * \tparam _Scalar the scalar type, i.e., the type of the coefficients
+  * \tparam _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
  */
 template <typename _Scalar, int _AmbientDim, int _Options>
 class ParametrizedLine
--- a/Eigen/src/Geometry/Rotation2D.h
+++ b/Eigen/src/Geometry/Rotation2D.h
@ -18,7 +18,7 @@ namespace Eigen {
  *
  * \brief Represents a rotation/orientation in a 2 dimensional space.
  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
+  * \tparam _Scalar the scalar type, i.e., the type of the coefficients
  *
  * This class is equivalent to a single scalar representing a counter clock wise rotation
  * as a single angle in radian. It provides some additional features such as the automatic
--- a/Eigen/src/Geometry/RotationBase.h
+++ b/Eigen/src/Geometry/RotationBase.h
@ -22,8 +22,8 @@ struct rotation_base_generic_product_selector;
  *
  * \brief Common base class for compact rotation representations
  *
-  * \param Derived is the derived type, i.e., a rotation type
-  * \param _Dim the dimension of the space
+  * \tparam Derived is the derived type, i.e., a rotation type
+  * \tparam _Dim the dimension of the space
  */
 template<typename Derived, int _Dim>
 class RotationBase
@ -164,8 +164,8 @@ namespace internal {
  *
  * Helper function to return an arbitrary rotation object to a rotation matrix.
  *
-  * \param Scalar the numeric type of the matrix coefficients
-  * \param Dim the dimension of the current space
+  * \tparam Scalar the numeric type of the matrix coefficients
+  * \tparam Dim the dimension of the current space
  *
  * It returns a Dim x Dim fixed size matrix.
  *
--- a/Eigen/src/Geometry/Scaling.h
+++ b/Eigen/src/Geometry/Scaling.h
@ -18,7 +18,7 @@ namespace Eigen {
  *
  * \brief Represents a generic uniform scaling transformation
  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients.
+  * \tparam _Scalar the scalar type, i.e., the type of the coefficients.
  *
  * This class represent a uniform scaling transformation. It is the return
  * type of Scaling(Scalar), and most of the time this is the only way it
--- a/Eigen/src/Geometry/Translation.h
+++ b/Eigen/src/Geometry/Translation.h
@ -18,8 +18,8 @@ namespace Eigen {
  *
  * \brief Represents a translation transformation
  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients.
-  * \param _Dim the  dimension of the space, can be a compile time value or Dynamic
+  * \tparam _Scalar the scalar type, i.e., the type of the coefficients.
+  * \tparam _Dim the  dimension of the space, can be a compile time value or Dynamic
  *
  * \note This class is not aimed to be used to store a translation transformation,
  * but rather to make easier the constructions and updates of Transform objects.
--- a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
@ -21,7 +21,7 @@ namespace Eigen {
  * References : C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with
  *              Limited memory, SIAM J. Sci. Comput.  21(1), pp. 24-45, 1999
  *
-  * \tparam _MatrixType The type of the sparse matrix. It is advised to give a row-oriented sparse matrix
+  * \tparam Scalar the scalar type of the input matrices
  * \tparam _UpLo The triangular part that will be used for the computations. It can be Lower
    *               or Upper. Default is Lower.
  * \tparam _OrderingType The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering<int>,
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@ -29,7 +29,7 @@ template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> >
  *
  * \brief LU decomposition of a matrix with complete pivoting, and related features
  *
-  * \param MatrixType the type of the matrix of which we are computing the LU decomposition
+  * \tparam _MatrixType the type of the matrix of which we are computing the LU decomposition
  *
  * This class represents a LU decomposition of any matrix, with complete pivoting: the matrix A is
  * decomposed as \f$ A = P^{-1} L U Q^{-1} \f$ where L is unit-lower-triangular, U is
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@ -34,7 +34,7 @@ template<typename _MatrixType> struct traits<PartialPivLU<_MatrixType> >
  *
  * \brief LU decomposition of a matrix with partial pivoting, and related features
  *
-  * \param MatrixType the type of the matrix of which we are computing the LU decomposition
+  * \tparam _MatrixType the type of the matrix of which we are computing the LU decomposition
  *
  * This class represents a LU decomposition of a \b square \b invertible matrix, with partial pivoting: the matrix A
  * is decomposed as A = PLU where L is unit-lower-triangular, U is upper-triangular, and P
--- a/Eigen/src/OrderingMethods/Amd.h
+++ b/Eigen/src/OrderingMethods/Amd.h
@ -84,8 +84,11 @@ StorageIndex cs_tdfs(StorageIndex j, StorageIndex k, StorageIndex *head, const S
 /** \internal
  * \ingroup OrderingMethods_Module 
  * Approximate minimum degree ordering algorithm.
-  * \returns the permutation P reducing the fill-in of the input matrix \a C
-  * The input matrix \a C must be a selfadjoint compressed column major SparseMatrix object. Both the upper and lower parts have to be stored, but the diagonal entries are optional.
+  *
+  * \param[in] C the input selfadjoint matrix stored in compressed column major format.
+  * \param[out] perm the permutation P reducing the fill-in of the input matrix \a C
+  *
+  * Note that the input matrix \a C must be complete, that is both the upper and lower parts have to be stored, as well as the diagonal entries.
  * On exit the values of C are destroyed */
 template<typename Scalar, typename StorageIndex>
 void minimum_degree_ordering(SparseMatrix<Scalar,ColMajor,StorageIndex>& C, PermutationMatrix<Dynamic,Dynamic,StorageIndex>& perm)
--- a/Eigen/src/OrderingMethods/Eigen_Colamd.h
+++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h
@ -516,7 +516,7 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
    Col [col].start = p [col] ;
    Col [col].length = p [col+1] - p [col] ;

-    if (Col [col].length < 0)
+    if ((Col [col].length) < 0) // extra parentheses to work-around gcc bug 10200
    {
      /* column pointers must be non-decreasing */
      stats [COLAMD_STATUS] = COLAMD_ERROR_col_length_negative ;
--- a/Eigen/src/OrderingMethods/Ordering.h
+++ b/Eigen/src/OrderingMethods/Ordering.h
@ -19,20 +19,21 @@ namespace internal {
    
 /** \internal
  * \ingroup OrderingMethods_Module
-  * \returns the symmetric pattern A^T+A from the input matrix A. 
+  * \param[in] A the input non-symmetric matrix
+  * \param[out] symmat the symmetric pattern A^T+A from the input matrix \a A.
  * FIXME: The values should not be considered here
  */
 template<typename MatrixType> 
-void ordering_helper_at_plus_a(const MatrixType& mat, MatrixType& symmat)
+void ordering_helper_at_plus_a(const MatrixType& A, MatrixType& symmat)
 {
  MatrixType C;
-  C = mat.transpose(); // NOTE: Could be  costly
+  C = A.transpose(); // NOTE: Could be  costly
  for (int i = 0; i < C.rows(); i++) 
  {
      for (typename MatrixType::InnerIterator it(C, i); it; ++it)
        it.valueRef() = 0.0;
  }
-  symmat = C + mat; 
+  symmat = C + A;
 }
    
 }
--- a/Eigen/src/PaStiXSupport/PaStiXSupport.h
+++ b/Eigen/src/PaStiXSupport/PaStiXSupport.h
@ -184,7 +184,7 @@ class PastixBase : public SparseSolverBase<Derived>
      * The statistics related to the different phases of factorization and solve are saved here as well
      * \sa analyzePattern() factorize()
      */
-    Array<RealScalar,IPARM_SIZE,1>& dparm()
+    Array<double,DPARM_SIZE,1>& dparm()
    {
      return m_dparm; 
    }
@ -244,8 +244,8 @@ class PastixBase : public SparseSolverBase<Derived>
    mutable ComputationInfo m_info; 
    mutable pastix_data_t *m_pastixdata; // Data structure for pastix
    mutable int m_comm; // The MPI communicator identifier
-    mutable Matrix<int,IPARM_SIZE,1> m_iparm; // integer vector for the input parameters
-    mutable Matrix<double,DPARM_SIZE,1> m_dparm; // Scalar vector for the input parameters
+    mutable Array<int,IPARM_SIZE,1> m_iparm; // integer vector for the input parameters
+    mutable Array<double,DPARM_SIZE,1> m_dparm; // Scalar vector for the input parameters
    mutable Matrix<StorageIndex,Dynamic,1> m_perm;  // Permutation vector
    mutable Matrix<StorageIndex,Dynamic,1> m_invp;  // Inverse permutation vector
    mutable int m_size; // Size of the matrix 
@ -268,7 +268,7 @@ void PastixBase<Derived>::init()
         0, 0, 0, 1, m_iparm.data(), m_dparm.data());
  
  m_iparm[IPARM_MATRIX_VERIFICATION] = API_NO;
-  m_iparm[IPARM_VERBOSE]             = 2;
+  m_iparm[IPARM_VERBOSE]             = API_VERBOSE_NOT;
  m_iparm[IPARM_ORDERING]            = API_ORDER_SCOTCH;
  m_iparm[IPARM_INCOMPLETE]          = API_NO;
  m_iparm[IPARM_OOC_LIMIT]           = 2000;
@ -581,6 +581,7 @@ class PastixLLT : public PastixBase< PastixLLT<_MatrixType, _UpLo> >
    
    void grabMatrix(const MatrixType& matrix, ColSpMatrix& out)
    {
+      out.resize(matrix.rows(), matrix.cols());
      // Pastix supports only lower, column-major matrices 
      out.template selfadjointView<Lower>() = matrix.template selfadjointView<UpLo>();
      internal::c_to_fortran_numbering(out);
@ -666,6 +667,7 @@ class PastixLDLT : public PastixBase< PastixLDLT<_MatrixType, _UpLo> >
    void grabMatrix(const MatrixType& matrix, ColSpMatrix& out)
    {
      // Pastix supports only lower, column-major matrices 
+      out.resize(matrix.rows(), matrix.cols());
      out.template selfadjointView<Lower>() = matrix.template selfadjointView<UpLo>();
      internal::c_to_fortran_numbering(out);
    }
--- a/Eigen/src/QR/ColPivHouseholderQR.h
+++ b/Eigen/src/QR/ColPivHouseholderQR.h
@ -28,7 +28,7 @@ template<typename _MatrixType> struct traits<ColPivHouseholderQR<_MatrixType> >
  *
  * \brief Householder rank-revealing QR decomposition of a matrix with column-pivoting
  *
-  * \param MatrixType the type of the matrix of which we are computing the QR decomposition
+  * \tparam _MatrixType the type of the matrix of which we are computing the QR decomposition
  *
  * This class performs a rank-revealing QR decomposition of a matrix \b A into matrices \b P, \b Q and \b R
  * such that 
--- a/Eigen/src/QR/FullPivHouseholderQR.h
+++ b/Eigen/src/QR/FullPivHouseholderQR.h
@ -37,7 +37,7 @@ struct traits<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
  *
  * \brief Householder rank-revealing QR decomposition of a matrix with full pivoting
  *
-  * \param MatrixType the type of the matrix of which we are computing the QR decomposition
+  * \tparam _MatrixType the type of the matrix of which we are computing the QR decomposition
  *
  * This class performs a rank-revealing QR decomposition of a matrix \b A into matrices \b P, \b P', \b Q and \b R
  * such that 
--- a/Eigen/src/QR/HouseholderQR.h
+++ b/Eigen/src/QR/HouseholderQR.h
@ -21,7 +21,7 @@ namespace Eigen {
  *
  * \brief Householder QR decomposition of a matrix
  *
-  * \param MatrixType the type of the matrix of which we are computing the QR decomposition
+  * \tparam _MatrixType the type of the matrix of which we are computing the QR decomposition
  *
  * This class performs a QR decomposition of a matrix \b A into matrices \b Q and \b R
  * such that 
--- a/Eigen/src/SVD/BDCSVD.h
+++ b/Eigen/src/SVD/BDCSVD.h
@ -47,9 +47,8 @@ struct traits<BDCSVD<_MatrixType> >
 *
 * \brief class Bidiagonal Divide and Conquer SVD
 *
- * \param MatrixType the type of the matrix of which we are computing the SVD decomposition
- * We plan to have a very similar interface to JacobiSVD on this class.
- * It should be used to speed up the calcul of SVD for big matrices. 
+ * \tparam _MatrixType the type of the matrix of which we are computing the SVD decomposition
+ *
 */
 template<typename _MatrixType> 
 class BDCSVD : public SVDBase<BDCSVD<_MatrixType> >
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
@ -449,8 +449,8 @@ struct traits<JacobiSVD<_MatrixType,QRPreconditioner> >
  *
  * \brief Two-sided Jacobi SVD decomposition of a rectangular matrix
  *
-  * \param MatrixType the type of the matrix of which we are computing the SVD decomposition
-  * \param QRPreconditioner this optional parameter allows to specify the type of QR decomposition that will be used internally
+  * \tparam _MatrixType the type of the matrix of which we are computing the SVD decomposition
+  * \tparam QRPreconditioner this optional parameter allows to specify the type of QR decomposition that will be used internally
  *                        for the R-SVD step for non-square matrices. See discussion of possible values below.
  *
  * SVD decomposition consists in decomposing any n-by-p matrix \a A as a product
--- a/Eigen/src/SparseCholesky/SimplicialCholesky.h
+++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h
@ -39,18 +39,16 @@ namespace internal {
 } // end namespace internal

 /** \ingroup SparseCholesky_Module
-  * \brief A direct sparse Cholesky factorizations
+  * \brief A base class for direct sparse Cholesky factorizations
  *
-  * These classes provide LL^T and LDL^T Cholesky factorizations of sparse matrices that are
-  * selfadjoint and positive definite. The factorization allows for solving A.X = B where
+  * This is a base class for LL^T and LDL^T Cholesky factorizations of sparse matrices that are
+  * selfadjoint and positive definite. These factorizations allow for solving A.X = B where
  * X and B can be either dense or sparse.
  * 
  * In order to reduce the fill-in, a symmetric permutation P is applied prior to the factorization
  * such that the factorized matrix is P A P^-1.
  *
-  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
-  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
-  *               or Upper. Default is Lower.
+  * \tparam Derived the type of the derived class, that is the actual factorization type.
  *
  */
 template<typename Derived>
--- a/Eigen/src/SparseCore/SparseCompressedBase.h
+++ b/Eigen/src/SparseCore/SparseCompressedBase.h
@ -22,6 +22,16 @@ struct traits<SparseCompressedBase<Derived> > : traits<Derived>

 } // end namespace internal

+/** \ingroup SparseCore_Module
+  * \class SparseCompressedBase
+  * \brief Common base class for sparse [compressed]-{row|column}-storage format.
+  *
+  * This class defines the common interface for all derived classes implementing the compressed sparse storage format, such as:
+  *  - SparseMatrix
+  *  - Ref<SparseMatrixType,Options>
+  *  - Map<SparseMatrixType>
+  *
+  */
 template<typename Derived>
 class SparseCompressedBase
  : public SparseMatrixBase<Derived>
--- a/Eigen/src/SparseCore/SparseMap.h
+++ b/Eigen/src/SparseCore/SparseMap.h
@ -37,11 +37,15 @@ struct traits<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, St
 };

 } // end namespace internal
-  
+
 template<typename Derived,
         int Level = internal::accessors_level<Derived>::has_write_access ? WriteAccessors : ReadOnlyAccessors
 > class SparseMapBase;

+/** \ingroup SparseCore_Module
+  * class SparseMapBase
+  * \brief Common base class for Map and Ref instance of sparse matrix and vector.
+  */
 template<typename Derived>
 class SparseMapBase<Derived,ReadOnlyAccessors>
  : public SparseCompressedBase<Derived>
@ -71,22 +75,33 @@ class SparseMapBase<Derived,ReadOnlyAccessors>

  public:

+    /** \copydoc SparseMatrixBase::rows() */
    inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
+    /** \copydoc SparseMatrixBase::cols() */
    inline Index cols() const { return IsRowMajor ? m_innerSize : m_outerSize; }
+    /** \copydoc SparseMatrixBase::innerSize() */
    inline Index innerSize() const { return m_innerSize; }
+    /** \copydoc SparseMatrixBase::outerSize() */
    inline Index outerSize() const { return m_outerSize; }
+    /** \copydoc SparseCompressedBase::nonZeros */
    inline Index nonZeros() const { return m_zero_nnz[1]; }
    
+    /** \copydoc SparseCompressedBase::isCompressed */
    bool isCompressed() const { return m_innerNonZeros==0; }

    //----------------------------------------
    // direct access interface
+    /** \copydoc SparseMatrix::valuePtr */
    inline const Scalar* valuePtr() const { return m_values; }
+    /** \copydoc SparseMatrix::innerIndexPtr */
    inline const StorageIndex* innerIndexPtr() const { return m_innerIndices; }
+    /** \copydoc SparseMatrix::outerIndexPtr */
    inline const StorageIndex* outerIndexPtr() const { return m_outerIndex; }
+    /** \copydoc SparseMatrix::innerNonZeroPtr */
    inline const StorageIndex* innerNonZeroPtr() const { return m_innerNonZeros; }
    //----------------------------------------

+    /** \copydoc SparseMatrix::coeff */
    inline Scalar coeff(Index row, Index col) const
    {
      const Index outer = IsRowMajor ? row : col;
@ -125,6 +140,10 @@ class SparseMapBase<Derived,ReadOnlyAccessors>
    inline SparseMapBase() {}
 };

+/** \ingroup SparseCore_Module
+  * class SparseMapBase
+  * \brief Common base class for writable Map and Ref instance of sparse matrix and vector.
+  */
 template<typename Derived>
 class SparseMapBase<Derived,WriteAccessors>
  : public SparseMapBase<Derived,ReadOnlyAccessors>
@ -185,9 +204,23 @@ class SparseMapBase<Derived,WriteAccessors>
    inline SparseMapBase() {}
 };

+/** \ingroup SparseCore_Module
+  *
+  * \brief Specialization of class Map for SparseMatrix-like storage.
+  *
+  * \tparam SparseMatrixType the equivalent sparse matrix type of the referenced data, it must be a template instance of class SparseMatrix.
+  *
+  * \sa class Map, class SparseMatrix, class Ref<SparseMatrixType,Options>
+  */
+#ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
 class Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType>
  : public SparseMapBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+#else
+template<typename SparseMatrixType>
+class Map<SparseMatrixType>
+  : public SparseMapBase<Derived,WriteAccessors>
+#endif
 {
  public:
    typedef SparseMapBase<Map> Base;
@ -196,6 +229,12 @@ class Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType>

  public:

+    /** Constructs a read-write Map to a sparse matrix of size \a rows x \a cols, containing \a nnz non-zero coefficients,
+      * stored as a sparse format as defined by the pointers \a outerIndexPtr, \a innerIndexPtr, and \a valuePtr.
+      * If the optional parameter \a innerNonZerosPtr is the null pointer, then a standard compressed format is assumed.
+      *
+      * More details on the expected storage schemes are given in the \ref TutorialSparse "manual pages".
+      */
    inline Map(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr,
               StorageIndex* innerIndexPtr, Scalar* valuePtr, StorageIndex* innerNonZerosPtr = 0)
      : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr)
--- a/Eigen/src/SparseCore/SparseRef.h
+++ b/Eigen/src/SparseCore/SparseRef.h
@ -108,20 +108,25 @@ protected:


 /** 
-  * \ingroup Sparse_Module
+  * \ingroup SparseCore_Module
  *
  * \brief A sparse matrix expression referencing an existing sparse expression
  *
-  * \tparam PlainObjectType the equivalent sparse matrix type of the referenced data
+  * \tparam SparseMatrixType the equivalent sparse matrix type of the referenced data, it must be a template instance of class SparseMatrix.
  * \tparam Options specifies whether the a standard compressed format is required \c Options is  \c #StandardCompressedFormat, or \c 0.
  *                The default is \c 0.
-  * \tparam StrideType Only used for dense Ref
  *
  * \sa class Ref
  */
+#ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
 class Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType >
  : public internal::SparseRefBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType > >
+#else
+template<typename SparseMatrixType, int Options>
+class Ref<SparseMatrixType, Options>
+  : public SparseMapBase<Derived,WriteAccessors> // yes, that's weird to use Derived here, but that works!
+#endif
 {
    typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
    typedef internal::traits<Ref> Traits;
@ -155,6 +160,7 @@ class Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType >
    template<typename Derived>
    inline Ref(const SparseCompressedBase<Derived>& expr)
    #else
+    /** Implicit constructor from any sparse expression (2D matrix or 1D vector) */
    template<typename Derived>
    inline Ref(SparseCompressedBase<Derived>& expr)
    #endif
@ -225,19 +231,23 @@ class Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType


 /**
-  * \ingroup Sparse_Module
+  * \ingroup SparseCore_Module
  *
  * \brief A sparse vector expression referencing an existing sparse vector expression
  *
-  * \tparam PlainObjectType the equivalent sparse matrix type of the referenced data
-  * \tparam Options Not used for SparseVector.
-  * \tparam StrideType Only used for dense Ref
+  * \tparam SparseVectorType the equivalent sparse vector type of the referenced data, it must be a template instance of class SparseVector.
  *
  * \sa class Ref
  */
+#ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
 class Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType >
  : public internal::SparseRefBase<Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType > >
+#else
+template<typename SparseVectorType>
+class Ref<SparseVectorType>
+  : public SparseMapBase<Derived,WriteAccessors>
+#endif
 {
    typedef SparseVector<MatScalar,MatOptions,MatIndex> PlainObjectType;
    typedef internal::traits<Ref> Traits;
@ -259,6 +269,7 @@ class Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType >
    template<typename Derived>
    inline Ref(const SparseCompressedBase<Derived>& expr)
    #else
+    /** Implicit constructor from any 1D sparse vector expression */
    template<typename Derived>
    inline Ref(SparseCompressedBase<Derived>& expr)
    #endif
--- a/Eigen/src/SparseCore/SparseSelfAdjointView.h
+++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h
@ -211,8 +211,6 @@ struct evaluator_traits<SparseSelfAdjointView<MatrixType,Mode> >
 {
  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
  typedef SparseSelfAdjointShape Shape;
-  
-  static const int AssumeAliasing = 0;
 };

 struct SparseSelfAdjoint2Sparse {};
--- a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
+++ b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
@ -14,22 +14,21 @@
 namespace Eigen {
 namespace internal {
  
-/**
- * \brief Performs numeric block updates from a given supernode to a single column
- * 
- * \param segsize Size of the segment (and blocks ) to use for updates
- * \param[in,out] dense Packed values of the original matrix
- * \param tempv temporary vector to use for updates
- * \param lusup array containing the supernodes
- * \param lda Leading dimension in the supernode
- * \param nrow Number of rows in the rectangular part of the supernode
- * \param lsub compressed row subscripts of supernodes
- * \param lptr pointer to the first column of the current supernode in lsub
- * \param no_zeros Number of nonzeros elements before the diagonal part of the supernode
- * \return 0 on success
- */
 template <int SegSizeAtCompileTime> struct LU_kernel_bmod
 {
+  /** \internal
+    * \brief Performs numeric block updates from a given supernode to a single column
+    *
+    * \param segsize Size of the segment (and blocks ) to use for updates
+    * \param[in,out] dense Packed values of the original matrix
+    * \param tempv temporary vector to use for updates
+    * \param lusup array containing the supernodes
+    * \param lda Leading dimension in the supernode
+    * \param nrow Number of rows in the rectangular part of the supernode
+    * \param lsub compressed row subscripts of supernodes
+    * \param lptr pointer to the first column of the current supernode in lsub
+    * \param no_zeros Number of nonzeros elements before the diagonal part of the supernode
+    */
  template <typename BlockScalarVector, typename ScalarVector, typename IndexVector>
  static EIGEN_DONT_INLINE void run(const Index segsize, BlockScalarVector& dense, ScalarVector& tempv, ScalarVector& lusup, Index& luptr, const Index lda,
                                    const Index nrow, IndexVector& lsub, const Index lptr, const Index no_zeros);
--- a/Eigen/src/SparseQR/SparseQR.h
+++ b/Eigen/src/SparseQR/SparseQR.h
@ -691,7 +691,6 @@ struct evaluator_traits<SparseQRMatrixQReturnType<SparseQRType> >
  typedef typename SparseQRType::MatrixType MatrixType;
  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
  typedef SparseShape Shape;
-  static const int AssumeAliasing = 0;
 };

 template< typename DstXprType, typename SparseQRType>
--- a/doc/AsciiQuickReference.txt
+++ b/doc/AsciiQuickReference.txt
@ -32,17 +32,19 @@ A << 1, 2, 3,     // Initialize A. The elements can also be
 B << A, A, A;     // B is three horizontally stacked A's.
 A.fill(10);       // Fill A with all 10's.

-// Eigen                            // Matlab
-MatrixXd::Identity(rows,cols)       // eye(rows,cols)
-C.setIdentity(rows,cols)            // C = eye(rows,cols)
-MatrixXd::Zero(rows,cols)           // zeros(rows,cols)
-C.setZero(rows,cols)                // C = ones(rows,cols)
-MatrixXd::Ones(rows,cols)           // ones(rows,cols)
-C.setOnes(rows,cols)                // C = ones(rows,cols)
-MatrixXd::Random(rows,cols)         // rand(rows,cols)*2-1        // MatrixXd::Random returns uniform random numbers in (-1, 1).
-C.setRandom(rows,cols)              // C = rand(rows,cols)*2-1
-VectorXd::LinSpaced(size,low,high)   // linspace(low,high,size)'
-v.setLinSpaced(size,low,high)        // v = linspace(low,high,size)'
+// Eigen                                    // Matlab
+MatrixXd::Identity(rows,cols)               // eye(rows,cols)
+C.setIdentity(rows,cols)                    // C = eye(rows,cols)
+MatrixXd::Zero(rows,cols)                   // zeros(rows,cols)
+C.setZero(rows,cols)                        // C = ones(rows,cols)
+MatrixXd::Ones(rows,cols)                   // ones(rows,cols)
+C.setOnes(rows,cols)                        // C = ones(rows,cols)
+MatrixXd::Random(rows,cols)                 // rand(rows,cols)*2-1            // MatrixXd::Random returns uniform random numbers in (-1, 1).
+C.setRandom(rows,cols)                      // C = rand(rows,cols)*2-1
+VectorXd::LinSpaced(size,low,high)          // linspace(low,high,size)'
+v.setLinSpaced(size,low,high)               // v = linspace(low,high,size)'
+VectorXi::LinSpaced(((hi-low)/step)+1,      // low:step:hi
+                    low,low+step*(size-1))


 // Matrix slicing and blocks. All expressions listed here are read/write.
@ -85,13 +87,15 @@ P.bottomRightCorner<rows,cols>()   // P(end-rows+1:end, end-cols+1:end)
 R.row(i) = P.col(j);               // R(i, :) = P(:, i)
 R.col(j1).swap(mat1.col(j2));      // R(:, [j1 j2]) = R(:, [j2, j1])

-// Views, transpose, etc; all read-write except for .adjoint().
+// Views, transpose, etc;
 // Eigen                           // Matlab
 R.adjoint()                        // R'
-R.transpose()                      // R.' or conj(R')
-R.diagonal()                       // diag(R)
+R.transpose()                      // R.' or conj(R')       // Read-write
+R.diagonal()                       // diag(R)               // Read-write
 x.asDiagonal()                     // diag(x)
-R.transpose().colwise().reverse(); // rot90(R)
+R.transpose().colwise().reverse()  // rot90(R)              // Read-write
+R.replicate(i,j)                   // repmat(P,i,j)
+

 // All the same as Matlab, but matlab doesn't have *= style operators.
 // Matrix-vector.  Matrix-matrix.   Matrix-scalar.
@ -103,37 +107,39 @@ a *= M;            R  = P + Q;      R  = P/s;
                   R -= Q;          R /= s;

 // Vectorized operations on each element independently
-// Eigen                  // Matlab
-R = P.cwiseProduct(Q);    // R = P .* Q
-R = P.array() * s.array();// R = P .* s
-R = P.cwiseQuotient(Q);   // R = P ./ Q
-R = P.array() / Q.array();// R = P ./ Q
-R = P.array() + s.array();// R = P + s
-R = P.array() - s.array();// R = P - s
-R.array() += s;           // R = R + s
-R.array() -= s;           // R = R - s
-R.array() < Q.array();    // R < Q
-R.array() <= Q.array();   // R <= Q
-R.cwiseInverse();         // 1 ./ P
-R.array().inverse();      // 1 ./ P
-R.array().sin()           // sin(P)
-R.array().cos()           // cos(P)
-R.array().pow(s)          // P .^ s
-R.array().square()        // P .^ 2
-R.array().cube()          // P .^ 3
-R.cwiseSqrt()             // sqrt(P)
-R.array().sqrt()          // sqrt(P)
-R.array().exp()           // exp(P)
-R.array().log()           // log(P)
-R.cwiseMax(P)             // max(R, P)
-R.array().max(P.array())  // max(R, P)
-R.cwiseMin(P)             // min(R, P)
-R.array().min(P.array())  // min(R, P)
-R.cwiseAbs()              // abs(P)
-R.array().abs()           // abs(P)
-R.cwiseAbs2()             // abs(P.^2)
-R.array().abs2()          // abs(P.^2)
-(R.array() < s).select(P,Q);  // (R < s ? P : Q)
+// Eigen                       // Matlab
+R = P.cwiseProduct(Q);         // R = P .* Q
+R = P.array() * s.array();     // R = P .* s
+R = P.cwiseQuotient(Q);        // R = P ./ Q
+R = P.array() / Q.array();     // R = P ./ Q
+R = P.array() + s.array();     // R = P + s
+R = P.array() - s.array();     // R = P - s
+R.array() += s;                // R = R + s
+R.array() -= s;                // R = R - s
+R.array() < Q.array();         // R < Q
+R.array() <= Q.array();        // R <= Q
+R.cwiseInverse();              // 1 ./ P
+R.array().inverse();           // 1 ./ P
+R.array().sin()                // sin(P)
+R.array().cos()                // cos(P)
+R.array().pow(s)               // P .^ s
+R.array().square()             // P .^ 2
+R.array().cube()               // P .^ 3
+R.cwiseSqrt()                  // sqrt(P)
+R.array().sqrt()               // sqrt(P)
+R.array().exp()                // exp(P)
+R.array().log()                // log(P)
+R.cwiseMax(P)                  // max(R, P)
+R.array().max(P.array())       // max(R, P)
+R.cwiseMin(P)                  // min(R, P)
+R.array().min(P.array())       // min(R, P)
+R.cwiseAbs()                   // abs(P)
+R.array().abs()                // abs(P)
+R.cwiseAbs2()                  // abs(P.^2)
+R.array().abs2()               // abs(P.^2)
+(R.array() < s).select(P,Q );  // (R < s ? P : Q)
+R = (Q.array()==0).select(P,A) // R(Q==0) = P(Q==0)
+

 // Reductions.
 int r, c;
@ -164,12 +170,12 @@ x.dot(y)                  // dot(x, y)
 x.cross(y)                // cross(x, y) Requires #include <Eigen/Geometry>

 //// Type conversion
-// Eigen                           // Matlab
-A.cast<double>();                  // double(A)
-A.cast<float>();                   // single(A)
-A.cast<int>();                     // int32(A)
-A.real();                          // real(A)
-A.imag();                          // imag(A)
+// Eigen                  // Matlab
+A.cast<double>();         // double(A)
+A.cast<float>();          // single(A)
+A.cast<int>();            // int32(A)
+A.real();                 // real(A)
+A.imag();                 // imag(A)
 // if the original type equals destination type, no work is done

 // Note that for most operations Eigen requires all operands to have the same type:
--- a/test/adjoint.cpp
+++ b/test/adjoint.cpp
@ -42,6 +42,15 @@ template<> struct adjoint_specific<false> {
    VERIFY_IS_APPROX(v1, v1.norm() * v3);
    VERIFY_IS_APPROX(v3, v1.normalized());
    VERIFY_IS_APPROX(v3.norm(), RealScalar(1));
+
+    // check null inputs
+    VERIFY_IS_APPROX((v1*0).normalized(), (v1*0));
+    RealScalar very_small = (std::numeric_limits<RealScalar>::min)();
+    VERIFY( (v1*very_small).norm() == 0 );
+    VERIFY_IS_APPROX((v1*very_small).normalized(), (v1*very_small));
+    v3 = v1*very_small;
+    v3.normalize();
+    VERIFY_IS_APPROX(v3, (v1*very_small));
    
    // check compatibility of dot and adjoint
    ref = NumTraits<Scalar>::IsInteger ? 0 : (std::max)((std::max)(v1.norm(),v2.norm()),(std::max)((square * v2).norm(),(square.adjoint() * v1).norm()));
--- a/test/incomplete_cholesky.cpp
+++ b/test/incomplete_cholesky.cpp
@ -15,16 +15,18 @@
 template<typename T, typename I> void test_incomplete_cholesky_T()
 {
  typedef SparseMatrix<T,0,I> SparseMatrixType;
-  ConjugateGradient<SparseMatrixType, Lower, IncompleteCholesky<T, Lower, AMDOrdering<I> > >     cg_illt_lower_amd;
-  ConjugateGradient<SparseMatrixType, Lower, IncompleteCholesky<T, Lower, NaturalOrdering<I> > > cg_illt_lower_nat;
-  ConjugateGradient<SparseMatrixType, Upper, IncompleteCholesky<T, Upper, AMDOrdering<I> > >     cg_illt_upper_amd;
-  ConjugateGradient<SparseMatrixType, Upper, IncompleteCholesky<T, Upper, NaturalOrdering<I> > >     cg_illt_upper_nat;
+  ConjugateGradient<SparseMatrixType, Lower, IncompleteCholesky<T, Lower, AMDOrdering<I> > >        cg_illt_lower_amd;
+  ConjugateGradient<SparseMatrixType, Lower, IncompleteCholesky<T, Lower, NaturalOrdering<I> > >    cg_illt_lower_nat;
+  ConjugateGradient<SparseMatrixType, Upper, IncompleteCholesky<T, Upper, AMDOrdering<I> > >        cg_illt_upper_amd;
+  ConjugateGradient<SparseMatrixType, Upper, IncompleteCholesky<T, Upper, NaturalOrdering<I> > >    cg_illt_upper_nat;
+  ConjugateGradient<SparseMatrixType, Upper|Lower, IncompleteCholesky<T, Lower, AMDOrdering<I> > >  cg_illt_uplo_amd;
  

  CALL_SUBTEST( check_sparse_spd_solving(cg_illt_lower_amd) );
  CALL_SUBTEST( check_sparse_spd_solving(cg_illt_lower_nat) );
  CALL_SUBTEST( check_sparse_spd_solving(cg_illt_upper_amd) );
  CALL_SUBTEST( check_sparse_spd_solving(cg_illt_upper_nat) );
+  CALL_SUBTEST( check_sparse_spd_solving(cg_illt_uplo_amd) );
 }

 void test_incomplete_cholesky()
--- a/test/pastix_support.cpp
+++ b/test/pastix_support.cpp
@ -27,6 +27,14 @@ template<typename T> void test_pastix_T()
  check_sparse_spd_solving(pastix_llt_upper);
  check_sparse_spd_solving(pastix_ldlt_upper);
  check_sparse_square_solving(pastix_lu);
+
+  // Some compilation check:
+  pastix_llt_lower.iparm();
+  pastix_llt_lower.dparm();
+  pastix_ldlt_lower.iparm();
+  pastix_ldlt_lower.dparm();
+  pastix_lu.iparm();
+  pastix_lu.dparm();
 }

 // There is no support for selfadjoint matrices with PaStiX. 
--- a/test/product.h
+++ b/test/product.h
@ -145,14 +145,31 @@ template<typename MatrixType> void product(const MatrixType& m)
  VERIFY_IS_APPROX(res.col(r).noalias() = square * square.col(r), (square * square.col(r)).eval());

  // inner product
-  Scalar x = square2.row(c) * square2.col(c2);
-  VERIFY_IS_APPROX(x, square2.row(c).transpose().cwiseProduct(square2.col(c2)).sum());
-  
+  {
+    Scalar x = square2.row(c) * square2.col(c2);
+    VERIFY_IS_APPROX(x, square2.row(c).transpose().cwiseProduct(square2.col(c2)).sum());
+  }
+
  // outer product
-  VERIFY_IS_APPROX(m1.col(c) * m1.row(r), m1.block(0,c,rows,1) * m1.block(r,0,1,cols));
-  VERIFY_IS_APPROX(m1.row(r).transpose() * m1.col(c).transpose(), m1.block(r,0,1,cols).transpose() * m1.block(0,c,rows,1).transpose());
-  VERIFY_IS_APPROX(m1.block(0,c,rows,1) * m1.row(r), m1.block(0,c,rows,1) * m1.block(r,0,1,cols));
-  VERIFY_IS_APPROX(m1.col(c) * m1.block(r,0,1,cols), m1.block(0,c,rows,1) * m1.block(r,0,1,cols));
-  VERIFY_IS_APPROX(m1.leftCols(1) * m1.row(r), m1.block(0,0,rows,1) * m1.block(r,0,1,cols));
-  VERIFY_IS_APPROX(m1.col(c) * m1.topRows(1), m1.block(0,c,rows,1) * m1.block(0,0,1,cols));  
+  {
+    VERIFY_IS_APPROX(m1.col(c) * m1.row(r), m1.block(0,c,rows,1) * m1.block(r,0,1,cols));
+    VERIFY_IS_APPROX(m1.row(r).transpose() * m1.col(c).transpose(), m1.block(r,0,1,cols).transpose() * m1.block(0,c,rows,1).transpose());
+    VERIFY_IS_APPROX(m1.block(0,c,rows,1) * m1.row(r), m1.block(0,c,rows,1) * m1.block(r,0,1,cols));
+    VERIFY_IS_APPROX(m1.col(c) * m1.block(r,0,1,cols), m1.block(0,c,rows,1) * m1.block(r,0,1,cols));
+    VERIFY_IS_APPROX(m1.leftCols(1) * m1.row(r), m1.block(0,0,rows,1) * m1.block(r,0,1,cols));
+    VERIFY_IS_APPROX(m1.col(c) * m1.topRows(1), m1.block(0,c,rows,1) * m1.block(0,0,1,cols));
+  }
+
+  // Aliasing
+  {
+    ColVectorType x(cols); x.setRandom();
+    ColVectorType z(x);
+    ColVectorType y(cols); y.setZero();
+    ColSquareMatrixType A(cols,cols); A.setRandom();
+    // CwiseBinaryOp
+    VERIFY_IS_APPROX(x = y + A*x, A*z);
+    x = z;
+    // CwiseUnaryOp
+    VERIFY_IS_APPROX(x = Scalar(1.)*(A*x), A*z);
+  }
 }
--- a/test/product_large.cpp
+++ b/test/product_large.cpp
@ -9,6 +9,27 @@

 #include "product.h"

+template<typename T>
+void test_aliasing()
+{
+  int rows = internal::random<int>(1,12);
+  int cols = internal::random<int>(1,12);
+  typedef Matrix<T,Dynamic,Dynamic> MatrixType;
+  typedef Matrix<T,Dynamic,1> VectorType;
+  VectorType x(cols); x.setRandom();
+  VectorType z(x);
+  VectorType y(rows); y.setZero();
+  MatrixType A(rows,cols); A.setRandom();
+  // CwiseBinaryOp
+  VERIFY_IS_APPROX(x = y + A*x, A*z);     // OK because "y + A*x" is marked as "assume-aliasing"
+  x = z;
+  // CwiseUnaryOp
+  VERIFY_IS_APPROX(x = T(1.)*(A*x), A*z); // OK because 1*(A*x) is replaced by (1*A*x) which is a Product<> expression
+  x = z;
+  // VERIFY_IS_APPROX(x = y-A*x, -A*z);   // Not OK in 3.3 because x is resized before A*x gets evaluated
+  x = z;
+}
+
 void test_product_large()
 {
  for(int i = 0; i < g_repeat; i++) {
@ -17,6 +38,8 @@ void test_product_large()
    CALL_SUBTEST_3( product(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
    CALL_SUBTEST_4( product(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
    CALL_SUBTEST_5( product(Matrix<float,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+
+    CALL_SUBTEST_1( test_aliasing<float>() );
  }

 #if defined EIGEN_TEST_PART_6
--- a/test/product_notemporary.cpp
+++ b/test/product_notemporary.cpp
@ -43,10 +43,16 @@ template<typename MatrixType> void product_notemporary(const MatrixType& m)
        r1 = internal::random<Index>(8,rows-r0);

  VERIFY_EVALUATION_COUNT( m3 = (m1 * m2.adjoint()), 1);
+  VERIFY_EVALUATION_COUNT( m3 = (m1 * m2.adjoint()).transpose(), 1);
  VERIFY_EVALUATION_COUNT( m3.noalias() = m1 * m2.adjoint(), 0);

+  VERIFY_EVALUATION_COUNT( m3 = s1 * (m1 * m2.transpose()), 1);
+//   VERIFY_EVALUATION_COUNT( m3 = m3 + s1 * (m1 * m2.transpose()), 1);
  VERIFY_EVALUATION_COUNT( m3.noalias() = s1 * (m1 * m2.transpose()), 0);

+  VERIFY_EVALUATION_COUNT( m3 = m3 + (m1 * m2.adjoint()), 1);
+
+  VERIFY_EVALUATION_COUNT( m3 = m3 + (m1 * m2.adjoint()).transpose(), 1);
  VERIFY_EVALUATION_COUNT( m3.noalias() = m3 + m1 * m2.transpose(), 0);
  VERIFY_EVALUATION_COUNT( m3.noalias() += m3 + m1 * m2.transpose(), 0);
  VERIFY_EVALUATION_COUNT( m3.noalias() -= m3 + m1 * m2.transpose(), 0);
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@ -88,6 +88,7 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorReductionCuda.h"
 #include "src/Tensor/TensorArgMax.h"
 #include "src/Tensor/TensorConcatenation.h"
+#include "src/Tensor/TensorContractionMapper.h"
 #include "src/Tensor/TensorContraction.h"
 #include "src/Tensor/TensorContractionThreadPool.h"
 #include "src/Tensor/TensorContractionCuda.h"
--- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h
@ -109,11 +109,9 @@ template<int n, typename x> struct get;

 template<int n, typename a, typename... as>               struct get<n, type_list<a, as...>>   : get<n-1, type_list<as...>> {};
 template<typename a, typename... as>                      struct get<0, type_list<a, as...>>   { typedef a type; };
-template<int n EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, as)> struct get<n, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(as)>> { static_assert((n - n) < 0, "meta-template get: The element to extract from a list must be smaller than the size of the list."); };

 template<typename T, int n, T a, T... as>                        struct get<n, numeric_list<T, a, as...>>   : get<n-1, numeric_list<T, as...>> {};
 template<typename T, T a, T... as>                               struct get<0, numeric_list<T, a, as...>>   { constexpr static T value = a; };
-template<typename T, int n EIGEN_TPL_PP_SPEC_HACK_DEFC(T, as)>   struct get<n, numeric_list<T EIGEN_TPL_PP_SPEC_HACK_USEC(as)>> { static_assert((n - n) < 0, "meta-template get: The element to extract from a list must be smaller than the size of the list."); };

 /* always get type, regardless of dummy; good for parameter pack expansion */

@ -406,7 +404,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
 template<typename Op, typename A, typename B, std::size_t N, int... n>
 constexpr inline array<decltype(Op::run(A(), B())),N> h_array_zip(array<A, N> a, array<B, N> b, numeric_list<int, n...>)
 {
-  return array<decltype(Op::run(A(), B())),N>{{ Op::run(array_get<n>(a), array_get<n>(b))... }};
+  return array<decltype(Op::run(A(), B())),N>{ Op::run(array_get<n>(a), array_get<n>(b))... };
 }

 template<typename Op, typename A, typename B, std::size_t N>
@ -434,7 +432,7 @@ constexpr inline auto array_zip_and_reduce(array<A, N> a, array<B, N> b) -> decl
 template<typename Op, typename A, std::size_t N, int... n>
 constexpr inline array<decltype(Op::run(A())),N> h_array_apply(array<A, N> a, numeric_list<int, n...>)
 {
-  return array<decltype(Op::run(A())),N>{{ Op::run(array_get<n>(a))... }};
+  return array<decltype(Op::run(A())),N>{ Op::run(array_get<n>(a))... };
 }

 template<typename Op, typename A, std::size_t N>
--- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
@ -132,13 +132,13 @@ template <typename T> class array<T, 0> {
    return *static_cast<const T*>(NULL);
  }

-  static EIGEN_ALWAYS_INLINE std::size_t size() { return 0; }
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; }

  EIGEN_DEVICE_FUNC
  EIGEN_STRONG_INLINE array() { }

 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
-  array(std::initializer_list<T> l) {
+  EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) {
    eigen_assert(l.size() == 0);
  }
 #endif
--- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@ -78,7 +78,8 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
      IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0) & !(Options_&DontAlign),
      PacketAccess = (internal::packet_traits<Scalar>::size > 1),
      Layout = Options_ & RowMajor ? RowMajor : ColMajor,
-      CoordAccess = true
+      CoordAccess = true,
+      RawAccess = true
    };

    static const int Options = Options_;
@ -118,7 +119,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
    {
      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return coeff(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+      return coeff(array<Index, NumIndices>{firstIndex, secondIndex, otherIndices...});
    }
 #endif

@ -158,7 +159,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
    {
      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return coeffRef(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+      return coeffRef(array<Index, NumIndices>{firstIndex, secondIndex, otherIndices...});
    }
 #endif

@ -198,7 +199,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
    {
      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return this->operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+      return this->operator()(array<Index, NumIndices>{firstIndex, secondIndex, otherIndices...});
    }
 #else
    EIGEN_DEVICE_FUNC
@ -265,7 +266,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
    {
      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+      return operator()(array<Index, NumIndices>{firstIndex, secondIndex, otherIndices...});
    }
 #else
    EIGEN_DEVICE_FUNC
@ -341,7 +342,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
    template<typename... IndexTypes>
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions)
-        : m_storage(internal::array_prod(array<Index, NumIndices>{{firstDimension, otherDimensions...}}), array<Index, NumIndices>{{firstDimension, otherDimensions...}})
+        : m_storage(internal::array_prod(array<Index, NumIndices>{firstDimension, otherDimensions...}), array<Index, NumIndices>{firstDimension, otherDimensions...})
    {
      // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
      EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
@ -426,7 +427,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
    {
      // The number of dimensions used to resize a tensor must be equal to the rank of the tensor.
      EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      resize(array<Index, NumIndices>{{firstDimension, otherDimensions...}});
+      resize(array<Index, NumIndices>{firstDimension, otherDimensions...});
    }
 #endif

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@ -89,6 +89,7 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
    BlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess = false,  // to be implemented
+    RawAccess = false
  };

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@ -134,7 +135,7 @@ struct traits<TensorTupleReducerOp<ReduceOp, Dims, XprType> > : public traits<Xp
  typedef Index Scalar;
  typedef typename XprType::Nested Nested;
  typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
  static const int Layout = XprTraits::Layout;
 };

@ -210,6 +211,7 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
    BlockAccess = false,
    Layout = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout,
    CoordAccess = false,  // to be implemented
+    RawAccess = false
  };

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@ -97,6 +97,7 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
    IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+    RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess,
  };

  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
@ -152,6 +153,8 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
    return m_leftImpl.template packet<LoadMode>(index);
  }

+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); }
+
 private:
  TensorEvaluator<LeftArgType, Device> m_leftImpl;
  TensorEvaluator<RightArgType, Device> m_rightImpl;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@ -46,6 +46,21 @@ struct nested<TensorBroadcastingOp<Broadcast, XprType>, 1, typename eval<TensorB
  typedef TensorBroadcastingOp<Broadcast, XprType> type;
 };

+template <typename Dims>
+struct is_input_scalar {
+  static const bool value = false;
+};
+template <>
+struct is_input_scalar<Sizes<> > {
+  static const bool value = true;
+};
+#ifndef EIGEN_EMULATE_CXX11_META_H
+template <typename std::size_t... Indices>
+struct is_input_scalar<Sizes<Indices...> > {
+  static const bool value = (Sizes<Indices...>::total_size == 1);
+};
+#endif
+
 }  // end namespace internal


@ -94,6 +109,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
    IsAligned = false,
    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess = false
  };

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@ -103,7 +119,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
    // and store the result in a scalar. Instead one should reshape the scalar into a a N-D
    // tensor with N >= 1 of 1 element first and then broadcast.
    EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
-    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    const InputDimensions& input_dims = m_impl.dimensions();
    const Broadcast& broadcast = op.broadcast();
    for (int i = 0; i < NumDims; ++i) {
      eigen_assert(input_dims[i] > 0);
@ -143,6 +159,10 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>

  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const
  {
+    if (internal::is_input_scalar<typename internal::remove_all<InputDimensions>::type>::value) {
+      return m_impl.coeff(0);
+    }
+
    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
      return coeffColMajor(index);
    } else {
@ -214,6 +234,10 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
  template<int LoadMode>
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const
  {
+    if (internal::is_input_scalar<typename internal::remove_all<InputDimensions>::type>::value) {
+      return internal::pset1<PacketReturnType>(m_impl.coeff(0));
+    }
+
    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
      return packetColMajor<LoadMode>(index);
    } else {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@ -145,6 +145,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess = false,  // to be implemented
+    RawAccess = false
  };

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@ -304,6 +305,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
  enum {
    IsAligned = false,
    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    RawAccess = false
  };

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@ -125,6 +125,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
    IsAligned = false,
    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+    RawAccess = false
  };

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@ -287,6 +288,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
    IsAligned = false,
    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+    RawAccess = false
  };

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device)
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@ -21,358 +21,6 @@ namespace Eigen {
  */
 namespace internal {

-enum {
-  Rhs = 0,
-  Lhs = 1,
-};
-
-/*
- * Implementation of the Eigen blas_data_mapper class for tensors.
- */
-template<typename Scalar, typename Index, int side,
-         typename Tensor,
-         typename nocontract_t, typename contract_t,
-         int packet_size, bool inner_dim_contiguous>
-class SimpleTensorContractionMapper {
-  public:
-  EIGEN_DEVICE_FUNC
-  SimpleTensorContractionMapper(const Tensor& tensor,
-                                const nocontract_t& nocontract_strides,
-                                const nocontract_t& ij_strides,
-                                const contract_t& contract_strides,
-                                const contract_t& k_strides) :
-      m_tensor(tensor),
-      m_nocontract_strides(nocontract_strides),
-      m_ij_strides(ij_strides),
-      m_contract_strides(contract_strides),
-      m_k_strides(k_strides) { }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Scalar operator()(Index row) const {
-    // column major assumption
-    return operator()(row, 0);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const {
-    return m_tensor.coeff(computeIndex(row, col));
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const {
-    const bool left = (side == Lhs);
-    Index nocontract_val = left ? row : col;
-    Index linidx = 0;
-    for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
-      const Index idx = nocontract_val / m_ij_strides[i];
-      linidx += idx * m_nocontract_strides[i];
-      nocontract_val -= idx * m_ij_strides[i];
-    }
-    if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
-      if (side == Lhs && inner_dim_contiguous) {
-        eigen_assert(m_nocontract_strides[0] == 1);
-        linidx += nocontract_val;
-      } else {
-        linidx += nocontract_val * m_nocontract_strides[0];
-      }
-    }
-
-    Index contract_val = left ? col : row;
-    for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
-      const Index idx = contract_val / m_k_strides[i];
-      linidx += idx * m_contract_strides[i];
-      contract_val -= idx * m_k_strides[i];
-    }
-
-    if(array_size<contract_t>::value > 0) {
-        if (side == Rhs && inner_dim_contiguous) {
-            eigen_assert(m_contract_strides[0] == 1);
-            linidx += contract_val;
-        } else {
-            linidx += contract_val * m_contract_strides[0];
-        }
-    }
-
-    return linidx;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE IndexPair<Index> computeIndexPair(Index row, Index col, const Index distance) const {
-    const bool left = (side == Lhs);
-    Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
-    Index linidx[2] = {0, 0};
-    for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
-      const Index idx0 = nocontract_val[0] / m_ij_strides[i];
-      const Index idx1 = nocontract_val[1] / m_ij_strides[i];
-      linidx[0] += idx0 * m_nocontract_strides[i];
-      linidx[1] += idx1 * m_nocontract_strides[i];
-      nocontract_val[0] -= idx0 * m_ij_strides[i];
-      nocontract_val[1] -= idx1 * m_ij_strides[i];
-    }
-    if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
-      if (side == Lhs && inner_dim_contiguous) {
-        eigen_assert(m_nocontract_strides[0] == 1);
-        linidx[0] += nocontract_val[0];
-        linidx[1] += nocontract_val[1];
-      } else {
-        linidx[0] += nocontract_val[0] * m_nocontract_strides[0];
-        linidx[1] += nocontract_val[1] * m_nocontract_strides[0];
-      }
-    }
-
-    Index contract_val[2] = {left ? col : row, left ? col : row + distance};
-    for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
-      const Index idx0 = contract_val[0] / m_k_strides[i];
-      const Index idx1 = contract_val[1] / m_k_strides[i];
-      linidx[0] += idx0 * m_contract_strides[i];
-      linidx[1] += idx1 * m_contract_strides[i];
-      contract_val[0] -= idx0 * m_k_strides[i];
-      contract_val[1] -= idx1 * m_k_strides[i];
-    }
-
-    if (side == Rhs && inner_dim_contiguous) {
-      eigen_assert(m_contract_strides[0] == 1);
-      linidx[0] += contract_val[0];
-      linidx[1] += contract_val[1];
-    } else {
-      linidx[0] += contract_val[0] * m_contract_strides[0];
-      linidx[1] += contract_val[1] * m_contract_strides[0];
-    }
-    return IndexPair<Index>(linidx[0], linidx[1]);
-  }
-
-  Index firstAligned(Index size) const {
-    return size;
-  }
-  Index stride() const {
-    return 1;
-  }
-
- protected:
-  const Tensor m_tensor;
-  const nocontract_t m_nocontract_strides;
-  const nocontract_t m_ij_strides;
-  const contract_t m_contract_strides;
-  const contract_t m_k_strides;
-};
-
-
-template<typename Scalar, typename Index, int side,
-         typename Tensor,
-         typename nocontract_t, typename contract_t,
-         int packet_size, bool inner_dim_contiguous,
-         bool inner_dim_reordered, int Alignment>
-  class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous>
-{
- public:
-  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous> ParentMapper;
-
-  EIGEN_DEVICE_FUNC
-  BaseTensorContractionMapper(const Tensor& tensor,
-                              const nocontract_t& nocontract_strides,
-                              const nocontract_t& ij_strides,
-                              const contract_t& contract_strides,
-                              const contract_t& k_strides) :
-  ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
-
-  typedef typename packet_traits<Scalar>::type Packet;
-  typedef typename packet_traits<Scalar>::half HalfPacket;
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
-    // whole method makes column major assumption
-
-    // don't need to add offsets for now (because operator handles that)
-    // current code assumes packet size must be a multiple of 2
-    EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-    if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) {
-      const Index index = this->computeIndex(i, j);
-      eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1);
-      return this->m_tensor.template packet<Alignment>(index);
-    }
-
-    const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1);
-    const Index first = indexPair.first;
-    const Index last = indexPair.second;
-
-    // We can always do optimized packet reads from left hand side right now, because
-    // the vertical matrix dimension on the left hand side is never contracting.
-    // On the right hand side we need to check if the contracting dimensions may have
-    // been shuffled first.
-    if (Tensor::PacketAccess &&
-        (side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) &&
-        (last - first) == (packet_size - 1)) {
-
-      return this->m_tensor.template packet<Alignment>(first);
-    }
-
-    EIGEN_ALIGN_MAX Scalar data[packet_size];
-
-    data[0] = this->m_tensor.coeff(first);
-    for (Index k = 1; k < packet_size - 1; k += 2) {
-      const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
-      data[k] = this->m_tensor.coeff(internal_pair.first);
-      data[k + 1] = this->m_tensor.coeff(internal_pair.second);
-    }
-    data[packet_size - 1] = this->m_tensor.coeff(last);
-
-    return pload<Packet>(data);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
-    // whole method makes column major assumption
-
-    // don't need to add offsets for now (because operator handles that)
-    const Index half_packet_size = unpacket_traits<HalfPacket>::size;
-    if (half_packet_size == packet_size) {
-      return loadPacket(i, j);
-    }
-    EIGEN_ALIGN_MAX Scalar data[half_packet_size];
-    for (Index k = 0; k < half_packet_size; k++) {
-      data[k] = operator()(i + k, j);
-    }
-    return pload<HalfPacket>(data);
-  }
-};
-
-
-template<typename Scalar, typename Index, int side,
-         typename Tensor,
-         typename nocontract_t, typename contract_t,
-         bool inner_dim_contiguous,
-         bool inner_dim_reordered, int Alignment>
-class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous>
-{
- public:
-  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous> ParentMapper;
-
-  EIGEN_DEVICE_FUNC
-  BaseTensorContractionMapper(const Tensor& tensor,
-                              const nocontract_t& nocontract_strides,
-                              const nocontract_t& ij_strides,
-                              const contract_t& contract_strides,
-                              const contract_t& k_strides) :
-  ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
-
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
-    EIGEN_ALIGN_MAX Scalar data[1];
-    data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
-    return pload<typename packet_traits<Scalar>::type>(data);
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const {
-    return loadPacket(i, j);
-  }
-};
-
-template<typename Scalar, typename Index, int side,
-         typename Tensor,
-         typename nocontract_t, typename contract_t,
-         int packet_size,
-         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
-class TensorContractionInputMapper;
-
-template<typename Scalar, typename Index, int side,
-         typename Tensor,
-         typename nocontract_t, typename contract_t,
-         int packet_size,
-         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
-class TensorContractionSubMapper {
- public:
-  typedef typename packet_traits<Scalar>::type Packet;
-  typedef typename packet_traits<Scalar>::half HalfPacket;
-
-  typedef TensorContractionInputMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper;
-  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
-  typedef Self LinearMapper;
-
-  EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
-      : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
-    return m_base_mapper(i + m_vert_offset, m_horiz_offset);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const {
-    return m_base_mapper(i + m_vert_offset, j + m_horiz_offset);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
-    return m_base_mapper.loadPacket(i + m_vert_offset, m_horiz_offset);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
-   return m_base_mapper.loadPacket(i + m_vert_offset, j + m_horiz_offset);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
-    return m_base_mapper.loadHalfPacket(i + m_vert_offset, m_horiz_offset);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
-    m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
-    return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset);
-  }
-
-  template <typename PacketT, int AlignmentType>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const {
-    EIGEN_STATIC_ASSERT((internal::is_same<PacketT, Packet>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    EIGEN_STATIC_ASSERT((AlignmentType == Aligned || Alignment == Unaligned), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return loadPacket(i);
-  }
-
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC bool aligned(Index) const {
-    return false;
-  }
-
- private:
-  const ParentMapper& m_base_mapper;
-  const Index m_vert_offset;
-  const Index m_horiz_offset;
-};
-
-
-template<typename Scalar, typename Index, int side,
-         typename Tensor,
-         typename nocontract_t, typename contract_t,
-         int packet_size,
-         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
-class TensorContractionInputMapper
-  : public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> {
-
- public:
-  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Base;
-  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
-  typedef SubMapper VectorMapper;
-
-  EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor,
-                               const nocontract_t& nocontract_strides,
-                               const nocontract_t& ij_strides,
-                               const contract_t& contract_strides,
-                               const contract_t& k_strides)
-      : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
-    return SubMapper(*this, i, j);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
-    return VectorMapper(*this, i, j);
-  }
-};
-
-
-
 template<typename Dimensions, typename LhsXprType, typename RhsXprType>
 struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
 {
@ -480,6 +128,7 @@ struct TensorContractionEvaluatorBase
    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
    CoordAccess = false,  // to be implemented
+    RawAccess = true
  };

  // Most of the code is assuming that both input tensors are ColMajor. If the
@ -498,8 +147,6 @@ struct TensorContractionEvaluatorBase
  static const int ContractDims = internal::array_size<Indices>::value;
  static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;

-  typedef array<Index, LDims> left_dim_mapper_t;
-  typedef array<Index, RDims> right_dim_mapper_t;
  typedef array<Index, ContractDims> contract_t;
  typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
  typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
@ -546,8 +193,8 @@ struct TensorContractionEvaluatorBase
      // We need to flip all the pairs of contracting indices as well as
      // reversing the dimensions.
      for (int i = 0; i < ContractDims; i++) {
-        eval_op_indices[i].first = LDims - 1 - op.indices()[i].second;
-        eval_op_indices[i].second = RDims - 1 - op.indices()[i].first;
+        eval_op_indices[i].first = LDims - 1 - op.indices()[ContractDims - 1 - i].second;
+        eval_op_indices[i].second = RDims - 1 - op.indices()[ContractDims - 1 - i].first;
      }
    }

@ -741,17 +388,19 @@ struct TensorContractionEvaluatorBase
    typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
    const Index lhs_packet_size = internal::packet_traits<LhsScalar>::size;
    const Index rhs_packet_size = internal::packet_traits<RhsScalar>::size;
+    const int lhs_alignment = LeftEvaluator::IsAligned ? Aligned : Unaligned;
+    const int rhs_alignment = RightEvaluator::IsAligned ? Aligned : Unaligned;
    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
                                                   LeftEvaluator, left_nocontract_t,
                                                   contract_t, lhs_packet_size,
                                                   lhs_inner_dim_contiguous,
-                                                   false, Unaligned> LhsMapper;
+                                                   false, lhs_alignment> LhsMapper;

    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
                                                   RightEvaluator, right_nocontract_t,
                                                   contract_t, rhs_packet_size,
                                                   rhs_inner_dim_contiguous,
-                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
+                                                   rhs_inner_dim_reordered, rhs_alignment> RhsMapper;

    LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides,
                  m_left_contracting_strides, m_k_strides);
@ -784,11 +433,11 @@ struct TensorContractionEvaluatorBase
  }

  template<int LoadMode>
-  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
    return internal::ploadt<Packet, LoadMode>(m_result + index);
  }

-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; }

  protected:
  // Prevent assignment
@ -853,9 +502,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
  static const int ContractDims = internal::array_size<Indices>::value;

-  typedef array<Index, LDims> left_dim_mapper_t;
-  typedef array<Index, RDims> right_dim_mapper_t;
-
  typedef array<Index, ContractDims> contract_t;
  typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
  typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@ -1261,7 +1261,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
      Base(op, device) {}

  // We need to redefine this method to make nvcc happy
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
    this->m_leftImpl.evalSubExprsIfNeeded(NULL);
    this->m_rightImpl.evalSubExprsIfNeeded(NULL);
    if (data) {
@ -1317,6 +1317,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
  void evalTyped(Scalar* buffer) const {
    // columns in left side, rows in right side
    const Index k = this->m_k_size;
+    EIGEN_UNUSED_VARIABLE(k)

    // rows in left side
    const Index m = this->m_i_size;
@ -1361,7 +1362,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
        const dim3 block_size(16, 16, 1);
        LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k);
      } else {
-       const Index m_blocks = (m + 127) / 128;
+        const Index m_blocks = (m + 127) / 128;
        const Index n_blocks = (n + 63) / 64;
        const dim3 num_blocks(m_blocks, n_blocks, 1);
        const dim3 block_size(8, 32, 1);
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@ -0,0 +1,464 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
+
+namespace Eigen {
+
+namespace internal {
+
+enum {
+  Rhs = 0,
+  Lhs = 1,
+};
+
+/*
+ * Implementation of the Eigen blas_data_mapper class for tensors.
+ */
+
+template <typename Tensor, bool HasRawAccess> struct CoeffLoader {
+  enum {
+    DirectOffsets = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_tensor(tensor) { }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index) {
+    eigen_assert(false && "unsupported");
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return m_tensor.coeff(index); }
+
+ template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename Tensor::PacketReturnType packet(typename Tensor::Index index) const
+  {
+    return m_tensor.template packet<LoadMode>(index);
+  }
+
+
+ private:
+  const Tensor m_tensor;
+};
+
+template <typename Tensor> struct CoeffLoader<Tensor, true> {
+  enum {
+    DirectOffsets = true
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_data(tensor.data()) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) {
+    m_data += offset;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return loadConstant(m_data+index); }
+
+ template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename Tensor::PacketReturnType packet(typename Tensor::Index index) const
+  {
+    return internal::ploadt_ro<typename Tensor::PacketReturnType, LoadMode>(m_data + index);
+  }
+ private:
+  typedef typename Tensor::Scalar Scalar;
+  const Scalar* m_data;
+};
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         int packet_size, bool inner_dim_contiguous, int Alignment>
+class SimpleTensorContractionMapper {
+  public:
+  EIGEN_DEVICE_FUNC
+  SimpleTensorContractionMapper(const Tensor& tensor,
+                                const nocontract_t& nocontract_strides,
+                                const nocontract_t& ij_strides,
+                                const contract_t& contract_strides,
+                                const contract_t& k_strides) :
+      m_tensor(tensor),
+      m_nocontract_strides(nocontract_strides),
+      m_ij_strides(ij_strides),
+      m_contract_strides(contract_strides),
+      m_k_strides(k_strides) { }
+
+  enum {
+    DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess>::DirectOffsets
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) {
+    m_tensor.offsetBuffer(offset);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar operator()(Index row) const {
+    // column major assumption
+    return operator()(row, 0);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const {
+    return m_tensor.coeff(computeIndex(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const {
+    const bool left = (side == Lhs);
+    Index nocontract_val = left ? row : col;
+    Index linidx = 0;
+    for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
+      const Index idx = nocontract_val / m_ij_strides[i];
+      linidx += idx * m_nocontract_strides[i];
+      nocontract_val -= idx * m_ij_strides[i];
+    }
+    if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+      if (side == Lhs && inner_dim_contiguous) {
+        eigen_assert(m_nocontract_strides[0] == 1);
+        linidx += nocontract_val;
+      } else {
+        linidx += nocontract_val * m_nocontract_strides[0];
+      }
+    }
+
+    Index contract_val = left ? col : row;
+    for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
+      const Index idx = contract_val / m_k_strides[i];
+      linidx += idx * m_contract_strides[i];
+      contract_val -= idx * m_k_strides[i];
+    }
+
+    if(array_size<contract_t>::value > 0) {
+        if (side == Rhs && inner_dim_contiguous) {
+            eigen_assert(m_contract_strides[0] == 1);
+            linidx += contract_val;
+        } else {
+            linidx += contract_val * m_contract_strides[0];
+        }
+    }
+
+    return linidx;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE IndexPair<Index> computeIndexPair(Index row, Index col, const Index distance) const {
+    const bool left = (side == Lhs);
+    Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
+    Index linidx[2] = {0, 0};
+    for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
+      const Index idx0 = nocontract_val[0] / m_ij_strides[i];
+      const Index idx1 = nocontract_val[1] / m_ij_strides[i];
+      linidx[0] += idx0 * m_nocontract_strides[i];
+      linidx[1] += idx1 * m_nocontract_strides[i];
+      nocontract_val[0] -= idx0 * m_ij_strides[i];
+      nocontract_val[1] -= idx1 * m_ij_strides[i];
+    }
+    if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+      if (side == Lhs && inner_dim_contiguous) {
+        eigen_assert(m_nocontract_strides[0] == 1);
+        linidx[0] += nocontract_val[0];
+        linidx[1] += nocontract_val[1];
+      } else {
+        linidx[0] += nocontract_val[0] * m_nocontract_strides[0];
+        linidx[1] += nocontract_val[1] * m_nocontract_strides[0];
+      }
+    }
+
+    Index contract_val[2] = {left ? col : row, left ? col : row + distance};
+    for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
+      const Index idx0 = contract_val[0] / m_k_strides[i];
+      const Index idx1 = contract_val[1] / m_k_strides[i];
+      linidx[0] += idx0 * m_contract_strides[i];
+      linidx[1] += idx1 * m_contract_strides[i];
+      contract_val[0] -= idx0 * m_k_strides[i];
+      contract_val[1] -= idx1 * m_k_strides[i];
+    }
+
+    if (side == Rhs && inner_dim_contiguous) {
+      eigen_assert(m_contract_strides[0] == 1);
+      linidx[0] += contract_val[0];
+      linidx[1] += contract_val[1];
+    } else {
+      linidx[0] += contract_val[0] * m_contract_strides[0];
+      linidx[1] += contract_val[1] * m_contract_strides[0];
+    }
+    return IndexPair<Index>(linidx[0], linidx[1]);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const {
+    // Only claim alignment when we can compute the actual stride (ie when we're
+    // dealing with the lhs with inner_dim_contiguous. This is because the
+    // matrix-vector product relies on the stride when dealing with aligned inputs.
+    return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const {
+    return ((side == Lhs) && inner_dim_contiguous) ? m_contract_strides[0] : 1;
+  }
+
+ protected:
+  CoeffLoader<Tensor, Tensor::RawAccess> m_tensor;
+  const nocontract_t m_nocontract_strides;
+  const nocontract_t m_ij_strides;
+  const contract_t m_contract_strides;
+  const contract_t m_k_strides;
+};
+
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         int packet_size, bool inner_dim_contiguous,
+         bool inner_dim_reordered, int Alignment>
+class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment>
+{
+ public:
+  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment> ParentMapper;
+
+  EIGEN_DEVICE_FUNC
+  BaseTensorContractionMapper(const Tensor& tensor,
+                              const nocontract_t& nocontract_strides,
+                              const nocontract_t& ij_strides,
+                              const contract_t& contract_strides,
+                              const contract_t& k_strides) :
+  ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
+  template <int AlignmentType = Alignment>
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
+    // whole method makes column major assumption
+
+    // don't need to add offsets for now (because operator handles that)
+    // current code assumes packet size must be a multiple of 2
+    EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) {
+      const Index index = this->computeIndex(i, j);
+      eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1);
+      return this->m_tensor.template packet<AlignmentType>(index);
+    }
+
+    const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1);
+    const Index first = indexPair.first;
+    const Index last = indexPair.second;
+
+    // We can always do optimized packet reads from left hand side right now, because
+    // the vertical matrix dimension on the left hand side is never contracting.
+    // On the right hand side we need to check if the contracting dimensions may have
+    // been shuffled first.
+    if (Tensor::PacketAccess &&
+        (side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) &&
+        (last - first) == (packet_size - 1)) {
+
+      return this->m_tensor.template packet<AlignmentType>(first);
+    }
+
+    EIGEN_ALIGN_MAX Scalar data[packet_size];
+
+    data[0] = this->m_tensor.coeff(first);
+    for (Index k = 1; k < packet_size - 1; k += 2) {
+      const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
+      data[k] = this->m_tensor.coeff(internal_pair.first);
+      data[k + 1] = this->m_tensor.coeff(internal_pair.second);
+    }
+    data[packet_size - 1] = this->m_tensor.coeff(last);
+
+    return pload<Packet>(data);
+  }
+
+  template <int AlignmentType = Alignment>
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
+    // whole method makes column major assumption
+
+    // don't need to add offsets for now (because operator handles that)
+    const Index half_packet_size = unpacket_traits<HalfPacket>::size;
+    if (half_packet_size == packet_size) {
+      return loadPacket<AlignmentType>(i, j);
+    }
+    EIGEN_ALIGN_MAX Scalar data[half_packet_size];
+    for (Index k = 0; k < half_packet_size; k++) {
+      data[k] = operator()(i + k, j);
+    }
+    return pload<HalfPacket>(data);
+  }
+};
+
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         bool inner_dim_contiguous,
+         bool inner_dim_reordered, int Alignment>
+class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment>
+{
+ public:
+  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment> ParentMapper;
+
+  EIGEN_DEVICE_FUNC
+  BaseTensorContractionMapper(const Tensor& tensor,
+                              const nocontract_t& nocontract_strides,
+                              const nocontract_t& ij_strides,
+                              const contract_t& contract_strides,
+                              const contract_t& k_strides) :
+  ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+
+  typedef typename packet_traits<Scalar>::type Packet;
+  template <int> EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
+    EIGEN_ALIGN_MAX Scalar data[1];
+    data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
+    return pload<typename packet_traits<Scalar>::type>(data);
+  }
+  template <int> EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const {
+    return loadPacket(i, j);
+  }
+};
+
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         int packet_size,
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionSubMapper {
+ public:
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
+  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
+  typedef Self LinearMapper;
+
+  enum {
+    // We can use direct offsets iff the parent mapper supports then and we can compute the strides.
+    // TODO: we should also enable direct offsets for the Rhs case.
+    UseDirectOffsets = (side == Lhs) && inner_dim_contiguous && ParentMapper::DirectOffsets
+  };
+
+  EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
+      : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) {
+    // Bake the offsets into the buffer used by the base mapper whenever possible. This avoids the need to recompute
+    // this offset every time we attempt to access a coefficient.
+    if (UseDirectOffsets) {
+      Index stride = m_base_mapper.stride();
+      m_base_mapper.offsetBuffer(vert_offset + horiz_offset * stride);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper(i, 0);
+    }
+    return m_base_mapper(i + m_vert_offset, m_horiz_offset);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper(i, j);
+    }
+    return m_base_mapper(i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper.template loadPacket<Alignment>(i, 0);
+    }
+    return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper.template loadPacket<Alignment>(i, j);
+    }
+    return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper.template loadHalfPacket<Alignment>(i, 0);
+    }
+    return m_base_mapper.template loadHalfPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
+    if (UseDirectOffsets) {
+      m_base_mapper.storePacket(i, 0, p);
+    }
+    m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    if (UseDirectOffsets) {
+      return LinearMapper(m_base_mapper, i, j);
+    }
+    return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  template <typename PacketT, int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const {
+    EIGEN_STATIC_ASSERT((internal::is_same<PacketT, Packet>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned;
+    if (UseDirectOffsets) {
+     return m_base_mapper.template loadPacket<ActualAlignment>(i, 0);
+    }
+    return m_base_mapper.template loadPacket<ActualAlignment>(i + m_vert_offset, m_horiz_offset);
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const {
+    return false;
+  }
+
+ private:
+  ParentMapper m_base_mapper;
+  const Index m_vert_offset;
+  const Index m_horiz_offset;
+};
+
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         int packet_size,
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionInputMapper
+  : public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> {
+
+ public:
+  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Base;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
+  typedef SubMapper VectorMapper;
+
+  EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor,
+                               const nocontract_t& nocontract_strides,
+                               const nocontract_t& ij_strides,
+                               const contract_t& contract_strides,
+                               const contract_t& k_strides)
+      : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
+    return SubMapper(*this, i, j);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
+    return VectorMapper(*this, i, j);
+  }
+};
+
+
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@ -162,6 +162,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
    IsAligned = false,
    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess && internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess = false
  };

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@ -306,6 +306,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
    PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
    Layout = TensorEvaluator<InputArgType, Device>::Layout,
    CoordAccess = false,  // to be implemented
+    RawAccess = false
  };

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@ -752,6 +753,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
    PacketAccess = false,
    Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
    CoordAccess = false,  // to be implemented
+    RawAccess = false
  };

  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@ -95,6 +95,7 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
    BlockAccess = false,
    Layout = TensorEvaluator<XprType, Device>::Layout,
    CoordAccess = false,  // to be implemented
+    RawAccess = false
  };

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device)
@ -250,6 +251,7 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
    BlockAccess = false,
    Layout = TensorEvaluator<LhsXprType, Device>::Layout,
    CoordAccess = false,  // to be implemented
+    RawAccess = false
  };

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@ -10,7 +10,6 @@
 #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H)
 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H

-
 namespace Eigen {

 // This defines an interface that GPUDevice can take to use
@ -206,20 +205,45 @@ struct GpuDevice {
 #endif
  }

-  inline int getNumCudaMultiProcessors() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
+#ifndef __CUDA_ARCH__
    return stream_->deviceProperties().multiProcessorCount;
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+    return 0;
+#endif
  }
-  inline int maxCudaThreadsPerBlock() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
+#ifndef __CUDA_ARCH__
    return stream_->deviceProperties().maxThreadsPerBlock;
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+    return 0;
+#endif
  }
-  inline int maxCudaThreadsPerMultiProcessor() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
+#ifndef __CUDA_ARCH__
    return stream_->deviceProperties().maxThreadsPerMultiProcessor;
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+    return 0;
+#endif
  }
-  inline int sharedMemPerBlock() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
+#ifndef __CUDA_ARCH__
    return stream_->deviceProperties().sharedMemPerBlock;
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+    return 0;
+#endif
  }
-  inline int majorDeviceVersion() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+#ifndef __CUDA_ARCH__
    return stream_->deviceProperties().major;
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+    return 0;
+#endif
  }

  // This function checks if the CUDA runtime recorded an error for the
@ -239,23 +263,29 @@ struct GpuDevice {
 };

 #ifndef __CUDA_ARCH__
-#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)            \
-  (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);  \
+#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
+  (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);   \
  assert(cudaGetLastError() == cudaSuccess);
 #else
-#define LAUNCH_CUDA_KERNEL(...)  \
-   eigen_assert(false && "Cannot launch a kernel from another kernel");
+#define LAUNCH_CUDA_KERNEL(kernel, ...)                                                     \
+  { const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); }        \
+  eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__);
 #endif

+
 // FIXME: Should be device and kernel specific.
 #ifdef __CUDACC__
-static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
+static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
+#ifndef __CUDA_ARCH__
  cudaError_t status = cudaDeviceSetSharedMemConfig(config);
  EIGEN_UNUSED_VARIABLE(status)
  assert(status == cudaSuccess);
+#else
+  EIGEN_UNUSED_VARIABLE(config)
+#endif
 }
 #endif

 }  // end namespace Eigen

-#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@ -110,14 +110,14 @@ struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> {
    return internal::arg_prod(Indices...);
  }

-  Sizes() { }
+  EIGEN_DEVICE_FUNC Sizes() { }
  template <typename DenseIndex>
-  explicit Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
+  explicit EIGEN_DEVICE_FUNC Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
    // todo: add assertion
  }
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
-  template <typename... DenseIndex> Sizes(DenseIndex...) { }
-  explicit Sizes(std::initializer_list<std::ptrdiff_t> /*l*/) {
+  template <typename... DenseIndex> EIGEN_DEVICE_FUNC Sizes(DenseIndex...) { }
+  explicit EIGEN_DEVICE_FUNC Sizes(std::initializer_list<std::ptrdiff_t> /*l*/) {
    // todo: add assertion
  }
 #endif
@ -289,7 +289,7 @@ struct DSizes : array<DenseIndex, NumDims> {
  template<typename... IndexTypes> EIGEN_DEVICE_FUNC
  EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, IndexTypes... otherDimensions) {
    EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    (*this) = array<DenseIndex, NumDims>{{firstDimension, otherDimensions...}};
+    (*this) = array<DenseIndex, NumDims>{firstDimension, otherDimensions...};
  }
 #else
  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) {
@ -405,20 +405,20 @@ template <std::size_t n, std::size_t V1, std::size_t V2, std::size_t V3, std::si

 template <typename Dims1, typename Dims2, size_t n, size_t m>
 struct sizes_match_below_dim {
-  static inline bool run(Dims1&, Dims2&) {
+  static EIGEN_DEVICE_FUNC  inline bool run(Dims1&, Dims2&) {
    return false;
  }
 };
 template <typename Dims1, typename Dims2, size_t n>
 struct sizes_match_below_dim<Dims1, Dims2, n, n> {
-  static inline bool run(Dims1& dims1, Dims2& dims2) {
+  static EIGEN_DEVICE_FUNC  inline bool run(Dims1& dims1, Dims2& dims2) {
    return (array_get<n-1>(dims1) == array_get<n-1>(dims2)) &
        sizes_match_below_dim<Dims1, Dims2, n-1, n-1>::run(dims1, dims2);
  }
 };
 template <typename Dims1, typename Dims2>
 struct sizes_match_below_dim<Dims1, Dims2, 0, 0> {
-  static inline bool run(Dims1&, Dims2&) {
+  static EIGEN_DEVICE_FUNC  inline bool run(Dims1&, Dims2&) {
    return true;
  }
 };
@ -427,7 +427,7 @@ struct sizes_match_below_dim<Dims1, Dims2, 0, 0> {


 template <typename Dims1, typename Dims2>
-bool dimensions_match(Dims1& dims1, Dims2& dims2) {
+EIGEN_DEVICE_FUNC bool dimensions_match(Dims1& dims1, Dims2& dims2) {
  return internal::sizes_match_below_dim<Dims1, Dims2, internal::array_size<Dims1>::value, internal::array_size<Dims2>::value>::run(dims1, dims2);
 }

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@ -98,6 +98,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
    PacketAccess = true,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess = false,  // to be implemented
+    RawAccess = true
  };

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@ -140,7 +141,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
    return internal::ploadt<Packet, LoadMode>(m_buffer + index);
  }

-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_buffer; }

 private:
  TensorEvaluator<ArgType, Device> m_impl;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@ -43,6 +43,7 @@ struct TensorEvaluator
    PacketAccess = Derived::PacketAccess,
    Layout = Derived::Layout,
    CoordAccess = NumCoords > 0,
+    RawAccess = true
  };

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
@ -148,6 +149,7 @@ struct TensorEvaluator<const Derived, Device>
    PacketAccess = Derived::PacketAccess,
    Layout = Derived::Layout,
    CoordAccess = NumCoords > 0,
+    RawAccess = true
  };

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
@ -207,6 +209,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
    PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess = false,  // to be implemented
+    RawAccess = false
  };

  EIGEN_DEVICE_FUNC
@ -257,6 +260,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess & internal::functor_traits<UnaryOp>::PacketAccess,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess = false,  // to be implemented
+    RawAccess = false
  };

  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
@ -312,6 +316,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
                   internal::functor_traits<BinaryOp>::PacketAccess,
    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
    CoordAccess = false,  // to be implemented
+    RawAccess = false
  };

  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
@ -378,6 +383,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
                   internal::packet_traits<Scalar>::HasBlend,
    Layout = TensorEvaluator<IfArgType, Device>::Layout,
    CoordAccess = false,  // to be implemented
+    RawAccess = false
  };

  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@ -156,14 +156,14 @@ template <typename Expression>
 class TensorExecutor<Expression, GpuDevice, false> {
 public:
  typedef typename Expression::Index Index;
-  EIGEN_DEVICE_FUNC static void run(const Expression& expr, const GpuDevice& device);
+  static EIGEN_DEVICE_FUNC void run(const Expression& expr, const GpuDevice& device);
 };

 template <typename Expression>
 class TensorExecutor<Expression, GpuDevice, true> {
 public:
  typedef typename Expression::Index Index;
-  EIGEN_DEVICE_FUNC static void run(const Expression& expr, const GpuDevice& device);
+  static EIGEN_DEVICE_FUNC void run(const Expression& expr, const GpuDevice& device);
 };

 #if defined(__CUDACC__)
@ -215,7 +215,6 @@ EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) {
 template <typename Expression>
 EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
 {
-#ifndef __CUDA_ARCH__
  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
  if (needs_assign)
@ -228,9 +227,6 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(
    LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
  }
  evaluator.cleanup();
-#else
-   eigen_assert(false && "Cannot launch a kernel from another kernel");
-#endif
 }


@ -238,7 +234,6 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(
 template<typename Expression>
 EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(const Expression& expr, const GpuDevice& device)
 {
-#ifndef __CUDA_ARCH__
  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
  if (needs_assign)
@ -251,9 +246,6 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(c
    LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
  }
  evaluator.cleanup();
-#else
-   eigen_assert(false && "Cannot launch a kernel from another kernel");
-#endif
 }

 #endif  // __CUDACC__
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@ -135,6 +135,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
    BlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess = false,
+    RawAccess = false
  };

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@ -44,7 +44,8 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
      PacketAccess = (internal::packet_traits<Scalar>::size > 1),
      Layout = Options_ & RowMajor ? RowMajor : ColMajor,
      CoordAccess = true,
-   };
+      RawAccess = true
+    };

  typedef Dimensions_ Dimensions;
  static const std::size_t NumIndices = Dimensions::count;
@ -53,7 +54,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
  TensorStorage<Scalar, Dimensions, Options> m_storage;

  public:
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                      rank()                   const { return NumIndices; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                    rank()                   const { return NumIndices; }
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                    dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions&        dimensions()             const { return m_storage.dimensions(); }
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                    size()                   const { return m_storage.size(); }
@ -72,7 +73,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
    {
      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return coeff(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+      return coeff(array<Index, NumIndices>{firstIndex, otherIndices...});
    }
 #endif

@ -104,7 +105,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
    {
      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return coeffRef(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+      return coeffRef(array<Index, NumIndices>{firstIndex, otherIndices...});
    }
 #endif

@ -136,7 +137,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
    {
      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return this->operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+      return this->operator()(array<Index, NumIndices>{firstIndex, otherIndices...});
    }
 #endif

@ -175,7 +176,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
    {
      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+      return operator()(array<Index, NumIndices>{firstIndex, otherIndices...});
    }
 #endif

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@ -92,6 +92,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
    IsAligned = true,
    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess = true
  };

  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@ -95,6 +95,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
    BlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess = false,  // to be implemented
+    RawAccess = false
  };

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
--- a/Show More
+++ b/Show More