From 8b0d1eb0f7f1ab017a8e603f3887143df15662d7 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 1 Jan 2016 21:45:06 +0100
Subject: [PATCH 01/52] Fix numerous doxygen shortcomings, and workaround some
 clang -Wdocumentation warnings

---
 Eigen/src/Cholesky/LDLT.h                     |   4 +-
 Eigen/src/Cholesky/LLT.h                      |   9 +-
 Eigen/src/Core/Array.h                        |  24 +--
 Eigen/src/Core/BandMatrix.h                   |  24 +--
 Eigen/src/Core/Block.h                        |  69 +++++----
 Eigen/src/Core/CwiseBinaryOp.h                |  39 +++--
 Eigen/src/Core/CwiseNullaryOp.h               |  35 +++--
 Eigen/src/Core/CwiseUnaryOp.h                 |  39 +++--
 Eigen/src/Core/CwiseUnaryView.h               |  27 ++--
 Eigen/src/Core/IO.h                           |   2 +-
 Eigen/src/Core/Map.h                          |  45 +++---
 Eigen/src/Core/MathFunctions.h                |   2 +-
 Eigen/src/Core/Matrix.h                       |  86 +++++------
 Eigen/src/Core/NestByValue.h                  |  25 ++--
 Eigen/src/Core/NoAlias.h                      |   2 +-
 Eigen/src/Core/NumTraits.h                    |   2 +-
 Eigen/src/Core/PermutationMatrix.h            |  65 ++++----
 Eigen/src/Core/Product.h                      |  31 ++--
 Eigen/src/Core/Ref.h                          | 140 +++++++++---------
 Eigen/src/Core/Replicate.h                    |  31 ++--
 Eigen/src/Core/ReturnByValue.h                |   9 +-
 Eigen/src/Core/Reverse.h                      |  28 ++--
 Eigen/src/Core/Stride.h                       |   4 +-
 Eigen/src/Core/Transpose.h                    |  27 ++--
 Eigen/src/Core/Transpositions.h               |  58 ++++----
 Eigen/src/Core/VectorBlock.h                  |  25 ++--
 Eigen/src/Core/VectorwiseOp.h                 |   8 +-
 Eigen/src/Core/util/XprHelper.h               |   8 +-
 .../src/Eigenvalues/SelfAdjointEigenSolver.h  |  15 +-
 Eigen/src/Geometry/Hyperplane.h               |   4 +-
 Eigen/src/Geometry/ParametrizedLine.h         |   4 +-
 Eigen/src/Geometry/Rotation2D.h               |   2 +-
 Eigen/src/Geometry/RotationBase.h             |   8 +-
 Eigen/src/Geometry/Scaling.h                  |   2 +-
 Eigen/src/Geometry/Translation.h              |   4 +-
 .../IncompleteCholesky.h                      |   2 +-
 Eigen/src/LU/FullPivLU.h                      |   2 +-
 Eigen/src/LU/PartialPivLU.h                   |   2 +-
 Eigen/src/OrderingMethods/Amd.h               |   7 +-
 Eigen/src/OrderingMethods/Ordering.h          |   9 +-
 Eigen/src/QR/ColPivHouseholderQR.h            |   2 +-
 Eigen/src/QR/FullPivHouseholderQR.h           |   2 +-
 Eigen/src/QR/HouseholderQR.h                  |   2 +-
 Eigen/src/SVD/BDCSVD.h                        |   5 +-
 Eigen/src/SVD/JacobiSVD.h                     |   4 +-
 Eigen/src/SparseCholesky/SimplicialCholesky.h |  10 +-
 Eigen/src/SparseLU/SparseLU_kernel_bmod.h     |  27 ++--
 .../Eigen/src/SparseExtra/RandomSetter.h      |   6 +-
 48 files changed, 486 insertions(+), 501 deletions(-)

diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h
index 6fcae01f7..c3cc3746c 100644
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -28,8 +28,8 @@ namespace internal {
   *
   * \brief Robust Cholesky decomposition of a matrix with pivoting
   *
-  * \param MatrixType the type of the matrix of which to compute the LDL^T Cholesky decomposition
-  * \param UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
+  * \tparam _MatrixType the type of the matrix of which to compute the LDL^T Cholesky decomposition
+  * \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
   *             The other triangular part won't be read.
   *
   * Perform a robust Cholesky decomposition of a positive semidefinite or negative semidefinite
diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h
index 1f0091f3c..74cf5bfe1 100644
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -22,8 +22,8 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
   *
   * \brief Standard Cholesky decomposition (LL^T) of a matrix and associated features
   *
-  * \param MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition
-  * \param UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
+  * \tparam _MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition
+  * \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
   *             The other triangular part won't be read.
   *
   * This class performs a LL^T Cholesky decomposition of a symmetric, positive definite
@@ -436,10 +436,7 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
   *
   * \param bAndX represents both the right-hand side matrix b and result x.
   *
-  * \returns true always! If you need to check for existence of solutions, use another decomposition like LU, QR, or SVD.
-  *
-  * This version avoids a copy when the right hand side matrix b is not
-  * needed anymore.
+  * This version avoids a copy when the right hand side matrix b is not needed anymore.
   *
   * \sa LLT::solve(), MatrixBase::llt()
   */
diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h
index e38eda72c..7480d1e24 100644
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h
@@ -12,7 +12,16 @@
 
 namespace Eigen {
 
-/** \class Array 
+namespace internal {
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct traits<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > : traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+{
+  typedef ArrayXpr XprKind;
+  typedef ArrayBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > XprBase;
+};
+}
+
+/** \class Array
   * \ingroup Core_Module
   *
   * \brief General-purpose arrays with easy API for coefficient-wise operations
@@ -26,21 +35,12 @@ namespace Eigen {
   *
   * See documentation of class Matrix for detailed information on the template parameters
   * storage layout.
-  * 
+  *
   * This class can be extended with the help of the plugin mechanism described on the page
   * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_ARRAY_PLUGIN.
   *
-  * \sa \ref TutorialArrayClass, \ref TopicClassHierarchy
+  * \sa \blank \ref TutorialArrayClass, \ref TopicClassHierarchy
   */
-namespace internal {
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct traits<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > : traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
-{
-  typedef ArrayXpr XprKind;
-  typedef ArrayBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > XprBase;
-};
-}
-
 template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
 class Array
   : public PlainObjectBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
diff --git a/Eigen/src/Core/BandMatrix.h b/Eigen/src/Core/BandMatrix.h
index 87c124fdf..4978c9140 100644
--- a/Eigen/src/Core/BandMatrix.h
+++ b/Eigen/src/Core/BandMatrix.h
@@ -161,15 +161,15 @@ class BandMatrixBase : public EigenBase<Derived>
   *
   * \brief Represents a rectangular matrix with a banded storage
   *
-  * \param _Scalar Numeric type, i.e. float, double, int
-  * \param Rows Number of rows, or \b Dynamic
-  * \param Cols Number of columns, or \b Dynamic
-  * \param Supers Number of super diagonal
-  * \param Subs Number of sub diagonal
-  * \param _Options A combination of either \b #RowMajor or \b #ColMajor, and of \b #SelfAdjoint
-  *                 The former controls \ref TopicStorageOrders "storage order", and defaults to
-  *                 column-major. The latter controls whether the matrix represents a selfadjoint 
-  *                 matrix in which case either Supers of Subs have to be null.
+  * \tparam _Scalar Numeric type, i.e. float, double, int
+  * \tparam _Rows Number of rows, or \b Dynamic
+  * \tparam _Cols Number of columns, or \b Dynamic
+  * \tparam _Supers Number of super diagonal
+  * \tparam _Subs Number of sub diagonal
+  * \tparam _Options A combination of either \b #RowMajor or \b #ColMajor, and of \b #SelfAdjoint
+  *                  The former controls \ref TopicStorageOrders "storage order", and defaults to
+  *                  column-major. The latter controls whether the matrix represents a selfadjoint
+  *                  matrix in which case either Supers of Subs have to be null.
   *
   * \sa class TridiagonalMatrix
   */
@@ -302,9 +302,9 @@ class BandMatrixWrapper : public BandMatrixBase<BandMatrixWrapper<_CoefficientsT
   *
   * \brief Represents a tridiagonal matrix with a compact banded storage
   *
-  * \param _Scalar Numeric type, i.e. float, double, int
-  * \param Size Number of rows and cols, or \b Dynamic
-  * \param _Options Can be 0 or \b SelfAdjoint
+  * \tparam Scalar Numeric type, i.e. float, double, int
+  * \tparam Size Number of rows and cols, or \b Dynamic
+  * \tparam Options Can be 0 or \b SelfAdjoint
   *
   * \sa class BandMatrix
   */
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 3748e259b..599b714cc 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -13,41 +13,6 @@
 
 namespace Eigen { 
 
-/** \class Block
-  * \ingroup Core_Module
-  *
-  * \brief Expression of a fixed-size or dynamic-size block
-  *
-  * \param XprType the type of the expression in which we are taking a block
-  * \param BlockRows the number of rows of the block we are taking at compile time (optional)
-  * \param BlockCols the number of columns of the block we are taking at compile time (optional)
-  * \param InnerPanel is true, if the block maps to a set of rows of a row major matrix or
-  *        to set of columns of a column major matrix (optional). The parameter allows to determine
-  *        at compile time whether aligned access is possible on the block expression.
-  *
-  * This class represents an expression of either a fixed-size or dynamic-size block. It is the return
-  * type of DenseBase::block(Index,Index,Index,Index) and DenseBase::block<int,int>(Index,Index) and
-  * most of the time this is the only way it is used.
-  *
-  * However, if you want to directly maniputate block expressions,
-  * for instance if you want to write a function returning such an expression, you
-  * will need to use this class.
-  *
-  * Here is an example illustrating the dynamic case:
-  * \include class_Block.cpp
-  * Output: \verbinclude class_Block.out
-  *
-  * \note Even though this expression has dynamic size, in the case where \a XprType
-  * has fixed size, this expression inherits a fixed maximal size which means that evaluating
-  * it does not cause a dynamic memory allocation.
-  *
-  * Here is an example illustrating the fixed-size case:
-  * \include class_FixedBlock.cpp
-  * Output: \verbinclude class_FixedBlock.out
-  *
-  * \sa DenseBase::block(Index,Index,Index,Index), DenseBase::block(Index,Index), class VectorBlock
-  */
-
 namespace internal {
 template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
 struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprType>
@@ -101,6 +66,40 @@ template<typename XprType, int BlockRows=Dynamic, int BlockCols=Dynamic, bool In
 
 template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, typename StorageKind> class BlockImpl;
 
+/** \class Block
+  * \ingroup Core_Module
+  *
+  * \brief Expression of a fixed-size or dynamic-size block
+  *
+  * \tparam XprType the type of the expression in which we are taking a block
+  * \tparam BlockRows the number of rows of the block we are taking at compile time (optional)
+  * \tparam BlockCols the number of columns of the block we are taking at compile time (optional)
+  * \tparam InnerPanel is true, if the block maps to a set of rows of a row major matrix or
+  *         to set of columns of a column major matrix (optional). The parameter allows to determine
+  *         at compile time whether aligned access is possible on the block expression.
+  *
+  * This class represents an expression of either a fixed-size or dynamic-size block. It is the return
+  * type of DenseBase::block(Index,Index,Index,Index) and DenseBase::block<int,int>(Index,Index) and
+  * most of the time this is the only way it is used.
+  *
+  * However, if you want to directly maniputate block expressions,
+  * for instance if you want to write a function returning such an expression, you
+  * will need to use this class.
+  *
+  * Here is an example illustrating the dynamic case:
+  * \include class_Block.cpp
+  * Output: \verbinclude class_Block.out
+  *
+  * \note Even though this expression has dynamic size, in the case where \a XprType
+  * has fixed size, this expression inherits a fixed maximal size which means that evaluating
+  * it does not cause a dynamic memory allocation.
+  *
+  * Here is an example illustrating the fixed-size case:
+  * \include class_FixedBlock.cpp
+  * Output: \verbinclude class_FixedBlock.out
+  *
+  * \sa DenseBase::block(Index,Index,Index,Index), DenseBase::block(Index,Index), class VectorBlock
+  */
 template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class Block
   : public BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, typename internal::traits<XprType>::StorageKind>
 {
diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h
index e42c3031b..f94629e6d 100644
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -13,26 +13,6 @@
 
 namespace Eigen {
 
-/** \class CwiseBinaryOp
-  * \ingroup Core_Module
-  *
-  * \brief Generic expression where a coefficient-wise binary operator is applied to two expressions
-  *
-  * \param BinaryOp template functor implementing the operator
-  * \param Lhs the type of the left-hand side
-  * \param Rhs the type of the right-hand side
-  *
-  * This class represents an expression  where a coefficient-wise binary operator is applied to two expressions.
-  * It is the return type of binary operators, by which we mean only those binary operators where
-  * both the left-hand side and the right-hand side are Eigen expressions.
-  * For example, the return type of matrix1+matrix2 is a CwiseBinaryOp.
-  *
-  * Most of the time, this is the only way that it is used, so you typically don't have to name
-  * CwiseBinaryOp types explicitly.
-  *
-  * \sa MatrixBase::binaryExpr(const MatrixBase<OtherDerived> &,const CustomBinaryOp &) const, class CwiseUnaryOp, class CwiseNullaryOp
-  */
-
 namespace internal {
 template<typename BinaryOp, typename Lhs, typename Rhs>
 struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
@@ -74,6 +54,25 @@ struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
 template<typename BinaryOp, typename Lhs, typename Rhs, typename StorageKind>
 class CwiseBinaryOpImpl;
 
+/** \class CwiseBinaryOp
+  * \ingroup Core_Module
+  *
+  * \brief Generic expression where a coefficient-wise binary operator is applied to two expressions
+  *
+  * \tparam BinaryOp template functor implementing the operator
+  * \tparam LhsType the type of the left-hand side
+  * \tparam RhsType the type of the right-hand side
+  *
+  * This class represents an expression  where a coefficient-wise binary operator is applied to two expressions.
+  * It is the return type of binary operators, by which we mean only those binary operators where
+  * both the left-hand side and the right-hand side are Eigen expressions.
+  * For example, the return type of matrix1+matrix2 is a CwiseBinaryOp.
+  *
+  * Most of the time, this is the only way that it is used, so you typically don't have to name
+  * CwiseBinaryOp types explicitly.
+  *
+  * \sa MatrixBase::binaryExpr(const MatrixBase<OtherDerived> &,const CustomBinaryOp &) const, class CwiseUnaryOp, class CwiseNullaryOp
+  */
 template<typename BinaryOp, typename LhsType, typename RhsType>
 class CwiseBinaryOp : 
   public CwiseBinaryOpImpl<
diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h
index 1dd109d3d..67b99653f 100644
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -12,24 +12,6 @@
 
 namespace Eigen {
 
-/** \class CwiseNullaryOp
-  * \ingroup Core_Module
-  *
-  * \brief Generic expression of a matrix where all coefficients are defined by a functor
-  *
-  * \param NullaryOp template functor implementing the operator
-  * \param PlainObjectType the underlying plain matrix/array type
-  *
-  * This class represents an expression of a generic nullary operator.
-  * It is the return type of the Ones(), Zero(), Constant(), Identity() and Random() methods,
-  * and most of the time this is the only way it is used.
-  *
-  * However, if you want to write a function returning such an expression, you
-  * will need to use this class.
-  *
-  * \sa class CwiseUnaryOp, class CwiseBinaryOp, DenseBase::NullaryExpr()
-  */
-
 namespace internal {
 template<typename NullaryOp, typename PlainObjectType>
 struct traits<CwiseNullaryOp<NullaryOp, PlainObjectType> > : traits<PlainObjectType>
@@ -40,6 +22,23 @@ struct traits<CwiseNullaryOp<NullaryOp, PlainObjectType> > : traits<PlainObjectT
 };
 }
 
+/** \class CwiseNullaryOp
+  * \ingroup Core_Module
+  *
+  * \brief Generic expression of a matrix where all coefficients are defined by a functor
+  *
+  * \tparam NullaryOp template functor implementing the operator
+  * \tparam PlainObjectType the underlying plain matrix/array type
+  *
+  * This class represents an expression of a generic nullary operator.
+  * It is the return type of the Ones(), Zero(), Constant(), Identity() and Random() methods,
+  * and most of the time this is the only way it is used.
+  *
+  * However, if you want to write a function returning such an expression, you
+  * will need to use this class.
+  *
+  * \sa class CwiseUnaryOp, class CwiseBinaryOp, DenseBase::NullaryExpr()
+  */
 template<typename NullaryOp, typename PlainObjectType>
 class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp, PlainObjectType> >::type, internal::no_assignment_operator
 {
diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h
index da1d1992d..5a809cf21 100644
--- a/Eigen/src/Core/CwiseUnaryOp.h
+++ b/Eigen/src/Core/CwiseUnaryOp.h
@@ -13,26 +13,6 @@
 
 namespace Eigen { 
 
-/** \class CwiseUnaryOp
-  * \ingroup Core_Module
-  *
-  * \brief Generic expression where a coefficient-wise unary operator is applied to an expression
-  *
-  * \param UnaryOp template functor implementing the operator
-  * \param XprType the type of the expression to which we are applying the unary operator
-  *
-  * This class represents an expression where a unary operator is applied to an expression.
-  * It is the return type of all operations taking exactly 1 input expression, regardless of the
-  * presence of other inputs such as scalars. For example, the operator* in the expression 3*matrix
-  * is considered unary, because only the right-hand side is an expression, and its
-  * return type is a specialization of CwiseUnaryOp.
-  *
-  * Most of the time, this is the only way that it is used, so you typically don't have to name
-  * CwiseUnaryOp types explicitly.
-  *
-  * \sa MatrixBase::unaryExpr(const CustomUnaryOp &) const, class CwiseBinaryOp, class CwiseNullaryOp
-  */
-
 namespace internal {
 template<typename UnaryOp, typename XprType>
 struct traits<CwiseUnaryOp<UnaryOp, XprType> >
@@ -52,6 +32,25 @@ struct traits<CwiseUnaryOp<UnaryOp, XprType> >
 template<typename UnaryOp, typename XprType, typename StorageKind>
 class CwiseUnaryOpImpl;
 
+/** \class CwiseUnaryOp
+  * \ingroup Core_Module
+  *
+  * \brief Generic expression where a coefficient-wise unary operator is applied to an expression
+  *
+  * \tparam UnaryOp template functor implementing the operator
+  * \tparam XprType the type of the expression to which we are applying the unary operator
+  *
+  * This class represents an expression where a unary operator is applied to an expression.
+  * It is the return type of all operations taking exactly 1 input expression, regardless of the
+  * presence of other inputs such as scalars. For example, the operator* in the expression 3*matrix
+  * is considered unary, because only the right-hand side is an expression, and its
+  * return type is a specialization of CwiseUnaryOp.
+  *
+  * Most of the time, this is the only way that it is used, so you typically don't have to name
+  * CwiseUnaryOp types explicitly.
+  *
+  * \sa MatrixBase::unaryExpr(const CustomUnaryOp &) const, class CwiseBinaryOp, class CwiseNullaryOp
+  */
 template<typename UnaryOp, typename XprType>
 class CwiseUnaryOp : public CwiseUnaryOpImpl<UnaryOp, XprType, typename internal::traits<XprType>::StorageKind>, internal::no_assignment_operator
 {
diff --git a/Eigen/src/Core/CwiseUnaryView.h b/Eigen/src/Core/CwiseUnaryView.h
index 72244751e..5a7db2b19 100644
--- a/Eigen/src/Core/CwiseUnaryView.h
+++ b/Eigen/src/Core/CwiseUnaryView.h
@@ -12,20 +12,6 @@
 
 namespace Eigen {
 
-/** \class CwiseUnaryView
-  * \ingroup Core_Module
-  *
-  * \brief Generic lvalue expression of a coefficient-wise unary operator of a matrix or a vector
-  *
-  * \param ViewOp template functor implementing the view
-  * \param MatrixType the type of the matrix we are applying the unary operator
-  *
-  * This class represents a lvalue expression of a generic unary view operator of a matrix or a vector.
-  * It is the return type of real() and imag(), and most of the time this is the only way it is used.
-  *
-  * \sa MatrixBase::unaryViewExpr(const CustomUnaryOp &) const, class CwiseUnaryOp
-  */
-
 namespace internal {
 template<typename ViewOp, typename MatrixType>
 struct traits<CwiseUnaryView<ViewOp, MatrixType> >
@@ -55,6 +41,19 @@ struct traits<CwiseUnaryView<ViewOp, MatrixType> >
 template<typename ViewOp, typename MatrixType, typename StorageKind>
 class CwiseUnaryViewImpl;
 
+/** \class CwiseUnaryView
+  * \ingroup Core_Module
+  *
+  * \brief Generic lvalue expression of a coefficient-wise unary operator of a matrix or a vector
+  *
+  * \tparam ViewOp template functor implementing the view
+  * \tparam MatrixType the type of the matrix we are applying the unary operator
+  *
+  * This class represents a lvalue expression of a generic unary view operator of a matrix or a vector.
+  * It is the return type of real() and imag(), and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::unaryViewExpr(const CustomUnaryOp &) const, class CwiseUnaryOp
+  */
 template<typename ViewOp, typename MatrixType>
 class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename internal::traits<MatrixType>::StorageKind>
 {
diff --git a/Eigen/src/Core/IO.h b/Eigen/src/Core/IO.h
index 9ae37bb5a..dfd9097cc 100644
--- a/Eigen/src/Core/IO.h
+++ b/Eigen/src/Core/IO.h
@@ -80,7 +80,7 @@ struct IOFormat
   *
   * \brief Pseudo expression providing matrix output with given format
   *
-  * \param ExpressionType the type of the object on which IO stream operations are performed
+  * \tparam ExpressionType the type of the object on which IO stream operations are performed
   *
   * This class represents an expression with stream operators controlled by a given IOFormat.
   * It is the return type of DenseBase::format()
diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h
index 3a8375da9..06d196702 100644
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -13,6 +13,28 @@
 
 namespace Eigen { 
 
+namespace internal {
+template<typename PlainObjectType, int MapOptions, typename StrideType>
+struct traits<Map<PlainObjectType, MapOptions, StrideType> >
+  : public traits<PlainObjectType>
+{
+  typedef traits<PlainObjectType> TraitsBase;
+  enum {
+    InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
+                             ? int(PlainObjectType::InnerStrideAtCompileTime)
+                             : int(StrideType::InnerStrideAtCompileTime),
+    OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
+                             ? int(PlainObjectType::OuterStrideAtCompileTime)
+                             : int(StrideType::OuterStrideAtCompileTime),
+    Alignment = int(MapOptions)&int(AlignedMask),
+    Flags0 = TraitsBase::Flags & (~NestByRefBit),
+    Flags = is_lvalue<PlainObjectType>::value ? int(Flags0) : (int(Flags0) & ~LvalueBit)
+  };
+private:
+  enum { Options }; // Expressions don't have Options
+};
+}
+
 /** \class Map
   * \ingroup Core_Module
   *
@@ -63,29 +85,6 @@ namespace Eigen {
   *
   * \sa PlainObjectBase::Map(), \ref TopicStorageOrders
   */
-
-namespace internal {
-template<typename PlainObjectType, int MapOptions, typename StrideType>
-struct traits<Map<PlainObjectType, MapOptions, StrideType> >
-  : public traits<PlainObjectType>
-{
-  typedef traits<PlainObjectType> TraitsBase;
-  enum {
-    InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
-                             ? int(PlainObjectType::InnerStrideAtCompileTime)
-                             : int(StrideType::InnerStrideAtCompileTime),
-    OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
-                             ? int(PlainObjectType::OuterStrideAtCompileTime)
-                             : int(StrideType::OuterStrideAtCompileTime),
-    Alignment = int(MapOptions)&int(AlignedMask),
-    Flags0 = TraitsBase::Flags & (~NestByRefBit),
-    Flags = is_lvalue<PlainObjectType>::value ? int(Flags0) : (int(Flags0) & ~LvalueBit)
-  };
-private:
-  enum { Options }; // Expressions don't have Options
-};
-}
-
 template<typename PlainObjectType, int MapOptions, typename StrideType> class Map
   : public MapBase<Map<PlainObjectType, MapOptions, StrideType> >
 {
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 48cf565fb..4d5e1acb8 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -26,7 +26,7 @@ long double abs(long double x) { return (fabsl(x)); }
   
 namespace internal {
 
-/** \internal \struct global_math_functions_filtering_base
+/** \internal \class global_math_functions_filtering_base
   *
   * What it does:
   * Defines a typedef 'type' as follows:
diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index ce1b70d23..bcbbbf9ae 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -13,6 +13,45 @@
 
 namespace Eigen {
 
+namespace internal {
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+{
+private:
+  enum { size = internal::size_at_compile_time<_Rows,_Cols>::ret };
+  typedef typename find_best_packet<_Scalar,size>::type PacketScalar;
+  enum {
+      row_major_bit = _Options&RowMajor ? RowMajorBit : 0,
+      is_dynamic_size_storage = _MaxRows==Dynamic || _MaxCols==Dynamic,
+      max_size = is_dynamic_size_storage ? Dynamic : _MaxRows*_MaxCols,
+      default_alignment = compute_default_alignment<_Scalar,max_size>::value,
+      actual_alignment = ((_Options&DontAlign)==0) ? default_alignment : 0,
+      required_alignment = unpacket_traits<PacketScalar>::alignment,
+      packet_access_bit = packet_traits<_Scalar>::Vectorizable && (actual_alignment>=required_alignment) ? PacketAccessBit : 0
+    };
+    
+public:
+  typedef _Scalar Scalar;
+  typedef Dense StorageKind;
+  typedef Eigen::Index StorageIndex;
+  typedef MatrixXpr XprKind;
+  enum {
+    RowsAtCompileTime = _Rows,
+    ColsAtCompileTime = _Cols,
+    MaxRowsAtCompileTime = _MaxRows,
+    MaxColsAtCompileTime = _MaxCols,
+    Flags = compute_matrix_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret,
+    Options = _Options,
+    InnerStrideAtCompileTime = 1,
+    OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime,
+    
+    // FIXME, the following flag in only used to define NeedsToAlign in PlainObjectBase
+    EvaluatorFlags = LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit,
+    Alignment = actual_alignment
+  };
+};
+}
+
 /** \class Matrix
   * \ingroup Core_Module
   *
@@ -98,7 +137,7 @@ namespace Eigen {
   * </dl>
   *
   * <i><b>ABI and storage layout</b></i>
-  * 
+  *
   * The table below summarizes the ABI of some possible Matrix instances which is fixed thorough the lifetime of Eigen 3.
   * <table  class="manual">
   * <tr><th>Matrix type</th><th>Equivalent C structure</th></tr>
@@ -130,50 +169,11 @@ namespace Eigen {
   * </table>
   * Note that in this table Rows, Cols, MaxRows and MaxCols are all positive integers. A(S) is defined to the largest possible power-of-two
   * smaller to EIGEN_MAX_STATIC_ALIGN_BYTES.
-  * 
-  * \see MatrixBase for the majority of the API methods for matrices, \ref TopicClassHierarchy, 
-  * \ref TopicStorageOrders 
+  *
+  * \see MatrixBase for the majority of the API methods for matrices, \ref TopicClassHierarchy,
+  * \ref TopicStorageOrders
   */
 
-namespace internal {
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
-{
-private:
-  enum { size = internal::size_at_compile_time<_Rows,_Cols>::ret };
-  typedef typename find_best_packet<_Scalar,size>::type PacketScalar;
-  enum {
-      row_major_bit = _Options&RowMajor ? RowMajorBit : 0,
-      is_dynamic_size_storage = _MaxRows==Dynamic || _MaxCols==Dynamic,
-      max_size = is_dynamic_size_storage ? Dynamic : _MaxRows*_MaxCols,
-      default_alignment = compute_default_alignment<_Scalar,max_size>::value,
-      actual_alignment = ((_Options&DontAlign)==0) ? default_alignment : 0,
-      required_alignment = unpacket_traits<PacketScalar>::alignment,
-      packet_access_bit = packet_traits<_Scalar>::Vectorizable && (actual_alignment>=required_alignment) ? PacketAccessBit : 0
-    };
-    
-public:
-  typedef _Scalar Scalar;
-  typedef Dense StorageKind;
-  typedef Eigen::Index StorageIndex;
-  typedef MatrixXpr XprKind;
-  enum {
-    RowsAtCompileTime = _Rows,
-    ColsAtCompileTime = _Cols,
-    MaxRowsAtCompileTime = _MaxRows,
-    MaxColsAtCompileTime = _MaxCols,
-    Flags = compute_matrix_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret,
-    Options = _Options,
-    InnerStrideAtCompileTime = 1,
-    OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime,
-    
-    // FIXME, the following flag in only used to define NeedsToAlign in PlainObjectBase
-    EvaluatorFlags = LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit,
-    Alignment = actual_alignment
-  };
-};
-}
-
 template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
 class Matrix
   : public PlainObjectBase<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
diff --git a/Eigen/src/Core/NestByValue.h b/Eigen/src/Core/NestByValue.h
index 9aeaf8d18..13adf070e 100644
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h
@@ -13,25 +13,24 @@
 
 namespace Eigen {
 
-/** \class NestByValue
-  * \ingroup Core_Module
-  *
-  * \brief Expression which must be nested by value
-  *
-  * \param ExpressionType the type of the object of which we are requiring nesting-by-value
-  *
-  * This class is the return type of MatrixBase::nestByValue()
-  * and most of the time this is the only way it is used.
-  *
-  * \sa MatrixBase::nestByValue()
-  */
-
 namespace internal {
 template<typename ExpressionType>
 struct traits<NestByValue<ExpressionType> > : public traits<ExpressionType>
 {};
 }
 
+/** \class NestByValue
+  * \ingroup Core_Module
+  *
+  * \brief Expression which must be nested by value
+  *
+  * \tparam ExpressionType the type of the object of which we are requiring nesting-by-value
+  *
+  * This class is the return type of MatrixBase::nestByValue()
+  * and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::nestByValue()
+  */
 template<typename ExpressionType> class NestByValue
   : public internal::dense_xpr_base< NestByValue<ExpressionType> >::type
 {
diff --git a/Eigen/src/Core/NoAlias.h b/Eigen/src/Core/NoAlias.h
index 0ade75255..ffb673cee 100644
--- a/Eigen/src/Core/NoAlias.h
+++ b/Eigen/src/Core/NoAlias.h
@@ -17,7 +17,7 @@ namespace Eigen {
   *
   * \brief Pseudo expression providing an operator = assuming no aliasing
   *
-  * \param ExpressionType the type of the object on which to do the lazy assignment
+  * \tparam ExpressionType the type of the object on which to do the lazy assignment
   *
   * This class represents an expression with special assignment operators
   * assuming no aliasing between the target expression and the source expression.
diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h
index 1d85dec72..d1aabd995 100644
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -17,7 +17,7 @@ namespace Eigen {
   *
   * \brief Holds information about the various numeric (i.e. scalar) types allowed by Eigen.
   *
-  * \param T the numeric type at hand
+  * \tparam T the numeric type at hand
   *
   * This class stores enums, typedefs and static methods giving information about a numeric type.
   *
diff --git a/Eigen/src/Core/PermutationMatrix.h b/Eigen/src/Core/PermutationMatrix.h
index aad4ccecd..b1fb455b9 100644
--- a/Eigen/src/Core/PermutationMatrix.h
+++ b/Eigen/src/Core/PermutationMatrix.h
@@ -13,12 +13,18 @@
 
 namespace Eigen { 
 
+namespace internal {
+
+enum PermPermProduct_t {PermPermProduct};
+
+} // end namespace internal
+
 /** \class PermutationBase
   * \ingroup Core_Module
   *
   * \brief Base class for permutations
   *
-  * \param Derived the derived class
+  * \tparam Derived the derived class
   *
   * This class is the base class for all expressions representing a permutation matrix,
   * internally stored as a vector of integers.
@@ -36,13 +42,6 @@ namespace Eigen {
   *
   * \sa class PermutationMatrix, class PermutationWrapper
   */
-
-namespace internal {
-
-enum PermPermProduct_t {PermPermProduct};
-
-} // end namespace internal
-
 template<typename Derived>
 class PermutationBase : public EigenBase<Derived>
 {
@@ -280,20 +279,6 @@ class PermutationBase : public EigenBase<Derived>
 
 };
 
-/** \class PermutationMatrix
-  * \ingroup Core_Module
-  *
-  * \brief Permutation matrix
-  *
-  * \param SizeAtCompileTime the number of rows/cols, or Dynamic
-  * \param MaxSizeAtCompileTime the maximum number of rows/cols, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
-  * \param StorageIndex the integer type of the indices
-  *
-  * This class represents a permutation matrix, internally stored as a vector of integers.
-  *
-  * \sa class PermutationBase, class PermutationWrapper, class DiagonalMatrix
-  */
-
 namespace internal {
 template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
 struct traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex> >
@@ -306,6 +291,19 @@ struct traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _Storag
 };
 }
 
+/** \class PermutationMatrix
+  * \ingroup Core_Module
+  *
+  * \brief Permutation matrix
+  *
+  * \tparam SizeAtCompileTime the number of rows/cols, or Dynamic
+  * \tparam MaxSizeAtCompileTime the maximum number of rows/cols, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
+  * \tparam _StorageIndex the integer type of the indices
+  *
+  * This class represents a permutation matrix, internally stored as a vector of integers.
+  *
+  * \sa class PermutationBase, class PermutationWrapper, class DiagonalMatrix
+  */
 template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
 class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex> >
 {
@@ -482,18 +480,6 @@ class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageInd
     IndicesType m_indices;
 };
 
-/** \class PermutationWrapper
-  * \ingroup Core_Module
-  *
-  * \brief Class to view a vector of integers as a permutation matrix
-  *
-  * \param _IndicesType the type of the vector of integer (can be any compatible expression)
-  *
-  * This class allows to view any vector expression of integers as a permutation matrix.
-  *
-  * \sa class PermutationBase, class PermutationMatrix
-  */
-
 template<typename _IndicesType> class TranspositionsWrapper;
 namespace internal {
 template<typename _IndicesType>
@@ -513,6 +499,17 @@ struct traits<PermutationWrapper<_IndicesType> >
 };
 }
 
+/** \class PermutationWrapper
+  * \ingroup Core_Module
+  *
+  * \brief Class to view a vector of integers as a permutation matrix
+  *
+  * \tparam _IndicesType the type of the vector of integer (can be any compatible expression)
+  *
+  * This class allows to view any vector expression of integers as a permutation matrix.
+  *
+  * \sa class PermutationBase, class PermutationMatrix
+  */
 template<typename _IndicesType>
 class PermutationWrapper : public PermutationBase<PermutationWrapper<_IndicesType> >
 {
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index fdd2fed3f..8aa1de081 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -14,22 +14,6 @@ namespace Eigen {
 
 template<typename Lhs, typename Rhs, int Option, typename StorageKind> class ProductImpl;
 
-/** \class Product
-  * \ingroup Core_Module
-  *
-  * \brief Expression of the product of two arbitrary matrices or vectors
-  *
-  * \param Lhs the type of the left-hand side expression
-  * \param Rhs the type of the right-hand side expression
-  *
-  * This class represents an expression of the product of two arbitrary matrices.
-  * 
-  * The other template parameters are:
-  * \tparam Option     can be DefaultProduct, AliasFreeProduct, or LazyProduct
-  *
-  */
-
-
 namespace internal {
 
 // Determine the scalar of Product<Lhs, Rhs>. This is normally the same as Lhs::Scalar times
@@ -102,7 +86,20 @@ struct traits<Product<Lhs, Rhs, Option> >
 
 } // end namespace internal
 
-
+/** \class Product
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the product of two arbitrary matrices or vectors
+  *
+  * \tparam _Lhs the type of the left-hand side expression
+  * \tparam _Rhs the type of the right-hand side expression
+  *
+  * This class represents an expression of the product of two arbitrary matrices.
+  *
+  * The other template parameters are:
+  * \tparam Option     can be DefaultProduct, AliasFreeProduct, or LazyProduct
+  *
+  */
 template<typename _Lhs, typename _Rhs, int Option>
 class Product : public ProductImpl<_Lhs,_Rhs,Option,
                                    typename internal::product_promote_storage_type<typename internal::traits<_Lhs>::StorageKind,
diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h
index 61de5ed17..ae414204e 100644
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@@ -12,76 +12,6 @@
 
 namespace Eigen { 
 
-/** \class Ref
-  * \ingroup Core_Module
-  *
-  * \brief A matrix or vector expression mapping an existing expression
-  *
-  * \tparam PlainObjectType the equivalent matrix type of the mapped data
-  * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned.
-  *                The default is \c #Unaligned.
-  * \tparam StrideType optionally specifies strides. By default, Ref implies a contiguous storage along the inner dimension (inner stride==1),
-  *                   but accepts a variable outer stride (leading dimension).
-  *                   This can be overridden by specifying strides.
-  *                   The type passed here must be a specialization of the Stride template, see examples below.
-  *
-  * This class provides a way to write non-template functions taking Eigen objects as parameters while limiting the number of copies.
-  * A Ref<> object can represent either a const expression or a l-value:
-  * \code
-  * // in-out argument:
-  * void foo1(Ref<VectorXf> x);
-  *
-  * // read-only const argument:
-  * void foo2(const Ref<const VectorXf>& x);
-  * \endcode
-  *
-  * In the in-out case, the input argument must satisfy the constraints of the actual Ref<> type, otherwise a compilation issue will be triggered.
-  * By default, a Ref<VectorXf> can reference any dense vector expression of float having a contiguous memory layout.
-  * Likewise, a Ref<MatrixXf> can reference any column-major dense matrix expression of float whose column's elements are contiguously stored with
-  * the possibility to have a constant space in-between each column, i.e. the inner stride must be equal to 1, but the outer stride (or leading dimension)
-  * can be greater than the number of rows.
-  *
-  * In the const case, if the input expression does not match the above requirement, then it is evaluated into a temporary before being passed to the function.
-  * Here are some examples:
-  * \code
-  * MatrixXf A;
-  * VectorXf a;
-  * foo1(a.head());             // OK
-  * foo1(A.col());              // OK
-  * foo1(A.row());              // Compilation error because here innerstride!=1
-  * foo2(A.row());              // Compilation error because A.row() is a 1xN object while foo2 is expecting a Nx1 object
-  * foo2(A.row().transpose());  // The row is copied into a contiguous temporary
-  * foo2(2*a);                  // The expression is evaluated into a temporary
-  * foo2(A.col().segment(2,4)); // No temporary
-  * \endcode
-  *
-  * The range of inputs that can be referenced without temporary can be enlarged using the last two template parameters.
-  * Here is an example accepting an innerstride!=1:
-  * \code
-  * // in-out argument:
-  * void foo3(Ref<VectorXf,0,InnerStride<> > x);
-  * foo3(A.row());              // OK
-  * \endcode
-  * The downside here is that the function foo3 might be significantly slower than foo1 because it won't be able to exploit vectorization, and will involve more
-  * expensive address computations even if the input is contiguously stored in memory. To overcome this issue, one might propose to overload internally calling a
-  * template function, e.g.:
-  * \code
-  * // in the .h:
-  * void foo(const Ref<MatrixXf>& A);
-  * void foo(const Ref<MatrixXf,0,Stride<> >& A);
-  *
-  * // in the .cpp:
-  * template<typename TypeOfA> void foo_impl(const TypeOfA& A) {
-  *     ... // crazy code goes here
-  * }
-  * void foo(const Ref<MatrixXf>& A) { foo_impl(A); }
-  * void foo(const Ref<MatrixXf,0,Stride<> >& A) { foo_impl(A); }
-  * \endcode
-  *
-  *
-  * \sa PlainObjectBase::Map(), \ref TopicStorageOrders
-  */
-
 namespace internal {
 
 template<typename _PlainObjectType, int _Options, typename _StrideType>
@@ -182,7 +112,75 @@ protected:
   StrideBase m_stride;
 };
 
-
+/** \class Ref
+  * \ingroup Core_Module
+  *
+  * \brief A matrix or vector expression mapping an existing expression
+  *
+  * \tparam PlainObjectType the equivalent matrix type of the mapped data
+  * \tparam Options specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned.
+  *                 The default is \c #Unaligned.
+  * \tparam StrideType optionally specifies strides. By default, Ref implies a contiguous storage along the inner dimension (inner stride==1),
+  *                   but accepts a variable outer stride (leading dimension).
+  *                   This can be overridden by specifying strides.
+  *                   The type passed here must be a specialization of the Stride template, see examples below.
+  *
+  * This class provides a way to write non-template functions taking Eigen objects as parameters while limiting the number of copies.
+  * A Ref<> object can represent either a const expression or a l-value:
+  * \code
+  * // in-out argument:
+  * void foo1(Ref<VectorXf> x);
+  *
+  * // read-only const argument:
+  * void foo2(const Ref<const VectorXf>& x);
+  * \endcode
+  *
+  * In the in-out case, the input argument must satisfy the constraints of the actual Ref<> type, otherwise a compilation issue will be triggered.
+  * By default, a Ref<VectorXf> can reference any dense vector expression of float having a contiguous memory layout.
+  * Likewise, a Ref<MatrixXf> can reference any column-major dense matrix expression of float whose column's elements are contiguously stored with
+  * the possibility to have a constant space in-between each column, i.e. the inner stride must be equal to 1, but the outer stride (or leading dimension)
+  * can be greater than the number of rows.
+  *
+  * In the const case, if the input expression does not match the above requirement, then it is evaluated into a temporary before being passed to the function.
+  * Here are some examples:
+  * \code
+  * MatrixXf A;
+  * VectorXf a;
+  * foo1(a.head());             // OK
+  * foo1(A.col());              // OK
+  * foo1(A.row());              // Compilation error because here innerstride!=1
+  * foo2(A.row());              // Compilation error because A.row() is a 1xN object while foo2 is expecting a Nx1 object
+  * foo2(A.row().transpose());  // The row is copied into a contiguous temporary
+  * foo2(2*a);                  // The expression is evaluated into a temporary
+  * foo2(A.col().segment(2,4)); // No temporary
+  * \endcode
+  *
+  * The range of inputs that can be referenced without temporary can be enlarged using the last two template parameters.
+  * Here is an example accepting an innerstride!=1:
+  * \code
+  * // in-out argument:
+  * void foo3(Ref<VectorXf,0,InnerStride<> > x);
+  * foo3(A.row());              // OK
+  * \endcode
+  * The downside here is that the function foo3 might be significantly slower than foo1 because it won't be able to exploit vectorization, and will involve more
+  * expensive address computations even if the input is contiguously stored in memory. To overcome this issue, one might propose to overload internally calling a
+  * template function, e.g.:
+  * \code
+  * // in the .h:
+  * void foo(const Ref<MatrixXf>& A);
+  * void foo(const Ref<MatrixXf,0,Stride<> >& A);
+  *
+  * // in the .cpp:
+  * template<typename TypeOfA> void foo_impl(const TypeOfA& A) {
+  *     ... // crazy code goes here
+  * }
+  * void foo(const Ref<MatrixXf>& A) { foo_impl(A); }
+  * void foo(const Ref<MatrixXf,0,Stride<> >& A) { foo_impl(A); }
+  * \endcode
+  *
+  *
+  * \sa PlainObjectBase::Map(), \ref TopicStorageOrders
+  */
 template<typename PlainObjectType, int Options, typename StrideType> class Ref
   : public RefBase<Ref<PlainObjectType, Options, StrideType> >
 {
diff --git a/Eigen/src/Core/Replicate.h b/Eigen/src/Core/Replicate.h
index bec598310..9960ef884 100644
--- a/Eigen/src/Core/Replicate.h
+++ b/Eigen/src/Core/Replicate.h
@@ -12,21 +12,6 @@
 
 namespace Eigen { 
 
-/**
-  * \class Replicate
-  * \ingroup Core_Module
-  *
-  * \brief Expression of the multiple replication of a matrix or vector
-  *
-  * \param MatrixType the type of the object we are replicating
-  *
-  * This class represents an expression of the multiple replication of a matrix or vector.
-  * It is the return type of DenseBase::replicate() and most of the time
-  * this is the only way it is used.
-  *
-  * \sa DenseBase::replicate()
-  */
-
 namespace internal {
 template<typename MatrixType,int RowFactor,int ColFactor>
 struct traits<Replicate<MatrixType,RowFactor,ColFactor> >
@@ -57,6 +42,22 @@ struct traits<Replicate<MatrixType,RowFactor,ColFactor> >
 };
 }
 
+/**
+  * \class Replicate
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the multiple replication of a matrix or vector
+  *
+  * \tparam MatrixType the type of the object we are replicating
+  * \tparam RowFactor number of repetitions at compile time along the vertical direction, can be Dynamic.
+  * \tparam ColFactor number of repetitions at compile time along the horizontal direction, can be Dynamic.
+  *
+  * This class represents an expression of the multiple replication of a matrix or vector.
+  * It is the return type of DenseBase::replicate() and most of the time
+  * this is the only way it is used.
+  *
+  * \sa DenseBase::replicate()
+  */
 template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
   : public internal::dense_xpr_base< Replicate<MatrixType,RowFactor,ColFactor> >::type
 {
diff --git a/Eigen/src/Core/ReturnByValue.h b/Eigen/src/Core/ReturnByValue.h
index 7feb6e01c..c44b7673b 100644
--- a/Eigen/src/Core/ReturnByValue.h
+++ b/Eigen/src/Core/ReturnByValue.h
@@ -13,11 +13,6 @@
 
 namespace Eigen {
 
-/** \class ReturnByValue
-  * \ingroup Core_Module
-  *
-  */
-
 namespace internal {
 
 template<typename Derived>
@@ -48,6 +43,10 @@ struct nested_eval<ReturnByValue<Derived>, n, PlainObject>
 
 } // end namespace internal
 
+/** \class ReturnByValue
+  * \ingroup Core_Module
+  *
+  */
 template<typename Derived> class ReturnByValue
   : public internal::dense_xpr_base< ReturnByValue<Derived> >::type, internal::no_assignment_operator
 {
diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h
index d7c380c78..0640cda2a 100644
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h
@@ -14,20 +14,6 @@
 
 namespace Eigen { 
 
-/** \class Reverse
-  * \ingroup Core_Module
-  *
-  * \brief Expression of the reverse of a vector or matrix
-  *
-  * \param MatrixType the type of the object of which we are taking the reverse
-  *
-  * This class represents an expression of the reverse of a vector.
-  * It is the return type of MatrixBase::reverse() and VectorwiseOp::reverse()
-  * and most of the time this is the only way it is used.
-  *
-  * \sa MatrixBase::reverse(), VectorwiseOp::reverse()
-  */
-
 namespace internal {
 
 template<typename MatrixType, int Direction>
@@ -60,6 +46,20 @@ template<typename PacketType> struct reverse_packet_cond<PacketType,false>
 
 } // end namespace internal 
 
+/** \class Reverse
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the reverse of a vector or matrix
+  *
+  * \tparam MatrixType the type of the object of which we are taking the reverse
+  * \tparam Direction defines the direction of the reverse operation, can be Vertical, Horizontal, or BothDirections
+  *
+  * This class represents an expression of the reverse of a vector.
+  * It is the return type of MatrixBase::reverse() and VectorwiseOp::reverse()
+  * and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::reverse(), VectorwiseOp::reverse()
+  */
 template<typename MatrixType, int Direction> class Reverse
   : public internal::dense_xpr_base< Reverse<MatrixType, Direction> >::type
 {
diff --git a/Eigen/src/Core/Stride.h b/Eigen/src/Core/Stride.h
index 9a2f4f1eb..513742f34 100644
--- a/Eigen/src/Core/Stride.h
+++ b/Eigen/src/Core/Stride.h
@@ -31,8 +31,8 @@ namespace Eigen {
   * arguments to the constructor.
   *
   * Indeed, this class takes two template parameters:
-  *  \param _OuterStrideAtCompileTime the outer stride, or Dynamic if you want to specify it at runtime.
-  *  \param _InnerStrideAtCompileTime the inner stride, or Dynamic if you want to specify it at runtime.
+  *  \tparam _OuterStrideAtCompileTime the outer stride, or Dynamic if you want to specify it at runtime.
+  *  \tparam _InnerStrideAtCompileTime the inner stride, or Dynamic if you want to specify it at runtime.
   *
   * Here is an example:
   * \include Map_general_stride.cpp
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index 5b66eb5e1..f199d1086 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -13,20 +13,6 @@
 
 namespace Eigen { 
 
-/** \class Transpose
-  * \ingroup Core_Module
-  *
-  * \brief Expression of the transpose of a matrix
-  *
-  * \param MatrixType the type of the object of which we are taking the transpose
-  *
-  * This class represents an expression of the transpose of a matrix.
-  * It is the return type of MatrixBase::transpose() and MatrixBase::adjoint()
-  * and most of the time this is the only way it is used.
-  *
-  * \sa MatrixBase::transpose(), MatrixBase::adjoint()
-  */
-
 namespace internal {
 template<typename MatrixType>
 struct traits<Transpose<MatrixType> > : public traits<MatrixType>
@@ -50,6 +36,19 @@ struct traits<Transpose<MatrixType> > : public traits<MatrixType>
 
 template<typename MatrixType, typename StorageKind> class TransposeImpl;
 
+/** \class Transpose
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the transpose of a matrix
+  *
+  * \tparam MatrixType the type of the object of which we are taking the transpose
+  *
+  * This class represents an expression of the transpose of a matrix.
+  * It is the return type of MatrixBase::transpose() and MatrixBase::adjoint()
+  * and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::transpose(), MatrixBase::adjoint()
+  */
 template<typename MatrixType> class Transpose
   : public TransposeImpl<MatrixType,typename internal::traits<MatrixType>::StorageKind>
 {
diff --git a/Eigen/src/Core/Transpositions.h b/Eigen/src/Core/Transpositions.h
index 3b1c1815d..678ba3288 100644
--- a/Eigen/src/Core/Transpositions.h
+++ b/Eigen/src/Core/Transpositions.h
@@ -12,35 +12,6 @@
 
 namespace Eigen { 
 
-/** \class Transpositions
-  * \ingroup Core_Module
-  *
-  * \brief Represents a sequence of transpositions (row/column interchange)
-  *
-  * \param SizeAtCompileTime the number of transpositions, or Dynamic
-  * \param MaxSizeAtCompileTime the maximum number of transpositions, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
-  *
-  * This class represents a permutation transformation as a sequence of \em n transpositions
-  * \f$[T_{n-1} \ldots T_{i} \ldots T_{0}]\f$. It is internally stored as a vector of integers \c indices.
-  * Each transposition \f$ T_{i} \f$ applied on the left of a matrix (\f$ T_{i} M\f$) interchanges
-  * the rows \c i and \c indices[i] of the matrix \c M.
-  * A transposition applied on the right (e.g., \f$ M T_{i}\f$) yields a column interchange.
-  *
-  * Compared to the class PermutationMatrix, such a sequence of transpositions is what is
-  * computed during a decomposition with pivoting, and it is faster when applying the permutation in-place.
-  * 
-  * To apply a sequence of transpositions to a matrix, simply use the operator * as in the following example:
-  * \code
-  * Transpositions tr;
-  * MatrixXf mat;
-  * mat = tr * mat;
-  * \endcode
-  * In this example, we detect that the matrix appears on both side, and so the transpositions
-  * are applied in-place without any temporary or extra copy.
-  *
-  * \sa class PermutationMatrix
-  */
-
 template<typename Derived>
 class TranspositionsBase
 {
@@ -154,6 +125,35 @@ struct traits<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageInde
 };
 }
 
+/** \class Transpositions
+  * \ingroup Core_Module
+  *
+  * \brief Represents a sequence of transpositions (row/column interchange)
+  *
+  * \tparam SizeAtCompileTime the number of transpositions, or Dynamic
+  * \tparam MaxSizeAtCompileTime the maximum number of transpositions, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
+  *
+  * This class represents a permutation transformation as a sequence of \em n transpositions
+  * \f$[T_{n-1} \ldots T_{i} \ldots T_{0}]\f$. It is internally stored as a vector of integers \c indices.
+  * Each transposition \f$ T_{i} \f$ applied on the left of a matrix (\f$ T_{i} M\f$) interchanges
+  * the rows \c i and \c indices[i] of the matrix \c M.
+  * A transposition applied on the right (e.g., \f$ M T_{i}\f$) yields a column interchange.
+  *
+  * Compared to the class PermutationMatrix, such a sequence of transpositions is what is
+  * computed during a decomposition with pivoting, and it is faster when applying the permutation in-place.
+  *
+  * To apply a sequence of transpositions to a matrix, simply use the operator * as in the following example:
+  * \code
+  * Transpositions tr;
+  * MatrixXf mat;
+  * mat = tr * mat;
+  * \endcode
+  * In this example, we detect that the matrix appears on both side, and so the transpositions
+  * are applied in-place without any temporary or extra copy.
+  *
+  * \sa class PermutationMatrix
+  */
+
 template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
 class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex> >
 {
diff --git a/Eigen/src/Core/VectorBlock.h b/Eigen/src/Core/VectorBlock.h
index 216c568c4..d72fbf7e9 100644
--- a/Eigen/src/Core/VectorBlock.h
+++ b/Eigen/src/Core/VectorBlock.h
@@ -13,13 +13,23 @@
 
 namespace Eigen { 
 
+namespace internal {
+template<typename VectorType, int Size>
+struct traits<VectorBlock<VectorType, Size> >
+  : public traits<Block<VectorType,
+                     traits<VectorType>::Flags & RowMajorBit ? 1 : Size,
+                     traits<VectorType>::Flags & RowMajorBit ? Size : 1> >
+{
+};
+}
+
 /** \class VectorBlock
   * \ingroup Core_Module
   *
   * \brief Expression of a fixed-size or dynamic-size sub-vector
   *
-  * \param VectorType the type of the object in which we are taking a sub-vector
-  * \param Size size of the sub-vector we are taking at compile time (optional)
+  * \tparam VectorType the type of the object in which we are taking a sub-vector
+  * \tparam Size size of the sub-vector we are taking at compile time (optional)
   *
   * This class represents an expression of either a fixed-size or dynamic-size sub-vector.
   * It is the return type of DenseBase::segment(Index,Index) and DenseBase::segment<int>(Index) and
@@ -43,17 +53,6 @@ namespace Eigen {
   *
   * \sa class Block, DenseBase::segment(Index,Index,Index,Index), DenseBase::segment(Index,Index)
   */
-
-namespace internal {
-template<typename VectorType, int Size>
-struct traits<VectorBlock<VectorType, Size> >
-  : public traits<Block<VectorType,
-                     traits<VectorType>::Flags & RowMajorBit ? 1 : Size,
-                     traits<VectorType>::Flags & RowMajorBit ? Size : 1> >
-{
-};
-}
-
 template<typename VectorType, int Size> class VectorBlock
   : public Block<VectorType,
                      internal::traits<VectorType>::Flags & RowMajorBit ? 1 : Size,
diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h
index 483f71909..95bcaa86f 100755
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@@ -141,8 +141,8 @@ struct member_redux {
   *
   * \brief Pseudo expression providing partial reduction operations
   *
-  * \param ExpressionType the type of the object on which to do partial reductions
-  * \param Direction indicates the direction of the redux (#Vertical or #Horizontal)
+  * \tparam ExpressionType the type of the object on which to do partial reductions
+  * \tparam Direction indicates the direction of the redux (#Vertical or #Horizontal)
   *
   * This class represents a pseudo expression with partial reduction features.
   * It is the return type of DenseBase::colwise() and DenseBase::rowwise()
@@ -187,11 +187,11 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
 
   protected:
 
-    /** \internal
-      * \returns the i-th subvector according to the \c Direction */
     typedef typename internal::conditional<isVertical,
                                typename ExpressionType::ColXpr,
                                typename ExpressionType::RowXpr>::type SubVector;
+    /** \internal
+      * \returns the i-th subvector according to the \c Direction */
     EIGEN_DEVICE_FUNC
     SubVector subVector(Index i)
     {
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index 6b93d5221..8b71e2c62 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -390,9 +390,9 @@ struct transfer_constness
   * a*d. Evaluating can be beneficial for example if every coefficient access in the resulting expression causes
   * many coefficient accesses in the nested expressions -- as is the case with matrix product for example.
   *
-  * \param T the type of the expression being nested.
-  * \param n the number of coefficient accesses in the nested expression for each coefficient access in the bigger expression.
-  * \param PlainObject the type of the temporary if needed.
+  * \tparam T the type of the expression being nested.
+  * \tparam n the number of coefficient accesses in the nested expression for each coefficient access in the bigger expression.
+  * \tparam PlainObject the type of the temporary if needed.
   */
 template<typename T, int n, typename PlainObject = typename plain_object_eval<T>::type> struct nested_eval
 {
@@ -575,7 +575,7 @@ template <int ProductTag>             struct product_promote_storage_type<Dense,
 template <int ProductTag>             struct product_promote_storage_type<PermutationStorage, Dense,              ProductTag> { typedef Dense ret; };
 
 /** \internal gives the plain matrix or array type to store a row/column/diagonal of a matrix type.
-  * \param Scalar optional parameter allowing to pass a different scalar type than the one of the MatrixType.
+  * \tparam Scalar optional parameter allowing to pass a different scalar type than the one of the MatrixType.
   */
 template<typename ExpressionType, typename Scalar = typename ExpressionType::Scalar>
 struct plain_row_type
diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
index dc9cd0a1a..469ea5e4e 100644
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
@@ -375,8 +375,12 @@ namespace internal {
   * Performs a QR step on a tridiagonal symmetric matrix represented as a
   * pair of two vectors \a diag and \a subdiag.
   *
-  * \param matA the input selfadjoint matrix
-  * \param hCoeffs returned Householder coefficients
+  * \param diag the diagonal part of the input selfadjoint tridiagonal matrix
+  * \param subdiag the sub-diagonal part of the input selfadjoint tridiagonal matrix
+  * \param start starting index of the submatrix to work on
+  * \param end last+1 index of the submatrix to work on
+  * \param matrixQ pointer to the column-major matrix holding the eigenvectors, can be 0
+  * \param n size of the input matrix
   *
   * For compilation efficiency reasons, this procedure does not use eigen expression
   * for its arguments.
@@ -467,9 +471,10 @@ namespace internal {
   * \brief Compute the eigendecomposition from a tridiagonal matrix
   *
   * \param[in,out] diag : On input, the diagonal of the matrix, on output the eigenvalues
-  * \param[in] subdiag : The subdiagonal part of the matrix.
-  * \param[in,out] : On input, the maximum number of iterations, on output, the effective number of iterations.
-  * \param[out] eivec : The matrix to store the eigenvectors... if needed. allocated on input
+  * \param[in,out] subdiag : The subdiagonal part of the matrix (entries are modified during the decomposition)
+  * \param[in] maxIterations : the maximum number of iterations
+  * \param[in] computeEigenvectors : whether the eigenvectors have to be computed or not
+  * \param[out] eivec : The matrix to store the eigenvectors if computeEigenvectors==true. Must be allocated on input.
   * \returns \c Success or \c NoConvergence
   */
 template<typename MatrixType, typename DiagType, typename SubDiagType>
diff --git a/Eigen/src/Geometry/Hyperplane.h b/Eigen/src/Geometry/Hyperplane.h
index 2d076d7f8..cc89639b6 100644
--- a/Eigen/src/Geometry/Hyperplane.h
+++ b/Eigen/src/Geometry/Hyperplane.h
@@ -22,8 +22,8 @@ namespace Eigen {
   * A hyperplane is an affine subspace of dimension n-1 in a space of dimension n.
   * For example, a hyperplane in a plane is a line; a hyperplane in 3-space is a plane.
   *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
-  * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
+  * \tparam _Scalar the scalar type, i.e., the type of the coefficients
+  * \tparam _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
   *             Notice that the dimension of the hyperplane is _AmbientDim-1.
   *
   * This class represents an hyperplane as the zero set of the implicit equation
diff --git a/Eigen/src/Geometry/ParametrizedLine.h b/Eigen/src/Geometry/ParametrizedLine.h
index 93edd9148..fdcd69760 100644
--- a/Eigen/src/Geometry/ParametrizedLine.h
+++ b/Eigen/src/Geometry/ParametrizedLine.h
@@ -23,8 +23,8 @@ namespace Eigen {
   * direction vector \f$ \mathbf{d} \f$ such that the line corresponds to
   * the set \f$ l(t) = \mathbf{o} + t \mathbf{d} \f$, \f$ t \in \mathbf{R} \f$.
   *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
-  * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
+  * \tparam _Scalar the scalar type, i.e., the type of the coefficients
+  * \tparam _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
   */
 template <typename _Scalar, int _AmbientDim, int _Options>
 class ParametrizedLine
diff --git a/Eigen/src/Geometry/Rotation2D.h b/Eigen/src/Geometry/Rotation2D.h
index 8b0ddcfb0..5ab0d5920 100644
--- a/Eigen/src/Geometry/Rotation2D.h
+++ b/Eigen/src/Geometry/Rotation2D.h
@@ -18,7 +18,7 @@ namespace Eigen {
   *
   * \brief Represents a rotation/orientation in a 2 dimensional space.
   *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
+  * \tparam _Scalar the scalar type, i.e., the type of the coefficients
   *
   * This class is equivalent to a single scalar representing a counter clock wise rotation
   * as a single angle in radian. It provides some additional features such as the automatic
diff --git a/Eigen/src/Geometry/RotationBase.h b/Eigen/src/Geometry/RotationBase.h
index b88661de6..fadfd9151 100644
--- a/Eigen/src/Geometry/RotationBase.h
+++ b/Eigen/src/Geometry/RotationBase.h
@@ -22,8 +22,8 @@ struct rotation_base_generic_product_selector;
   *
   * \brief Common base class for compact rotation representations
   *
-  * \param Derived is the derived type, i.e., a rotation type
-  * \param _Dim the dimension of the space
+  * \tparam Derived is the derived type, i.e., a rotation type
+  * \tparam _Dim the dimension of the space
   */
 template<typename Derived, int _Dim>
 class RotationBase
@@ -164,8 +164,8 @@ namespace internal {
   *
   * Helper function to return an arbitrary rotation object to a rotation matrix.
   *
-  * \param Scalar the numeric type of the matrix coefficients
-  * \param Dim the dimension of the current space
+  * \tparam Scalar the numeric type of the matrix coefficients
+  * \tparam Dim the dimension of the current space
   *
   * It returns a Dim x Dim fixed size matrix.
   *
diff --git a/Eigen/src/Geometry/Scaling.h b/Eigen/src/Geometry/Scaling.h
index e94aa189f..643138199 100644
--- a/Eigen/src/Geometry/Scaling.h
+++ b/Eigen/src/Geometry/Scaling.h
@@ -18,7 +18,7 @@ namespace Eigen {
   *
   * \brief Represents a generic uniform scaling transformation
   *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients.
+  * \tparam _Scalar the scalar type, i.e., the type of the coefficients.
   *
   * This class represent a uniform scaling transformation. It is the return
   * type of Scaling(Scalar), and most of the time this is the only way it
diff --git a/Eigen/src/Geometry/Translation.h b/Eigen/src/Geometry/Translation.h
index 7fda179cc..87ea445e9 100644
--- a/Eigen/src/Geometry/Translation.h
+++ b/Eigen/src/Geometry/Translation.h
@@ -18,8 +18,8 @@ namespace Eigen {
   *
   * \brief Represents a translation transformation
   *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients.
-  * \param _Dim the  dimension of the space, can be a compile time value or Dynamic
+  * \tparam _Scalar the scalar type, i.e., the type of the coefficients.
+  * \tparam _Dim the  dimension of the space, can be a compile time value or Dynamic
   *
   * \note This class is not aimed to be used to store a translation transformation,
   * but rather to make easier the constructions and updates of Transform objects.
diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
index 284e37f13..18e9d1466 100644
--- a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
@@ -21,7 +21,7 @@ namespace Eigen {
   * References : C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with
   *              Limited memory, SIAM J. Sci. Comput.  21(1), pp. 24-45, 1999
   *
-  * \tparam _MatrixType The type of the sparse matrix. It is advised to give a row-oriented sparse matrix
+  * \tparam Scalar the scalar type of the input matrices
   * \tparam _UpLo The triangular part that will be used for the computations. It can be Lower
     *               or Upper. Default is Lower.
   * \tparam _OrderingType The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering<int>,
diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h
index 0c4d63923..1721213d6 100644
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@@ -29,7 +29,7 @@ template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> >
   *
   * \brief LU decomposition of a matrix with complete pivoting, and related features
   *
-  * \param MatrixType the type of the matrix of which we are computing the LU decomposition
+  * \tparam _MatrixType the type of the matrix of which we are computing the LU decomposition
   *
   * This class represents a LU decomposition of any matrix, with complete pivoting: the matrix A is
   * decomposed as \f$ A = P^{-1} L U Q^{-1} \f$ where L is unit-lower-triangular, U is
diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index 50e920609..ab7797d2a 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@@ -34,7 +34,7 @@ template<typename _MatrixType> struct traits<PartialPivLU<_MatrixType> >
   *
   * \brief LU decomposition of a matrix with partial pivoting, and related features
   *
-  * \param MatrixType the type of the matrix of which we are computing the LU decomposition
+  * \tparam _MatrixType the type of the matrix of which we are computing the LU decomposition
   *
   * This class represents a LU decomposition of a \b square \b invertible matrix, with partial pivoting: the matrix A
   * is decomposed as A = PLU where L is unit-lower-triangular, U is upper-triangular, and P
diff --git a/Eigen/src/OrderingMethods/Amd.h b/Eigen/src/OrderingMethods/Amd.h
index 323255e0a..d1d08ca57 100644
--- a/Eigen/src/OrderingMethods/Amd.h
+++ b/Eigen/src/OrderingMethods/Amd.h
@@ -84,8 +84,11 @@ StorageIndex cs_tdfs(StorageIndex j, StorageIndex k, StorageIndex *head, const S
 /** \internal
   * \ingroup OrderingMethods_Module 
   * Approximate minimum degree ordering algorithm.
-  * \returns the permutation P reducing the fill-in of the input matrix \a C
-  * The input matrix \a C must be a selfadjoint compressed column major SparseMatrix object. Both the upper and lower parts have to be stored, but the diagonal entries are optional.
+  *
+  * \param[in] C the input selfadjoint matrix stored in compressed column major format.
+  * \param[out] perm the permutation P reducing the fill-in of the input matrix \a C
+  *
+  * Note that the input matrix \a C must be complete, that is both the upper and lower parts have to be stored, as well as the diagonal entries.
   * On exit the values of C are destroyed */
 template<typename Scalar, typename StorageIndex>
 void minimum_degree_ordering(SparseMatrix<Scalar,ColMajor,StorageIndex>& C, PermutationMatrix<Dynamic,Dynamic,StorageIndex>& perm)
diff --git a/Eigen/src/OrderingMethods/Ordering.h b/Eigen/src/OrderingMethods/Ordering.h
index 25792a828..7ea9b14d7 100644
--- a/Eigen/src/OrderingMethods/Ordering.h
+++ b/Eigen/src/OrderingMethods/Ordering.h
@@ -19,20 +19,21 @@ namespace internal {
     
 /** \internal
   * \ingroup OrderingMethods_Module
-  * \returns the symmetric pattern A^T+A from the input matrix A. 
+  * \param[in] A the input non-symmetric matrix
+  * \param[out] symmat the symmetric pattern A^T+A from the input matrix \a A.
   * FIXME: The values should not be considered here
   */
 template<typename MatrixType> 
-void ordering_helper_at_plus_a(const MatrixType& mat, MatrixType& symmat)
+void ordering_helper_at_plus_a(const MatrixType& A, MatrixType& symmat)
 {
   MatrixType C;
-  C = mat.transpose(); // NOTE: Could be  costly
+  C = A.transpose(); // NOTE: Could be  costly
   for (int i = 0; i < C.rows(); i++) 
   {
       for (typename MatrixType::InnerIterator it(C, i); it; ++it)
         it.valueRef() = 0.0;
   }
-  symmat = C + mat; 
+  symmat = C + A;
 }
     
 }
diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h
index 172e4a89f..d8bd4b950 100644
--- a/Eigen/src/QR/ColPivHouseholderQR.h
+++ b/Eigen/src/QR/ColPivHouseholderQR.h
@@ -28,7 +28,7 @@ template<typename _MatrixType> struct traits<ColPivHouseholderQR<_MatrixType> >
   *
   * \brief Householder rank-revealing QR decomposition of a matrix with column-pivoting
   *
-  * \param MatrixType the type of the matrix of which we are computing the QR decomposition
+  * \tparam _MatrixType the type of the matrix of which we are computing the QR decomposition
   *
   * This class performs a rank-revealing QR decomposition of a matrix \b A into matrices \b P, \b Q and \b R
   * such that 
diff --git a/Eigen/src/QR/FullPivHouseholderQR.h b/Eigen/src/QR/FullPivHouseholderQR.h
index 64fe6b7b8..32a10f3fe 100644
--- a/Eigen/src/QR/FullPivHouseholderQR.h
+++ b/Eigen/src/QR/FullPivHouseholderQR.h
@@ -37,7 +37,7 @@ struct traits<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
   *
   * \brief Householder rank-revealing QR decomposition of a matrix with full pivoting
   *
-  * \param MatrixType the type of the matrix of which we are computing the QR decomposition
+  * \tparam _MatrixType the type of the matrix of which we are computing the QR decomposition
   *
   * This class performs a rank-revealing QR decomposition of a matrix \b A into matrices \b P, \b P', \b Q and \b R
   * such that 
diff --git a/Eigen/src/QR/HouseholderQR.h b/Eigen/src/QR/HouseholderQR.h
index 1eb861025..03bc8e6cd 100644
--- a/Eigen/src/QR/HouseholderQR.h
+++ b/Eigen/src/QR/HouseholderQR.h
@@ -21,7 +21,7 @@ namespace Eigen {
   *
   * \brief Householder QR decomposition of a matrix
   *
-  * \param MatrixType the type of the matrix of which we are computing the QR decomposition
+  * \tparam _MatrixType the type of the matrix of which we are computing the QR decomposition
   *
   * This class performs a QR decomposition of a matrix \b A into matrices \b Q and \b R
   * such that 
diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h
index 896246e46..3552c87bf 100644
--- a/Eigen/src/SVD/BDCSVD.h
+++ b/Eigen/src/SVD/BDCSVD.h
@@ -47,9 +47,8 @@ struct traits<BDCSVD<_MatrixType> >
  *
  * \brief class Bidiagonal Divide and Conquer SVD
  *
- * \param MatrixType the type of the matrix of which we are computing the SVD decomposition
- * We plan to have a very similar interface to JacobiSVD on this class.
- * It should be used to speed up the calcul of SVD for big matrices. 
+ * \tparam _MatrixType the type of the matrix of which we are computing the SVD decomposition
+ *
  */
 template<typename _MatrixType> 
 class BDCSVD : public SVDBase<BDCSVD<_MatrixType> >
diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h
index 59c965e15..bf5ff48c3 100755
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
@@ -449,8 +449,8 @@ struct traits<JacobiSVD<_MatrixType,QRPreconditioner> >
   *
   * \brief Two-sided Jacobi SVD decomposition of a rectangular matrix
   *
-  * \param MatrixType the type of the matrix of which we are computing the SVD decomposition
-  * \param QRPreconditioner this optional parameter allows to specify the type of QR decomposition that will be used internally
+  * \tparam _MatrixType the type of the matrix of which we are computing the SVD decomposition
+  * \tparam QRPreconditioner this optional parameter allows to specify the type of QR decomposition that will be used internally
   *                        for the R-SVD step for non-square matrices. See discussion of possible values below.
   *
   * SVD decomposition consists in decomposing any n-by-p matrix \a A as a product
diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h
index 1343eb15c..2907f6529 100644
--- a/Eigen/src/SparseCholesky/SimplicialCholesky.h
+++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h
@@ -39,18 +39,16 @@ namespace internal {
 } // end namespace internal
 
 /** \ingroup SparseCholesky_Module
-  * \brief A direct sparse Cholesky factorizations
+  * \brief A base class for direct sparse Cholesky factorizations
   *
-  * These classes provide LL^T and LDL^T Cholesky factorizations of sparse matrices that are
-  * selfadjoint and positive definite. The factorization allows for solving A.X = B where
+  * This is a base class for LL^T and LDL^T Cholesky factorizations of sparse matrices that are
+  * selfadjoint and positive definite. These factorizations allow for solving A.X = B where
   * X and B can be either dense or sparse.
   * 
   * In order to reduce the fill-in, a symmetric permutation P is applied prior to the factorization
   * such that the factorized matrix is P A P^-1.
   *
-  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
-  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
-  *               or Upper. Default is Lower.
+  * \tparam Derived the type of the derived class, that is the actual factorization type.
   *
   */
 template<typename Derived>
diff --git a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
index e71a13b89..8c1b3e8bc 100644
--- a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
+++ b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
@@ -14,22 +14,21 @@
 namespace Eigen {
 namespace internal {
   
-/**
- * \brief Performs numeric block updates from a given supernode to a single column
- * 
- * \param segsize Size of the segment (and blocks ) to use for updates
- * \param[in,out] dense Packed values of the original matrix
- * \param tempv temporary vector to use for updates
- * \param lusup array containing the supernodes
- * \param lda Leading dimension in the supernode
- * \param nrow Number of rows in the rectangular part of the supernode
- * \param lsub compressed row subscripts of supernodes
- * \param lptr pointer to the first column of the current supernode in lsub
- * \param no_zeros Number of nonzeros elements before the diagonal part of the supernode
- * \return 0 on success
- */
 template <int SegSizeAtCompileTime> struct LU_kernel_bmod
 {
+  /** \internal
+    * \brief Performs numeric block updates from a given supernode to a single column
+    *
+    * \param segsize Size of the segment (and blocks ) to use for updates
+    * \param[in,out] dense Packed values of the original matrix
+    * \param tempv temporary vector to use for updates
+    * \param lusup array containing the supernodes
+    * \param lda Leading dimension in the supernode
+    * \param nrow Number of rows in the rectangular part of the supernode
+    * \param lsub compressed row subscripts of supernodes
+    * \param lptr pointer to the first column of the current supernode in lsub
+    * \param no_zeros Number of nonzeros elements before the diagonal part of the supernode
+    */
   template <typename BlockScalarVector, typename ScalarVector, typename IndexVector>
   static EIGEN_DONT_INLINE void run(const Index segsize, BlockScalarVector& dense, ScalarVector& tempv, ScalarVector& lusup, Index& luptr, const Index lda,
                                     const Index nrow, IndexVector& lsub, const Index lptr, const Index no_zeros);
diff --git a/unsupported/Eigen/src/SparseExtra/RandomSetter.h b/unsupported/Eigen/src/SparseExtra/RandomSetter.h
index eb3e17330..ee97299af 100644
--- a/unsupported/Eigen/src/SparseExtra/RandomSetter.h
+++ b/unsupported/Eigen/src/SparseExtra/RandomSetter.h
@@ -95,10 +95,10 @@ template<typename Scalar> struct GoogleSparseHashMapTraits
   *
   * \brief The RandomSetter is a wrapper object allowing to set/update a sparse matrix with random access
   *
-  * \param SparseMatrixType the type of the sparse matrix we are updating
-  * \param MapTraits a traits class representing the map implementation used for the temporary sparse storage.
+  * \tparam SparseMatrixType the type of the sparse matrix we are updating
+  * \tparam MapTraits a traits class representing the map implementation used for the temporary sparse storage.
   *                  Its default value depends on the system.
-  * \param OuterPacketBits defines the number of rows (or columns) manage by a single map object
+  * \tparam OuterPacketBits defines the number of rows (or columns) manage by a single map object
   *                        as a power of two exponent.
   *
   * This class temporarily represents a sparse matrix object using a generic map implementation allowing for

From 715f6f049fb449df57484a36711d9fde8a5d1be0 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sun, 3 Jan 2016 21:56:30 +0100
Subject: [PATCH 02/52] Improve inline documentation of SparseCompressedBase
 and its derived classes

---
 Eigen/src/Core/Ref.h                        |  1 +
 Eigen/src/SparseCore/SparseCompressedBase.h | 10 +++++
 Eigen/src/SparseCore/SparseMap.h            | 41 ++++++++++++++++++++-
 Eigen/src/SparseCore/SparseRef.h            | 25 +++++++++----
 4 files changed, 69 insertions(+), 8 deletions(-)

diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h
index ae414204e..6e94181f3 100644
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@@ -207,6 +207,7 @@ template<typename PlainObjectType, int Options, typename StrideType> class Ref
     EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr,
                                  typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
     #else
+    /** Implicit constructor from any dense expression */
     template<typename Derived>
     inline Ref(DenseBase<Derived>& expr)
     #endif
diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h
index c223e4f42..f78d7c24d 100644
--- a/Eigen/src/SparseCore/SparseCompressedBase.h
+++ b/Eigen/src/SparseCore/SparseCompressedBase.h
@@ -22,6 +22,16 @@ struct traits<SparseCompressedBase<Derived> > : traits<Derived>
 
 } // end namespace internal
 
+/** \ingroup SparseCore_Module
+  * \class SparseCompressedBase
+  * \brief Common base class for sparse [compressed]-{row|column}-storage format.
+  *
+  * This class defines the common interface for all derived classes implementing the compressed sparse storage format, such as:
+  *  - SparseMatrix
+  *  - Ref<SparseMatrixType,Options>
+  *  - Map<SparseMatrixType>
+  *
+  */
 template<typename Derived>
 class SparseCompressedBase
   : public SparseMatrixBase<Derived>
diff --git a/Eigen/src/SparseCore/SparseMap.h b/Eigen/src/SparseCore/SparseMap.h
index 36c09ab0c..eb241c3e2 100644
--- a/Eigen/src/SparseCore/SparseMap.h
+++ b/Eigen/src/SparseCore/SparseMap.h
@@ -37,11 +37,15 @@ struct traits<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, St
 };
 
 } // end namespace internal
-  
+
 template<typename Derived,
          int Level = internal::accessors_level<Derived>::has_write_access ? WriteAccessors : ReadOnlyAccessors
 > class SparseMapBase;
 
+/** \ingroup SparseCore_Module
+  * class SparseMapBase
+  * \brief Common base class for Map and Ref instance of sparse matrix and vector.
+  */
 template<typename Derived>
 class SparseMapBase<Derived,ReadOnlyAccessors>
   : public SparseCompressedBase<Derived>
@@ -71,22 +75,33 @@ class SparseMapBase<Derived,ReadOnlyAccessors>
 
   public:
 
+    /** \copydoc SparseMatrixBase::rows() */
     inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
+    /** \copydoc SparseMatrixBase::cols() */
     inline Index cols() const { return IsRowMajor ? m_innerSize : m_outerSize; }
+    /** \copydoc SparseMatrixBase::innerSize() */
     inline Index innerSize() const { return m_innerSize; }
+    /** \copydoc SparseMatrixBase::outerSize() */
     inline Index outerSize() const { return m_outerSize; }
+    /** \copydoc SparseCompressedBase::nonZeros */
     inline Index nonZeros() const { return m_zero_nnz[1]; }
     
+    /** \copydoc SparseCompressedBase::isCompressed */
     bool isCompressed() const { return m_innerNonZeros==0; }
 
     //----------------------------------------
     // direct access interface
+    /** \copydoc SparseMatrix::valuePtr */
     inline const Scalar* valuePtr() const { return m_values; }
+    /** \copydoc SparseMatrix::innerIndexPtr */
     inline const StorageIndex* innerIndexPtr() const { return m_innerIndices; }
+    /** \copydoc SparseMatrix::outerIndexPtr */
     inline const StorageIndex* outerIndexPtr() const { return m_outerIndex; }
+    /** \copydoc SparseMatrix::innerNonZeroPtr */
     inline const StorageIndex* innerNonZeroPtr() const { return m_innerNonZeros; }
     //----------------------------------------
 
+    /** \copydoc SparseMatrix::coeff */
     inline Scalar coeff(Index row, Index col) const
     {
       const Index outer = IsRowMajor ? row : col;
@@ -125,6 +140,10 @@ class SparseMapBase<Derived,ReadOnlyAccessors>
     inline SparseMapBase() {}
 };
 
+/** \ingroup SparseCore_Module
+  * class SparseMapBase
+  * \brief Common base class for writable Map and Ref instance of sparse matrix and vector.
+  */
 template<typename Derived>
 class SparseMapBase<Derived,WriteAccessors>
   : public SparseMapBase<Derived,ReadOnlyAccessors>
@@ -185,9 +204,23 @@ class SparseMapBase<Derived,WriteAccessors>
     inline SparseMapBase() {}
 };
 
+/** \ingroup SparseCore_Module
+  *
+  * \brief Specialization of class Map for SparseMatrix-like storage.
+  *
+  * \tparam SparseMatrixType the equivalent sparse matrix type of the referenced data, it must be a template instance of class SparseMatrix.
+  *
+  * \sa class Map, class SparseMatrix, class Ref<SparseMatrixType,Options>
+  */
+#ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
 class Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType>
   : public SparseMapBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+#else
+template<typename SparseMatrixType>
+class Map<SparseMatrixType>
+  : public SparseMapBase<Derived,WriteAccessors>
+#endif
 {
   public:
     typedef SparseMapBase<Map> Base;
@@ -196,6 +229,12 @@ class Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType>
 
   public:
 
+    /** Constructs a read-write Map to a sparse matrix of size \a rows x \a cols, containing \a nnz non-zero coefficients,
+      * stored as a sparse format as defined by the pointers \a outerIndexPtr, \a innerIndexPtr, and \a valuePtr.
+      * If the optional parameter \a innerNonZerosPtr is the null pointer, then a standard compressed format is assumed.
+      *
+      * More details on the expected storage schemes are given in the \ref TutorialSparse "manual pages".
+      */
     inline Map(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr,
                StorageIndex* innerIndexPtr, Scalar* valuePtr, StorageIndex* innerNonZerosPtr = 0)
       : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr)
diff --git a/Eigen/src/SparseCore/SparseRef.h b/Eigen/src/SparseCore/SparseRef.h
index 605ca42ba..a558230e7 100644
--- a/Eigen/src/SparseCore/SparseRef.h
+++ b/Eigen/src/SparseCore/SparseRef.h
@@ -108,20 +108,25 @@ protected:
 
 
 /** 
-  * \ingroup Sparse_Module
+  * \ingroup SparseCore_Module
   *
   * \brief A sparse matrix expression referencing an existing sparse expression
   *
-  * \tparam PlainObjectType the equivalent sparse matrix type of the referenced data
+  * \tparam SparseMatrixType the equivalent sparse matrix type of the referenced data, it must be a template instance of class SparseMatrix.
   * \tparam Options specifies whether the a standard compressed format is required \c Options is  \c #StandardCompressedFormat, or \c 0.
   *                The default is \c 0.
-  * \tparam StrideType Only used for dense Ref
   *
   * \sa class Ref
   */
+#ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
 class Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType >
   : public internal::SparseRefBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType > >
+#else
+template<typename SparseMatrixType, int Options>
+class Ref<SparseMatrixType, Options>
+  : public SparseMapBase<Derived,WriteAccessors> // yes, that's weird to use Derived here, but that works!
+#endif
 {
     typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
     typedef internal::traits<Ref> Traits;
@@ -155,6 +160,7 @@ class Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType >
     template<typename Derived>
     inline Ref(const SparseCompressedBase<Derived>& expr)
     #else
+    /** Implicit constructor from any sparse expression (2D matrix or 1D vector) */
     template<typename Derived>
     inline Ref(SparseCompressedBase<Derived>& expr)
     #endif
@@ -225,19 +231,23 @@ class Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType
 
 
 /**
-  * \ingroup Sparse_Module
+  * \ingroup SparseCore_Module
   *
   * \brief A sparse vector expression referencing an existing sparse vector expression
   *
-  * \tparam PlainObjectType the equivalent sparse matrix type of the referenced data
-  * \tparam Options Not used for SparseVector.
-  * \tparam StrideType Only used for dense Ref
+  * \tparam SparseVectorType the equivalent sparse vector type of the referenced data, it must be a template instance of class SparseVector.
   *
   * \sa class Ref
   */
+#ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
 class Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType >
   : public internal::SparseRefBase<Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType > >
+#else
+template<typename SparseVectorType>
+class Ref<SparseVectorType>
+  : public SparseMapBase<Derived,WriteAccessors>
+#endif
 {
     typedef SparseVector<MatScalar,MatOptions,MatIndex> PlainObjectType;
     typedef internal::traits<Ref> Traits;
@@ -259,6 +269,7 @@ class Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType >
     template<typename Derived>
     inline Ref(const SparseCompressedBase<Derived>& expr)
     #else
+    /** Implicit constructor from any 1D sparse vector expression */
     template<typename Derived>
     inline Ref(SparseCompressedBase<Derived>& expr)
     #endif

From 515dee0bafc69dd79cfbf9c2831e707d2214390f Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 4 Jan 2016 16:29:26 -0800
Subject: [PATCH 03/52] Added a 'divup' util to compute the floor of the
 quotient of two integers

---
 unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index f28a9699d..d6ad65070 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -24,6 +24,11 @@ const T2& choose(Cond<false>, const T1&, const T2& second) {
   return second;
 }
 
+template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T divup(const T x, const T y) {
+  return (x + y - 1) / y;
+}
+
 template <size_t n> struct max_n_1 {
   static const size_t size = n;
 };

From cfff40b1d48999d354be34c984c83f7d0f1ca5cb Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 4 Jan 2016 17:25:00 -0800
Subject: [PATCH 04/52] Improved the performance of reductions on CUDA devices

---
 .../Eigen/CXX11/src/Tensor/TensorReduction.h  |  31 ++++++
 .../CXX11/src/Tensor/TensorReductionCuda.h    | 101 +++++++++++++++++-
 2 files changed, 128 insertions(+), 4 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index c30980a49..2ecdccdfd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -337,6 +337,16 @@ struct FullReducer<Self, Op, ThreadPoolDevice, true> {
 #endif
 
 
+// Default inner reducer
+template <typename Self, typename Op, typename Device>
+struct InnerReducer {
+  static const bool HasOptimizedImplementation = false;
+
+  static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
+    assert(false && "Not implemented");
+  }
+};
+
 // Default outer reducer
 template <typename Self, typename Op, typename Device>
 struct OuterReducer {
@@ -352,6 +362,9 @@ struct OuterReducer {
 template <int B, int N, typename S, typename R, typename I>
 __global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
 
+template <int NPT, typename S, typename R, typename I>
+__global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+
 template <int NPT, typename S, typename R, typename I>
 __global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
 #endif
@@ -516,6 +529,23 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
     // Attempt to use an optimized reduction.
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
     else if (RunningOnGPU && data && (m_device.majorDeviceVersion() >= 3)) {
+      bool reducing_inner_dims = true;
+      for (int i = 0; i < NumReducedDims; ++i) {
+        if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+          reducing_inner_dims &= m_reducedDims[i];
+        } else {
+          reducing_inner_dims &= m_reducedDims[NumInputDims - 1 - i];
+        }
+      }
+      if (internal::InnerReducer<Self, Op, GpuDevice>::HasOptimizedImplementation &&
+          (reducing_inner_dims || ReducingInnerMostDims)) {
+        const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
+        const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+        Op reducer(m_reducer);
+        internal::InnerReducer<Self, Op, GpuDevice>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
+        return false;
+      }
+
       bool preserving_inner_dims = true;
       for (int i = 0; i < NumReducedDims; ++i) {
         if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -615,6 +645,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
 #endif
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
   template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
+  template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
   template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
 #endif
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 8e250867c..b046607b9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -131,7 +131,102 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
   }
 };
 
-#define DIVUP(x, y) (((x) + (y)-1) / (y))
+
+extern __shared__ float temp[];
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                         typename Self::CoeffReturnType* output) {
+  eigen_assert(blockDim.y == 1);
+  eigen_assert(blockDim.z == 1);
+  eigen_assert(gridDim.y == 1);
+  eigen_assert(gridDim.z == 1);
+
+  const int unroll_times = 16;
+  eigen_assert(NumPerThread % unroll_times == 0);
+
+  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
+  const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
+
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+    output[i] = reducer.initialize();
+  }
+
+  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+    const Index row = i / input_col_blocks;
+
+    if (row < num_preserved_coeffs) {
+      const Index col_block = i % input_col_blocks;
+      const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
+
+      float reduced_val = reducer.initialize();
+
+      for (Index j = 0; j < NumPerThread; j += unroll_times) {
+        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
+        if (last_col >= num_coeffs_to_reduce) {
+          for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col +=blockDim.x) {
+            const float val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+            reducer.reduce(val, &reduced_val);
+          }
+          break;
+        } else {
+          // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+          for (int k = 0; k < unroll_times; ++k) {
+            const Index col = col_begin + blockDim.x * (j + k);
+            reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
+          }
+        }
+      }
+
+      temp[threadIdx.x] = reduced_val;
+
+      __syncthreads();
+      const int warp_id = threadIdx.x & 31;
+      if (warp_id < 16) reducer.reduce(temp[threadIdx.x + 16], &temp[threadIdx.x]);
+      if (warp_id < 8) reducer.reduce(temp[threadIdx.x + 8], &temp[threadIdx.x]);
+      if (warp_id < 4) reducer.reduce(temp[threadIdx.x + 4], &temp[threadIdx.x]);
+      if (warp_id < 2) reducer.reduce(temp[threadIdx.x + 2], &temp[threadIdx.x]);
+      if (warp_id < 1) {
+        reducer.reduce(temp[threadIdx.x + 1], &temp[threadIdx.x]);
+        atomicReduce(&(output[row]), temp[threadIdx.x], reducer);
+      }
+    }
+
+    __syncthreads();
+  }
+}
+
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, GpuDevice> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple case
+  // of floats.
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+                                                 internal::is_same<typename Self::CoeffReturnType, float>::value;
+
+  template <typename Device, typename OutputType>
+  static void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
+    assert(false && "Should only be called to reduce floats on a gpu device");
+  }
+
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = 32;
+
+    LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread>),
+                       num_blocks, block_size, block_size*sizeof(float), device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+  }
+};
+
 
 template <int NumPerThread, typename Self,
           typename Reducer, typename Index>
@@ -145,7 +240,7 @@ __global__ void OuterReductionKernel(Reducer reducer, const Self input, Index nu
   }
 
   // Do the reduction.
-  const Index max_iter = DIVUP(num_coeffs_to_reduce, NumPerThread) * num_preserved_coeffs;
+  const Index max_iter = divup<Index>(num_coeffs_to_reduce, NumPerThread) * num_preserved_coeffs;
   for (Index i = thread_id; i < max_iter; i += num_threads) {
     const Index input_col = i % num_preserved_coeffs;
     const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
@@ -189,8 +284,6 @@ struct OuterReducer<Self, Op, GpuDevice> {
   }
 };
 
-#undef DIVUP
-
 #endif
 
 

From 54bf582303407387f9a34d99c8993aa3255274ec Mon Sep 17 00:00:00 2001
From: Christoph Hertzberg <chtz@informatik.uni-bremen.de>
Date: Wed, 6 Jan 2016 11:59:24 +0100
Subject: [PATCH 05/52] bug #1143: Work-around gcc bug

---
 Eigen/src/OrderingMethods/Eigen_Colamd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h
index 6238676e5..70c987afa 100644
--- a/Eigen/src/OrderingMethods/Eigen_Colamd.h
+++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h
@@ -516,7 +516,7 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
     Col [col].start = p [col] ;
     Col [col].length = p [col+1] - p [col] ;
 
-    if (Col [col].length < 0)
+    if ((Col [col].length) < 0) // extra parentheses to work-around gcc bug 10200
     {
       /* column pointers must be non-decreasing */
       stats [COLAMD_STATUS] = COLAMD_ERROR_col_length_negative ;

From ee738321aa6c13f327821f4a4b1aaa4ead635687 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 6 Jan 2016 14:49:40 +0100
Subject: [PATCH 06/52] rm remaining debug code

---
 Eigen/src/Core/util/Constants.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index 9e6816021..5f71ba3df 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -224,7 +224,7 @@ enum {
 
 /** \ingroup enums
   * Enum for indicating whether a buffer is aligned or not. */
-enum Foo {
+enum {
   Unaligned=0,        /**< Data pointer has no specific alignment. */
   Aligned8=8,         /**< Data pointer is aligned on a 8 bytes boundary. */
   Aligned16=16,       /**< Data pointer is aligned on a 16 bytes boundary. */

From 213459d81850f98f3822624ae84c1f420f12092c Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 6 Jan 2016 18:47:45 -0800
Subject: [PATCH 07/52] Optimized the performance of broadcasting of scalars.

---
 .../CXX11/src/Tensor/TensorBroadcasting.h     | 25 ++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index dc64959e1..0c95e5c0b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -46,6 +46,21 @@ struct nested<TensorBroadcastingOp<Broadcast, XprType>, 1, typename eval<TensorB
   typedef TensorBroadcastingOp<Broadcast, XprType> type;
 };
 
+template <typename Dims>
+struct is_input_scalar {
+  static const bool value = false;
+};
+template <>
+struct is_input_scalar<Sizes<> > {
+  static const bool value = true;
+};
+#ifndef EIGEN_EMULATE_CXX11_META_H
+template <typename std::size_t... Indices>
+struct is_input_scalar<Sizes<Indices...> > {
+  static const bool value = (Sizes<Indices...>::total_size == 1);
+};
+#endif
+
 }  // end namespace internal
 
 
@@ -103,7 +118,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     // and store the result in a scalar. Instead one should reshape the scalar into a a N-D
     // tensor with N >= 1 of 1 element first and then broadcast.
     EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
-    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    const InputDimensions& input_dims = m_impl.dimensions();
     const Broadcast& broadcast = op.broadcast();
     for (int i = 0; i < NumDims; ++i) {
       eigen_assert(input_dims[i] > 0);
@@ -143,6 +158,10 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const
   {
+    if (internal::is_input_scalar<typename internal::remove_all<InputDimensions>::type>::value) {
+      return m_impl.coeff(0);
+    }
+
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       return coeffColMajor(index);
     } else {
@@ -214,6 +233,10 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const
   {
+    if (internal::is_input_scalar<typename internal::remove_all<InputDimensions>::type>::value) {
+      return m_impl.coeff(0);
+    }
+
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       return packetColMajor<LoadMode>(index);
     } else {

From 0cb2ca5de216f258cd3106cf18d4aada9064ce13 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 6 Jan 2016 18:50:28 -0800
Subject: [PATCH 08/52] Fixed a typo.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 0c95e5c0b..9ec20a99e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -234,7 +234,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const
   {
     if (internal::is_input_scalar<typename internal::remove_all<InputDimensions>::type>::value) {
-      return m_impl.coeff(0);
+      return internal::pset1<PacketReturnType>(m_impl.coeff(0));
     }
 
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {

From 6639b7d6e86ef36f6f78cf51e36efa5a004154eb Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 7 Jan 2016 18:45:19 -0800
Subject: [PATCH 09/52] Removed a couple of partial specialization that confuse
 nvcc and result in errors such as this: error: more than one partial
 specialization matches the template argument list of class
 "Eigen::internal::get<3, Eigen::internal::numeric_list<std::size_t, 1UL, 1UL,
 1UL, 1UL>>"             "Eigen::internal::get<n,
 Eigen::internal::numeric_list<T, a, as...>>"            
 "Eigen::internal::get<n, Eigen::internal::numeric_list<T, as...>>"

---
 unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h
index 3f149c6a3..4d99f786c 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h
@@ -109,11 +109,9 @@ template<int n, typename x> struct get;
 
 template<int n, typename a, typename... as>               struct get<n, type_list<a, as...>>   : get<n-1, type_list<as...>> {};
 template<typename a, typename... as>                      struct get<0, type_list<a, as...>>   { typedef a type; };
-template<int n EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, as)> struct get<n, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(as)>> { static_assert((n - n) < 0, "meta-template get: The element to extract from a list must be smaller than the size of the list."); };
 
 template<typename T, int n, T a, T... as>                        struct get<n, numeric_list<T, a, as...>>   : get<n-1, numeric_list<T, as...>> {};
 template<typename T, T a, T... as>                               struct get<0, numeric_list<T, a, as...>>   { constexpr static T value = a; };
-template<typename T, int n EIGEN_TPL_PP_SPEC_HACK_DEFC(T, as)>   struct get<n, numeric_list<T EIGEN_TPL_PP_SPEC_HACK_USEC(as)>> { static_assert((n - n) < 0, "meta-template get: The element to extract from a list must be smaller than the size of the list."); };
 
 /* always get type, regardless of dummy; good for parameter pack expansion */
 

From f9d71a172992cfda5e2733f9f4a6e12a14b9ed73 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 8 Jan 2016 22:24:45 +0100
Subject: [PATCH 10/52] extend matlab conversion table

---
 doc/AsciiQuickReference.txt | 110 +++++++++++++++++++-----------------
 1 file changed, 58 insertions(+), 52 deletions(-)

diff --git a/doc/AsciiQuickReference.txt b/doc/AsciiQuickReference.txt
index b5bdfa1f4..c604e575c 100644
--- a/doc/AsciiQuickReference.txt
+++ b/doc/AsciiQuickReference.txt
@@ -32,17 +32,19 @@ A << 1, 2, 3,     // Initialize A. The elements can also be
 B << A, A, A;     // B is three horizontally stacked A's.
 A.fill(10);       // Fill A with all 10's.
 
-// Eigen                            // Matlab
-MatrixXd::Identity(rows,cols)       // eye(rows,cols)
-C.setIdentity(rows,cols)            // C = eye(rows,cols)
-MatrixXd::Zero(rows,cols)           // zeros(rows,cols)
-C.setZero(rows,cols)                // C = ones(rows,cols)
-MatrixXd::Ones(rows,cols)           // ones(rows,cols)
-C.setOnes(rows,cols)                // C = ones(rows,cols)
-MatrixXd::Random(rows,cols)         // rand(rows,cols)*2-1        // MatrixXd::Random returns uniform random numbers in (-1, 1).
-C.setRandom(rows,cols)              // C = rand(rows,cols)*2-1
-VectorXd::LinSpaced(size,low,high)   // linspace(low,high,size)'
-v.setLinSpaced(size,low,high)        // v = linspace(low,high,size)'
+// Eigen                                    // Matlab
+MatrixXd::Identity(rows,cols)               // eye(rows,cols)
+C.setIdentity(rows,cols)                    // C = eye(rows,cols)
+MatrixXd::Zero(rows,cols)                   // zeros(rows,cols)
+C.setZero(rows,cols)                        // C = ones(rows,cols)
+MatrixXd::Ones(rows,cols)                   // ones(rows,cols)
+C.setOnes(rows,cols)                        // C = ones(rows,cols)
+MatrixXd::Random(rows,cols)                 // rand(rows,cols)*2-1            // MatrixXd::Random returns uniform random numbers in (-1, 1).
+C.setRandom(rows,cols)                      // C = rand(rows,cols)*2-1
+VectorXd::LinSpaced(size,low,high)          // linspace(low,high,size)'
+v.setLinSpaced(size,low,high)               // v = linspace(low,high,size)'
+VectorXi::LinSpaced(((hi-low)/step)+1,      // low:step:hi
+                    low,low+step*(size-1))
 
 
 // Matrix slicing and blocks. All expressions listed here are read/write.
@@ -85,13 +87,15 @@ P.bottomRightCorner<rows,cols>()   // P(end-rows+1:end, end-cols+1:end)
 R.row(i) = P.col(j);               // R(i, :) = P(:, i)
 R.col(j1).swap(mat1.col(j2));      // R(:, [j1 j2]) = R(:, [j2, j1])
 
-// Views, transpose, etc; all read-write except for .adjoint().
+// Views, transpose, etc;
 // Eigen                           // Matlab
 R.adjoint()                        // R'
-R.transpose()                      // R.' or conj(R')
-R.diagonal()                       // diag(R)
+R.transpose()                      // R.' or conj(R')       // Read-write
+R.diagonal()                       // diag(R)               // Read-write
 x.asDiagonal()                     // diag(x)
-R.transpose().colwise().reverse(); // rot90(R)
+R.transpose().colwise().reverse()  // rot90(R)              // Read-write
+R.replicate(i,j)                   // repmat(P,i,j)
+
 
 // All the same as Matlab, but matlab doesn't have *= style operators.
 // Matrix-vector.  Matrix-matrix.   Matrix-scalar.
@@ -103,37 +107,39 @@ a *= M;            R  = P + Q;      R  = P/s;
                    R -= Q;          R /= s;
 
 // Vectorized operations on each element independently
-// Eigen                  // Matlab
-R = P.cwiseProduct(Q);    // R = P .* Q
-R = P.array() * s.array();// R = P .* s
-R = P.cwiseQuotient(Q);   // R = P ./ Q
-R = P.array() / Q.array();// R = P ./ Q
-R = P.array() + s.array();// R = P + s
-R = P.array() - s.array();// R = P - s
-R.array() += s;           // R = R + s
-R.array() -= s;           // R = R - s
-R.array() < Q.array();    // R < Q
-R.array() <= Q.array();   // R <= Q
-R.cwiseInverse();         // 1 ./ P
-R.array().inverse();      // 1 ./ P
-R.array().sin()           // sin(P)
-R.array().cos()           // cos(P)
-R.array().pow(s)          // P .^ s
-R.array().square()        // P .^ 2
-R.array().cube()          // P .^ 3
-R.cwiseSqrt()             // sqrt(P)
-R.array().sqrt()          // sqrt(P)
-R.array().exp()           // exp(P)
-R.array().log()           // log(P)
-R.cwiseMax(P)             // max(R, P)
-R.array().max(P.array())  // max(R, P)
-R.cwiseMin(P)             // min(R, P)
-R.array().min(P.array())  // min(R, P)
-R.cwiseAbs()              // abs(P)
-R.array().abs()           // abs(P)
-R.cwiseAbs2()             // abs(P.^2)
-R.array().abs2()          // abs(P.^2)
-(R.array() < s).select(P,Q);  // (R < s ? P : Q)
+// Eigen                       // Matlab
+R = P.cwiseProduct(Q);         // R = P .* Q
+R = P.array() * s.array();     // R = P .* s
+R = P.cwiseQuotient(Q);        // R = P ./ Q
+R = P.array() / Q.array();     // R = P ./ Q
+R = P.array() + s.array();     // R = P + s
+R = P.array() - s.array();     // R = P - s
+R.array() += s;                // R = R + s
+R.array() -= s;                // R = R - s
+R.array() < Q.array();         // R < Q
+R.array() <= Q.array();        // R <= Q
+R.cwiseInverse();              // 1 ./ P
+R.array().inverse();           // 1 ./ P
+R.array().sin()                // sin(P)
+R.array().cos()                // cos(P)
+R.array().pow(s)               // P .^ s
+R.array().square()             // P .^ 2
+R.array().cube()               // P .^ 3
+R.cwiseSqrt()                  // sqrt(P)
+R.array().sqrt()               // sqrt(P)
+R.array().exp()                // exp(P)
+R.array().log()                // log(P)
+R.cwiseMax(P)                  // max(R, P)
+R.array().max(P.array())       // max(R, P)
+R.cwiseMin(P)                  // min(R, P)
+R.array().min(P.array())       // min(R, P)
+R.cwiseAbs()                   // abs(P)
+R.array().abs()                // abs(P)
+R.cwiseAbs2()                  // abs(P.^2)
+R.array().abs2()               // abs(P.^2)
+(R.array() < s).select(P,Q );  // (R < s ? P : Q)
+R = (Q.array()==0).select(P,A) // R(Q==0) = P(Q==0)
+
 
 // Reductions.
 int r, c;
@@ -164,12 +170,12 @@ x.dot(y)                  // dot(x, y)
 x.cross(y)                // cross(x, y) Requires #include <Eigen/Geometry>
 
 //// Type conversion
-// Eigen                           // Matlab
-A.cast<double>();                  // double(A)
-A.cast<float>();                   // single(A)
-A.cast<int>();                     // int32(A)
-A.real();                          // real(A)
-A.imag();                          // imag(A)
+// Eigen                  // Matlab
+A.cast<double>();         // double(A)
+A.cast<float>();          // single(A)
+A.cast<int>();            // int32(A)
+A.real();                 // real(A)
+A.imag();                 // imag(A)
 // if the original type equals destination type, no work is done
 
 // Note that for most operations Eigen requires all operands to have the same type:

From 53749ff4152191d2f1bd56090a14f6474fe059c2 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 8 Jan 2016 13:53:40 -0800
Subject: [PATCH 11/52] Prevent nvcc from miscompiling the cuda metakernel.
 Unfortunately this reintroduces some compulation warnings but it's much
 better than having to deal with random assertion failures.

---
 .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h    |  6 +-----
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h      | 16 ++++------------
 .../Eigen/CXX11/src/Tensor/TensorReductionCuda.h |  4 ++--
 3 files changed, 7 insertions(+), 19 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 4d7570077..af140a68b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -238,14 +238,10 @@ struct GpuDevice {
 
 };
 
-#ifndef __CUDA_ARCH__
 #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)            \
   (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);  \
   assert(cudaGetLastError() == cudaSuccess);
-#else
-#define LAUNCH_CUDA_KERNEL(...)  \
-   eigen_assert(false && "Cannot launch a kernel from another kernel");
-#endif
+
 
 // FIXME: Should be device and kernel specific.
 #ifdef __CUDACC__
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index c28078882..d93e1de1b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -156,14 +156,14 @@ template <typename Expression>
 class TensorExecutor<Expression, GpuDevice, false> {
  public:
   typedef typename Expression::Index Index;
-  EIGEN_DEVICE_FUNC static void run(const Expression& expr, const GpuDevice& device);
+  static void run(const Expression& expr, const GpuDevice& device);
 };
 
 template <typename Expression>
 class TensorExecutor<Expression, GpuDevice, true> {
  public:
   typedef typename Expression::Index Index;
-  EIGEN_DEVICE_FUNC static void run(const Expression& expr, const GpuDevice& device);
+  static void run(const Expression& expr, const GpuDevice& device);
 };
 
 #if defined(__CUDACC__)
@@ -213,9 +213,8 @@ EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) {
 
 /*static*/
 template <typename Expression>
-EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
+inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
 {
-#ifndef __CUDA_ARCH__
   TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
   const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
   if (needs_assign)
@@ -228,17 +227,13 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(
     LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
   }
   evaluator.cleanup();
-#else
-   eigen_assert(false && "Cannot launch a kernel from another kernel");
-#endif
 }
 
 
 /*static*/
 template<typename Expression>
-EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(const Expression& expr, const GpuDevice& device)
+inline void TensorExecutor<Expression, GpuDevice, true>::run(const Expression& expr, const GpuDevice& device)
 {
-#ifndef __CUDA_ARCH__
   TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
   const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
   if (needs_assign)
@@ -251,9 +246,6 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(c
     LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
   }
   evaluator.cleanup();
-#else
-   eigen_assert(false && "Cannot launch a kernel from another kernel");
-#endif
 }
 
 #endif  // __CUDACC__
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index b046607b9..558d0c83d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -115,11 +115,11 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
                                                  internal::is_same<typename Self::CoeffReturnType, float>::value;
 
   template <typename OutputType>
-  EIGEN_DEVICE_FUNC static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
     assert(false && "Should only be called on floats");
   }
 
-  EIGEN_DEVICE_FUNC static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
     typedef typename Self::Index Index;
 
     const Index num_coeffs = array_prod(self.m_impl.dimensions());

From 3358dfd5dd6c98fda9be133d1c4958ac00221006 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 8 Jan 2016 16:28:53 -0800
Subject: [PATCH 12/52] Reworked the dispatch of optimized cuda reduction
 kernels to workaround a nvcc bug that prevented the code from compiling in
 optimized mode in some cases

---
 unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 2ecdccdfd..1c721de6d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -506,7 +506,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
   typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
   typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
     m_impl.evalSubExprsIfNeeded(NULL);
 
     // Use the FullReducer if possible.
@@ -527,7 +527,6 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
     }
 
     // Attempt to use an optimized reduction.
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
     else if (RunningOnGPU && data && (m_device.majorDeviceVersion() >= 3)) {
       bool reducing_inner_dims = true;
       for (int i = 0; i < NumReducedDims; ++i) {
@@ -537,12 +536,12 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
           reducing_inner_dims &= m_reducedDims[NumInputDims - 1 - i];
         }
       }
-      if (internal::InnerReducer<Self, Op, GpuDevice>::HasOptimizedImplementation &&
+      if (internal::InnerReducer<Self, Op, Device>::HasOptimizedImplementation &&
           (reducing_inner_dims || ReducingInnerMostDims)) {
         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
         Op reducer(m_reducer);
-        internal::InnerReducer<Self, Op, GpuDevice>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
+        internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
         return false;
       }
 
@@ -554,16 +553,15 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
           preserving_inner_dims &= m_reducedDims[i];
         }
       }
-      if (internal::OuterReducer<Self, Op, GpuDevice>::HasOptimizedImplementation &&
+      if (internal::OuterReducer<Self, Op, Device>::HasOptimizedImplementation &&
           preserving_inner_dims) {
         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
         Op reducer(m_reducer);
-        internal::OuterReducer<Self, Op, GpuDevice>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
+        internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
         return false;
       }
     }
-#endif
     return true;
   }
 

From d726e864ac91a1b5d5128972ee9e169966f2fa51 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 8 Jan 2016 16:38:14 -0800
Subject: [PATCH 13/52] Made it possible to use array of size 0 on CUDA devices

---
 unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
index ab9c2ec3e..456b34d0b 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
@@ -132,13 +132,13 @@ template <typename T> class array<T, 0> {
     return *static_cast<const T*>(NULL);
   }
 
-  static EIGEN_ALWAYS_INLINE std::size_t size() { return 0; }
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; }
 
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE array() { }
 
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
-  array(std::initializer_list<T> l) {
+  EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) {
     eigen_assert(l.size() == 0);
   }
 #endif

From e76904af1b0f6aef80b68492534404f534ce0240 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 8 Jan 2016 16:50:57 -0800
Subject: [PATCH 14/52] Simplified the dispatch code.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 1c721de6d..fd7064459 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -342,7 +342,7 @@ template <typename Self, typename Op, typename Device>
 struct InnerReducer {
   static const bool HasOptimizedImplementation = false;
 
-  static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
+  static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
     assert(false && "Not implemented");
   }
 };
@@ -352,7 +352,7 @@ template <typename Self, typename Op, typename Device>
 struct OuterReducer {
   static const bool HasOptimizedImplementation = false;
 
-  static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
+  static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
     assert(false && "Not implemented");
   }
 };
@@ -527,6 +527,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
     }
 
     // Attempt to use an optimized reduction.
+#if 0
     else if (RunningOnGPU && data && (m_device.majorDeviceVersion() >= 3)) {
       bool reducing_inner_dims = true;
       for (int i = 0; i < NumReducedDims; ++i) {
@@ -562,6 +563,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
         return false;
       }
     }
+#endif
     return true;
   }
 

From 8b9dc9f0dfb44c2c4ad6d02fb88ecce0986cd154 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 9 Jan 2016 08:30:38 +0100
Subject: [PATCH 15/52] bug #1144: fix regression in x=y+A*x (aliasing), and
 move evaluator_traits::AssumeAliasing to evaluator_assume_aliasing.

---
 Eigen/src/Core/AssignEvaluator.h             | 14 ++++----
 Eigen/src/Core/CoreEvaluators.h              |  8 ++---
 Eigen/src/Core/ProductEvaluators.h           | 25 ++++++--------
 Eigen/src/Core/SelfAdjointView.h             |  2 --
 Eigen/src/Core/TriangularMatrix.h            |  4 ---
 Eigen/src/Geometry/Homogeneous.h             |  1 -
 Eigen/src/SparseCore/SparseSelfAdjointView.h |  2 --
 Eigen/src/SparseQR/SparseQR.h                |  1 -
 test/product.h                               | 35 +++++++++++++++-----
 test/product_large.cpp                       | 23 +++++++++++++
 test/product_notemporary.cpp                 |  6 ++++
 11 files changed, 77 insertions(+), 44 deletions(-)

diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
index 9dfffbcc4..f6632de69 100755
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -682,9 +682,9 @@ template< typename DstXprType, typename SrcXprType, typename Functor,
 struct Assignment;
 
 
-// The only purpose of this call_assignment() function is to deal with noalias() / AssumeAliasing and automatic transposition.
-// Indeed, I (Gael) think that this concept of AssumeAliasing was a mistake, and it makes thing quite complicated.
-// So this intermediate function removes everything related to AssumeAliasing such that Assignment
+// The only purpose of this call_assignment() function is to deal with noalias() / "assume-aliasing" and automatic transposition.
+// Indeed, I (Gael) think that this concept of "assume-aliasing" was a mistake, and it makes thing quite complicated.
+// So this intermediate function removes everything related to "assume-aliasing" such that Assignment
 // does not has to bother about these annoying details.
 
 template<typename Dst, typename Src>
@@ -698,21 +698,21 @@ EIGEN_DEVICE_FUNC void call_assignment(const Dst& dst, const Src& src)
   call_assignment(dst, src, internal::assign_op<typename Dst::Scalar>());
 }
                      
-// Deal with AssumeAliasing
+// Deal with "assume-aliasing"
 template<typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<evaluator_traits<Src>::AssumeAliasing==1, void*>::type = 0)
+EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if< evaluator_assume_aliasing<Src>::value, void*>::type = 0)
 {
   typename plain_matrix_type<Src>::type tmp(src);
   call_assignment_no_alias(dst, tmp, func);
 }
 
 template<typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<evaluator_traits<Src>::AssumeAliasing==0, void*>::type = 0)
+EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<!evaluator_assume_aliasing<Src>::value, void*>::type = 0)
 {
   call_assignment_no_alias(dst, src, func);
 }
 
-// by-pass AssumeAliasing
+// by-pass "assume-aliasing"
 // When there is no aliasing, we require that 'dst' has been properly resized
 template<typename Dst, template <typename> class StorageBase, typename Src, typename Func>
 EIGEN_DEVICE_FUNC void call_assignment(NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index f97dc33de..8bd73b814 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -63,10 +63,6 @@ struct evaluator_traits_base
   // by default, get evaluator kind and shape from storage
   typedef typename storage_kind_to_evaluator_kind<typename traits<T>::StorageKind>::Kind Kind;
   typedef typename storage_kind_to_shape<typename traits<T>::StorageKind>::Shape Shape;
-  
-  // 1 if assignment A = B assumes aliasing when B is of type T and thus B needs to be evaluated into a
-  // temporary; 0 if not.
-  static const int AssumeAliasing = 0;
 };
 
 // Default evaluator traits
@@ -75,6 +71,10 @@ struct evaluator_traits : public evaluator_traits_base<T>
 {
 };
 
+template<typename T, typename Shape = typename evaluator_traits<T>::Shape >
+struct evaluator_assume_aliasing {
+  static const bool value = false;
+};
 
 // By default, we assume a unary expression:
 template<typename T>
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index 794038a2a..b2a0a4b4f 100755
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -38,10 +38,9 @@ struct evaluator<Product<Lhs, Rhs, Options> >
 // Catch scalar * ( A * B ) and transform it to (A*scalar) * B
 // TODO we should apply that rule only if that's really helpful
 template<typename Lhs, typename Rhs, typename Scalar>
-struct evaluator_traits<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,  const Product<Lhs, Rhs, DefaultProduct>  > >
- : evaluator_traits_base<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,  const Product<Lhs, Rhs, DefaultProduct>  > >
+struct evaluator_assume_aliasing<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,  const Product<Lhs, Rhs, DefaultProduct>  > >
 {
-  enum { AssumeAliasing = 1 };
+  static const bool value = true;
 };
 template<typename Lhs, typename Rhs, typename Scalar>
 struct evaluator<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,  const Product<Lhs, Rhs, DefaultProduct>  > > 
@@ -81,17 +80,8 @@ template< typename Lhs, typename Rhs,
 struct generic_product_impl;
 
 template<typename Lhs, typename Rhs>
-struct evaluator_traits<Product<Lhs, Rhs, DefaultProduct> > 
- : evaluator_traits_base<Product<Lhs, Rhs, DefaultProduct> >
-{
-  enum { AssumeAliasing = 1 };
-};
-
-template<typename Lhs, typename Rhs>
-struct evaluator_traits<Product<Lhs, Rhs, AliasFreeProduct> > 
- : evaluator_traits_base<Product<Lhs, Rhs, AliasFreeProduct> >
-{
-  enum { AssumeAliasing = 0 };
+struct evaluator_assume_aliasing<Product<Lhs, Rhs, DefaultProduct> > {
+  static const bool value = true;
 };
 
 // This is the default evaluator implementation for products:
@@ -189,6 +179,13 @@ struct Assignment<DstXprType, CwiseUnaryOp<internal::scalar_multiple_op<ScalarBi
 //----------------------------------------
 // Catch "Dense ?= xpr + Product<>" expression to save one temporary
 // FIXME we could probably enable these rules for any product, i.e., not only Dense and DefaultProduct
+// TODO enable it for "Dense ?= xpr - Product<>" as well.
+
+template<typename OtherXpr, typename Lhs, typename Rhs>
+struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_sum_op<typename OtherXpr::Scalar>, const OtherXpr,
+                                               const Product<Lhs,Rhs,DefaultProduct> >, DenseShape > {
+  static const bool value = true;
+};
 
 template<typename DstXprType, typename OtherXpr, typename ProductType, typename Scalar, typename Func1, typename Func2>
 struct assignment_from_xpr_plus_product
diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h
index 87e87ab3a..e709eb213 100644
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@@ -203,8 +203,6 @@ struct evaluator_traits<SelfAdjointView<MatrixType,Mode> >
 {
   typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
   typedef SelfAdjointShape Shape;
-  
-  static const int AssumeAliasing = 0;
 };
 
 template<int UpLo, int SetOpposite, typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor, int Version>
diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h
index 7d6a97848..27845e89c 100644
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -704,10 +704,6 @@ struct evaluator_traits<TriangularView<MatrixType,Mode> >
 {
   typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
   typedef typename glue_shapes<typename evaluator_traits<MatrixType>::Shape, TriangularShape>::type Shape;
-  
-  // 1 if assignment A = B assumes aliasing when B is of type T and thus B needs to be evaluated into a
-  // temporary; 0 if not.
-  static const int AssumeAliasing = 0;
 };
 
 template<typename MatrixType, unsigned int Mode>
diff --git a/Eigen/src/Geometry/Homogeneous.h b/Eigen/src/Geometry/Homogeneous.h
index 367fd3930..cd52b5470 100644
--- a/Eigen/src/Geometry/Homogeneous.h
+++ b/Eigen/src/Geometry/Homogeneous.h
@@ -304,7 +304,6 @@ struct evaluator_traits<Homogeneous<ArgType,Direction> >
 {
   typedef typename storage_kind_to_evaluator_kind<typename ArgType::StorageKind>::Kind Kind;
   typedef HomogeneousShape Shape;  
-  static const int AssumeAliasing = 0;
 };
 
 template<> struct AssignmentKind<DenseShape,HomogeneousShape> { typedef Dense2Dense Kind; };
diff --git a/Eigen/src/SparseCore/SparseSelfAdjointView.h b/Eigen/src/SparseCore/SparseSelfAdjointView.h
index 46c6ce1d3..975cefd28 100644
--- a/Eigen/src/SparseCore/SparseSelfAdjointView.h
+++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h
@@ -211,8 +211,6 @@ struct evaluator_traits<SparseSelfAdjointView<MatrixType,Mode> >
 {
   typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
   typedef SparseSelfAdjointShape Shape;
-  
-  static const int AssumeAliasing = 0;
 };
 
 struct SparseSelfAdjoint2Sparse {};
diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h
index 4f26c19ca..0d448d02e 100644
--- a/Eigen/src/SparseQR/SparseQR.h
+++ b/Eigen/src/SparseQR/SparseQR.h
@@ -691,7 +691,6 @@ struct evaluator_traits<SparseQRMatrixQReturnType<SparseQRType> >
   typedef typename SparseQRType::MatrixType MatrixType;
   typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
   typedef SparseShape Shape;
-  static const int AssumeAliasing = 0;
 };
 
 template< typename DstXprType, typename SparseQRType>
diff --git a/test/product.h b/test/product.h
index 9dfff9303..bd92309d2 100644
--- a/test/product.h
+++ b/test/product.h
@@ -145,14 +145,31 @@ template<typename MatrixType> void product(const MatrixType& m)
   VERIFY_IS_APPROX(res.col(r).noalias() = square * square.col(r), (square * square.col(r)).eval());
 
   // inner product
-  Scalar x = square2.row(c) * square2.col(c2);
-  VERIFY_IS_APPROX(x, square2.row(c).transpose().cwiseProduct(square2.col(c2)).sum());
-  
+  {
+    Scalar x = square2.row(c) * square2.col(c2);
+    VERIFY_IS_APPROX(x, square2.row(c).transpose().cwiseProduct(square2.col(c2)).sum());
+  }
+
   // outer product
-  VERIFY_IS_APPROX(m1.col(c) * m1.row(r), m1.block(0,c,rows,1) * m1.block(r,0,1,cols));
-  VERIFY_IS_APPROX(m1.row(r).transpose() * m1.col(c).transpose(), m1.block(r,0,1,cols).transpose() * m1.block(0,c,rows,1).transpose());
-  VERIFY_IS_APPROX(m1.block(0,c,rows,1) * m1.row(r), m1.block(0,c,rows,1) * m1.block(r,0,1,cols));
-  VERIFY_IS_APPROX(m1.col(c) * m1.block(r,0,1,cols), m1.block(0,c,rows,1) * m1.block(r,0,1,cols));
-  VERIFY_IS_APPROX(m1.leftCols(1) * m1.row(r), m1.block(0,0,rows,1) * m1.block(r,0,1,cols));
-  VERIFY_IS_APPROX(m1.col(c) * m1.topRows(1), m1.block(0,c,rows,1) * m1.block(0,0,1,cols));  
+  {
+    VERIFY_IS_APPROX(m1.col(c) * m1.row(r), m1.block(0,c,rows,1) * m1.block(r,0,1,cols));
+    VERIFY_IS_APPROX(m1.row(r).transpose() * m1.col(c).transpose(), m1.block(r,0,1,cols).transpose() * m1.block(0,c,rows,1).transpose());
+    VERIFY_IS_APPROX(m1.block(0,c,rows,1) * m1.row(r), m1.block(0,c,rows,1) * m1.block(r,0,1,cols));
+    VERIFY_IS_APPROX(m1.col(c) * m1.block(r,0,1,cols), m1.block(0,c,rows,1) * m1.block(r,0,1,cols));
+    VERIFY_IS_APPROX(m1.leftCols(1) * m1.row(r), m1.block(0,0,rows,1) * m1.block(r,0,1,cols));
+    VERIFY_IS_APPROX(m1.col(c) * m1.topRows(1), m1.block(0,c,rows,1) * m1.block(0,0,1,cols));
+  }
+
+  // Aliasing
+  {
+    ColVectorType x(cols); x.setRandom();
+    ColVectorType z(x);
+    ColVectorType y(cols); y.setZero();
+    ColSquareMatrixType A(cols,cols); A.setRandom();
+    // CwiseBinaryOp
+    VERIFY_IS_APPROX(x = y + A*x, A*z);
+    x = z;
+    // CwiseUnaryOp
+    VERIFY_IS_APPROX(x = Scalar(1.)*(A*x), A*z);
+  }
 }
diff --git a/test/product_large.cpp b/test/product_large.cpp
index 7207973c2..98f84c53b 100644
--- a/test/product_large.cpp
+++ b/test/product_large.cpp
@@ -9,6 +9,27 @@
 
 #include "product.h"
 
+template<typename T>
+void test_aliasing()
+{
+  int rows = internal::random<int>(1,12);
+  int cols = internal::random<int>(1,12);
+  typedef Matrix<T,Dynamic,Dynamic> MatrixType;
+  typedef Matrix<T,Dynamic,1> VectorType;
+  VectorType x(cols); x.setRandom();
+  VectorType z(x);
+  VectorType y(rows); y.setZero();
+  MatrixType A(rows,cols); A.setRandom();
+  // CwiseBinaryOp
+  VERIFY_IS_APPROX(x = y + A*x, A*z);     // OK because "y + A*x" is marked as "assume-aliasing"
+  x = z;
+  // CwiseUnaryOp
+  VERIFY_IS_APPROX(x = T(1.)*(A*x), A*z); // OK because 1*(A*x) is replaced by (1*A*x) which is a Product<> expression
+  x = z;
+  // VERIFY_IS_APPROX(x = y-A*x, -A*z);   // Not OK in 3.3 because x is resized before A*x gets evaluated
+  x = z;
+}
+
 void test_product_large()
 {
   for(int i = 0; i < g_repeat; i++) {
@@ -17,6 +38,8 @@ void test_product_large()
     CALL_SUBTEST_3( product(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_4( product(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
     CALL_SUBTEST_5( product(Matrix<float,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+
+    CALL_SUBTEST_1( test_aliasing<float>() );
   }
 
 #if defined EIGEN_TEST_PART_6
diff --git a/test/product_notemporary.cpp b/test/product_notemporary.cpp
index ff93cb881..5a3f3a01a 100644
--- a/test/product_notemporary.cpp
+++ b/test/product_notemporary.cpp
@@ -43,10 +43,16 @@ template<typename MatrixType> void product_notemporary(const MatrixType& m)
         r1 = internal::random<Index>(8,rows-r0);
 
   VERIFY_EVALUATION_COUNT( m3 = (m1 * m2.adjoint()), 1);
+  VERIFY_EVALUATION_COUNT( m3 = (m1 * m2.adjoint()).transpose(), 1);
   VERIFY_EVALUATION_COUNT( m3.noalias() = m1 * m2.adjoint(), 0);
 
+  VERIFY_EVALUATION_COUNT( m3 = s1 * (m1 * m2.transpose()), 1);
+//   VERIFY_EVALUATION_COUNT( m3 = m3 + s1 * (m1 * m2.transpose()), 1);
   VERIFY_EVALUATION_COUNT( m3.noalias() = s1 * (m1 * m2.transpose()), 0);
 
+  VERIFY_EVALUATION_COUNT( m3 = m3 + (m1 * m2.adjoint()), 1);
+
+  VERIFY_EVALUATION_COUNT( m3 = m3 + (m1 * m2.adjoint()).transpose(), 1);
   VERIFY_EVALUATION_COUNT( m3.noalias() = m3 + m1 * m2.transpose(), 0);
   VERIFY_EVALUATION_COUNT( m3.noalias() += m3 + m1 * m2.transpose(), 0);
   VERIFY_EVALUATION_COUNT( m3.noalias() -= m3 + m1 * m2.transpose(), 0);

From 403a7cb6c34d163e4f120387b5dc5487d30bb1d5 Mon Sep 17 00:00:00 2001
From: Jeremy Barnes <jeremy@barneso.com>
Date: Sun, 10 Jan 2016 22:39:13 -0500
Subject: [PATCH 16/52] Alternative way of forcing instantiation of device
 kernels without causing warnings or requiring device to device kernel
 invocations.

This allows Tensorflow to work on SM 3.0 (ie, Amazon EC2) machines.
---
 unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h  | 10 ++++++++++
 unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h   |  2 +-
 .../Eigen/CXX11/src/Tensor/TensorReductionCuda.h       |  4 ++--
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index af140a68b..359a01b8f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -242,6 +242,16 @@ struct GpuDevice {
   (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);  \
   assert(cudaGetLastError() == cudaSuccess);
 
+#ifndef __CUDA_ARCH__
+#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)            \
+    (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
+  assert(cudaGetLastError() == cudaSuccess);
+#else
+#define LAUNCH_CUDA_KERNEL(kernel, ...)                                  \
+    { static const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \
+   eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__);
+#endif
+
 
 // FIXME: Should be device and kernel specific.
 #ifdef __CUDACC__
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index fd7064459..9a66e81f7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -506,7 +506,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
   typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
   typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
 
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) {
     m_impl.evalSubExprsIfNeeded(NULL);
 
     // Use the FullReducer if possible.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 558d0c83d..374edb605 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -116,7 +116,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
 
   template <typename OutputType>
   static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
-    assert(false && "Should only be called on floats");
+    eigen_assert(false && "Should only be called on floats");
   }
 
   static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
@@ -126,7 +126,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
     const int block_size = 256;
     const int num_per_thread = 128;
     const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
-    LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread>),
+    LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
                        num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
   }
 };

From 91678f489a89b1f4a393cd3b703e67922b5c4257 Mon Sep 17 00:00:00 2001
From: Jeremy Barnes <jeremy@barneso.com>
Date: Sun, 10 Jan 2016 22:44:45 -0500
Subject: [PATCH 17/52] Cleaned up double-defined macro from last commit

---
 unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 359a01b8f..c74613873 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -238,10 +238,6 @@ struct GpuDevice {
 
 };
 
-#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)            \
-  (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);  \
-  assert(cudaGetLastError() == cudaSuccess);
-
 #ifndef __CUDA_ARCH__
 #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)            \
     (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \

From 780623261eedd996404795dfb7928e680408adb5 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 11 Jan 2016 09:07:14 -0800
Subject: [PATCH 18/52] Re-enabled the optimized reduction CUDA code.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h     | 2 --
 unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 6 +++---
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index fd7064459..cea32d05f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -527,7 +527,6 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
     }
 
     // Attempt to use an optimized reduction.
-#if 0
     else if (RunningOnGPU && data && (m_device.majorDeviceVersion() >= 3)) {
       bool reducing_inner_dims = true;
       for (int i = 0; i < NumReducedDims; ++i) {
@@ -563,7 +562,6 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
         return false;
       }
     }
-#endif
     return true;
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 558d0c83d..198b3604c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -126,7 +126,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
     const int block_size = 256;
     const int num_per_thread = 128;
     const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
-    LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread>),
+    LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
                        num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
   }
 };
@@ -222,7 +222,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
     const int num_per_thread = 128;
     const int num_blocks = 32;
 
-    LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread>),
+    LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
                        num_blocks, block_size, block_size*sizeof(float), device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
   }
 };
@@ -279,7 +279,7 @@ struct OuterReducer<Self, Op, GpuDevice> {
                            device.maxCudaThreadsPerMultiProcessor() / block_size;
     const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
 
-    LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread>),
+    LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
                        num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
   }
 };

From 2ccb1c86342203cbc25a587590149d2cf5175900 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 11 Jan 2016 10:36:37 -0800
Subject: [PATCH 19/52] Fixed a bug in the dispatch of optimized reduction
 kernels.

---
 .../Eigen/CXX11/src/Tensor/TensorReduction.h  | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index cea32d05f..7bd2326b0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -438,19 +438,18 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
     EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
                         YOU_MADE_A_PROGRAMMING_MISTAKE);
 
-    // Bitmap indicating if an input dimension is reduced or not.
-    array<bool, NumInputDims> reduced;
+    // Build the bitmap indicating if an input dimension is reduced or not.
     for (int i = 0; i < NumInputDims; ++i) {
-      reduced[i] = false;
+      m_reduced[i] = false;
     }
     for (int i = 0; i < NumReducedDims; ++i) {
       eigen_assert(op.dims()[i] >= 0);
       eigen_assert(op.dims()[i] < NumInputDims);
-      reduced[op.dims()[i]] = true;
+      m_reduced[op.dims()[i]] = true;
     }
 
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
-    internal::DimInitializer<Dimensions>::run(input_dims, reduced, &m_dimensions, &m_reducedDims);
+    internal::DimInitializer<Dimensions>::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims);
 
     // Precompute output strides.
     if (NumOutputDims > 0) {
@@ -485,7 +484,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
       int outputIndex = 0;
       int reduceIndex = 0;
       for (int i = 0; i < NumInputDims; ++i) {
-	if (reduced[i]) {
+	if (m_reduced[i]) {
 	  m_reducedStrides[reduceIndex] = input_strides[i];
 	  ++reduceIndex;
 	} else {
@@ -531,9 +530,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
       bool reducing_inner_dims = true;
       for (int i = 0; i < NumReducedDims; ++i) {
         if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-          reducing_inner_dims &= m_reducedDims[i];
+          reducing_inner_dims &= m_reduced[i];
         } else {
-          reducing_inner_dims &= m_reducedDims[NumInputDims - 1 - i];
+          reducing_inner_dims &= m_reduced[NumInputDims - 1 - i];
         }
       }
       if (internal::InnerReducer<Self, Op, Device>::HasOptimizedImplementation &&
@@ -548,9 +547,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
       bool preserving_inner_dims = true;
       for (int i = 0; i < NumReducedDims; ++i) {
         if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-          preserving_inner_dims &= m_reducedDims[NumInputDims - 1 - i];
+          preserving_inner_dims &= m_reduced[NumInputDims - 1 - i];
         } else {
-          preserving_inner_dims &= m_reducedDims[i];
+          preserving_inner_dims &= m_reduced[i];
         }
       }
       if (internal::OuterReducer<Self, Op, Device>::HasOptimizedImplementation &&
@@ -689,6 +688,8 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
     return startInput;
   }
 
+  // Bitmap indicating if an input dimension is reduced or not.
+  array<bool, NumInputDims> m_reduced;
   // Dimensions of the output of the operation.
   Dimensions m_dimensions;
   // Precomputed strides for the output tensor.

From b523771a24320014abfec537b0f4b568c19882eb Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 11 Jan 2016 14:25:43 -0800
Subject: [PATCH 20/52] Silenced several compilation warnings triggered by
 nvcc.

---
 .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 48 ++++++++++++++-----
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h   |  8 ++--
 .../CXX11/src/Tensor/TensorReductionCuda.h    | 12 ++---
 3 files changed, 46 insertions(+), 22 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index c74613873..0f67f0f57 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -10,7 +10,6 @@
 #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H)
 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
 
-
 namespace Eigen {
 
 // This defines an interface that GPUDevice can take to use
@@ -206,20 +205,45 @@ struct GpuDevice {
 #endif
   }
 
-  inline int getNumCudaMultiProcessors() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
+#ifndef __CUDA_ARCH__
     return stream_->deviceProperties().multiProcessorCount;
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+    return 0;
+#endif
   }
-  inline int maxCudaThreadsPerBlock() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
+#ifndef __CUDA_ARCH__
     return stream_->deviceProperties().maxThreadsPerBlock;
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+    return 0;
+#endif
   }
-  inline int maxCudaThreadsPerMultiProcessor() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
+#ifndef __CUDA_ARCH__
     return stream_->deviceProperties().maxThreadsPerMultiProcessor;
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+    return 0;
+#endif
   }
-  inline int sharedMemPerBlock() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
+#ifndef __CUDA_ARCH__
     return stream_->deviceProperties().sharedMemPerBlock;
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+    return 0;
+#endif
   }
-  inline int majorDeviceVersion() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+#ifndef __CUDA_ARCH__
     return stream_->deviceProperties().major;
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+    return 0;
+#endif
   }
 
   // This function checks if the CUDA runtime recorded an error for the
@@ -239,13 +263,13 @@ struct GpuDevice {
 };
 
 #ifndef __CUDA_ARCH__
-#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)            \
-    (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
+#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
+  (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);   \
   assert(cudaGetLastError() == cudaSuccess);
 #else
-#define LAUNCH_CUDA_KERNEL(kernel, ...)                                  \
-    { static const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \
-   eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__);
+#define LAUNCH_CUDA_KERNEL(kernel, ...)                                                     \
+  { const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); }        \
+  eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__);
 #endif
 
 
@@ -260,4 +284,4 @@ static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
 
 }  // end namespace Eigen
 
-#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index d93e1de1b..d2ab70f2b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -156,14 +156,14 @@ template <typename Expression>
 class TensorExecutor<Expression, GpuDevice, false> {
  public:
   typedef typename Expression::Index Index;
-  static void run(const Expression& expr, const GpuDevice& device);
+  static EIGEN_DEVICE_FUNC void run(const Expression& expr, const GpuDevice& device);
 };
 
 template <typename Expression>
 class TensorExecutor<Expression, GpuDevice, true> {
  public:
   typedef typename Expression::Index Index;
-  static void run(const Expression& expr, const GpuDevice& device);
+  static EIGEN_DEVICE_FUNC void run(const Expression& expr, const GpuDevice& device);
 };
 
 #if defined(__CUDACC__)
@@ -213,7 +213,7 @@ EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) {
 
 /*static*/
 template <typename Expression>
-inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
+EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
 {
   TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
   const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
@@ -232,7 +232,7 @@ inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression&
 
 /*static*/
 template<typename Expression>
-inline void TensorExecutor<Expression, GpuDevice, true>::run(const Expression& expr, const GpuDevice& device)
+EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(const Expression& expr, const GpuDevice& device)
 {
   TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
   const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 3fa3d5c3c..867654aff 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -115,8 +115,8 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
                                                  internal::is_same<typename Self::CoeffReturnType, float>::value;
 
   template <typename OutputType>
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
-    eigen_assert(false && "Should only be called on floats");
+  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
+    assert(false && "Should only be called on floats");
   }
 
   static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
@@ -210,11 +210,11 @@ struct InnerReducer<Self, Op, GpuDevice> {
                                                  internal::is_same<typename Self::CoeffReturnType, float>::value;
 
   template <typename Device, typename OutputType>
-  static void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
+  static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
     assert(false && "Should only be called to reduce floats on a gpu device");
   }
 
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
     typedef typename Self::Index Index;
 
     const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
@@ -264,11 +264,11 @@ struct OuterReducer<Self, Op, GpuDevice> {
                                                  internal::is_same<typename Self::CoeffReturnType, float>::value;
 
   template <typename Device, typename OutputType>
-  static void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
+  static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
     assert(false && "Should only be called to reduce floats on a gpu device");
   }
 
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
     typedef typename Self::Index Index;
 
     const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;

From 0504c56ea7b1dc4d804580cbc498b189ffd06b6e Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 11 Jan 2016 15:49:21 -0800
Subject: [PATCH 21/52] Silenced a nvcc compilation warning

---
 unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 867654aff..91b6e68dd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -119,7 +119,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
     assert(false && "Should only be called on floats");
   }
 
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
+  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
     typedef typename Self::Index Index;
 
     const Index num_coeffs = array_prod(self.m_impl.dimensions());

From 01c55d37e69ca3c45f4390b7e15c310028e8b8ed Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 11 Jan 2016 15:53:19 -0800
Subject: [PATCH 22/52] Deleted unused variable.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 91b6e68dd..59404dba5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -217,7 +217,6 @@ struct InnerReducer<Self, Op, GpuDevice> {
   static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
     typedef typename Self::Index Index;
 
-    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
     const int block_size = 256;
     const int num_per_thread = 128;
     const int num_blocks = 32;

From 4f7714d72cb8a34dd42081a6ece310c984392354 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 11 Jan 2016 16:01:00 -0800
Subject: [PATCH 23/52] Enabled the use of fixed dimensions from within a cuda
 kernel.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index f3c9a3148..2692563a9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -110,14 +110,14 @@ struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> {
     return internal::arg_prod(Indices...);
   }
 
-  Sizes() { }
+  EIGEN_DEVICE_FUNC Sizes() { }
   template <typename DenseIndex>
-  explicit Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
+  explicit EIGEN_DEVICE_FUNC Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
     // todo: add assertion
   }
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
-  template <typename... DenseIndex> Sizes(DenseIndex...) { }
-  explicit Sizes(std::initializer_list<std::ptrdiff_t> /*l*/) {
+  template <typename... DenseIndex> EIGEN_DEVICE_FUNC Sizes(DenseIndex...) { }
+  explicit EIGEN_DEVICE_FUNC Sizes(std::initializer_list<std::ptrdiff_t> /*l*/) {
     // todo: add assertion
   }
 #endif

From f894736d61198eef9d6463d3cdde8dbe1171a56b Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 11 Jan 2016 16:42:18 -0800
Subject: [PATCH 24/52] Updated the tensor traits: the alignment is not part of
 the Flags enum anymore

---
 unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
index 7a9568b36..2f06f8442 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -20,7 +20,7 @@ class compute_tensor_flags
   enum {
     is_dynamic_size_storage = 1,
 
-    aligned_bit =
+    is_aligned =
     (
         ((Options&DontAlign)==0) && (
 #if EIGEN_MAX_STATIC_ALIGN_BYTES>0
@@ -35,12 +35,12 @@ class compute_tensor_flags
             0
 #endif
       )
-    ) ? AlignedBit : 0,
-    packet_access_bit = packet_traits<Scalar>::Vectorizable && aligned_bit ? PacketAccessBit : 0
+     ),
+    packet_access_bit = packet_traits<Scalar>::Vectorizable && is_aligned ? PacketAccessBit : 0
   };
 
   public:
-    enum { ret = packet_access_bit | aligned_bit};
+    enum { ret = packet_access_bit};
 };
 
 
@@ -86,7 +86,7 @@ struct traits<TensorMap<PlainObjectType, Options_> >
   static const int Layout = BaseTraits::Layout;
   enum {
     Options = Options_,
-    Flags = (BaseTraits::Flags & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0),
+    Flags = BaseTraits::Flags,
   };
 };
 
@@ -102,7 +102,7 @@ struct traits<TensorRef<PlainObjectType> >
   static const int Layout = BaseTraits::Layout;
   enum {
     Options = BaseTraits::Options,
-    Flags = (BaseTraits::Flags & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0),
+    Flags = BaseTraits::Flags,
   };
 };
 

From c5e6900400663901013dc48a3492e756b21c131e Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 11 Jan 2016 17:06:39 -0800
Subject: [PATCH 25/52] Silenced a few compilation warnings.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h | 7 ++++---
 unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h      | 6 +++++-
 unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h   | 2 ++
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index 90ee50678..126a9e4ad 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -1261,7 +1261,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       Base(op, device) {}
 
   // We need to redefine this method to make nvcc happy
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
     this->m_leftImpl.evalSubExprsIfNeeded(NULL);
     this->m_rightImpl.evalSubExprsIfNeeded(NULL);
     if (data) {
@@ -1274,7 +1274,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     }
   }
 
-  void evalTo(Scalar* buffer) const {
+  EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const {
     if (this->m_lhs_inner_dim_contiguous) {
       if (this->m_rhs_inner_dim_contiguous) {
         if (this->m_rhs_inner_dim_reordered) {
@@ -1313,10 +1313,11 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     }
   }
 
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> EIGEN_DEVICE_FUNC
   void evalTyped(Scalar* buffer) const {
     // columns in left side, rows in right side
     const Index k = this->m_k_size;
+    EIGEN_UNUSED_VARIABLE(k)
 
     // rows in left side
     const Index m = this->m_i_size;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 0f67f0f57..5abdc489b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -275,10 +275,14 @@ struct GpuDevice {
 
 // FIXME: Should be device and kernel specific.
 #ifdef __CUDACC__
-static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
+static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
+#ifndef __CUDA_ARCH__
   cudaError_t status = cudaDeviceSetSharedMemConfig(config);
   EIGEN_UNUSED_VARIABLE(status)
   assert(status == cudaSuccess);
+#else
+  EIGEN_UNUSED_VARIABLE(config)
+#endif
 }
 #endif
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 59404dba5..89f055134 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -220,6 +220,8 @@ struct InnerReducer<Self, Op, GpuDevice> {
     const int block_size = 256;
     const int num_per_thread = 128;
     const int num_blocks = 32;
+    EIGEN_UNUSED_VARIABLE(block_size)
+    EIGEN_UNUSED_VARIABLE(num_blocks)
 
     LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
                        num_blocks, block_size, block_size*sizeof(float), device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);

From bbdabbb37976a37ab585d266691100bdd035f30b Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 11 Jan 2016 17:26:56 -0800
Subject: [PATCH 26/52] Made the blas utils usable from within a cuda kernel

---
 Eigen/src/Core/util/BlasUtil.h | 42 +++++++++++++++++-----------------
 Eigen/src/Core/util/Memory.h   |  7 +++---
 2 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index d00fa9707..498db3a70 100755
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -123,18 +123,18 @@ template<typename Scalar> struct get_factor<Scalar,typename NumTraits<Scalar>::R
 template<typename Scalar, typename Index>
 class BlasVectorMapper {
   public:
-  EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar *data) : m_data(data) {}
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar *data) : m_data(data) {}
 
-  EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
     return m_data[i];
   }
   template <typename Packet, int AlignmentType>
-  EIGEN_ALWAYS_INLINE Packet load(Index i) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet load(Index i) const {
     return ploadt<Packet, AlignmentType>(m_data + i);
   }
 
   template <typename Packet>
-  bool aligned(Index i) const {
+  EIGEN_DEVICE_FUNC bool aligned(Index i) const {
     return (size_t(m_data+i)%sizeof(Packet))==0;
   }
 
@@ -148,25 +148,25 @@ class BlasLinearMapper {
   typedef typename packet_traits<Scalar>::type Packet;
   typedef typename packet_traits<Scalar>::half HalfPacket;
 
-  EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {}
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {}
 
-  EIGEN_ALWAYS_INLINE void prefetch(int i) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const {
     internal::prefetch(&operator()(i));
   }
 
-  EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const {
     return m_data[i];
   }
 
-  EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
     return ploadt<Packet, AlignmentType>(m_data + i);
   }
 
-  EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
     return ploadt<HalfPacket, AlignmentType>(m_data + i);
   }
 
-  EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet &p) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet &p) const {
     pstoret<Scalar, Packet, AlignmentType>(m_data + i, p);
   }
 
@@ -184,18 +184,18 @@ class blas_data_mapper {
   typedef BlasLinearMapper<Scalar, Index, AlignmentType> LinearMapper;
   typedef BlasVectorMapper<Scalar, Index> VectorMapper;
 
-  EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
 
-  EIGEN_ALWAYS_INLINE blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>
+  EIGEN_DEVICE_FUNC  EIGEN_ALWAYS_INLINE blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>
   getSubMapper(Index i, Index j) const {
     return blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>(&operator()(i, j), m_stride);
   }
 
-  EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+  EIGEN_DEVICE_FUNC  EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
     return LinearMapper(&operator()(i, j));
   }
 
-  EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
+  EIGEN_DEVICE_FUNC  EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
     return VectorMapper(&operator()(i, j));
   }
 
@@ -205,28 +205,28 @@ class blas_data_mapper {
     return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride];
   }
 
-  EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
     return ploadt<Packet, AlignmentType>(&operator()(i, j));
   }
 
-  EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
     return ploadt<HalfPacket, AlignmentType>(&operator()(i, j));
   }
 
   template<typename SubPacket>
-  EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
     pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
   }
 
   template<typename SubPacket>
-  EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const {
     return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride);
   }
 
-  const Index stride() const { return m_stride; }
-  const Scalar* data() const { return m_data; }
+  EIGEN_DEVICE_FUNC const Index stride() const { return m_stride; }
+  EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; }
 
-  Index firstAligned(Index size) const {
+  EIGEN_DEVICE_FUNC Index firstAligned(Index size) const {
     if (size_t(m_data)%sizeof(Scalar)) {
       return -1;
     }
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 1a899ea6c..823e077af 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -524,7 +524,7 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_align
   * \sa first_default_aligned()
   */
 template<int Alignment, typename Scalar, typename Index>
-inline Index first_aligned(const Scalar* array, Index size)
+EIGEN_DEVICE_FUNC inline Index first_aligned(const Scalar* array, Index size)
 {
   static const Index ScalarSize = sizeof(Scalar);
   static const Index AlignmentSize = Alignment / ScalarSize;
@@ -544,14 +544,15 @@ inline Index first_aligned(const Scalar* array, Index size)
   }
   else
   {
-    return std::min<Index>( (AlignmentSize - (Index((std::size_t(array)/sizeof(Scalar))) & AlignmentMask)) & AlignmentMask, size);
+    Index first = (AlignmentSize - (Index((std::size_t(array)/sizeof(Scalar))) & AlignmentMask)) & AlignmentMask;
+    return (first < size) ? first : size;
   }
 }
 
 /** \internal Returns the index of the first element of the array that is well aligned with respect the largest packet requirement.
    * \sa first_aligned(Scalar*,Index) and first_default_aligned(DenseBase<Derived>) */
 template<typename Scalar, typename Index>
-inline Index first_default_aligned(const Scalar* array, Index size)
+EIGEN_DEVICE_FUNC inline Index first_default_aligned(const Scalar* array, Index size)
 {
   typedef typename packet_traits<Scalar>::type DefaultPacketType;
   return first_aligned<unpacket_traits<DefaultPacketType>::alignment>(array, size);

From bd7d901da9bd824ee2a7a94d3b0c8668d77f5ff2 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 11 Jan 2016 17:49:44 -0800
Subject: [PATCH 27/52] Reverted a previous change that tripped nvcc when
 compiling in debug mode.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index 126a9e4ad..a5f3debc4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -1274,7 +1274,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     }
   }
 
-  EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const {
+  void evalTo(Scalar* buffer) const {
     if (this->m_lhs_inner_dim_contiguous) {
       if (this->m_rhs_inner_dim_contiguous) {
         if (this->m_rhs_inner_dim_reordered) {
@@ -1313,7 +1313,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     }
   }
 
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> EIGEN_DEVICE_FUNC
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
   void evalTyped(Scalar* buffer) const {
     // columns in left side, rows in right side
     const Index k = this->m_k_size;
@@ -1362,7 +1362,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
         const dim3 block_size(16, 16, 1);
         LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k);
       } else {
-       const Index m_blocks = (m + 127) / 128;
+        const Index m_blocks = (m + 127) / 128;
         const Index n_blocks = (n + 63) / 64;
         const dim3 num_blocks(m_blocks, n_blocks, 1);
         const dim3 block_size(8, 32, 1);

From d920d57f38e07739403a6c1e224c74fec5a36e6f Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 12 Jan 2016 11:32:27 -0800
Subject: [PATCH 28/52] Improved the performance of the contraction of a 2d
 tensor with a 1d tensor by a factor of 3 or more. This helps speedup LSTM
 neural networks.

---
 .../CXX11/src/Tensor/TensorContraction.h      | 48 ++++++++++---------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index eda93a1de..63d0c6f68 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -32,7 +32,7 @@ enum {
 template<typename Scalar, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
-         int packet_size, bool inner_dim_contiguous>
+         int packet_size, bool inner_dim_contiguous, int Alignment>
 class SimpleTensorContractionMapper {
   public:
   EIGEN_DEVICE_FUNC
@@ -144,11 +144,11 @@ class SimpleTensorContractionMapper {
     return IndexPair<Index>(linidx[0], linidx[1]);
   }
 
-  Index firstAligned(Index size) const {
-    return size;
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const {
+    return (Alignment == Aligned) ? 0 : size;
   }
-  Index stride() const {
-    return 1;
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const {
+    return ((side == Lhs) && inner_dim_contiguous) ? m_contract_strides[0] : 1;
   }
 
  protected:
@@ -165,10 +165,10 @@ template<typename Scalar, typename Index, int side,
          typename nocontract_t, typename contract_t,
          int packet_size, bool inner_dim_contiguous,
          bool inner_dim_reordered, int Alignment>
-  class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous>
+class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment>
 {
  public:
-  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous> ParentMapper;
+  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment> ParentMapper;
 
   EIGEN_DEVICE_FUNC
   BaseTensorContractionMapper(const Tensor& tensor,
@@ -181,6 +181,7 @@ template<typename Scalar, typename Index, int side,
   typedef typename packet_traits<Scalar>::type Packet;
   typedef typename packet_traits<Scalar>::half HalfPacket;
 
+  template <int AlignmentType = Alignment>
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
     // whole method makes column major assumption
@@ -192,7 +193,7 @@ template<typename Scalar, typename Index, int side,
     if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) {
       const Index index = this->computeIndex(i, j);
       eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1);
-      return this->m_tensor.template packet<Alignment>(index);
+      return this->m_tensor.template packet<AlignmentType>(index);
     }
 
     const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1);
@@ -207,7 +208,7 @@ template<typename Scalar, typename Index, int side,
         (side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) &&
         (last - first) == (packet_size - 1)) {
 
-      return this->m_tensor.template packet<Alignment>(first);
+      return this->m_tensor.template packet<AlignmentType>(first);
     }
 
     EIGEN_ALIGN_MAX Scalar data[packet_size];
@@ -223,6 +224,7 @@ template<typename Scalar, typename Index, int side,
     return pload<Packet>(data);
   }
 
+  template <int AlignmentType = Alignment>
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
     // whole method makes column major assumption
@@ -230,7 +232,7 @@ template<typename Scalar, typename Index, int side,
     // don't need to add offsets for now (because operator handles that)
     const Index half_packet_size = unpacket_traits<HalfPacket>::size;
     if (half_packet_size == packet_size) {
-      return loadPacket(i, j);
+      return loadPacket<AlignmentType>(i, j);
     }
     EIGEN_ALIGN_MAX Scalar data[half_packet_size];
     for (Index k = 0; k < half_packet_size; k++) {
@@ -246,10 +248,10 @@ template<typename Scalar, typename Index, int side,
          typename nocontract_t, typename contract_t,
          bool inner_dim_contiguous,
          bool inner_dim_reordered, int Alignment>
-class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous>
+class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment>
 {
  public:
-  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous> ParentMapper;
+  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment> ParentMapper;
 
   EIGEN_DEVICE_FUNC
   BaseTensorContractionMapper(const Tensor& tensor,
@@ -260,13 +262,13 @@ class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, con
   ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
 
   typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC
+  template <int> EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
     EIGEN_ALIGN_MAX Scalar data[1];
     data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
     return pload<typename packet_traits<Scalar>::type>(data);
   }
-  EIGEN_DEVICE_FUNC
+  template <int> EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const {
     return loadPacket(i, j);
   }
@@ -304,14 +306,14 @@ class TensorContractionSubMapper {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
-    return m_base_mapper.loadPacket(i + m_vert_offset, m_horiz_offset);
+    return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
   }
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
-   return m_base_mapper.loadPacket(i + m_vert_offset, j + m_horiz_offset);
+   return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, j + m_horiz_offset);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
-    return m_base_mapper.loadHalfPacket(i + m_vert_offset, m_horiz_offset);
+    return m_base_mapper.template loadHalfPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
@@ -325,12 +327,12 @@ class TensorContractionSubMapper {
   template <typename PacketT, int AlignmentType>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const {
     EIGEN_STATIC_ASSERT((internal::is_same<PacketT, Packet>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    EIGEN_STATIC_ASSERT((AlignmentType == Aligned || Alignment == Unaligned), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return loadPacket(i);
+    const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned;
+    return m_base_mapper.template loadPacket<ActualAlignment>(i + m_vert_offset, m_horiz_offset);
   }
 
   template <typename Packet>
-  EIGEN_DEVICE_FUNC bool aligned(Index) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const {
     return false;
   }
 
@@ -741,17 +743,19 @@ struct TensorContractionEvaluatorBase
     typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
     const Index lhs_packet_size = internal::packet_traits<LhsScalar>::size;
     const Index rhs_packet_size = internal::packet_traits<RhsScalar>::size;
+    const int lhs_alignment = LeftEvaluator::IsAligned ? Aligned : Unaligned;
+    const int rhs_alignment = RightEvaluator::IsAligned ? Aligned : Unaligned;
     typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
                                                    LeftEvaluator, left_nocontract_t,
                                                    contract_t, lhs_packet_size,
                                                    lhs_inner_dim_contiguous,
-                                                   false, Unaligned> LhsMapper;
+                                                   false, lhs_alignment> LhsMapper;
 
     typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
                                                    RightEvaluator, right_nocontract_t,
                                                    contract_t, rhs_packet_size,
                                                    rhs_inner_dim_contiguous,
-                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
+                                                   rhs_inner_dim_reordered, rhs_alignment> RhsMapper;
 
     LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides,
                   m_left_contracting_strides, m_k_strides);

From 79b69b7444cfae2f7631e873e822cdca6f4e355f Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 12 Jan 2016 15:21:09 -0800
Subject: [PATCH 29/52] Trigger the optimized matrix vector path more
 conservatively.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 63d0c6f68..72a378dfd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -145,7 +145,10 @@ class SimpleTensorContractionMapper {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const {
-    return (Alignment == Aligned) ? 0 : size;
+    // Only claim alignment when we can compute the actual stride (ie when we're
+    // dealing with the lhs with inner_dim_contiguous. This is because the
+    // matrix-vector product relies on the stride when dealing with aligned inputs.
+    return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size;
   }
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const {
     return ((side == Lhs) && inner_dim_contiguous) ? m_contract_strides[0] : 1;

From 9f013a9d86ad5cf82939bfeab2223652a821c448 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 13 Jan 2016 14:24:37 -0800
Subject: [PATCH 30/52] Properly record the rank of reduced tensors in the
 tensor traits.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h    | 2 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 9 ++++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
index c783aab97..781a37e34 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@@ -134,7 +134,7 @@ struct traits<TensorTupleReducerOp<ReduceOp, Dims, XprType> > : public traits<Xp
   typedef Index Scalar;
   typedef typename XprType::Nested Nested;
   typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
   static const int Layout = XprTraits::Layout;
 };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 8028e71c0..2dc8815b8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -24,11 +24,14 @@ template<typename Op, typename Dims, typename XprType>
 struct traits<TensorReductionOp<Op, Dims, XprType> >
  : traits<XprType>
 {
-  typedef typename traits<XprType>::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::Scalar Scalar;
   typedef typename internal::packet_traits<Scalar>::type Packet;
-  typedef typename traits<XprType>::StorageKind StorageKind;
-  typedef typename traits<XprType>::Index Index;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
   typedef typename XprType::Nested Nested;
+  static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
+  static const int Layout = XprTraits::Layout;
 };
 
 template<typename Op, typename Dims, typename XprType>

From 8fe2532e70a8e0261717003d96d4df41ab978756 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 14 Jan 2016 09:29:48 -0800
Subject: [PATCH 31/52] Fixed a boundary condition bug in the outer reduction
 kernel

---
 unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 89f055134..54ab34ba1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -241,7 +241,7 @@ __global__ void OuterReductionKernel(Reducer reducer, const Self input, Index nu
   }
 
   // Do the reduction.
-  const Index max_iter = divup<Index>(num_coeffs_to_reduce, NumPerThread) * num_preserved_coeffs;
+  const Index max_iter = num_preserved_coeffs * numext::maxi<Index>(1, (num_coeffs_to_reduce - NumPerThread + 1));
   for (Index i = thread_id; i < max_iter; i += num_threads) {
     const Index input_col = i % num_preserved_coeffs;
     const Index input_row = (i / num_preserved_coeffs) * NumPerThread;

From aed4cb1269d52d0ff0e69c8aa6d89c804185b18f Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 14 Jan 2016 21:45:14 -0800
Subject: [PATCH 32/52] Use warp shuffles instead of shared memory access to
 speedup the inner reduction kernel.

---
 .../CXX11/src/Tensor/TensorReductionCuda.h    | 20 +++++++------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 54ab34ba1..82ea09f07 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -132,8 +132,6 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
 };
 
 
-extern __shared__ float temp[];
-
 template <int NumPerThread, typename Self,
           typename Reducer, typename Index>
 __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
@@ -183,17 +181,13 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
         }
       }
 
-      temp[threadIdx.x] = reduced_val;
+#pragma unroll
+      for (int offset = warpSize/2; offset > 0; offset /= 2) {
+        reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
+      }
 
-      __syncthreads();
-      const int warp_id = threadIdx.x & 31;
-      if (warp_id < 16) reducer.reduce(temp[threadIdx.x + 16], &temp[threadIdx.x]);
-      if (warp_id < 8) reducer.reduce(temp[threadIdx.x + 8], &temp[threadIdx.x]);
-      if (warp_id < 4) reducer.reduce(temp[threadIdx.x + 4], &temp[threadIdx.x]);
-      if (warp_id < 2) reducer.reduce(temp[threadIdx.x + 2], &temp[threadIdx.x]);
-      if (warp_id < 1) {
-        reducer.reduce(temp[threadIdx.x + 1], &temp[threadIdx.x]);
-        atomicReduce(&(output[row]), temp[threadIdx.x], reducer);
+      if ((threadIdx.x & (warpSize - 1)) == 0) {
+        atomicReduce(&(output[row]), reduced_val, reducer);
       }
     }
 
@@ -224,7 +218,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
     EIGEN_UNUSED_VARIABLE(num_blocks)
 
     LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, block_size*sizeof(float), device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
   }
 };
 

From 0461f0153ef66fb95d16b620675c97107ef7a342 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 15 Jan 2016 11:22:16 -0800
Subject: [PATCH 33/52] Made it possible to compare tensor dimensions inside a
 CUDA kernel.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index 2692563a9..52569a359 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -405,20 +405,20 @@ template <std::size_t n, std::size_t V1, std::size_t V2, std::size_t V3, std::si
 
 template <typename Dims1, typename Dims2, size_t n, size_t m>
 struct sizes_match_below_dim {
-  static inline bool run(Dims1&, Dims2&) {
+  static EIGEN_DEVICE_FUNC  inline bool run(Dims1&, Dims2&) {
     return false;
   }
 };
 template <typename Dims1, typename Dims2, size_t n>
 struct sizes_match_below_dim<Dims1, Dims2, n, n> {
-  static inline bool run(Dims1& dims1, Dims2& dims2) {
+  static EIGEN_DEVICE_FUNC  inline bool run(Dims1& dims1, Dims2& dims2) {
     return (array_get<n-1>(dims1) == array_get<n-1>(dims2)) &
         sizes_match_below_dim<Dims1, Dims2, n-1, n-1>::run(dims1, dims2);
   }
 };
 template <typename Dims1, typename Dims2>
 struct sizes_match_below_dim<Dims1, Dims2, 0, 0> {
-  static inline bool run(Dims1&, Dims2&) {
+  static EIGEN_DEVICE_FUNC  inline bool run(Dims1&, Dims2&) {
     return true;
   }
 };
@@ -427,7 +427,7 @@ struct sizes_match_below_dim<Dims1, Dims2, 0, 0> {
 
 
 template <typename Dims1, typename Dims2>
-bool dimensions_match(Dims1& dims1, Dims2& dims2) {
+EIGEN_DEVICE_FUNC bool dimensions_match(Dims1& dims1, Dims2& dims2) {
   return internal::sizes_match_below_dim<Dims1, Dims2, internal::array_size<Dims1>::value, internal::array_size<Dims2>::value>::run(dims1, dims2);
 }
 

From 34057cff23bc3b9d59117af9bea0b3e098e3eaea Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 15 Jan 2016 15:11:56 -0800
Subject: [PATCH 34/52] Fixed a race condition that could affect some
 reductions on CUDA devices.

---
 .../CXX11/src/Tensor/TensorReductionCuda.h    | 72 ++++++++++++++++---
 1 file changed, 61 insertions(+), 11 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 82ea09f07..2da18b147 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -76,13 +76,24 @@ __device__ inline void atomicReduce(T* output, T accum, SumReducer<T>&) {
 #endif
 }
 
+
+template <typename CoeffType, typename Index>
+__global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index num_threads = blockDim.x * gridDim.x;
+  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+    output[i] = val;
+  }
+}
+
 template <int BlockSize, int NumPerThread, typename Self,
           typename Reducer, typename Index>
 __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
                                     typename Self::CoeffReturnType* output) {
   const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
 
-  if (first_index == 0) {
+  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
+  if (gridDim.x == 1 && first_index == 0) {
     *output = reducer.initialize();
   }
 
@@ -126,6 +137,14 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
     const int block_size = 256;
     const int num_per_thread = 128;
     const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
+
+    if (num_blocks > 1) {
+      // We initialize the outputs outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
+                         1, 32, 0, device, reducer.initialize(), 1, output);
+    }
+
     LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
                        num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
   }
@@ -150,8 +169,11 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
   const Index num_threads = blockDim.x * gridDim.x;
   const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
-  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
-    output[i] = reducer.initialize();
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+      output[i] = reducer.initialize();
+    }
   }
 
   for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
@@ -211,11 +233,25 @@ struct InnerReducer<Self, Op, GpuDevice> {
   static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
     typedef typename Self::Index Index;
 
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
     const int block_size = 256;
     const int num_per_thread = 128;
-    const int num_blocks = 32;
-    EIGEN_UNUSED_VARIABLE(block_size)
-    EIGEN_UNUSED_VARIABLE(num_blocks)
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumCudaMultiProcessors() *
+                           device.maxCudaThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks = device.getNumCudaMultiProcessors() *
+                           device.maxCudaThreadsPerMultiProcessor() / 1024;
+      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
+                         num_blocks, 1024, 0, device, reducer.initialize(),
+                         num_preserved_vals, output);
+    }
 
     LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
                        num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
@@ -229,9 +265,11 @@ __global__ void OuterReductionKernel(Reducer reducer, const Self input, Index nu
                                      typename Self::CoeffReturnType* output) {
   const Index num_threads = blockDim.x * gridDim.x;
   const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  // Initialize the output values
-  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
-    output[i] = reducer.initialize();
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+      output[i] = reducer.initialize();
+    }
   }
 
   // Do the reduction.
@@ -266,14 +304,26 @@ struct OuterReducer<Self, Op, GpuDevice> {
   static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
     typedef typename Self::Index Index;
 
-    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+     const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
     const int block_size = 256;
     const int num_per_thread = 16;
-    const int dyn_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
     const int max_blocks = device.getNumCudaMultiProcessors() *
                            device.maxCudaThreadsPerMultiProcessor() / block_size;
     const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
 
+    if (num_blocks > 1) {
+      // We initialize the outputs in the reduction kernel itself when we don't have to worry
+      // about race conditions between multiple thread blocks.
+      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks = device.getNumCudaMultiProcessors() *
+                             device.maxCudaThreadsPerMultiProcessor() / 1024;
+      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
+                         num_blocks, 1024, 0, device, reducer.initialize(),
+                         num_preserved_vals, output);
+    }
+
     LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
                        num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
   }

From 5b7713dd33f5d16282f69f10c33a739554762782 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 19 Jan 2016 17:05:10 -0800
Subject: [PATCH 35/52] Record whether the underlying tensor storage can be
 accessed directly during the evaluation of an expression.

---
 unsupported/Eigen/CXX11/src/Tensor/Tensor.h              | 3 ++-
 unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h        | 2 ++
 unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h        | 3 +++
 unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h  | 1 +
 unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h      | 2 ++
 unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h | 2 ++
 unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h    | 1 +
 unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h   | 2 ++
 unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h      | 2 ++
 unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h        | 3 ++-
 unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h     | 6 ++++++
 unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h           | 1 +
 unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h     | 3 ++-
 unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h    | 1 +
 unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h     | 1 +
 unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h    | 1 +
 unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h     | 3 ++-
 unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h    | 1 +
 unsupported/Eigen/CXX11/src/Tensor/TensorMap.h           | 3 ++-
 unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h      | 4 ++++
 unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h       | 1 +
 unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h         | 1 +
 unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h     | 1 +
 unsupported/Eigen/CXX11/src/Tensor/TensorRef.h           | 3 +++
 unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h       | 2 ++
 unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h     | 2 ++
 unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h      | 2 ++
 unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h   | 1 +
 28 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
index ad525bac8..dc6ca4909 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@@ -78,7 +78,8 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
       IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0) & !(Options_&DontAlign),
       PacketAccess = (internal::packet_traits<Scalar>::size > 1),
       Layout = Options_ & RowMajor ? RowMajor : ColMajor,
-      CoordAccess = true
+      CoordAccess = true,
+      RawAccess = true
     };
 
     static const int Options = Options_;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
index 781a37e34..f1ec04c49 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@@ -89,6 +89,7 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
     BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -210,6 +211,7 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
     BlockAccess = false,
     Layout = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout,
     CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index a41d4d265..10fac0cc5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -97,6 +97,7 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
     IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+    RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess,
   };
 
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
@@ -152,6 +153,8 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
     return m_leftImpl.template packet<LoadMode>(index);
   }
 
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); }
+
  private:
   TensorEvaluator<LeftArgType, Device> m_leftImpl;
   TensorEvaluator<RightArgType, Device> m_rightImpl;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 9ec20a99e..efca7cd79 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -109,6 +109,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index abc3c92ca..a209e885b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -145,6 +145,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -304,6 +305,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index 3d153bb94..f57d2bb7d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -125,6 +125,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
     IsAligned = false,
     PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -287,6 +288,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
     IsAligned = false,
     PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index 3ca7daf32..877bcd0df 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -162,6 +162,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess && internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index a82bfc0aa..367a152a0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -306,6 +306,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
     PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<InputArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -752,6 +753,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
     PacketAccess = false,
     Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
     CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
index 0157f6fab..0f8a98caf 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@@ -95,6 +95,7 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
     BlockAccess = false,
     Layout = TensorEvaluator<XprType, Device>::Layout,
     CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device)
@@ -250,6 +251,7 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
     BlockAccess = false,
     Layout = TensorEvaluator<LhsXprType, Device>::Layout,
     CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index ff4373f59..e7daf7304 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -98,6 +98,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
     PacketAccess = true,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
+    RawAccess = true
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -140,7 +141,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
     return internal::ploadt<Packet, LoadMode>(m_buffer + index);
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_buffer; }
 
  private:
   TensorEvaluator<ArgType, Device> m_impl;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 902f25247..f726585b1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -43,6 +43,7 @@ struct TensorEvaluator
     PacketAccess = Derived::PacketAccess,
     Layout = Derived::Layout,
     CoordAccess = NumCoords > 0,
+    RawAccess = true
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
@@ -148,6 +149,7 @@ struct TensorEvaluator<const Derived, Device>
     PacketAccess = Derived::PacketAccess,
     Layout = Derived::Layout,
     CoordAccess = NumCoords > 0,
+    RawAccess = true
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
@@ -207,6 +209,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
     PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC
@@ -257,6 +260,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess & internal::functor_traits<UnaryOp>::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
@@ -312,6 +316,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
                    internal::functor_traits<BinaryOp>::PacketAccess,
     Layout = TensorEvaluator<LeftArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
@@ -378,6 +383,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
                    internal::packet_traits<Scalar>::HasBlend,
     Layout = TensorEvaluator<IfArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index 215a4ebad..3bfaf6d23 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -135,6 +135,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
     BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index a4d6ce6b3..28cb9f02d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -44,7 +44,8 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
       PacketAccess = (internal::packet_traits<Scalar>::size > 1),
       Layout = Options_ & RowMajor ? RowMajor : ColMajor,
       CoordAccess = true,
-   };
+      RawAccess = true
+    };
 
   typedef Dimensions_ Dimensions;
   static const std::size_t NumIndices = Dimensions::count;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index c16bf7e67..c9b0b2f28 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -92,6 +92,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
     IsAligned = true,
     PacketAccess = (internal::packet_traits<Scalar>::size > 1),
     Layout = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess = true
   };
 
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
index 9316c9831..96f74b992 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -95,6 +95,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
     BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index 11e510414..2ab332add 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -168,6 +168,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = NumDims == 5,
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
index ae9e9f751..52d89ad01 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@@ -91,7 +91,8 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
     BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
-  };
+    RawAccess = false
+ };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_strides(op.strides())
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
index f612bbd45..a37516974 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -123,6 +123,7 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
     CoordAccess = false,  // to be implemented
+    RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
index 5c759af09..6f69da34a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@@ -49,7 +49,8 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
       IsAligned = ((int(Options_)&Aligned)==Aligned),
       PacketAccess = (internal::packet_traits<Scalar>::size > 1),
       Layout = PlainObjectType::Layout,
-      CoordAccess = true
+      CoordAccess = true,
+      RawAccess = true
     };
 
     EIGEN_DEVICE_FUNC
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index d8c923d74..0524cf0d8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -110,6 +110,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
+    RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -170,6 +171,7 @@ template<typename NewDimensions, typename ArgType, typename Device>
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
+    RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -317,6 +319,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -545,6 +548,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index 91e32d200..39a305a93 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -93,6 +93,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = true,
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
index 8fb53f4f2..2cbb820b1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -94,6 +94,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = true,
+    RawAccess = false
  };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 2dc8815b8..09ee0c2c6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -428,6 +428,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
     PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
 
   static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
index 6b25b2ba0..57197d060 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@@ -139,6 +139,7 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
       PacketAccess = false,
       Layout = PlainObjectType::Layout,
       CoordAccess = false,  // to be implemented
+      RawAccess = false
     };
 
     EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) {
@@ -367,6 +368,7 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
     PacketAccess = false,
     Layout = TensorRef<Derived>::Layout,
     CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&)
@@ -412,6 +414,7 @@ struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<cons
   enum {
     IsAligned = false,
     PacketAccess = false,
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 10328c61f..846f81e0f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -113,6 +113,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
@@ -239,6 +240,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
                                                         const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index 15a22aa1b..c4adb7d4c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -113,6 +113,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
     PacketAccess = (internal::packet_traits<Scalar>::size > 1),
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -225,6 +226,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
   enum {
     IsAligned = false,
     PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 97b6168a9..2c2eb6515 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -112,6 +112,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -258,6 +259,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
index 6625c66d5..52b78b261 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -181,6 +181,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
     BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = NumDims == 6,
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)

From b3b722905f3df26a34cdda4f2cee74aa62403040 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 19 Jan 2016 17:09:47 -0800
Subject: [PATCH 36/52] Improved code indentation

---
 unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index 28cb9f02d..70282dd83 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -54,7 +54,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
   TensorStorage<Scalar, Dimensions, Options> m_storage;
 
   public:
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                      rank()                   const { return NumIndices; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                    rank()                   const { return NumIndices; }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                    dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions&        dimensions()             const { return m_storage.dimensions(); }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                    size()                   const { return m_storage.size(); }

From 6d472d83754e0f16db1deb69218e10c2b21268b1 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 19 Jan 2016 17:22:05 -0800
Subject: [PATCH 37/52] Moved the contraction mapping code to its own file to
 make the code more manageable.

---
 unsupported/Eigen/CXX11/Tensor                |   1 +
 .../CXX11/src/Tensor/TensorContraction.h      | 357 -----------------
 .../src/Tensor/TensorContractionMapper.h      | 377 ++++++++++++++++++
 3 files changed, 378 insertions(+), 357 deletions(-)
 create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h

diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 7481a9ddb..1c5734383 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -88,6 +88,7 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorReductionCuda.h"
 #include "src/Tensor/TensorArgMax.h"
 #include "src/Tensor/TensorConcatenation.h"
+#include "src/Tensor/TensorContractionMapper.h"
 #include "src/Tensor/TensorContraction.h"
 #include "src/Tensor/TensorContractionThreadPool.h"
 #include "src/Tensor/TensorContractionCuda.h"
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 72a378dfd..506696ae9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -21,363 +21,6 @@ namespace Eigen {
   */
 namespace internal {
 
-enum {
-  Rhs = 0,
-  Lhs = 1,
-};
-
-/*
- * Implementation of the Eigen blas_data_mapper class for tensors.
- */
-template<typename Scalar, typename Index, int side,
-         typename Tensor,
-         typename nocontract_t, typename contract_t,
-         int packet_size, bool inner_dim_contiguous, int Alignment>
-class SimpleTensorContractionMapper {
-  public:
-  EIGEN_DEVICE_FUNC
-  SimpleTensorContractionMapper(const Tensor& tensor,
-                                const nocontract_t& nocontract_strides,
-                                const nocontract_t& ij_strides,
-                                const contract_t& contract_strides,
-                                const contract_t& k_strides) :
-      m_tensor(tensor),
-      m_nocontract_strides(nocontract_strides),
-      m_ij_strides(ij_strides),
-      m_contract_strides(contract_strides),
-      m_k_strides(k_strides) { }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Scalar operator()(Index row) const {
-    // column major assumption
-    return operator()(row, 0);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const {
-    return m_tensor.coeff(computeIndex(row, col));
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const {
-    const bool left = (side == Lhs);
-    Index nocontract_val = left ? row : col;
-    Index linidx = 0;
-    for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
-      const Index idx = nocontract_val / m_ij_strides[i];
-      linidx += idx * m_nocontract_strides[i];
-      nocontract_val -= idx * m_ij_strides[i];
-    }
-    if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
-      if (side == Lhs && inner_dim_contiguous) {
-        eigen_assert(m_nocontract_strides[0] == 1);
-        linidx += nocontract_val;
-      } else {
-        linidx += nocontract_val * m_nocontract_strides[0];
-      }
-    }
-
-    Index contract_val = left ? col : row;
-    for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
-      const Index idx = contract_val / m_k_strides[i];
-      linidx += idx * m_contract_strides[i];
-      contract_val -= idx * m_k_strides[i];
-    }
-
-    if(array_size<contract_t>::value > 0) {
-        if (side == Rhs && inner_dim_contiguous) {
-            eigen_assert(m_contract_strides[0] == 1);
-            linidx += contract_val;
-        } else {
-            linidx += contract_val * m_contract_strides[0];
-        }
-    }
-
-    return linidx;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE IndexPair<Index> computeIndexPair(Index row, Index col, const Index distance) const {
-    const bool left = (side == Lhs);
-    Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
-    Index linidx[2] = {0, 0};
-    for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
-      const Index idx0 = nocontract_val[0] / m_ij_strides[i];
-      const Index idx1 = nocontract_val[1] / m_ij_strides[i];
-      linidx[0] += idx0 * m_nocontract_strides[i];
-      linidx[1] += idx1 * m_nocontract_strides[i];
-      nocontract_val[0] -= idx0 * m_ij_strides[i];
-      nocontract_val[1] -= idx1 * m_ij_strides[i];
-    }
-    if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
-      if (side == Lhs && inner_dim_contiguous) {
-        eigen_assert(m_nocontract_strides[0] == 1);
-        linidx[0] += nocontract_val[0];
-        linidx[1] += nocontract_val[1];
-      } else {
-        linidx[0] += nocontract_val[0] * m_nocontract_strides[0];
-        linidx[1] += nocontract_val[1] * m_nocontract_strides[0];
-      }
-    }
-
-    Index contract_val[2] = {left ? col : row, left ? col : row + distance};
-    for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
-      const Index idx0 = contract_val[0] / m_k_strides[i];
-      const Index idx1 = contract_val[1] / m_k_strides[i];
-      linidx[0] += idx0 * m_contract_strides[i];
-      linidx[1] += idx1 * m_contract_strides[i];
-      contract_val[0] -= idx0 * m_k_strides[i];
-      contract_val[1] -= idx1 * m_k_strides[i];
-    }
-
-    if (side == Rhs && inner_dim_contiguous) {
-      eigen_assert(m_contract_strides[0] == 1);
-      linidx[0] += contract_val[0];
-      linidx[1] += contract_val[1];
-    } else {
-      linidx[0] += contract_val[0] * m_contract_strides[0];
-      linidx[1] += contract_val[1] * m_contract_strides[0];
-    }
-    return IndexPair<Index>(linidx[0], linidx[1]);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const {
-    // Only claim alignment when we can compute the actual stride (ie when we're
-    // dealing with the lhs with inner_dim_contiguous. This is because the
-    // matrix-vector product relies on the stride when dealing with aligned inputs.
-    return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const {
-    return ((side == Lhs) && inner_dim_contiguous) ? m_contract_strides[0] : 1;
-  }
-
- protected:
-  const Tensor m_tensor;
-  const nocontract_t m_nocontract_strides;
-  const nocontract_t m_ij_strides;
-  const contract_t m_contract_strides;
-  const contract_t m_k_strides;
-};
-
-
-template<typename Scalar, typename Index, int side,
-         typename Tensor,
-         typename nocontract_t, typename contract_t,
-         int packet_size, bool inner_dim_contiguous,
-         bool inner_dim_reordered, int Alignment>
-class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment>
-{
- public:
-  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment> ParentMapper;
-
-  EIGEN_DEVICE_FUNC
-  BaseTensorContractionMapper(const Tensor& tensor,
-                              const nocontract_t& nocontract_strides,
-                              const nocontract_t& ij_strides,
-                              const contract_t& contract_strides,
-                              const contract_t& k_strides) :
-  ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
-
-  typedef typename packet_traits<Scalar>::type Packet;
-  typedef typename packet_traits<Scalar>::half HalfPacket;
-
-  template <int AlignmentType = Alignment>
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
-    // whole method makes column major assumption
-
-    // don't need to add offsets for now (because operator handles that)
-    // current code assumes packet size must be a multiple of 2
-    EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-    if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) {
-      const Index index = this->computeIndex(i, j);
-      eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1);
-      return this->m_tensor.template packet<AlignmentType>(index);
-    }
-
-    const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1);
-    const Index first = indexPair.first;
-    const Index last = indexPair.second;
-
-    // We can always do optimized packet reads from left hand side right now, because
-    // the vertical matrix dimension on the left hand side is never contracting.
-    // On the right hand side we need to check if the contracting dimensions may have
-    // been shuffled first.
-    if (Tensor::PacketAccess &&
-        (side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) &&
-        (last - first) == (packet_size - 1)) {
-
-      return this->m_tensor.template packet<AlignmentType>(first);
-    }
-
-    EIGEN_ALIGN_MAX Scalar data[packet_size];
-
-    data[0] = this->m_tensor.coeff(first);
-    for (Index k = 1; k < packet_size - 1; k += 2) {
-      const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
-      data[k] = this->m_tensor.coeff(internal_pair.first);
-      data[k + 1] = this->m_tensor.coeff(internal_pair.second);
-    }
-    data[packet_size - 1] = this->m_tensor.coeff(last);
-
-    return pload<Packet>(data);
-  }
-
-  template <int AlignmentType = Alignment>
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
-    // whole method makes column major assumption
-
-    // don't need to add offsets for now (because operator handles that)
-    const Index half_packet_size = unpacket_traits<HalfPacket>::size;
-    if (half_packet_size == packet_size) {
-      return loadPacket<AlignmentType>(i, j);
-    }
-    EIGEN_ALIGN_MAX Scalar data[half_packet_size];
-    for (Index k = 0; k < half_packet_size; k++) {
-      data[k] = operator()(i + k, j);
-    }
-    return pload<HalfPacket>(data);
-  }
-};
-
-
-template<typename Scalar, typename Index, int side,
-         typename Tensor,
-         typename nocontract_t, typename contract_t,
-         bool inner_dim_contiguous,
-         bool inner_dim_reordered, int Alignment>
-class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment>
-{
- public:
-  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment> ParentMapper;
-
-  EIGEN_DEVICE_FUNC
-  BaseTensorContractionMapper(const Tensor& tensor,
-                              const nocontract_t& nocontract_strides,
-                              const nocontract_t& ij_strides,
-                              const contract_t& contract_strides,
-                              const contract_t& k_strides) :
-  ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
-
-  typedef typename packet_traits<Scalar>::type Packet;
-  template <int> EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
-    EIGEN_ALIGN_MAX Scalar data[1];
-    data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
-    return pload<typename packet_traits<Scalar>::type>(data);
-  }
-  template <int> EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const {
-    return loadPacket(i, j);
-  }
-};
-
-template<typename Scalar, typename Index, int side,
-         typename Tensor,
-         typename nocontract_t, typename contract_t,
-         int packet_size,
-         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
-class TensorContractionInputMapper;
-
-template<typename Scalar, typename Index, int side,
-         typename Tensor,
-         typename nocontract_t, typename contract_t,
-         int packet_size,
-         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
-class TensorContractionSubMapper {
- public:
-  typedef typename packet_traits<Scalar>::type Packet;
-  typedef typename packet_traits<Scalar>::half HalfPacket;
-
-  typedef TensorContractionInputMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper;
-  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
-  typedef Self LinearMapper;
-
-  EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
-      : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
-    return m_base_mapper(i + m_vert_offset, m_horiz_offset);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const {
-    return m_base_mapper(i + m_vert_offset, j + m_horiz_offset);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
-    return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
-   return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, j + m_horiz_offset);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
-    return m_base_mapper.template loadHalfPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
-    m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
-    return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset);
-  }
-
-  template <typename PacketT, int AlignmentType>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const {
-    EIGEN_STATIC_ASSERT((internal::is_same<PacketT, Packet>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned;
-    return m_base_mapper.template loadPacket<ActualAlignment>(i + m_vert_offset, m_horiz_offset);
-  }
-
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const {
-    return false;
-  }
-
- private:
-  const ParentMapper& m_base_mapper;
-  const Index m_vert_offset;
-  const Index m_horiz_offset;
-};
-
-
-template<typename Scalar, typename Index, int side,
-         typename Tensor,
-         typename nocontract_t, typename contract_t,
-         int packet_size,
-         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
-class TensorContractionInputMapper
-  : public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> {
-
- public:
-  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Base;
-  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
-  typedef SubMapper VectorMapper;
-
-  EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor,
-                               const nocontract_t& nocontract_strides,
-                               const nocontract_t& ij_strides,
-                               const contract_t& contract_strides,
-                               const contract_t& k_strides)
-      : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
-    return SubMapper(*this, i, j);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
-    return VectorMapper(*this, i, j);
-  }
-};
-
-
-
 template<typename Dimensions, typename LhsXprType, typename RhsXprType>
 struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
 {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
new file mode 100644
index 000000000..b25b34d61
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@@ -0,0 +1,377 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
+
+namespace Eigen {
+
+namespace internal {
+
+enum {
+  Rhs = 0,
+  Lhs = 1,
+};
+
+/*
+ * Implementation of the Eigen blas_data_mapper class for tensors.
+ */
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         int packet_size, bool inner_dim_contiguous, int Alignment>
+class SimpleTensorContractionMapper {
+  public:
+  EIGEN_DEVICE_FUNC
+  SimpleTensorContractionMapper(const Tensor& tensor,
+                                const nocontract_t& nocontract_strides,
+                                const nocontract_t& ij_strides,
+                                const contract_t& contract_strides,
+                                const contract_t& k_strides) :
+      m_tensor(tensor),
+      m_nocontract_strides(nocontract_strides),
+      m_ij_strides(ij_strides),
+      m_contract_strides(contract_strides),
+      m_k_strides(k_strides) { }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar operator()(Index row) const {
+    // column major assumption
+    return operator()(row, 0);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const {
+    return m_tensor.coeff(computeIndex(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const {
+    const bool left = (side == Lhs);
+    Index nocontract_val = left ? row : col;
+    Index linidx = 0;
+    for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
+      const Index idx = nocontract_val / m_ij_strides[i];
+      linidx += idx * m_nocontract_strides[i];
+      nocontract_val -= idx * m_ij_strides[i];
+    }
+    if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+      if (side == Lhs && inner_dim_contiguous) {
+        eigen_assert(m_nocontract_strides[0] == 1);
+        linidx += nocontract_val;
+      } else {
+        linidx += nocontract_val * m_nocontract_strides[0];
+      }
+    }
+
+    Index contract_val = left ? col : row;
+    for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
+      const Index idx = contract_val / m_k_strides[i];
+      linidx += idx * m_contract_strides[i];
+      contract_val -= idx * m_k_strides[i];
+    }
+
+    if(array_size<contract_t>::value > 0) {
+        if (side == Rhs && inner_dim_contiguous) {
+            eigen_assert(m_contract_strides[0] == 1);
+            linidx += contract_val;
+        } else {
+            linidx += contract_val * m_contract_strides[0];
+        }
+    }
+
+    return linidx;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE IndexPair<Index> computeIndexPair(Index row, Index col, const Index distance) const {
+    const bool left = (side == Lhs);
+    Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
+    Index linidx[2] = {0, 0};
+    for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
+      const Index idx0 = nocontract_val[0] / m_ij_strides[i];
+      const Index idx1 = nocontract_val[1] / m_ij_strides[i];
+      linidx[0] += idx0 * m_nocontract_strides[i];
+      linidx[1] += idx1 * m_nocontract_strides[i];
+      nocontract_val[0] -= idx0 * m_ij_strides[i];
+      nocontract_val[1] -= idx1 * m_ij_strides[i];
+    }
+    if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+      if (side == Lhs && inner_dim_contiguous) {
+        eigen_assert(m_nocontract_strides[0] == 1);
+        linidx[0] += nocontract_val[0];
+        linidx[1] += nocontract_val[1];
+      } else {
+        linidx[0] += nocontract_val[0] * m_nocontract_strides[0];
+        linidx[1] += nocontract_val[1] * m_nocontract_strides[0];
+      }
+    }
+
+    Index contract_val[2] = {left ? col : row, left ? col : row + distance};
+    for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
+      const Index idx0 = contract_val[0] / m_k_strides[i];
+      const Index idx1 = contract_val[1] / m_k_strides[i];
+      linidx[0] += idx0 * m_contract_strides[i];
+      linidx[1] += idx1 * m_contract_strides[i];
+      contract_val[0] -= idx0 * m_k_strides[i];
+      contract_val[1] -= idx1 * m_k_strides[i];
+    }
+
+    if (side == Rhs && inner_dim_contiguous) {
+      eigen_assert(m_contract_strides[0] == 1);
+      linidx[0] += contract_val[0];
+      linidx[1] += contract_val[1];
+    } else {
+      linidx[0] += contract_val[0] * m_contract_strides[0];
+      linidx[1] += contract_val[1] * m_contract_strides[0];
+    }
+    return IndexPair<Index>(linidx[0], linidx[1]);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const {
+    // Only claim alignment when we can compute the actual stride (ie when we're
+    // dealing with the lhs with inner_dim_contiguous. This is because the
+    // matrix-vector product relies on the stride when dealing with aligned inputs.
+    return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const {
+    return ((side == Lhs) && inner_dim_contiguous) ? m_contract_strides[0] : 1;
+  }
+
+ protected:
+  const Tensor m_tensor;
+  const nocontract_t m_nocontract_strides;
+  const nocontract_t m_ij_strides;
+  const contract_t m_contract_strides;
+  const contract_t m_k_strides;
+};
+
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         int packet_size, bool inner_dim_contiguous,
+         bool inner_dim_reordered, int Alignment>
+class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment>
+{
+ public:
+  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment> ParentMapper;
+
+  EIGEN_DEVICE_FUNC
+  BaseTensorContractionMapper(const Tensor& tensor,
+                              const nocontract_t& nocontract_strides,
+                              const nocontract_t& ij_strides,
+                              const contract_t& contract_strides,
+                              const contract_t& k_strides) :
+  ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
+  template <int AlignmentType = Alignment>
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
+    // whole method makes column major assumption
+
+    // don't need to add offsets for now (because operator handles that)
+    // current code assumes packet size must be a multiple of 2
+    EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) {
+      const Index index = this->computeIndex(i, j);
+      eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1);
+      return this->m_tensor.template packet<AlignmentType>(index);
+    }
+
+    const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1);
+    const Index first = indexPair.first;
+    const Index last = indexPair.second;
+
+    // We can always do optimized packet reads from left hand side right now, because
+    // the vertical matrix dimension on the left hand side is never contracting.
+    // On the right hand side we need to check if the contracting dimensions may have
+    // been shuffled first.
+    if (Tensor::PacketAccess &&
+        (side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) &&
+        (last - first) == (packet_size - 1)) {
+
+      return this->m_tensor.template packet<AlignmentType>(first);
+    }
+
+    EIGEN_ALIGN_MAX Scalar data[packet_size];
+
+    data[0] = this->m_tensor.coeff(first);
+    for (Index k = 1; k < packet_size - 1; k += 2) {
+      const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
+      data[k] = this->m_tensor.coeff(internal_pair.first);
+      data[k + 1] = this->m_tensor.coeff(internal_pair.second);
+    }
+    data[packet_size - 1] = this->m_tensor.coeff(last);
+
+    return pload<Packet>(data);
+  }
+
+  template <int AlignmentType = Alignment>
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
+    // whole method makes column major assumption
+
+    // don't need to add offsets for now (because operator handles that)
+    const Index half_packet_size = unpacket_traits<HalfPacket>::size;
+    if (half_packet_size == packet_size) {
+      return loadPacket<AlignmentType>(i, j);
+    }
+    EIGEN_ALIGN_MAX Scalar data[half_packet_size];
+    for (Index k = 0; k < half_packet_size; k++) {
+      data[k] = operator()(i + k, j);
+    }
+    return pload<HalfPacket>(data);
+  }
+};
+
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         bool inner_dim_contiguous,
+         bool inner_dim_reordered, int Alignment>
+class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment>
+{
+ public:
+  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment> ParentMapper;
+
+  EIGEN_DEVICE_FUNC
+  BaseTensorContractionMapper(const Tensor& tensor,
+                              const nocontract_t& nocontract_strides,
+                              const nocontract_t& ij_strides,
+                              const contract_t& contract_strides,
+                              const contract_t& k_strides) :
+  ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+
+  typedef typename packet_traits<Scalar>::type Packet;
+  template <int> EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
+    EIGEN_ALIGN_MAX Scalar data[1];
+    data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
+    return pload<typename packet_traits<Scalar>::type>(data);
+  }
+  template <int> EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const {
+    return loadPacket(i, j);
+  }
+};
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         int packet_size,
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionInputMapper;
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         int packet_size,
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionSubMapper {
+ public:
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
+  typedef TensorContractionInputMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
+  typedef Self LinearMapper;
+
+  EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
+      : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
+    return m_base_mapper(i + m_vert_offset, m_horiz_offset);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const {
+    return m_base_mapper(i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
+    return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
+   return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
+    return m_base_mapper.template loadHalfPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
+    m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  template <typename PacketT, int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const {
+    EIGEN_STATIC_ASSERT((internal::is_same<PacketT, Packet>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned;
+    return m_base_mapper.template loadPacket<ActualAlignment>(i + m_vert_offset, m_horiz_offset);
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const {
+    return false;
+  }
+
+ private:
+  const ParentMapper& m_base_mapper;
+  const Index m_vert_offset;
+  const Index m_horiz_offset;
+};
+
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         int packet_size,
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionInputMapper
+  : public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> {
+
+ public:
+  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Base;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
+  typedef SubMapper VectorMapper;
+
+  EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor,
+                               const nocontract_t& nocontract_strides,
+                               const nocontract_t& ij_strides,
+                               const contract_t& contract_strides,
+                               const contract_t& k_strides)
+      : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
+    return SubMapper(*this, i, j);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
+    return VectorMapper(*this, i, j);
+  }
+};
+
+
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H

From df79c00901e2c976f79a0b1518ed9a58a48f0501 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 19 Jan 2016 17:24:08 -0800
Subject: [PATCH 38/52] Improved the formatting of the code

---
 unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
index 52d89ad01..2798956ae 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@@ -92,7 +92,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
- };
+  };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_strides(op.strides())

From 234a1094b7839017e6cf8e8f376995ee13775e00 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 20 Jan 2016 09:18:44 +0100
Subject: [PATCH 39/52] Add static assertion to y(), z(), w() accessors

---
 Eigen/src/Core/DenseCoeffsBase.h   | 36 +++++++++++++++++++++++++-----
 Eigen/src/Core/util/StaticAssert.h |  1 +
 2 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/Eigen/src/Core/DenseCoeffsBase.h b/Eigen/src/Core/DenseCoeffsBase.h
index 28cc1432c..423ab167d 100644
--- a/Eigen/src/Core/DenseCoeffsBase.h
+++ b/Eigen/src/Core/DenseCoeffsBase.h
@@ -191,19 +191,31 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
 
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE CoeffReturnType
-    y() const { return (*this)[1]; }
+    y() const
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=2, OUT_OF_RANGE_ACCESS);
+      return (*this)[1];
+    }
 
     /** equivalent to operator[](2).  */
 
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE CoeffReturnType
-    z() const { return (*this)[2]; }
+    z() const
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=3, OUT_OF_RANGE_ACCESS);
+      return (*this)[2];
+    }
 
     /** equivalent to operator[](3).  */
 
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE CoeffReturnType
-    w() const { return (*this)[3]; }
+    w() const
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=4, OUT_OF_RANGE_ACCESS);
+      return (*this)[3];
+    }
 
     /** \internal
       * \returns the packet of coefficients starting at the given row and column. It is your responsibility
@@ -424,19 +436,31 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
 
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar&
-    y() { return (*this)[1]; }
+    y()
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=2, OUT_OF_RANGE_ACCESS);
+      return (*this)[1];
+    }
 
     /** equivalent to operator[](2).  */
 
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar&
-    z() { return (*this)[2]; }
+    z()
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=3, OUT_OF_RANGE_ACCESS);
+      return (*this)[2];
+    }
 
     /** equivalent to operator[](3).  */
 
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar&
-    w() { return (*this)[3]; }
+    w()
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=4, OUT_OF_RANGE_ACCESS);
+      return (*this)[3];
+    }
 };
 
 /** \brief Base class providing direct read-only coefficient access to matrices and arrays.
diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h
index 1fe365aa7..e174509e0 100644
--- a/Eigen/src/Core/util/StaticAssert.h
+++ b/Eigen/src/Core/util/StaticAssert.h
@@ -50,6 +50,7 @@
         THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE,
         THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE,
         THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE,
+        OUT_OF_RANGE_ACCESS,
         YOU_MADE_A_PROGRAMMING_MISTAKE,
         EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT,
         EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE,

From 0b7169d1f70b0c18ae14d2a2bb5f165b7b675d51 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 20 Jan 2016 18:15:59 +0100
Subject: [PATCH 40/52] bug #1147: fix compilation of PastixSupport

---
 Eigen/PaStiXSupport | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Eigen/PaStiXSupport b/Eigen/PaStiXSupport
index 3411dface..de3a63b4d 100644
--- a/Eigen/PaStiXSupport
+++ b/Eigen/PaStiXSupport
@@ -12,7 +12,6 @@
 
 #include "src/Core/util/DisableStupidWarnings.h"
 
-#include <complex.h>
 extern "C" {
 #include <pastix_nompi.h>
 #include <pastix.h>

From db237d0c75abe70cf8ed0f21e7eaa66474f8a9bd Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 20 Jan 2016 18:49:01 +0100
Subject: [PATCH 41/52] bug #1145: fix PastixSupport LLT/LDLT wrappers (missing
 resize prior to calls to selfAdjointView)

---
 Eigen/src/PaStiXSupport/PaStiXSupport.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Eigen/src/PaStiXSupport/PaStiXSupport.h b/Eigen/src/PaStiXSupport/PaStiXSupport.h
index c8cb2c0cc..d21495e81 100644
--- a/Eigen/src/PaStiXSupport/PaStiXSupport.h
+++ b/Eigen/src/PaStiXSupport/PaStiXSupport.h
@@ -581,6 +581,7 @@ class PastixLLT : public PastixBase< PastixLLT<_MatrixType, _UpLo> >
     
     void grabMatrix(const MatrixType& matrix, ColSpMatrix& out)
     {
+      out.resize(matrix.rows(), matrix.cols());
       // Pastix supports only lower, column-major matrices 
       out.template selfadjointView<Lower>() = matrix.template selfadjointView<UpLo>();
       internal::c_to_fortran_numbering(out);
@@ -666,6 +667,7 @@ class PastixLDLT : public PastixBase< PastixLDLT<_MatrixType, _UpLo> >
     void grabMatrix(const MatrixType& matrix, ColSpMatrix& out)
     {
       // Pastix supports only lower, column-major matrices 
+      out.resize(matrix.rows(), matrix.cols());
       out.template selfadjointView<Lower>() = matrix.template selfadjointView<UpLo>();
       internal::c_to_fortran_numbering(out);
     }

From 4c5e96aab6baf32b077d8824b4ecbe592caaa8a0 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 20 Jan 2016 18:56:17 +0100
Subject: [PATCH 42/52] bug #1148: silent Pastix by default

---
 Eigen/src/PaStiXSupport/PaStiXSupport.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/PaStiXSupport/PaStiXSupport.h b/Eigen/src/PaStiXSupport/PaStiXSupport.h
index d21495e81..7c8622c97 100644
--- a/Eigen/src/PaStiXSupport/PaStiXSupport.h
+++ b/Eigen/src/PaStiXSupport/PaStiXSupport.h
@@ -268,7 +268,7 @@ void PastixBase<Derived>::init()
          0, 0, 0, 1, m_iparm.data(), m_dparm.data());
   
   m_iparm[IPARM_MATRIX_VERIFICATION] = API_NO;
-  m_iparm[IPARM_VERBOSE]             = 2;
+  m_iparm[IPARM_VERBOSE]             = API_VERBOSE_NOT;
   m_iparm[IPARM_ORDERING]            = API_ORDER_SCOTCH;
   m_iparm[IPARM_INCOMPLETE]          = API_NO;
   m_iparm[IPARM_OOC_LIMIT]           = 2000;

From ed8ade9c65bc25f2226946e4275d62f7ceb28213 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 20 Jan 2016 19:01:24 +0100
Subject: [PATCH 43/52] bug #1149: fix Pastix*::*parm()

---
 Eigen/src/PaStiXSupport/PaStiXSupport.h | 6 +++---
 test/pastix_support.cpp                 | 8 ++++++++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/Eigen/src/PaStiXSupport/PaStiXSupport.h b/Eigen/src/PaStiXSupport/PaStiXSupport.h
index 7c8622c97..d2ebfd7bb 100644
--- a/Eigen/src/PaStiXSupport/PaStiXSupport.h
+++ b/Eigen/src/PaStiXSupport/PaStiXSupport.h
@@ -184,7 +184,7 @@ class PastixBase : public SparseSolverBase<Derived>
       * The statistics related to the different phases of factorization and solve are saved here as well
       * \sa analyzePattern() factorize()
       */
-    Array<RealScalar,IPARM_SIZE,1>& dparm()
+    Array<double,DPARM_SIZE,1>& dparm()
     {
       return m_dparm; 
     }
@@ -244,8 +244,8 @@ class PastixBase : public SparseSolverBase<Derived>
     mutable ComputationInfo m_info; 
     mutable pastix_data_t *m_pastixdata; // Data structure for pastix
     mutable int m_comm; // The MPI communicator identifier
-    mutable Matrix<int,IPARM_SIZE,1> m_iparm; // integer vector for the input parameters
-    mutable Matrix<double,DPARM_SIZE,1> m_dparm; // Scalar vector for the input parameters
+    mutable Array<int,IPARM_SIZE,1> m_iparm; // integer vector for the input parameters
+    mutable Array<double,DPARM_SIZE,1> m_dparm; // Scalar vector for the input parameters
     mutable Matrix<StorageIndex,Dynamic,1> m_perm;  // Permutation vector
     mutable Matrix<StorageIndex,Dynamic,1> m_invp;  // Inverse permutation vector
     mutable int m_size; // Size of the matrix 
diff --git a/test/pastix_support.cpp b/test/pastix_support.cpp
index 49239e3a5..b62f85739 100644
--- a/test/pastix_support.cpp
+++ b/test/pastix_support.cpp
@@ -27,6 +27,14 @@ template<typename T> void test_pastix_T()
   check_sparse_spd_solving(pastix_llt_upper);
   check_sparse_spd_solving(pastix_ldlt_upper);
   check_sparse_square_solving(pastix_lu);
+
+  // Some compilation check:
+  pastix_llt_lower.iparm();
+  pastix_llt_lower.dparm();
+  pastix_ldlt_lower.iparm();
+  pastix_ldlt_lower.dparm();
+  pastix_lu.iparm();
+  pastix_lu.dparm();
 }
 
 // There is no support for selfadjoint matrices with PaStiX. 

From 47076bf00ec32c41c0a7f2ba438361ea5f0256e4 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 20 Jan 2016 14:51:48 -0800
Subject: [PATCH 44/52] Reduce the register pressure exerted by the tensor
 mappers whenever possible. This improves the performance of the contraction
 of a matrix with a vector by about 35%.

---
 .../CXX11/src/Tensor/TensorContraction.h      |   5 +-
 .../src/Tensor/TensorContractionMapper.h      | 109 ++++++++++++++++--
 2 files changed, 101 insertions(+), 13 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 506696ae9..575ae7b54 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -128,6 +128,7 @@ struct TensorContractionEvaluatorBase
     PacketAccess = (internal::packet_traits<Scalar>::size > 1),
     Layout = TensorEvaluator<LeftArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
+    RawAccess = true
   };
 
   // Most of the code is assuming that both input tensors are ColMajor. If the
@@ -434,11 +435,11 @@ struct TensorContractionEvaluatorBase
   }
 
   template<int LoadMode>
-  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
     return internal::ploadt<Packet, LoadMode>(m_result + index);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; }
 
   protected:
   // Prevent assignment
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
index b25b34d61..9b6d18090 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@@ -22,6 +22,54 @@ enum {
 /*
  * Implementation of the Eigen blas_data_mapper class for tensors.
  */
+
+template <typename Tensor, bool HasRawAccess> struct CoeffLoader {
+  enum {
+    DirectOffsets = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_tensor(tensor) { }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index) {
+    eigen_assert(false && "unsupported");
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return m_tensor.coeff(index); }
+
+ template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename Tensor::PacketReturnType packet(typename Tensor::Index index) const
+  {
+    return m_tensor.template packet<LoadMode>(index);
+  }
+
+
+ private:
+  const Tensor m_tensor;
+};
+
+template <typename Tensor> struct CoeffLoader<Tensor, true> {
+  enum {
+    DirectOffsets = true
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_data(tensor.data()) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) {
+    m_data += offset;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return loadConstant(m_data+index); }
+
+ template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename Tensor::PacketReturnType packet(typename Tensor::Index index) const
+  {
+    return internal::ploadt_ro<typename Tensor::PacketReturnType, LoadMode>(m_data + index);
+  }
+ private:
+  typedef typename Tensor::Scalar Scalar;
+  const Scalar* m_data;
+};
+
 template<typename Scalar, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
@@ -40,6 +88,14 @@ class SimpleTensorContractionMapper {
       m_contract_strides(contract_strides),
       m_k_strides(k_strides) { }
 
+  enum {
+    DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess>::DirectOffsets
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) {
+    m_tensor.offsetBuffer(offset);
+  }
+
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { }
 
@@ -148,7 +204,7 @@ class SimpleTensorContractionMapper {
   }
 
  protected:
-  const Tensor m_tensor;
+  CoeffLoader<Tensor, Tensor::RawAccess> m_tensor;
   const nocontract_t m_nocontract_strides;
   const nocontract_t m_ij_strides;
   const contract_t m_contract_strides;
@@ -270,12 +326,6 @@ class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, con
   }
 };
 
-template<typename Scalar, typename Index, int side,
-         typename Tensor,
-         typename nocontract_t, typename contract_t,
-         int packet_size,
-         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
-class TensorContractionInputMapper;
 
 template<typename Scalar, typename Index, int side,
          typename Tensor,
@@ -287,36 +337,70 @@ class TensorContractionSubMapper {
   typedef typename packet_traits<Scalar>::type Packet;
   typedef typename packet_traits<Scalar>::half HalfPacket;
 
-  typedef TensorContractionInputMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper;
+  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper;
   typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
   typedef Self LinearMapper;
 
+  enum {
+    // We can use direct offsets iff the parent mapper supports then and we can compute the strides.
+    // TODO: we should also enable direct offsets for the Rhs case.
+    UseDirectOffsets = (side == Lhs) && inner_dim_contiguous && ParentMapper::DirectOffsets
+  };
+
   EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
-      : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { }
+      : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) {
+    // Bake the offsets into the buffer used by the base mapper whenever possible. This avoids the need to recompute
+    // this offset every time we attempt to access a coefficient.
+    if (UseDirectOffsets) {
+      Index stride = m_base_mapper.stride();
+      m_base_mapper.offsetBuffer(vert_offset + horiz_offset * stride);
+    }
+  }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper(i, 0);
+    }
     return m_base_mapper(i + m_vert_offset, m_horiz_offset);
   }
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper(i, j);
+    }
     return m_base_mapper(i + m_vert_offset, j + m_horiz_offset);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper.template loadPacket<Alignment>(i, 0);
+    }
     return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
   }
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
-   return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, j + m_horiz_offset);
+    if (UseDirectOffsets) {
+      return m_base_mapper.template loadPacket<Alignment>(i, j);
+    }
+    return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, j + m_horiz_offset);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper.template loadHalfPacket<Alignment>(i, 0);
+    }
     return m_base_mapper.template loadHalfPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
+    if (UseDirectOffsets) {
+      m_base_mapper.storePacket(i, 0, p);
+    }
     m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    if (UseDirectOffsets) {
+      return LinearMapper(m_base_mapper, i, j);
+    }
     return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset);
   }
 
@@ -324,6 +408,9 @@ class TensorContractionSubMapper {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const {
     EIGEN_STATIC_ASSERT((internal::is_same<PacketT, Packet>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
     const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned;
+    if (UseDirectOffsets) {
+     return m_base_mapper.template loadPacket<ActualAlignment>(i, 0);
+    }
     return m_base_mapper.template loadPacket<ActualAlignment>(i + m_vert_offset, m_horiz_offset);
   }
 
@@ -333,7 +420,7 @@ class TensorContractionSubMapper {
   }
 
  private:
-  const ParentMapper& m_base_mapper;
+  ParentMapper m_base_mapper;
   const Index m_vert_offset;
   const Index m_horiz_offset;
 };

From 62f7e777117d35ee4d45e77af429246b1e4e1c12 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 21 Jan 2016 00:02:59 +0100
Subject: [PATCH 45/52] add upper|lower case in incomplete_cholesky unit test

---
 test/incomplete_cholesky.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/test/incomplete_cholesky.cpp b/test/incomplete_cholesky.cpp
index 435e2839a..7acad9872 100644
--- a/test/incomplete_cholesky.cpp
+++ b/test/incomplete_cholesky.cpp
@@ -15,16 +15,18 @@
 template<typename T, typename I> void test_incomplete_cholesky_T()
 {
   typedef SparseMatrix<T,0,I> SparseMatrixType;
-  ConjugateGradient<SparseMatrixType, Lower, IncompleteCholesky<T, Lower, AMDOrdering<I> > >     cg_illt_lower_amd;
-  ConjugateGradient<SparseMatrixType, Lower, IncompleteCholesky<T, Lower, NaturalOrdering<I> > > cg_illt_lower_nat;
-  ConjugateGradient<SparseMatrixType, Upper, IncompleteCholesky<T, Upper, AMDOrdering<I> > >     cg_illt_upper_amd;
-  ConjugateGradient<SparseMatrixType, Upper, IncompleteCholesky<T, Upper, NaturalOrdering<I> > >     cg_illt_upper_nat;
+  ConjugateGradient<SparseMatrixType, Lower, IncompleteCholesky<T, Lower, AMDOrdering<I> > >        cg_illt_lower_amd;
+  ConjugateGradient<SparseMatrixType, Lower, IncompleteCholesky<T, Lower, NaturalOrdering<I> > >    cg_illt_lower_nat;
+  ConjugateGradient<SparseMatrixType, Upper, IncompleteCholesky<T, Upper, AMDOrdering<I> > >        cg_illt_upper_amd;
+  ConjugateGradient<SparseMatrixType, Upper, IncompleteCholesky<T, Upper, NaturalOrdering<I> > >    cg_illt_upper_nat;
+  ConjugateGradient<SparseMatrixType, Upper|Lower, IncompleteCholesky<T, Lower, AMDOrdering<I> > >  cg_illt_uplo_amd;
   
 
   CALL_SUBTEST( check_sparse_spd_solving(cg_illt_lower_amd) );
   CALL_SUBTEST( check_sparse_spd_solving(cg_illt_lower_nat) );
   CALL_SUBTEST( check_sparse_spd_solving(cg_illt_upper_amd) );
   CALL_SUBTEST( check_sparse_spd_solving(cg_illt_upper_nat) );
+  CALL_SUBTEST( check_sparse_spd_solving(cg_illt_uplo_amd) );
 }
 
 void test_incomplete_cholesky()

From 7ce932edd33c56baab0a5c5a8e0608c6345efd53 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 20 Jan 2016 18:12:08 -0800
Subject: [PATCH 46/52] Small cleanup and small fix to the contraction of row
 major tensors

---
 unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 575ae7b54..624e814e2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -147,8 +147,6 @@ struct TensorContractionEvaluatorBase
   static const int ContractDims = internal::array_size<Indices>::value;
   static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
 
-  typedef array<Index, LDims> left_dim_mapper_t;
-  typedef array<Index, RDims> right_dim_mapper_t;
   typedef array<Index, ContractDims> contract_t;
   typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
   typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
@@ -195,8 +193,8 @@ struct TensorContractionEvaluatorBase
       // We need to flip all the pairs of contracting indices as well as
       // reversing the dimensions.
       for (int i = 0; i < ContractDims; i++) {
-        eval_op_indices[i].first = LDims - 1 - op.indices()[i].second;
-        eval_op_indices[i].second = RDims - 1 - op.indices()[i].first;
+        eval_op_indices[i].first = LDims - 1 - op.indices()[ContractDims - 1 - i].second;
+        eval_op_indices[i].second = RDims - 1 - op.indices()[ContractDims - 1 - i].first;
       }
     }
 
@@ -504,9 +502,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
   static const int ContractDims = internal::array_size<Indices>::value;
 
-  typedef array<Index, LDims> left_dim_mapper_t;
-  typedef array<Index, RDims> right_dim_mapper_t;
-
   typedef array<Index, ContractDims> contract_t;
   typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
   typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;

From 690bc950f70c61075d396671e63480bbd64bb297 Mon Sep 17 00:00:00 2001
From: Jan Prach <anonymous@invalid.net>
Date: Wed, 20 Jan 2016 19:35:59 -0800
Subject: [PATCH 47/52] fix clang warnings

"braces around scalar initializer"
---
 unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h    |  4 ++--
 unsupported/Eigen/CXX11/src/Tensor/Tensor.h          | 12 ++++++------
 .../Eigen/CXX11/src/Tensor/TensorDimensions.h        |  2 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h |  8 ++++----
 unsupported/Eigen/CXX11/src/Tensor/TensorMap.h       |  8 ++++----
 .../Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h |  4 ++--
 .../Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h  |  2 +-
 unsupported/test/cxx11_tensor_broadcasting.cpp       |  6 +++---
 unsupported/test/cxx11_tensor_contraction.cpp        |  2 +-
 unsupported/test/cxx11_tensor_map.cpp                |  2 +-
 10 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h
index 4d99f786c..c1c57041f 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h
@@ -404,7 +404,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
 template<typename Op, typename A, typename B, std::size_t N, int... n>
 constexpr inline array<decltype(Op::run(A(), B())),N> h_array_zip(array<A, N> a, array<B, N> b, numeric_list<int, n...>)
 {
-  return array<decltype(Op::run(A(), B())),N>{{ Op::run(array_get<n>(a), array_get<n>(b))... }};
+  return array<decltype(Op::run(A(), B())),N>{ Op::run(array_get<n>(a), array_get<n>(b))... };
 }
 
 template<typename Op, typename A, typename B, std::size_t N>
@@ -432,7 +432,7 @@ constexpr inline auto array_zip_and_reduce(array<A, N> a, array<B, N> b) -> decl
 template<typename Op, typename A, std::size_t N, int... n>
 constexpr inline array<decltype(Op::run(A())),N> h_array_apply(array<A, N> a, numeric_list<int, n...>)
 {
-  return array<decltype(Op::run(A())),N>{{ Op::run(array_get<n>(a))... }};
+  return array<decltype(Op::run(A())),N>{ Op::run(array_get<n>(a))... };
 }
 
 template<typename Op, typename A, std::size_t N>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
index dc6ca4909..092e30c1f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@@ -119,7 +119,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
     {
       // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return coeff(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+      return coeff(array<Index, NumIndices>{firstIndex, secondIndex, otherIndices...});
     }
 #endif
 
@@ -159,7 +159,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
     {
       // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return coeffRef(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+      return coeffRef(array<Index, NumIndices>{firstIndex, secondIndex, otherIndices...});
     }
 #endif
 
@@ -199,7 +199,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
     {
       // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return this->operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+      return this->operator()(array<Index, NumIndices>{firstIndex, secondIndex, otherIndices...});
     }
 #else
     EIGEN_DEVICE_FUNC
@@ -266,7 +266,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
     {
       // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+      return operator()(array<Index, NumIndices>{firstIndex, secondIndex, otherIndices...});
     }
 #else
     EIGEN_DEVICE_FUNC
@@ -342,7 +342,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions)
-        : m_storage(internal::array_prod(array<Index, NumIndices>{{firstDimension, otherDimensions...}}), array<Index, NumIndices>{{firstDimension, otherDimensions...}})
+        : m_storage(internal::array_prod(array<Index, NumIndices>{firstDimension, otherDimensions...}), array<Index, NumIndices>{firstDimension, otherDimensions...})
     {
       // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
@@ -427,7 +427,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
     {
       // The number of dimensions used to resize a tensor must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      resize(array<Index, NumIndices>{{firstDimension, otherDimensions...}});
+      resize(array<Index, NumIndices>{firstDimension, otherDimensions...});
     }
 #endif
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index 52569a359..06cac3570 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -289,7 +289,7 @@ struct DSizes : array<DenseIndex, NumDims> {
   template<typename... IndexTypes> EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, IndexTypes... otherDimensions) {
     EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    (*this) = array<DenseIndex, NumDims>{{firstDimension, otherDimensions...}};
+    (*this) = array<DenseIndex, NumDims>{firstDimension, otherDimensions...};
   }
 #else
   EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index 70282dd83..7d0858d02 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -73,7 +73,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
     {
       // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return coeff(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+      return coeff(array<Index, NumIndices>{firstIndex, otherIndices...});
     }
 #endif
 
@@ -105,7 +105,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
     {
       // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return coeffRef(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+      return coeffRef(array<Index, NumIndices>{firstIndex, otherIndices...});
     }
 #endif
 
@@ -137,7 +137,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
     {
       // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return this->operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+      return this->operator()(array<Index, NumIndices>{firstIndex, otherIndices...});
     }
 #endif
 
@@ -176,7 +176,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
     {
       // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+      return operator()(array<Index, NumIndices>{firstIndex, otherIndices...});
     }
 #endif
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
index 6f69da34a..7233f4c89 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@@ -141,10 +141,10 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
     {
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
       if (PlainObjectType::Options&RowMajor) {
-        const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+        const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{firstIndex, otherIndices...});
         return m_data[index];
       } else {
-        const Index index = m_dimensions.IndexOfColMajor(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+        const Index index = m_dimensions.IndexOfColMajor(array<Index, NumIndices>{firstIndex, otherIndices...});
         return m_data[index];
       }
     }
@@ -228,10 +228,10 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
       static_assert(sizeof...(otherIndices) + 1 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
       const std::size_t NumDims = sizeof...(otherIndices) + 1;
       if (PlainObjectType::Options&RowMajor) {
-        const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumDims>{{firstIndex, otherIndices...}});
+        const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumDims>{firstIndex, otherIndices...});
         return m_data[index];
       } else {
-        const Index index = m_dimensions.IndexOfColMajor(array<Index, NumDims>{{firstIndex, otherIndices...}});
+        const Index index = m_dimensions.IndexOfColMajor(array<Index, NumDims>{firstIndex, otherIndices...});
         return m_data[index];
       }
     }
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h
index bc4f2025f..1b9fe2779 100644
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h
+++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h
@@ -55,7 +55,7 @@ class DynamicSGroup
     inline internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const
     {
       static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
-      return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{{firstIndex, otherIndices...}});
+      return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{firstIndex, otherIndices...});
     }
 
     template<typename Tensor_>
@@ -90,7 +90,7 @@ class DynamicSGroup
     template<typename Index, std::size_t N, int... n>
     inline std::array<Index, N> h_permute(std::size_t which, const std::array<Index, N>& idx, internal::numeric_list<int, n...>) const
     {
-      return std::array<Index, N>{{ idx[n >= m_numIndices ? n : m_elements[which].representation[n]]... }};
+      return std::array<Index, N>{ idx[n >= m_numIndices ? n : m_elements[which].representation[n]]... };
     }
 
     template<typename Index>
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h
index 942293bd7..255c344b4 100644
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h
+++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h
@@ -217,7 +217,7 @@ class StaticSGroup
     inline internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const
     {
       static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
-      return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{{firstIndex, otherIndices...}});
+      return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{firstIndex, otherIndices...});
     }
 
     template<typename Tensor_>
diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp
index 2ddf47234..6fdefd66c 100644
--- a/unsupported/test/cxx11_tensor_broadcasting.cpp
+++ b/unsupported/test/cxx11_tensor_broadcasting.cpp
@@ -167,13 +167,13 @@ static void test_fixed_size_broadcasting()
   TensorFixedSize<float, Sizes<1>, DataLayout> t2;
   t2 = t2.constant(20.0f);
 
-  Tensor<float, 1, DataLayout> t3 = t1 + t2.broadcast(Eigen::array<int, 1>{{10}});
+  Tensor<float, 1, DataLayout> t3 = t1 + t2.broadcast(Eigen::array<int, 1>{10});
   for (int i = 0; i < 10; ++i) {
     VERIFY_IS_APPROX(t3(i), t1(i) + t2(0));
   }
 
-  TensorMap<TensorFixedSize<float, Sizes<1>, DataLayout> > t4(t2.data(), {{1}});
-  Tensor<float, 1, DataLayout> t5 = t1 + t4.broadcast(Eigen::array<int, 1>{{10}});
+  TensorMap<TensorFixedSize<float, Sizes<1>, DataLayout> > t4(t2.data(), {1});
+  Tensor<float, 1, DataLayout> t5 = t1 + t4.broadcast(Eigen::array<int, 1>{10});
   for (int i = 0; i < 10; ++i) {
     VERIFY_IS_APPROX(t5(i), t1(i) + t2(0));
   }
diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp
index b0d52c6cf..c5f3af73e 100644
--- a/unsupported/test/cxx11_tensor_contraction.cpp
+++ b/unsupported/test/cxx11_tensor_contraction.cpp
@@ -456,7 +456,7 @@ static void test_tensor_product()
   mat1.setRandom();
   mat2.setRandom();
 
-  Tensor<float, 4, DataLayout> result = mat1.contract(mat2, Eigen::array<DimPair, 0>{{}});
+  Tensor<float, 4, DataLayout> result = mat1.contract(mat2, Eigen::array<DimPair, 0>{});
 
   VERIFY_IS_EQUAL(result.dimension(0), 2);
   VERIFY_IS_EQUAL(result.dimension(1), 3);
diff --git a/unsupported/test/cxx11_tensor_map.cpp b/unsupported/test/cxx11_tensor_map.cpp
index a8a095e38..dc0f8a5a2 100644
--- a/unsupported/test/cxx11_tensor_map.cpp
+++ b/unsupported/test/cxx11_tensor_map.cpp
@@ -130,7 +130,7 @@ static void test_3d()
   }
 
   TensorMap<Tensor<const int, 3>> mat3(mat1.data(), 2, 3, 7);
-  TensorMap<Tensor<const int, 3, RowMajor>> mat4(mat2.data(), array<DenseIndex, 3>{{2, 3, 7}});
+  TensorMap<Tensor<const int, 3, RowMajor>> mat4(mat2.data(), array<DenseIndex, 3>{2, 3, 7});
 
   VERIFY_IS_EQUAL(mat3.rank(), 3);
   VERIFY_IS_EQUAL(mat3.size(), 2*3*7);

From 34340458cbe33976559bf8fd73a9d4b2f747d611 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 21 Jan 2016 14:29:45 +0100
Subject: [PATCH 48/52] bug #1151: remove useless critical section

---
 Eigen/src/Core/products/GeneralMatrixMatrix.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index d830dfb96..d77fc2630 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -145,12 +145,9 @@ static void run(Index rows, Index cols, Index depth,
 
       // Release all the sub blocks A'_i of A' for the current thread,
       // i.e., we simply decrement the number of users by 1
-      #pragma omp critical
-      {
       for(Index i=0; i<threads; ++i)
         #pragma omp atomic
         info[i].users -= 1;
-      }
     }
   }
   else

From 8dca9f97e38970b1f7fed6cb508c58d8ff39d526 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 21 Jan 2016 20:18:51 +0100
Subject: [PATCH 49/52] Add numext::sqrt function to enable custom optimized
 implementation.

This changeset add two specializations for float/double on SSE. Those
are mostly usefull with GCC for which std::sqrt add an extra and costly
check on the result of _mm_sqrt_*. Clang does not add this burden.

In this changeset, only DenseBase::norm() makes use of it.
---
 Eigen/src/Core/Dot.h                    |  3 +--
 Eigen/src/Core/MathFunctions.h          | 20 ++++++++++++++++++--
 Eigen/src/Core/arch/SSE/MathFunctions.h | 22 ++++++++++++++++++++++
 3 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h
index c5040c67b..ce42854cd 100644
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -99,8 +99,7 @@ EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scala
 template<typename Derived>
 inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
 {
-  EIGEN_USING_STD_MATH(sqrt)
-  return sqrt(squaredNorm());
+  return numext::sqrt(squaredNorm());
 }
 
 /** \returns an expression of the quotient of *this by its own norm.
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 4d5e1acb8..1c7b28a4b 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -954,8 +954,8 @@ T (ceil)(const T& x)
   return ceil(x);
 }
 
-// Log base 2 for 32 bits positive integers.
-// Conveniently returns 0 for x==0.
+/** Log base 2 for 32 bits positive integers.
+  * Conveniently returns 0 for x==0. */
 inline int log2(int x)
 {
   eigen_assert(x>=0);
@@ -969,6 +969,22 @@ inline int log2(int x)
   return table[(v * 0x07C4ACDDU) >> 27];
 }
 
+/** \returns the square root of \a x.
+  *
+  * It is essentially equivalent to \code using std::sqrt; return sqrt(x); \endcode,
+  * but slightly faster for float/double and some compilers (e.g., gcc), thanks to
+  * specializations when SSE is enabled.
+  *
+  * It's usage is justified in performance critical functions, like norm/normalize.
+  */
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T sqrt(const T &x)
+{
+  EIGEN_USING_STD_MATH(sqrt);
+  return sqrt(x);
+}
+
 } // end namespace numext
 
 namespace internal {
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index 3b8b7303f..0dd52f96e 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -518,6 +518,28 @@ Packet2d prsqrt<Packet2d>(const Packet2d& x) {
 
 } // end namespace internal
 
+namespace numext {
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float sqrt(const float &x)
+{
+  return internal::pfirst(_mm_sqrt_ss(_mm_set_ss(x)));
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double sqrt(const double &x)
+{
+#if EIGEN_COMP_GNUC
+  return internal::pfirst(__builtin_ia32_sqrtsd(_mm_set_sd(x)));
+#else
+  return internal::pfirst(_mm_sqrt_pd(_mm_set_sd(x)));
+#endif
+}
+
+} // end namespace numex
+
 } // end namespace Eigen
 
 #endif // EIGEN_MATH_FUNCTIONS_SSE_H

From 7cae8918c019feabf6c143c430d0cd82c74aeec3 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 21 Jan 2016 20:30:32 +0100
Subject: [PATCH 50/52] Fix compilation on old gcc+AVX

---
 Eigen/src/Core/arch/SSE/MathFunctions.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index 0dd52f96e..74f6abc37 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -524,7 +524,7 @@ template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float sqrt(const float &x)
 {
-  return internal::pfirst(_mm_sqrt_ss(_mm_set_ss(x)));
+  return internal::pfirst(internal::Packet4f(_mm_sqrt_ss(_mm_set_ss(x))));
 }
 
 template<>
@@ -532,9 +532,9 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 double sqrt(const double &x)
 {
 #if EIGEN_COMP_GNUC
-  return internal::pfirst(__builtin_ia32_sqrtsd(_mm_set_sd(x)));
+  return internal::pfirst(internal::Packet2d(__builtin_ia32_sqrtsd(_mm_set_sd(x))));
 #else
-  return internal::pfirst(_mm_sqrt_pd(_mm_set_sd(x)));
+  return internal::pfirst(internal::Packet2d(_mm_sqrt_pd(_mm_set_sd(x))));
 #endif
 }
 

From ee37eb4eed09fe35be2acc3699e80f49a44ea99a Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 21 Jan 2016 20:43:42 +0100
Subject: [PATCH 51/52] bug #977: avoid division by 0 in normalize() and
 normalized().

---
 Eigen/src/Core/Dot.h | 19 ++++++++++++++++---
 test/adjoint.cpp     |  9 +++++++++
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h
index ce42854cd..221fc3224 100644
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -102,7 +102,10 @@ inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real Matr
   return numext::sqrt(squaredNorm());
 }
 
-/** \returns an expression of the quotient of *this by its own norm.
+/** \returns an expression of the quotient of \c *this by its own norm.
+  *
+  * \warning If the input vector is too small (i.e., this->norm()==0),
+  *          then this function returns a copy of the input.
   *
   * \only_for_vectors
   *
@@ -114,19 +117,29 @@ MatrixBase<Derived>::normalized() const
 {
   typedef typename internal::nested_eval<Derived,2>::type _Nested;
   _Nested n(derived());
-  return n / n.norm();
+  RealScalar z = n.squaredNorm();
+  // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
+  if(z>RealScalar(0))
+    return n / numext::sqrt(z);
+  else
+    return n;
 }
 
 /** Normalizes the vector, i.e. divides it by its own norm.
   *
   * \only_for_vectors
   *
+  * \warning If the input vector is too small (i.e., this->norm()==0), then \c *this is left unchanged.
+  *
   * \sa norm(), normalized()
   */
 template<typename Derived>
 inline void MatrixBase<Derived>::normalize()
 {
-  *this /= norm();
+  RealScalar z = squaredNorm();
+  // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
+  if(z>RealScalar(0))
+    derived() /= numext::sqrt(z);
 }
 
 //---------- implementation of other norms ----------
diff --git a/test/adjoint.cpp b/test/adjoint.cpp
index 3b2a53c91..b1e69c2e5 100644
--- a/test/adjoint.cpp
+++ b/test/adjoint.cpp
@@ -42,6 +42,15 @@ template<> struct adjoint_specific<false> {
     VERIFY_IS_APPROX(v1, v1.norm() * v3);
     VERIFY_IS_APPROX(v3, v1.normalized());
     VERIFY_IS_APPROX(v3.norm(), RealScalar(1));
+
+    // check null inputs
+    VERIFY_IS_APPROX((v1*0).normalized(), (v1*0));
+    RealScalar very_small = (std::numeric_limits<RealScalar>::min)();
+    VERIFY( (v1*very_small).norm() == 0 );
+    VERIFY_IS_APPROX((v1*very_small).normalized(), (v1*very_small));
+    v3 = v1*very_small;
+    v3.normalize();
+    VERIFY_IS_APPROX(v3, (v1*very_small));
     
     // check compatibility of dot and adjoint
     ref = NumTraits<Scalar>::IsInteger ? 0 : (std::max)((std::max)(v1.norm(),v2.norm()),(std::max)((square * v2).norm(),(square.adjoint() * v1).norm()));

From c33479324c2a24094acf78c776655a6474c3bcca Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 21 Jan 2016 17:08:11 -0800
Subject: [PATCH 52/52] Fixed a constness bug

---
 unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 0524cf0d8..11284315c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -146,7 +146,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
     return m_impl.template packet<LoadMode>(index);
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_impl.data(); }
+  EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); }
 
   const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }