From 8b0d1eb0f7f1ab017a8e603f3887143df15662d7 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 1 Jan 2016 21:45:06 +0100 Subject: [PATCH 01/52] Fix numerous doxygen shortcomings, and workaround some clang -Wdocumentation warnings --- Eigen/src/Cholesky/LDLT.h | 4 +- Eigen/src/Cholesky/LLT.h | 9 +- Eigen/src/Core/Array.h | 24 +-- Eigen/src/Core/BandMatrix.h | 24 +-- Eigen/src/Core/Block.h | 69 +++++---- Eigen/src/Core/CwiseBinaryOp.h | 39 +++-- Eigen/src/Core/CwiseNullaryOp.h | 35 +++-- Eigen/src/Core/CwiseUnaryOp.h | 39 +++-- Eigen/src/Core/CwiseUnaryView.h | 27 ++-- Eigen/src/Core/IO.h | 2 +- Eigen/src/Core/Map.h | 45 +++--- Eigen/src/Core/MathFunctions.h | 2 +- Eigen/src/Core/Matrix.h | 86 +++++------ Eigen/src/Core/NestByValue.h | 25 ++-- Eigen/src/Core/NoAlias.h | 2 +- Eigen/src/Core/NumTraits.h | 2 +- Eigen/src/Core/PermutationMatrix.h | 65 ++++---- Eigen/src/Core/Product.h | 31 ++-- Eigen/src/Core/Ref.h | 140 +++++++++--------- Eigen/src/Core/Replicate.h | 31 ++-- Eigen/src/Core/ReturnByValue.h | 9 +- Eigen/src/Core/Reverse.h | 28 ++-- Eigen/src/Core/Stride.h | 4 +- Eigen/src/Core/Transpose.h | 27 ++-- Eigen/src/Core/Transpositions.h | 58 ++++---- Eigen/src/Core/VectorBlock.h | 25 ++-- Eigen/src/Core/VectorwiseOp.h | 8 +- Eigen/src/Core/util/XprHelper.h | 8 +- .../src/Eigenvalues/SelfAdjointEigenSolver.h | 15 +- Eigen/src/Geometry/Hyperplane.h | 4 +- Eigen/src/Geometry/ParametrizedLine.h | 4 +- Eigen/src/Geometry/Rotation2D.h | 2 +- Eigen/src/Geometry/RotationBase.h | 8 +- Eigen/src/Geometry/Scaling.h | 2 +- Eigen/src/Geometry/Translation.h | 4 +- .../IncompleteCholesky.h | 2 +- Eigen/src/LU/FullPivLU.h | 2 +- Eigen/src/LU/PartialPivLU.h | 2 +- Eigen/src/OrderingMethods/Amd.h | 7 +- Eigen/src/OrderingMethods/Ordering.h | 9 +- Eigen/src/QR/ColPivHouseholderQR.h | 2 +- Eigen/src/QR/FullPivHouseholderQR.h | 2 +- Eigen/src/QR/HouseholderQR.h | 2 +- Eigen/src/SVD/BDCSVD.h | 5 +- Eigen/src/SVD/JacobiSVD.h | 4 +- Eigen/src/SparseCholesky/SimplicialCholesky.h | 10 +- Eigen/src/SparseLU/SparseLU_kernel_bmod.h | 27 ++-- .../Eigen/src/SparseExtra/RandomSetter.h | 6 +- 48 files changed, 486 insertions(+), 501 deletions(-) diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h index 6fcae01f7..c3cc3746c 100644 --- a/Eigen/src/Cholesky/LDLT.h +++ b/Eigen/src/Cholesky/LDLT.h @@ -28,8 +28,8 @@ namespace internal { * * \brief Robust Cholesky decomposition of a matrix with pivoting * - * \param MatrixType the type of the matrix of which to compute the LDL^T Cholesky decomposition - * \param UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper. + * \tparam _MatrixType the type of the matrix of which to compute the LDL^T Cholesky decomposition + * \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper. * The other triangular part won't be read. * * Perform a robust Cholesky decomposition of a positive semidefinite or negative semidefinite diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index 1f0091f3c..74cf5bfe1 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -22,8 +22,8 @@ template struct LLT_Traits; * * \brief Standard Cholesky decomposition (LL^T) of a matrix and associated features * - * \param MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition - * \param UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper. + * \tparam _MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition + * \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper. * The other triangular part won't be read. * * This class performs a LL^T Cholesky decomposition of a symmetric, positive definite @@ -436,10 +436,7 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const * * \param bAndX represents both the right-hand side matrix b and result x. * - * \returns true always! If you need to check for existence of solutions, use another decomposition like LU, QR, or SVD. - * - * This version avoids a copy when the right hand side matrix b is not - * needed anymore. + * This version avoids a copy when the right hand side matrix b is not needed anymore. * * \sa LLT::solve(), MatrixBase::llt() */ diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h index e38eda72c..7480d1e24 100644 --- a/Eigen/src/Core/Array.h +++ b/Eigen/src/Core/Array.h @@ -12,7 +12,16 @@ namespace Eigen { -/** \class Array +namespace internal { +template +struct traits > : traits > +{ + typedef ArrayXpr XprKind; + typedef ArrayBase > XprBase; +}; +} + +/** \class Array * \ingroup Core_Module * * \brief General-purpose arrays with easy API for coefficient-wise operations @@ -26,21 +35,12 @@ namespace Eigen { * * See documentation of class Matrix for detailed information on the template parameters * storage layout. - * + * * This class can be extended with the help of the plugin mechanism described on the page * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_ARRAY_PLUGIN. * - * \sa \ref TutorialArrayClass, \ref TopicClassHierarchy + * \sa \blank \ref TutorialArrayClass, \ref TopicClassHierarchy */ -namespace internal { -template -struct traits > : traits > -{ - typedef ArrayXpr XprKind; - typedef ArrayBase > XprBase; -}; -} - template class Array : public PlainObjectBase > diff --git a/Eigen/src/Core/BandMatrix.h b/Eigen/src/Core/BandMatrix.h index 87c124fdf..4978c9140 100644 --- a/Eigen/src/Core/BandMatrix.h +++ b/Eigen/src/Core/BandMatrix.h @@ -161,15 +161,15 @@ class BandMatrixBase : public EigenBase * * \brief Represents a rectangular matrix with a banded storage * - * \param _Scalar Numeric type, i.e. float, double, int - * \param Rows Number of rows, or \b Dynamic - * \param Cols Number of columns, or \b Dynamic - * \param Supers Number of super diagonal - * \param Subs Number of sub diagonal - * \param _Options A combination of either \b #RowMajor or \b #ColMajor, and of \b #SelfAdjoint - * The former controls \ref TopicStorageOrders "storage order", and defaults to - * column-major. The latter controls whether the matrix represents a selfadjoint - * matrix in which case either Supers of Subs have to be null. + * \tparam _Scalar Numeric type, i.e. float, double, int + * \tparam _Rows Number of rows, or \b Dynamic + * \tparam _Cols Number of columns, or \b Dynamic + * \tparam _Supers Number of super diagonal + * \tparam _Subs Number of sub diagonal + * \tparam _Options A combination of either \b #RowMajor or \b #ColMajor, and of \b #SelfAdjoint + * The former controls \ref TopicStorageOrders "storage order", and defaults to + * column-major. The latter controls whether the matrix represents a selfadjoint + * matrix in which case either Supers of Subs have to be null. * * \sa class TridiagonalMatrix */ @@ -302,9 +302,9 @@ class BandMatrixWrapper : public BandMatrixBase(Index,Index) and - * most of the time this is the only way it is used. - * - * However, if you want to directly maniputate block expressions, - * for instance if you want to write a function returning such an expression, you - * will need to use this class. - * - * Here is an example illustrating the dynamic case: - * \include class_Block.cpp - * Output: \verbinclude class_Block.out - * - * \note Even though this expression has dynamic size, in the case where \a XprType - * has fixed size, this expression inherits a fixed maximal size which means that evaluating - * it does not cause a dynamic memory allocation. - * - * Here is an example illustrating the fixed-size case: - * \include class_FixedBlock.cpp - * Output: \verbinclude class_FixedBlock.out - * - * \sa DenseBase::block(Index,Index,Index,Index), DenseBase::block(Index,Index), class VectorBlock - */ - namespace internal { template struct traits > : traits @@ -101,6 +66,40 @@ template class BlockImpl; +/** \class Block + * \ingroup Core_Module + * + * \brief Expression of a fixed-size or dynamic-size block + * + * \tparam XprType the type of the expression in which we are taking a block + * \tparam BlockRows the number of rows of the block we are taking at compile time (optional) + * \tparam BlockCols the number of columns of the block we are taking at compile time (optional) + * \tparam InnerPanel is true, if the block maps to a set of rows of a row major matrix or + * to set of columns of a column major matrix (optional). The parameter allows to determine + * at compile time whether aligned access is possible on the block expression. + * + * This class represents an expression of either a fixed-size or dynamic-size block. It is the return + * type of DenseBase::block(Index,Index,Index,Index) and DenseBase::block(Index,Index) and + * most of the time this is the only way it is used. + * + * However, if you want to directly maniputate block expressions, + * for instance if you want to write a function returning such an expression, you + * will need to use this class. + * + * Here is an example illustrating the dynamic case: + * \include class_Block.cpp + * Output: \verbinclude class_Block.out + * + * \note Even though this expression has dynamic size, in the case where \a XprType + * has fixed size, this expression inherits a fixed maximal size which means that evaluating + * it does not cause a dynamic memory allocation. + * + * Here is an example illustrating the fixed-size case: + * \include class_FixedBlock.cpp + * Output: \verbinclude class_FixedBlock.out + * + * \sa DenseBase::block(Index,Index,Index,Index), DenseBase::block(Index,Index), class VectorBlock + */ template class Block : public BlockImpl::StorageKind> { diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h index e42c3031b..f94629e6d 100644 --- a/Eigen/src/Core/CwiseBinaryOp.h +++ b/Eigen/src/Core/CwiseBinaryOp.h @@ -13,26 +13,6 @@ namespace Eigen { -/** \class CwiseBinaryOp - * \ingroup Core_Module - * - * \brief Generic expression where a coefficient-wise binary operator is applied to two expressions - * - * \param BinaryOp template functor implementing the operator - * \param Lhs the type of the left-hand side - * \param Rhs the type of the right-hand side - * - * This class represents an expression where a coefficient-wise binary operator is applied to two expressions. - * It is the return type of binary operators, by which we mean only those binary operators where - * both the left-hand side and the right-hand side are Eigen expressions. - * For example, the return type of matrix1+matrix2 is a CwiseBinaryOp. - * - * Most of the time, this is the only way that it is used, so you typically don't have to name - * CwiseBinaryOp types explicitly. - * - * \sa MatrixBase::binaryExpr(const MatrixBase &,const CustomBinaryOp &) const, class CwiseUnaryOp, class CwiseNullaryOp - */ - namespace internal { template struct traits > @@ -74,6 +54,25 @@ struct traits > template class CwiseBinaryOpImpl; +/** \class CwiseBinaryOp + * \ingroup Core_Module + * + * \brief Generic expression where a coefficient-wise binary operator is applied to two expressions + * + * \tparam BinaryOp template functor implementing the operator + * \tparam LhsType the type of the left-hand side + * \tparam RhsType the type of the right-hand side + * + * This class represents an expression where a coefficient-wise binary operator is applied to two expressions. + * It is the return type of binary operators, by which we mean only those binary operators where + * both the left-hand side and the right-hand side are Eigen expressions. + * For example, the return type of matrix1+matrix2 is a CwiseBinaryOp. + * + * Most of the time, this is the only way that it is used, so you typically don't have to name + * CwiseBinaryOp types explicitly. + * + * \sa MatrixBase::binaryExpr(const MatrixBase &,const CustomBinaryOp &) const, class CwiseUnaryOp, class CwiseNullaryOp + */ template class CwiseBinaryOp : public CwiseBinaryOpImpl< diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h index 1dd109d3d..67b99653f 100644 --- a/Eigen/src/Core/CwiseNullaryOp.h +++ b/Eigen/src/Core/CwiseNullaryOp.h @@ -12,24 +12,6 @@ namespace Eigen { -/** \class CwiseNullaryOp - * \ingroup Core_Module - * - * \brief Generic expression of a matrix where all coefficients are defined by a functor - * - * \param NullaryOp template functor implementing the operator - * \param PlainObjectType the underlying plain matrix/array type - * - * This class represents an expression of a generic nullary operator. - * It is the return type of the Ones(), Zero(), Constant(), Identity() and Random() methods, - * and most of the time this is the only way it is used. - * - * However, if you want to write a function returning such an expression, you - * will need to use this class. - * - * \sa class CwiseUnaryOp, class CwiseBinaryOp, DenseBase::NullaryExpr() - */ - namespace internal { template struct traits > : traits @@ -40,6 +22,23 @@ struct traits > : traits class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp >::type, internal::no_assignment_operator { diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h index da1d1992d..5a809cf21 100644 --- a/Eigen/src/Core/CwiseUnaryOp.h +++ b/Eigen/src/Core/CwiseUnaryOp.h @@ -13,26 +13,6 @@ namespace Eigen { -/** \class CwiseUnaryOp - * \ingroup Core_Module - * - * \brief Generic expression where a coefficient-wise unary operator is applied to an expression - * - * \param UnaryOp template functor implementing the operator - * \param XprType the type of the expression to which we are applying the unary operator - * - * This class represents an expression where a unary operator is applied to an expression. - * It is the return type of all operations taking exactly 1 input expression, regardless of the - * presence of other inputs such as scalars. For example, the operator* in the expression 3*matrix - * is considered unary, because only the right-hand side is an expression, and its - * return type is a specialization of CwiseUnaryOp. - * - * Most of the time, this is the only way that it is used, so you typically don't have to name - * CwiseUnaryOp types explicitly. - * - * \sa MatrixBase::unaryExpr(const CustomUnaryOp &) const, class CwiseBinaryOp, class CwiseNullaryOp - */ - namespace internal { template struct traits > @@ -52,6 +32,25 @@ struct traits > template class CwiseUnaryOpImpl; +/** \class CwiseUnaryOp + * \ingroup Core_Module + * + * \brief Generic expression where a coefficient-wise unary operator is applied to an expression + * + * \tparam UnaryOp template functor implementing the operator + * \tparam XprType the type of the expression to which we are applying the unary operator + * + * This class represents an expression where a unary operator is applied to an expression. + * It is the return type of all operations taking exactly 1 input expression, regardless of the + * presence of other inputs such as scalars. For example, the operator* in the expression 3*matrix + * is considered unary, because only the right-hand side is an expression, and its + * return type is a specialization of CwiseUnaryOp. + * + * Most of the time, this is the only way that it is used, so you typically don't have to name + * CwiseUnaryOp types explicitly. + * + * \sa MatrixBase::unaryExpr(const CustomUnaryOp &) const, class CwiseBinaryOp, class CwiseNullaryOp + */ template class CwiseUnaryOp : public CwiseUnaryOpImpl::StorageKind>, internal::no_assignment_operator { diff --git a/Eigen/src/Core/CwiseUnaryView.h b/Eigen/src/Core/CwiseUnaryView.h index 72244751e..5a7db2b19 100644 --- a/Eigen/src/Core/CwiseUnaryView.h +++ b/Eigen/src/Core/CwiseUnaryView.h @@ -12,20 +12,6 @@ namespace Eigen { -/** \class CwiseUnaryView - * \ingroup Core_Module - * - * \brief Generic lvalue expression of a coefficient-wise unary operator of a matrix or a vector - * - * \param ViewOp template functor implementing the view - * \param MatrixType the type of the matrix we are applying the unary operator - * - * This class represents a lvalue expression of a generic unary view operator of a matrix or a vector. - * It is the return type of real() and imag(), and most of the time this is the only way it is used. - * - * \sa MatrixBase::unaryViewExpr(const CustomUnaryOp &) const, class CwiseUnaryOp - */ - namespace internal { template struct traits > @@ -55,6 +41,19 @@ struct traits > template class CwiseUnaryViewImpl; +/** \class CwiseUnaryView + * \ingroup Core_Module + * + * \brief Generic lvalue expression of a coefficient-wise unary operator of a matrix or a vector + * + * \tparam ViewOp template functor implementing the view + * \tparam MatrixType the type of the matrix we are applying the unary operator + * + * This class represents a lvalue expression of a generic unary view operator of a matrix or a vector. + * It is the return type of real() and imag(), and most of the time this is the only way it is used. + * + * \sa MatrixBase::unaryViewExpr(const CustomUnaryOp &) const, class CwiseUnaryOp + */ template class CwiseUnaryView : public CwiseUnaryViewImpl::StorageKind> { diff --git a/Eigen/src/Core/IO.h b/Eigen/src/Core/IO.h index 9ae37bb5a..dfd9097cc 100644 --- a/Eigen/src/Core/IO.h +++ b/Eigen/src/Core/IO.h @@ -80,7 +80,7 @@ struct IOFormat * * \brief Pseudo expression providing matrix output with given format * - * \param ExpressionType the type of the object on which IO stream operations are performed + * \tparam ExpressionType the type of the object on which IO stream operations are performed * * This class represents an expression with stream operators controlled by a given IOFormat. * It is the return type of DenseBase::format() diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h index 3a8375da9..06d196702 100644 --- a/Eigen/src/Core/Map.h +++ b/Eigen/src/Core/Map.h @@ -13,6 +13,28 @@ namespace Eigen { +namespace internal { +template +struct traits > + : public traits +{ + typedef traits TraitsBase; + enum { + InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0 + ? int(PlainObjectType::InnerStrideAtCompileTime) + : int(StrideType::InnerStrideAtCompileTime), + OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0 + ? int(PlainObjectType::OuterStrideAtCompileTime) + : int(StrideType::OuterStrideAtCompileTime), + Alignment = int(MapOptions)&int(AlignedMask), + Flags0 = TraitsBase::Flags & (~NestByRefBit), + Flags = is_lvalue::value ? int(Flags0) : (int(Flags0) & ~LvalueBit) + }; +private: + enum { Options }; // Expressions don't have Options +}; +} + /** \class Map * \ingroup Core_Module * @@ -63,29 +85,6 @@ namespace Eigen { * * \sa PlainObjectBase::Map(), \ref TopicStorageOrders */ - -namespace internal { -template -struct traits > - : public traits -{ - typedef traits TraitsBase; - enum { - InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0 - ? int(PlainObjectType::InnerStrideAtCompileTime) - : int(StrideType::InnerStrideAtCompileTime), - OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0 - ? int(PlainObjectType::OuterStrideAtCompileTime) - : int(StrideType::OuterStrideAtCompileTime), - Alignment = int(MapOptions)&int(AlignedMask), - Flags0 = TraitsBase::Flags & (~NestByRefBit), - Flags = is_lvalue::value ? int(Flags0) : (int(Flags0) & ~LvalueBit) - }; -private: - enum { Options }; // Expressions don't have Options -}; -} - template class Map : public MapBase > { diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 48cf565fb..4d5e1acb8 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -26,7 +26,7 @@ long double abs(long double x) { return (fabsl(x)); } namespace internal { -/** \internal \struct global_math_functions_filtering_base +/** \internal \class global_math_functions_filtering_base * * What it does: * Defines a typedef 'type' as follows: diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h index ce1b70d23..bcbbbf9ae 100644 --- a/Eigen/src/Core/Matrix.h +++ b/Eigen/src/Core/Matrix.h @@ -13,6 +13,45 @@ namespace Eigen { +namespace internal { +template +struct traits > +{ +private: + enum { size = internal::size_at_compile_time<_Rows,_Cols>::ret }; + typedef typename find_best_packet<_Scalar,size>::type PacketScalar; + enum { + row_major_bit = _Options&RowMajor ? RowMajorBit : 0, + is_dynamic_size_storage = _MaxRows==Dynamic || _MaxCols==Dynamic, + max_size = is_dynamic_size_storage ? Dynamic : _MaxRows*_MaxCols, + default_alignment = compute_default_alignment<_Scalar,max_size>::value, + actual_alignment = ((_Options&DontAlign)==0) ? default_alignment : 0, + required_alignment = unpacket_traits::alignment, + packet_access_bit = packet_traits<_Scalar>::Vectorizable && (actual_alignment>=required_alignment) ? PacketAccessBit : 0 + }; + +public: + typedef _Scalar Scalar; + typedef Dense StorageKind; + typedef Eigen::Index StorageIndex; + typedef MatrixXpr XprKind; + enum { + RowsAtCompileTime = _Rows, + ColsAtCompileTime = _Cols, + MaxRowsAtCompileTime = _MaxRows, + MaxColsAtCompileTime = _MaxCols, + Flags = compute_matrix_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret, + Options = _Options, + InnerStrideAtCompileTime = 1, + OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime, + + // FIXME, the following flag in only used to define NeedsToAlign in PlainObjectBase + EvaluatorFlags = LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit, + Alignment = actual_alignment + }; +}; +} + /** \class Matrix * \ingroup Core_Module * @@ -98,7 +137,7 @@ namespace Eigen { * * * ABI and storage layout - * + * * The table below summarizes the ABI of some possible Matrix instances which is fixed thorough the lifetime of Eigen 3. * * @@ -130,50 +169,11 @@ namespace Eigen { *
Matrix typeEquivalent C structure
* Note that in this table Rows, Cols, MaxRows and MaxCols are all positive integers. A(S) is defined to the largest possible power-of-two * smaller to EIGEN_MAX_STATIC_ALIGN_BYTES. - * - * \see MatrixBase for the majority of the API methods for matrices, \ref TopicClassHierarchy, - * \ref TopicStorageOrders + * + * \see MatrixBase for the majority of the API methods for matrices, \ref TopicClassHierarchy, + * \ref TopicStorageOrders */ -namespace internal { -template -struct traits > -{ -private: - enum { size = internal::size_at_compile_time<_Rows,_Cols>::ret }; - typedef typename find_best_packet<_Scalar,size>::type PacketScalar; - enum { - row_major_bit = _Options&RowMajor ? RowMajorBit : 0, - is_dynamic_size_storage = _MaxRows==Dynamic || _MaxCols==Dynamic, - max_size = is_dynamic_size_storage ? Dynamic : _MaxRows*_MaxCols, - default_alignment = compute_default_alignment<_Scalar,max_size>::value, - actual_alignment = ((_Options&DontAlign)==0) ? default_alignment : 0, - required_alignment = unpacket_traits::alignment, - packet_access_bit = packet_traits<_Scalar>::Vectorizable && (actual_alignment>=required_alignment) ? PacketAccessBit : 0 - }; - -public: - typedef _Scalar Scalar; - typedef Dense StorageKind; - typedef Eigen::Index StorageIndex; - typedef MatrixXpr XprKind; - enum { - RowsAtCompileTime = _Rows, - ColsAtCompileTime = _Cols, - MaxRowsAtCompileTime = _MaxRows, - MaxColsAtCompileTime = _MaxCols, - Flags = compute_matrix_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret, - Options = _Options, - InnerStrideAtCompileTime = 1, - OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime, - - // FIXME, the following flag in only used to define NeedsToAlign in PlainObjectBase - EvaluatorFlags = LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit, - Alignment = actual_alignment - }; -}; -} - template class Matrix : public PlainObjectBase > diff --git a/Eigen/src/Core/NestByValue.h b/Eigen/src/Core/NestByValue.h index 9aeaf8d18..13adf070e 100644 --- a/Eigen/src/Core/NestByValue.h +++ b/Eigen/src/Core/NestByValue.h @@ -13,25 +13,24 @@ namespace Eigen { -/** \class NestByValue - * \ingroup Core_Module - * - * \brief Expression which must be nested by value - * - * \param ExpressionType the type of the object of which we are requiring nesting-by-value - * - * This class is the return type of MatrixBase::nestByValue() - * and most of the time this is the only way it is used. - * - * \sa MatrixBase::nestByValue() - */ - namespace internal { template struct traits > : public traits {}; } +/** \class NestByValue + * \ingroup Core_Module + * + * \brief Expression which must be nested by value + * + * \tparam ExpressionType the type of the object of which we are requiring nesting-by-value + * + * This class is the return type of MatrixBase::nestByValue() + * and most of the time this is the only way it is used. + * + * \sa MatrixBase::nestByValue() + */ template class NestByValue : public internal::dense_xpr_base< NestByValue >::type { diff --git a/Eigen/src/Core/NoAlias.h b/Eigen/src/Core/NoAlias.h index 0ade75255..ffb673cee 100644 --- a/Eigen/src/Core/NoAlias.h +++ b/Eigen/src/Core/NoAlias.h @@ -17,7 +17,7 @@ namespace Eigen { * * \brief Pseudo expression providing an operator = assuming no aliasing * - * \param ExpressionType the type of the object on which to do the lazy assignment + * \tparam ExpressionType the type of the object on which to do the lazy assignment * * This class represents an expression with special assignment operators * assuming no aliasing between the target expression and the source expression. diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index 1d85dec72..d1aabd995 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -17,7 +17,7 @@ namespace Eigen { * * \brief Holds information about the various numeric (i.e. scalar) types allowed by Eigen. * - * \param T the numeric type at hand + * \tparam T the numeric type at hand * * This class stores enums, typedefs and static methods giving information about a numeric type. * diff --git a/Eigen/src/Core/PermutationMatrix.h b/Eigen/src/Core/PermutationMatrix.h index aad4ccecd..b1fb455b9 100644 --- a/Eigen/src/Core/PermutationMatrix.h +++ b/Eigen/src/Core/PermutationMatrix.h @@ -13,12 +13,18 @@ namespace Eigen { +namespace internal { + +enum PermPermProduct_t {PermPermProduct}; + +} // end namespace internal + /** \class PermutationBase * \ingroup Core_Module * * \brief Base class for permutations * - * \param Derived the derived class + * \tparam Derived the derived class * * This class is the base class for all expressions representing a permutation matrix, * internally stored as a vector of integers. @@ -36,13 +42,6 @@ namespace Eigen { * * \sa class PermutationMatrix, class PermutationWrapper */ - -namespace internal { - -enum PermPermProduct_t {PermPermProduct}; - -} // end namespace internal - template class PermutationBase : public EigenBase { @@ -280,20 +279,6 @@ class PermutationBase : public EigenBase }; -/** \class PermutationMatrix - * \ingroup Core_Module - * - * \brief Permutation matrix - * - * \param SizeAtCompileTime the number of rows/cols, or Dynamic - * \param MaxSizeAtCompileTime the maximum number of rows/cols, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it. - * \param StorageIndex the integer type of the indices - * - * This class represents a permutation matrix, internally stored as a vector of integers. - * - * \sa class PermutationBase, class PermutationWrapper, class DiagonalMatrix - */ - namespace internal { template struct traits > @@ -306,6 +291,19 @@ struct traits class PermutationMatrix : public PermutationBase > { @@ -482,18 +480,6 @@ class Map class TranspositionsWrapper; namespace internal { template @@ -513,6 +499,17 @@ struct traits > }; } +/** \class PermutationWrapper + * \ingroup Core_Module + * + * \brief Class to view a vector of integers as a permutation matrix + * + * \tparam _IndicesType the type of the vector of integer (can be any compatible expression) + * + * This class allows to view any vector expression of integers as a permutation matrix. + * + * \sa class PermutationBase, class PermutationMatrix + */ template class PermutationWrapper : public PermutationBase > { diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index fdd2fed3f..8aa1de081 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -14,22 +14,6 @@ namespace Eigen { template class ProductImpl; -/** \class Product - * \ingroup Core_Module - * - * \brief Expression of the product of two arbitrary matrices or vectors - * - * \param Lhs the type of the left-hand side expression - * \param Rhs the type of the right-hand side expression - * - * This class represents an expression of the product of two arbitrary matrices. - * - * The other template parameters are: - * \tparam Option can be DefaultProduct, AliasFreeProduct, or LazyProduct - * - */ - - namespace internal { // Determine the scalar of Product. This is normally the same as Lhs::Scalar times @@ -102,7 +86,20 @@ struct traits > } // end namespace internal - +/** \class Product + * \ingroup Core_Module + * + * \brief Expression of the product of two arbitrary matrices or vectors + * + * \tparam _Lhs the type of the left-hand side expression + * \tparam _Rhs the type of the right-hand side expression + * + * This class represents an expression of the product of two arbitrary matrices. + * + * The other template parameters are: + * \tparam Option can be DefaultProduct, AliasFreeProduct, or LazyProduct + * + */ template class Product : public ProductImpl<_Lhs,_Rhs,Option, typename internal::product_promote_storage_type::StorageKind, diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h index 61de5ed17..ae414204e 100644 --- a/Eigen/src/Core/Ref.h +++ b/Eigen/src/Core/Ref.h @@ -12,76 +12,6 @@ namespace Eigen { -/** \class Ref - * \ingroup Core_Module - * - * \brief A matrix or vector expression mapping an existing expression - * - * \tparam PlainObjectType the equivalent matrix type of the mapped data - * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned. - * The default is \c #Unaligned. - * \tparam StrideType optionally specifies strides. By default, Ref implies a contiguous storage along the inner dimension (inner stride==1), - * but accepts a variable outer stride (leading dimension). - * This can be overridden by specifying strides. - * The type passed here must be a specialization of the Stride template, see examples below. - * - * This class provides a way to write non-template functions taking Eigen objects as parameters while limiting the number of copies. - * A Ref<> object can represent either a const expression or a l-value: - * \code - * // in-out argument: - * void foo1(Ref x); - * - * // read-only const argument: - * void foo2(const Ref& x); - * \endcode - * - * In the in-out case, the input argument must satisfy the constraints of the actual Ref<> type, otherwise a compilation issue will be triggered. - * By default, a Ref can reference any dense vector expression of float having a contiguous memory layout. - * Likewise, a Ref can reference any column-major dense matrix expression of float whose column's elements are contiguously stored with - * the possibility to have a constant space in-between each column, i.e. the inner stride must be equal to 1, but the outer stride (or leading dimension) - * can be greater than the number of rows. - * - * In the const case, if the input expression does not match the above requirement, then it is evaluated into a temporary before being passed to the function. - * Here are some examples: - * \code - * MatrixXf A; - * VectorXf a; - * foo1(a.head()); // OK - * foo1(A.col()); // OK - * foo1(A.row()); // Compilation error because here innerstride!=1 - * foo2(A.row()); // Compilation error because A.row() is a 1xN object while foo2 is expecting a Nx1 object - * foo2(A.row().transpose()); // The row is copied into a contiguous temporary - * foo2(2*a); // The expression is evaluated into a temporary - * foo2(A.col().segment(2,4)); // No temporary - * \endcode - * - * The range of inputs that can be referenced without temporary can be enlarged using the last two template parameters. - * Here is an example accepting an innerstride!=1: - * \code - * // in-out argument: - * void foo3(Ref > x); - * foo3(A.row()); // OK - * \endcode - * The downside here is that the function foo3 might be significantly slower than foo1 because it won't be able to exploit vectorization, and will involve more - * expensive address computations even if the input is contiguously stored in memory. To overcome this issue, one might propose to overload internally calling a - * template function, e.g.: - * \code - * // in the .h: - * void foo(const Ref& A); - * void foo(const Ref >& A); - * - * // in the .cpp: - * template void foo_impl(const TypeOfA& A) { - * ... // crazy code goes here - * } - * void foo(const Ref& A) { foo_impl(A); } - * void foo(const Ref >& A) { foo_impl(A); } - * \endcode - * - * - * \sa PlainObjectBase::Map(), \ref TopicStorageOrders - */ - namespace internal { template @@ -182,7 +112,75 @@ protected: StrideBase m_stride; }; - +/** \class Ref + * \ingroup Core_Module + * + * \brief A matrix or vector expression mapping an existing expression + * + * \tparam PlainObjectType the equivalent matrix type of the mapped data + * \tparam Options specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned. + * The default is \c #Unaligned. + * \tparam StrideType optionally specifies strides. By default, Ref implies a contiguous storage along the inner dimension (inner stride==1), + * but accepts a variable outer stride (leading dimension). + * This can be overridden by specifying strides. + * The type passed here must be a specialization of the Stride template, see examples below. + * + * This class provides a way to write non-template functions taking Eigen objects as parameters while limiting the number of copies. + * A Ref<> object can represent either a const expression or a l-value: + * \code + * // in-out argument: + * void foo1(Ref x); + * + * // read-only const argument: + * void foo2(const Ref& x); + * \endcode + * + * In the in-out case, the input argument must satisfy the constraints of the actual Ref<> type, otherwise a compilation issue will be triggered. + * By default, a Ref can reference any dense vector expression of float having a contiguous memory layout. + * Likewise, a Ref can reference any column-major dense matrix expression of float whose column's elements are contiguously stored with + * the possibility to have a constant space in-between each column, i.e. the inner stride must be equal to 1, but the outer stride (or leading dimension) + * can be greater than the number of rows. + * + * In the const case, if the input expression does not match the above requirement, then it is evaluated into a temporary before being passed to the function. + * Here are some examples: + * \code + * MatrixXf A; + * VectorXf a; + * foo1(a.head()); // OK + * foo1(A.col()); // OK + * foo1(A.row()); // Compilation error because here innerstride!=1 + * foo2(A.row()); // Compilation error because A.row() is a 1xN object while foo2 is expecting a Nx1 object + * foo2(A.row().transpose()); // The row is copied into a contiguous temporary + * foo2(2*a); // The expression is evaluated into a temporary + * foo2(A.col().segment(2,4)); // No temporary + * \endcode + * + * The range of inputs that can be referenced without temporary can be enlarged using the last two template parameters. + * Here is an example accepting an innerstride!=1: + * \code + * // in-out argument: + * void foo3(Ref > x); + * foo3(A.row()); // OK + * \endcode + * The downside here is that the function foo3 might be significantly slower than foo1 because it won't be able to exploit vectorization, and will involve more + * expensive address computations even if the input is contiguously stored in memory. To overcome this issue, one might propose to overload internally calling a + * template function, e.g.: + * \code + * // in the .h: + * void foo(const Ref& A); + * void foo(const Ref >& A); + * + * // in the .cpp: + * template void foo_impl(const TypeOfA& A) { + * ... // crazy code goes here + * } + * void foo(const Ref& A) { foo_impl(A); } + * void foo(const Ref >& A) { foo_impl(A); } + * \endcode + * + * + * \sa PlainObjectBase::Map(), \ref TopicStorageOrders + */ template class Ref : public RefBase > { diff --git a/Eigen/src/Core/Replicate.h b/Eigen/src/Core/Replicate.h index bec598310..9960ef884 100644 --- a/Eigen/src/Core/Replicate.h +++ b/Eigen/src/Core/Replicate.h @@ -12,21 +12,6 @@ namespace Eigen { -/** - * \class Replicate - * \ingroup Core_Module - * - * \brief Expression of the multiple replication of a matrix or vector - * - * \param MatrixType the type of the object we are replicating - * - * This class represents an expression of the multiple replication of a matrix or vector. - * It is the return type of DenseBase::replicate() and most of the time - * this is the only way it is used. - * - * \sa DenseBase::replicate() - */ - namespace internal { template struct traits > @@ -57,6 +42,22 @@ struct traits > }; } +/** + * \class Replicate + * \ingroup Core_Module + * + * \brief Expression of the multiple replication of a matrix or vector + * + * \tparam MatrixType the type of the object we are replicating + * \tparam RowFactor number of repetitions at compile time along the vertical direction, can be Dynamic. + * \tparam ColFactor number of repetitions at compile time along the horizontal direction, can be Dynamic. + * + * This class represents an expression of the multiple replication of a matrix or vector. + * It is the return type of DenseBase::replicate() and most of the time + * this is the only way it is used. + * + * \sa DenseBase::replicate() + */ template class Replicate : public internal::dense_xpr_base< Replicate >::type { diff --git a/Eigen/src/Core/ReturnByValue.h b/Eigen/src/Core/ReturnByValue.h index 7feb6e01c..c44b7673b 100644 --- a/Eigen/src/Core/ReturnByValue.h +++ b/Eigen/src/Core/ReturnByValue.h @@ -13,11 +13,6 @@ namespace Eigen { -/** \class ReturnByValue - * \ingroup Core_Module - * - */ - namespace internal { template @@ -48,6 +43,10 @@ struct nested_eval, n, PlainObject> } // end namespace internal +/** \class ReturnByValue + * \ingroup Core_Module + * + */ template class ReturnByValue : public internal::dense_xpr_base< ReturnByValue >::type, internal::no_assignment_operator { diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h index d7c380c78..0640cda2a 100644 --- a/Eigen/src/Core/Reverse.h +++ b/Eigen/src/Core/Reverse.h @@ -14,20 +14,6 @@ namespace Eigen { -/** \class Reverse - * \ingroup Core_Module - * - * \brief Expression of the reverse of a vector or matrix - * - * \param MatrixType the type of the object of which we are taking the reverse - * - * This class represents an expression of the reverse of a vector. - * It is the return type of MatrixBase::reverse() and VectorwiseOp::reverse() - * and most of the time this is the only way it is used. - * - * \sa MatrixBase::reverse(), VectorwiseOp::reverse() - */ - namespace internal { template @@ -60,6 +46,20 @@ template struct reverse_packet_cond } // end namespace internal +/** \class Reverse + * \ingroup Core_Module + * + * \brief Expression of the reverse of a vector or matrix + * + * \tparam MatrixType the type of the object of which we are taking the reverse + * \tparam Direction defines the direction of the reverse operation, can be Vertical, Horizontal, or BothDirections + * + * This class represents an expression of the reverse of a vector. + * It is the return type of MatrixBase::reverse() and VectorwiseOp::reverse() + * and most of the time this is the only way it is used. + * + * \sa MatrixBase::reverse(), VectorwiseOp::reverse() + */ template class Reverse : public internal::dense_xpr_base< Reverse >::type { diff --git a/Eigen/src/Core/Stride.h b/Eigen/src/Core/Stride.h index 9a2f4f1eb..513742f34 100644 --- a/Eigen/src/Core/Stride.h +++ b/Eigen/src/Core/Stride.h @@ -31,8 +31,8 @@ namespace Eigen { * arguments to the constructor. * * Indeed, this class takes two template parameters: - * \param _OuterStrideAtCompileTime the outer stride, or Dynamic if you want to specify it at runtime. - * \param _InnerStrideAtCompileTime the inner stride, or Dynamic if you want to specify it at runtime. + * \tparam _OuterStrideAtCompileTime the outer stride, or Dynamic if you want to specify it at runtime. + * \tparam _InnerStrideAtCompileTime the inner stride, or Dynamic if you want to specify it at runtime. * * Here is an example: * \include Map_general_stride.cpp diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index 5b66eb5e1..f199d1086 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -13,20 +13,6 @@ namespace Eigen { -/** \class Transpose - * \ingroup Core_Module - * - * \brief Expression of the transpose of a matrix - * - * \param MatrixType the type of the object of which we are taking the transpose - * - * This class represents an expression of the transpose of a matrix. - * It is the return type of MatrixBase::transpose() and MatrixBase::adjoint() - * and most of the time this is the only way it is used. - * - * \sa MatrixBase::transpose(), MatrixBase::adjoint() - */ - namespace internal { template struct traits > : public traits @@ -50,6 +36,19 @@ struct traits > : public traits template class TransposeImpl; +/** \class Transpose + * \ingroup Core_Module + * + * \brief Expression of the transpose of a matrix + * + * \tparam MatrixType the type of the object of which we are taking the transpose + * + * This class represents an expression of the transpose of a matrix. + * It is the return type of MatrixBase::transpose() and MatrixBase::adjoint() + * and most of the time this is the only way it is used. + * + * \sa MatrixBase::transpose(), MatrixBase::adjoint() + */ template class Transpose : public TransposeImpl::StorageKind> { diff --git a/Eigen/src/Core/Transpositions.h b/Eigen/src/Core/Transpositions.h index 3b1c1815d..678ba3288 100644 --- a/Eigen/src/Core/Transpositions.h +++ b/Eigen/src/Core/Transpositions.h @@ -12,35 +12,6 @@ namespace Eigen { -/** \class Transpositions - * \ingroup Core_Module - * - * \brief Represents a sequence of transpositions (row/column interchange) - * - * \param SizeAtCompileTime the number of transpositions, or Dynamic - * \param MaxSizeAtCompileTime the maximum number of transpositions, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it. - * - * This class represents a permutation transformation as a sequence of \em n transpositions - * \f$[T_{n-1} \ldots T_{i} \ldots T_{0}]\f$. It is internally stored as a vector of integers \c indices. - * Each transposition \f$ T_{i} \f$ applied on the left of a matrix (\f$ T_{i} M\f$) interchanges - * the rows \c i and \c indices[i] of the matrix \c M. - * A transposition applied on the right (e.g., \f$ M T_{i}\f$) yields a column interchange. - * - * Compared to the class PermutationMatrix, such a sequence of transpositions is what is - * computed during a decomposition with pivoting, and it is faster when applying the permutation in-place. - * - * To apply a sequence of transpositions to a matrix, simply use the operator * as in the following example: - * \code - * Transpositions tr; - * MatrixXf mat; - * mat = tr * mat; - * \endcode - * In this example, we detect that the matrix appears on both side, and so the transpositions - * are applied in-place without any temporary or extra copy. - * - * \sa class PermutationMatrix - */ - template class TranspositionsBase { @@ -154,6 +125,35 @@ struct traits class Transpositions : public TranspositionsBase > { diff --git a/Eigen/src/Core/VectorBlock.h b/Eigen/src/Core/VectorBlock.h index 216c568c4..d72fbf7e9 100644 --- a/Eigen/src/Core/VectorBlock.h +++ b/Eigen/src/Core/VectorBlock.h @@ -13,13 +13,23 @@ namespace Eigen { +namespace internal { +template +struct traits > + : public traits::Flags & RowMajorBit ? 1 : Size, + traits::Flags & RowMajorBit ? Size : 1> > +{ +}; +} + /** \class VectorBlock * \ingroup Core_Module * * \brief Expression of a fixed-size or dynamic-size sub-vector * - * \param VectorType the type of the object in which we are taking a sub-vector - * \param Size size of the sub-vector we are taking at compile time (optional) + * \tparam VectorType the type of the object in which we are taking a sub-vector + * \tparam Size size of the sub-vector we are taking at compile time (optional) * * This class represents an expression of either a fixed-size or dynamic-size sub-vector. * It is the return type of DenseBase::segment(Index,Index) and DenseBase::segment(Index) and @@ -43,17 +53,6 @@ namespace Eigen { * * \sa class Block, DenseBase::segment(Index,Index,Index,Index), DenseBase::segment(Index,Index) */ - -namespace internal { -template -struct traits > - : public traits::Flags & RowMajorBit ? 1 : Size, - traits::Flags & RowMajorBit ? Size : 1> > -{ -}; -} - template class VectorBlock : public Block::Flags & RowMajorBit ? 1 : Size, diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h index 483f71909..95bcaa86f 100755 --- a/Eigen/src/Core/VectorwiseOp.h +++ b/Eigen/src/Core/VectorwiseOp.h @@ -141,8 +141,8 @@ struct member_redux { * * \brief Pseudo expression providing partial reduction operations * - * \param ExpressionType the type of the object on which to do partial reductions - * \param Direction indicates the direction of the redux (#Vertical or #Horizontal) + * \tparam ExpressionType the type of the object on which to do partial reductions + * \tparam Direction indicates the direction of the redux (#Vertical or #Horizontal) * * This class represents a pseudo expression with partial reduction features. * It is the return type of DenseBase::colwise() and DenseBase::rowwise() @@ -187,11 +187,11 @@ template class VectorwiseOp protected: - /** \internal - * \returns the i-th subvector according to the \c Direction */ typedef typename internal::conditional::type SubVector; + /** \internal + * \returns the i-th subvector according to the \c Direction */ EIGEN_DEVICE_FUNC SubVector subVector(Index i) { diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 6b93d5221..8b71e2c62 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -390,9 +390,9 @@ struct transfer_constness * a*d. Evaluating can be beneficial for example if every coefficient access in the resulting expression causes * many coefficient accesses in the nested expressions -- as is the case with matrix product for example. * - * \param T the type of the expression being nested. - * \param n the number of coefficient accesses in the nested expression for each coefficient access in the bigger expression. - * \param PlainObject the type of the temporary if needed. + * \tparam T the type of the expression being nested. + * \tparam n the number of coefficient accesses in the nested expression for each coefficient access in the bigger expression. + * \tparam PlainObject the type of the temporary if needed. */ template::type> struct nested_eval { @@ -575,7 +575,7 @@ template struct product_promote_storage_type struct product_promote_storage_type { typedef Dense ret; }; /** \internal gives the plain matrix or array type to store a row/column/diagonal of a matrix type. - * \param Scalar optional parameter allowing to pass a different scalar type than the one of the MatrixType. + * \tparam Scalar optional parameter allowing to pass a different scalar type than the one of the MatrixType. */ template struct plain_row_type diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h index dc9cd0a1a..469ea5e4e 100644 --- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h @@ -375,8 +375,12 @@ namespace internal { * Performs a QR step on a tridiagonal symmetric matrix represented as a * pair of two vectors \a diag and \a subdiag. * - * \param matA the input selfadjoint matrix - * \param hCoeffs returned Householder coefficients + * \param diag the diagonal part of the input selfadjoint tridiagonal matrix + * \param subdiag the sub-diagonal part of the input selfadjoint tridiagonal matrix + * \param start starting index of the submatrix to work on + * \param end last+1 index of the submatrix to work on + * \param matrixQ pointer to the column-major matrix holding the eigenvectors, can be 0 + * \param n size of the input matrix * * For compilation efficiency reasons, this procedure does not use eigen expression * for its arguments. @@ -467,9 +471,10 @@ namespace internal { * \brief Compute the eigendecomposition from a tridiagonal matrix * * \param[in,out] diag : On input, the diagonal of the matrix, on output the eigenvalues - * \param[in] subdiag : The subdiagonal part of the matrix. - * \param[in,out] : On input, the maximum number of iterations, on output, the effective number of iterations. - * \param[out] eivec : The matrix to store the eigenvectors... if needed. allocated on input + * \param[in,out] subdiag : The subdiagonal part of the matrix (entries are modified during the decomposition) + * \param[in] maxIterations : the maximum number of iterations + * \param[in] computeEigenvectors : whether the eigenvectors have to be computed or not + * \param[out] eivec : The matrix to store the eigenvectors if computeEigenvectors==true. Must be allocated on input. * \returns \c Success or \c NoConvergence */ template diff --git a/Eigen/src/Geometry/Hyperplane.h b/Eigen/src/Geometry/Hyperplane.h index 2d076d7f8..cc89639b6 100644 --- a/Eigen/src/Geometry/Hyperplane.h +++ b/Eigen/src/Geometry/Hyperplane.h @@ -22,8 +22,8 @@ namespace Eigen { * A hyperplane is an affine subspace of dimension n-1 in a space of dimension n. * For example, a hyperplane in a plane is a line; a hyperplane in 3-space is a plane. * - * \param _Scalar the scalar type, i.e., the type of the coefficients - * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic. + * \tparam _Scalar the scalar type, i.e., the type of the coefficients + * \tparam _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic. * Notice that the dimension of the hyperplane is _AmbientDim-1. * * This class represents an hyperplane as the zero set of the implicit equation diff --git a/Eigen/src/Geometry/ParametrizedLine.h b/Eigen/src/Geometry/ParametrizedLine.h index 93edd9148..fdcd69760 100644 --- a/Eigen/src/Geometry/ParametrizedLine.h +++ b/Eigen/src/Geometry/ParametrizedLine.h @@ -23,8 +23,8 @@ namespace Eigen { * direction vector \f$ \mathbf{d} \f$ such that the line corresponds to * the set \f$ l(t) = \mathbf{o} + t \mathbf{d} \f$, \f$ t \in \mathbf{R} \f$. * - * \param _Scalar the scalar type, i.e., the type of the coefficients - * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic. + * \tparam _Scalar the scalar type, i.e., the type of the coefficients + * \tparam _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic. */ template class ParametrizedLine diff --git a/Eigen/src/Geometry/Rotation2D.h b/Eigen/src/Geometry/Rotation2D.h index 8b0ddcfb0..5ab0d5920 100644 --- a/Eigen/src/Geometry/Rotation2D.h +++ b/Eigen/src/Geometry/Rotation2D.h @@ -18,7 +18,7 @@ namespace Eigen { * * \brief Represents a rotation/orientation in a 2 dimensional space. * - * \param _Scalar the scalar type, i.e., the type of the coefficients + * \tparam _Scalar the scalar type, i.e., the type of the coefficients * * This class is equivalent to a single scalar representing a counter clock wise rotation * as a single angle in radian. It provides some additional features such as the automatic diff --git a/Eigen/src/Geometry/RotationBase.h b/Eigen/src/Geometry/RotationBase.h index b88661de6..fadfd9151 100644 --- a/Eigen/src/Geometry/RotationBase.h +++ b/Eigen/src/Geometry/RotationBase.h @@ -22,8 +22,8 @@ struct rotation_base_generic_product_selector; * * \brief Common base class for compact rotation representations * - * \param Derived is the derived type, i.e., a rotation type - * \param _Dim the dimension of the space + * \tparam Derived is the derived type, i.e., a rotation type + * \tparam _Dim the dimension of the space */ template class RotationBase @@ -164,8 +164,8 @@ namespace internal { * * Helper function to return an arbitrary rotation object to a rotation matrix. * - * \param Scalar the numeric type of the matrix coefficients - * \param Dim the dimension of the current space + * \tparam Scalar the numeric type of the matrix coefficients + * \tparam Dim the dimension of the current space * * It returns a Dim x Dim fixed size matrix. * diff --git a/Eigen/src/Geometry/Scaling.h b/Eigen/src/Geometry/Scaling.h index e94aa189f..643138199 100644 --- a/Eigen/src/Geometry/Scaling.h +++ b/Eigen/src/Geometry/Scaling.h @@ -18,7 +18,7 @@ namespace Eigen { * * \brief Represents a generic uniform scaling transformation * - * \param _Scalar the scalar type, i.e., the type of the coefficients. + * \tparam _Scalar the scalar type, i.e., the type of the coefficients. * * This class represent a uniform scaling transformation. It is the return * type of Scaling(Scalar), and most of the time this is the only way it diff --git a/Eigen/src/Geometry/Translation.h b/Eigen/src/Geometry/Translation.h index 7fda179cc..87ea445e9 100644 --- a/Eigen/src/Geometry/Translation.h +++ b/Eigen/src/Geometry/Translation.h @@ -18,8 +18,8 @@ namespace Eigen { * * \brief Represents a translation transformation * - * \param _Scalar the scalar type, i.e., the type of the coefficients. - * \param _Dim the dimension of the space, can be a compile time value or Dynamic + * \tparam _Scalar the scalar type, i.e., the type of the coefficients. + * \tparam _Dim the dimension of the space, can be a compile time value or Dynamic * * \note This class is not aimed to be used to store a translation transformation, * but rather to make easier the constructions and updates of Transform objects. diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h index 284e37f13..18e9d1466 100644 --- a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h @@ -21,7 +21,7 @@ namespace Eigen { * References : C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with * Limited memory, SIAM J. Sci. Comput. 21(1), pp. 24-45, 1999 * - * \tparam _MatrixType The type of the sparse matrix. It is advised to give a row-oriented sparse matrix + * \tparam Scalar the scalar type of the input matrices * \tparam _UpLo The triangular part that will be used for the computations. It can be Lower * or Upper. Default is Lower. * \tparam _OrderingType The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering, diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h index 0c4d63923..1721213d6 100644 --- a/Eigen/src/LU/FullPivLU.h +++ b/Eigen/src/LU/FullPivLU.h @@ -29,7 +29,7 @@ template struct traits > * * \brief LU decomposition of a matrix with complete pivoting, and related features * - * \param MatrixType the type of the matrix of which we are computing the LU decomposition + * \tparam _MatrixType the type of the matrix of which we are computing the LU decomposition * * This class represents a LU decomposition of any matrix, with complete pivoting: the matrix A is * decomposed as \f$ A = P^{-1} L U Q^{-1} \f$ where L is unit-lower-triangular, U is diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index 50e920609..ab7797d2a 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h @@ -34,7 +34,7 @@ template struct traits > * * \brief LU decomposition of a matrix with partial pivoting, and related features * - * \param MatrixType the type of the matrix of which we are computing the LU decomposition + * \tparam _MatrixType the type of the matrix of which we are computing the LU decomposition * * This class represents a LU decomposition of a \b square \b invertible matrix, with partial pivoting: the matrix A * is decomposed as A = PLU where L is unit-lower-triangular, U is upper-triangular, and P diff --git a/Eigen/src/OrderingMethods/Amd.h b/Eigen/src/OrderingMethods/Amd.h index 323255e0a..d1d08ca57 100644 --- a/Eigen/src/OrderingMethods/Amd.h +++ b/Eigen/src/OrderingMethods/Amd.h @@ -84,8 +84,11 @@ StorageIndex cs_tdfs(StorageIndex j, StorageIndex k, StorageIndex *head, const S /** \internal * \ingroup OrderingMethods_Module * Approximate minimum degree ordering algorithm. - * \returns the permutation P reducing the fill-in of the input matrix \a C - * The input matrix \a C must be a selfadjoint compressed column major SparseMatrix object. Both the upper and lower parts have to be stored, but the diagonal entries are optional. + * + * \param[in] C the input selfadjoint matrix stored in compressed column major format. + * \param[out] perm the permutation P reducing the fill-in of the input matrix \a C + * + * Note that the input matrix \a C must be complete, that is both the upper and lower parts have to be stored, as well as the diagonal entries. * On exit the values of C are destroyed */ template void minimum_degree_ordering(SparseMatrix& C, PermutationMatrix& perm) diff --git a/Eigen/src/OrderingMethods/Ordering.h b/Eigen/src/OrderingMethods/Ordering.h index 25792a828..7ea9b14d7 100644 --- a/Eigen/src/OrderingMethods/Ordering.h +++ b/Eigen/src/OrderingMethods/Ordering.h @@ -19,20 +19,21 @@ namespace internal { /** \internal * \ingroup OrderingMethods_Module - * \returns the symmetric pattern A^T+A from the input matrix A. + * \param[in] A the input non-symmetric matrix + * \param[out] symmat the symmetric pattern A^T+A from the input matrix \a A. * FIXME: The values should not be considered here */ template -void ordering_helper_at_plus_a(const MatrixType& mat, MatrixType& symmat) +void ordering_helper_at_plus_a(const MatrixType& A, MatrixType& symmat) { MatrixType C; - C = mat.transpose(); // NOTE: Could be costly + C = A.transpose(); // NOTE: Could be costly for (int i = 0; i < C.rows(); i++) { for (typename MatrixType::InnerIterator it(C, i); it; ++it) it.valueRef() = 0.0; } - symmat = C + mat; + symmat = C + A; } } diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h index 172e4a89f..d8bd4b950 100644 --- a/Eigen/src/QR/ColPivHouseholderQR.h +++ b/Eigen/src/QR/ColPivHouseholderQR.h @@ -28,7 +28,7 @@ template struct traits > * * \brief Householder rank-revealing QR decomposition of a matrix with column-pivoting * - * \param MatrixType the type of the matrix of which we are computing the QR decomposition + * \tparam _MatrixType the type of the matrix of which we are computing the QR decomposition * * This class performs a rank-revealing QR decomposition of a matrix \b A into matrices \b P, \b Q and \b R * such that diff --git a/Eigen/src/QR/FullPivHouseholderQR.h b/Eigen/src/QR/FullPivHouseholderQR.h index 64fe6b7b8..32a10f3fe 100644 --- a/Eigen/src/QR/FullPivHouseholderQR.h +++ b/Eigen/src/QR/FullPivHouseholderQR.h @@ -37,7 +37,7 @@ struct traits > * * \brief Householder rank-revealing QR decomposition of a matrix with full pivoting * - * \param MatrixType the type of the matrix of which we are computing the QR decomposition + * \tparam _MatrixType the type of the matrix of which we are computing the QR decomposition * * This class performs a rank-revealing QR decomposition of a matrix \b A into matrices \b P, \b P', \b Q and \b R * such that diff --git a/Eigen/src/QR/HouseholderQR.h b/Eigen/src/QR/HouseholderQR.h index 1eb861025..03bc8e6cd 100644 --- a/Eigen/src/QR/HouseholderQR.h +++ b/Eigen/src/QR/HouseholderQR.h @@ -21,7 +21,7 @@ namespace Eigen { * * \brief Householder QR decomposition of a matrix * - * \param MatrixType the type of the matrix of which we are computing the QR decomposition + * \tparam _MatrixType the type of the matrix of which we are computing the QR decomposition * * This class performs a QR decomposition of a matrix \b A into matrices \b Q and \b R * such that diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index 896246e46..3552c87bf 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -47,9 +47,8 @@ struct traits > * * \brief class Bidiagonal Divide and Conquer SVD * - * \param MatrixType the type of the matrix of which we are computing the SVD decomposition - * We plan to have a very similar interface to JacobiSVD on this class. - * It should be used to speed up the calcul of SVD for big matrices. + * \tparam _MatrixType the type of the matrix of which we are computing the SVD decomposition + * */ template class BDCSVD : public SVDBase > diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index 59c965e15..bf5ff48c3 100755 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -449,8 +449,8 @@ struct traits > * * \brief Two-sided Jacobi SVD decomposition of a rectangular matrix * - * \param MatrixType the type of the matrix of which we are computing the SVD decomposition - * \param QRPreconditioner this optional parameter allows to specify the type of QR decomposition that will be used internally + * \tparam _MatrixType the type of the matrix of which we are computing the SVD decomposition + * \tparam QRPreconditioner this optional parameter allows to specify the type of QR decomposition that will be used internally * for the R-SVD step for non-square matrices. See discussion of possible values below. * * SVD decomposition consists in decomposing any n-by-p matrix \a A as a product diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h index 1343eb15c..2907f6529 100644 --- a/Eigen/src/SparseCholesky/SimplicialCholesky.h +++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h @@ -39,18 +39,16 @@ namespace internal { } // end namespace internal /** \ingroup SparseCholesky_Module - * \brief A direct sparse Cholesky factorizations + * \brief A base class for direct sparse Cholesky factorizations * - * These classes provide LL^T and LDL^T Cholesky factorizations of sparse matrices that are - * selfadjoint and positive definite. The factorization allows for solving A.X = B where + * This is a base class for LL^T and LDL^T Cholesky factorizations of sparse matrices that are + * selfadjoint and positive definite. These factorizations allow for solving A.X = B where * X and B can be either dense or sparse. * * In order to reduce the fill-in, a symmetric permutation P is applied prior to the factorization * such that the factorized matrix is P A P^-1. * - * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<> - * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower - * or Upper. Default is Lower. + * \tparam Derived the type of the derived class, that is the actual factorization type. * */ template diff --git a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h index e71a13b89..8c1b3e8bc 100644 --- a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +++ b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h @@ -14,22 +14,21 @@ namespace Eigen { namespace internal { -/** - * \brief Performs numeric block updates from a given supernode to a single column - * - * \param segsize Size of the segment (and blocks ) to use for updates - * \param[in,out] dense Packed values of the original matrix - * \param tempv temporary vector to use for updates - * \param lusup array containing the supernodes - * \param lda Leading dimension in the supernode - * \param nrow Number of rows in the rectangular part of the supernode - * \param lsub compressed row subscripts of supernodes - * \param lptr pointer to the first column of the current supernode in lsub - * \param no_zeros Number of nonzeros elements before the diagonal part of the supernode - * \return 0 on success - */ template struct LU_kernel_bmod { + /** \internal + * \brief Performs numeric block updates from a given supernode to a single column + * + * \param segsize Size of the segment (and blocks ) to use for updates + * \param[in,out] dense Packed values of the original matrix + * \param tempv temporary vector to use for updates + * \param lusup array containing the supernodes + * \param lda Leading dimension in the supernode + * \param nrow Number of rows in the rectangular part of the supernode + * \param lsub compressed row subscripts of supernodes + * \param lptr pointer to the first column of the current supernode in lsub + * \param no_zeros Number of nonzeros elements before the diagonal part of the supernode + */ template static EIGEN_DONT_INLINE void run(const Index segsize, BlockScalarVector& dense, ScalarVector& tempv, ScalarVector& lusup, Index& luptr, const Index lda, const Index nrow, IndexVector& lsub, const Index lptr, const Index no_zeros); diff --git a/unsupported/Eigen/src/SparseExtra/RandomSetter.h b/unsupported/Eigen/src/SparseExtra/RandomSetter.h index eb3e17330..ee97299af 100644 --- a/unsupported/Eigen/src/SparseExtra/RandomSetter.h +++ b/unsupported/Eigen/src/SparseExtra/RandomSetter.h @@ -95,10 +95,10 @@ template struct GoogleSparseHashMapTraits * * \brief The RandomSetter is a wrapper object allowing to set/update a sparse matrix with random access * - * \param SparseMatrixType the type of the sparse matrix we are updating - * \param MapTraits a traits class representing the map implementation used for the temporary sparse storage. + * \tparam SparseMatrixType the type of the sparse matrix we are updating + * \tparam MapTraits a traits class representing the map implementation used for the temporary sparse storage. * Its default value depends on the system. - * \param OuterPacketBits defines the number of rows (or columns) manage by a single map object + * \tparam OuterPacketBits defines the number of rows (or columns) manage by a single map object * as a power of two exponent. * * This class temporarily represents a sparse matrix object using a generic map implementation allowing for From 715f6f049fb449df57484a36711d9fde8a5d1be0 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sun, 3 Jan 2016 21:56:30 +0100 Subject: [PATCH 02/52] Improve inline documentation of SparseCompressedBase and its derived classes --- Eigen/src/Core/Ref.h | 1 + Eigen/src/SparseCore/SparseCompressedBase.h | 10 +++++ Eigen/src/SparseCore/SparseMap.h | 41 ++++++++++++++++++++- Eigen/src/SparseCore/SparseRef.h | 25 +++++++++---- 4 files changed, 69 insertions(+), 8 deletions(-) diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h index ae414204e..6e94181f3 100644 --- a/Eigen/src/Core/Ref.h +++ b/Eigen/src/Core/Ref.h @@ -207,6 +207,7 @@ template class Ref EIGEN_DEVICE_FUNC inline Ref(const DenseBase& expr, typename internal::enable_if::MatchAtCompileTime),Derived>::type* = 0) #else + /** Implicit constructor from any dense expression */ template inline Ref(DenseBase& expr) #endif diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h index c223e4f42..f78d7c24d 100644 --- a/Eigen/src/SparseCore/SparseCompressedBase.h +++ b/Eigen/src/SparseCore/SparseCompressedBase.h @@ -22,6 +22,16 @@ struct traits > : traits } // end namespace internal +/** \ingroup SparseCore_Module + * \class SparseCompressedBase + * \brief Common base class for sparse [compressed]-{row|column}-storage format. + * + * This class defines the common interface for all derived classes implementing the compressed sparse storage format, such as: + * - SparseMatrix + * - Ref + * - Map + * + */ template class SparseCompressedBase : public SparseMatrixBase diff --git a/Eigen/src/SparseCore/SparseMap.h b/Eigen/src/SparseCore/SparseMap.h index 36c09ab0c..eb241c3e2 100644 --- a/Eigen/src/SparseCore/SparseMap.h +++ b/Eigen/src/SparseCore/SparseMap.h @@ -37,11 +37,15 @@ struct traits, Options, St }; } // end namespace internal - + template::has_write_access ? WriteAccessors : ReadOnlyAccessors > class SparseMapBase; +/** \ingroup SparseCore_Module + * class SparseMapBase + * \brief Common base class for Map and Ref instance of sparse matrix and vector. + */ template class SparseMapBase : public SparseCompressedBase @@ -71,22 +75,33 @@ class SparseMapBase public: + /** \copydoc SparseMatrixBase::rows() */ inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; } + /** \copydoc SparseMatrixBase::cols() */ inline Index cols() const { return IsRowMajor ? m_innerSize : m_outerSize; } + /** \copydoc SparseMatrixBase::innerSize() */ inline Index innerSize() const { return m_innerSize; } + /** \copydoc SparseMatrixBase::outerSize() */ inline Index outerSize() const { return m_outerSize; } + /** \copydoc SparseCompressedBase::nonZeros */ inline Index nonZeros() const { return m_zero_nnz[1]; } + /** \copydoc SparseCompressedBase::isCompressed */ bool isCompressed() const { return m_innerNonZeros==0; } //---------------------------------------- // direct access interface + /** \copydoc SparseMatrix::valuePtr */ inline const Scalar* valuePtr() const { return m_values; } + /** \copydoc SparseMatrix::innerIndexPtr */ inline const StorageIndex* innerIndexPtr() const { return m_innerIndices; } + /** \copydoc SparseMatrix::outerIndexPtr */ inline const StorageIndex* outerIndexPtr() const { return m_outerIndex; } + /** \copydoc SparseMatrix::innerNonZeroPtr */ inline const StorageIndex* innerNonZeroPtr() const { return m_innerNonZeros; } //---------------------------------------- + /** \copydoc SparseMatrix::coeff */ inline Scalar coeff(Index row, Index col) const { const Index outer = IsRowMajor ? row : col; @@ -125,6 +140,10 @@ class SparseMapBase inline SparseMapBase() {} }; +/** \ingroup SparseCore_Module + * class SparseMapBase + * \brief Common base class for writable Map and Ref instance of sparse matrix and vector. + */ template class SparseMapBase : public SparseMapBase @@ -185,9 +204,23 @@ class SparseMapBase inline SparseMapBase() {} }; +/** \ingroup SparseCore_Module + * + * \brief Specialization of class Map for SparseMatrix-like storage. + * + * \tparam SparseMatrixType the equivalent sparse matrix type of the referenced data, it must be a template instance of class SparseMatrix. + * + * \sa class Map, class SparseMatrix, class Ref + */ +#ifndef EIGEN_PARSED_BY_DOXYGEN template class Map, Options, StrideType> : public SparseMapBase, Options, StrideType> > +#else +template +class Map + : public SparseMapBase +#endif { public: typedef SparseMapBase Base; @@ -196,6 +229,12 @@ class Map, Options, StrideType> public: + /** Constructs a read-write Map to a sparse matrix of size \a rows x \a cols, containing \a nnz non-zero coefficients, + * stored as a sparse format as defined by the pointers \a outerIndexPtr, \a innerIndexPtr, and \a valuePtr. + * If the optional parameter \a innerNonZerosPtr is the null pointer, then a standard compressed format is assumed. + * + * More details on the expected storage schemes are given in the \ref TutorialSparse "manual pages". + */ inline Map(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, StorageIndex* innerIndexPtr, Scalar* valuePtr, StorageIndex* innerNonZerosPtr = 0) : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr) diff --git a/Eigen/src/SparseCore/SparseRef.h b/Eigen/src/SparseCore/SparseRef.h index 605ca42ba..a558230e7 100644 --- a/Eigen/src/SparseCore/SparseRef.h +++ b/Eigen/src/SparseCore/SparseRef.h @@ -108,20 +108,25 @@ protected: /** - * \ingroup Sparse_Module + * \ingroup SparseCore_Module * * \brief A sparse matrix expression referencing an existing sparse expression * - * \tparam PlainObjectType the equivalent sparse matrix type of the referenced data + * \tparam SparseMatrixType the equivalent sparse matrix type of the referenced data, it must be a template instance of class SparseMatrix. * \tparam Options specifies whether the a standard compressed format is required \c Options is \c #StandardCompressedFormat, or \c 0. * The default is \c 0. - * \tparam StrideType Only used for dense Ref * * \sa class Ref */ +#ifndef EIGEN_PARSED_BY_DOXYGEN template class Ref, Options, StrideType > : public internal::SparseRefBase, Options, StrideType > > +#else +template +class Ref + : public SparseMapBase // yes, that's weird to use Derived here, but that works! +#endif { typedef SparseMatrix PlainObjectType; typedef internal::traits Traits; @@ -155,6 +160,7 @@ class Ref, Options, StrideType > template inline Ref(const SparseCompressedBase& expr) #else + /** Implicit constructor from any sparse expression (2D matrix or 1D vector) */ template inline Ref(SparseCompressedBase& expr) #endif @@ -225,19 +231,23 @@ class Ref, Options, StrideType /** - * \ingroup Sparse_Module + * \ingroup SparseCore_Module * * \brief A sparse vector expression referencing an existing sparse vector expression * - * \tparam PlainObjectType the equivalent sparse matrix type of the referenced data - * \tparam Options Not used for SparseVector. - * \tparam StrideType Only used for dense Ref + * \tparam SparseVectorType the equivalent sparse vector type of the referenced data, it must be a template instance of class SparseVector. * * \sa class Ref */ +#ifndef EIGEN_PARSED_BY_DOXYGEN template class Ref, Options, StrideType > : public internal::SparseRefBase, Options, StrideType > > +#else +template +class Ref + : public SparseMapBase +#endif { typedef SparseVector PlainObjectType; typedef internal::traits Traits; @@ -259,6 +269,7 @@ class Ref, Options, StrideType > template inline Ref(const SparseCompressedBase& expr) #else + /** Implicit constructor from any 1D sparse vector expression */ template inline Ref(SparseCompressedBase& expr) #endif From 515dee0bafc69dd79cfbf9c2831e707d2214390f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 4 Jan 2016 16:29:26 -0800 Subject: [PATCH 03/52] Added a 'divup' util to compute the floor of the quotient of two integers --- unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index f28a9699d..d6ad65070 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -24,6 +24,11 @@ const T2& choose(Cond, const T1&, const T2& second) { return second; } +template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T divup(const T x, const T y) { + return (x + y - 1) / y; +} + template struct max_n_1 { static const size_t size = n; }; From cfff40b1d48999d354be34c984c83f7d0f1ca5cb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 4 Jan 2016 17:25:00 -0800 Subject: [PATCH 04/52] Improved the performance of reductions on CUDA devices --- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 31 ++++++ .../CXX11/src/Tensor/TensorReductionCuda.h | 101 +++++++++++++++++- 2 files changed, 128 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index c30980a49..2ecdccdfd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -337,6 +337,16 @@ struct FullReducer { #endif +// Default inner reducer +template +struct InnerReducer { + static const bool HasOptimizedImplementation = false; + + static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + assert(false && "Not implemented"); + } +}; + // Default outer reducer template struct OuterReducer { @@ -352,6 +362,9 @@ struct OuterReducer { template __global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*); +template +__global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); + template __global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); #endif @@ -516,6 +529,23 @@ struct TensorEvaluator, Device> // Attempt to use an optimized reduction. #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) else if (RunningOnGPU && data && (m_device.majorDeviceVersion() >= 3)) { + bool reducing_inner_dims = true; + for (int i = 0; i < NumReducedDims; ++i) { + if (static_cast(Layout) == static_cast(ColMajor)) { + reducing_inner_dims &= m_reducedDims[i]; + } else { + reducing_inner_dims &= m_reducedDims[NumInputDims - 1 - i]; + } + } + if (internal::InnerReducer::HasOptimizedImplementation && + (reducing_inner_dims || ReducingInnerMostDims)) { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + Op reducer(m_reducer); + internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); + return false; + } + bool preserving_inner_dims = true; for (int i = 0; i < NumReducedDims; ++i) { if (static_cast(Layout) == static_cast(ColMajor)) { @@ -615,6 +645,7 @@ struct TensorEvaluator, Device> #endif #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) template friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*); + template friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); template friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 8e250867c..b046607b9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -131,7 +131,102 @@ struct FullReducer { } }; -#define DIVUP(x, y) (((x) + (y)-1) / (y)) + +extern __shared__ float temp[]; + +template +__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, + typename Self::CoeffReturnType* output) { + eigen_assert(blockDim.y == 1); + eigen_assert(blockDim.z == 1); + eigen_assert(gridDim.y == 1); + eigen_assert(gridDim.z == 1); + + const int unroll_times = 16; + eigen_assert(NumPerThread % unroll_times == 0); + + const Index input_col_blocks = divup(num_coeffs_to_reduce, blockDim.x * NumPerThread); + const Index num_input_blocks = input_col_blocks * num_preserved_coeffs; + + const Index num_threads = blockDim.x * gridDim.x; + const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { + output[i] = reducer.initialize(); + } + + for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) { + const Index row = i / input_col_blocks; + + if (row < num_preserved_coeffs) { + const Index col_block = i % input_col_blocks; + const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x; + + float reduced_val = reducer.initialize(); + + for (Index j = 0; j < NumPerThread; j += unroll_times) { + const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1); + if (last_col >= num_coeffs_to_reduce) { + for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col +=blockDim.x) { + const float val = input.m_impl.coeff(row * num_coeffs_to_reduce + col); + reducer.reduce(val, &reduced_val); + } + break; + } else { + // Faster version of the loop with no branches after unrolling. +#pragma unroll + for (int k = 0; k < unroll_times; ++k) { + const Index col = col_begin + blockDim.x * (j + k); + reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val); + } + } + } + + temp[threadIdx.x] = reduced_val; + + __syncthreads(); + const int warp_id = threadIdx.x & 31; + if (warp_id < 16) reducer.reduce(temp[threadIdx.x + 16], &temp[threadIdx.x]); + if (warp_id < 8) reducer.reduce(temp[threadIdx.x + 8], &temp[threadIdx.x]); + if (warp_id < 4) reducer.reduce(temp[threadIdx.x + 4], &temp[threadIdx.x]); + if (warp_id < 2) reducer.reduce(temp[threadIdx.x + 2], &temp[threadIdx.x]); + if (warp_id < 1) { + reducer.reduce(temp[threadIdx.x + 1], &temp[threadIdx.x]); + atomicReduce(&(output[row]), temp[threadIdx.x], reducer); + } + } + + __syncthreads(); + } +} + +template +struct InnerReducer { + // Unfortunately nvidia doesn't support well exotic types such as complex, + // so reduce the scope of the optimized version of the code to the simple case + // of floats. + static const bool HasOptimizedImplementation = !Op::IsStateful && + internal::is_same::value; + + template + static void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { + assert(false && "Should only be called to reduce floats on a gpu device"); + } + + static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + typedef typename Self::Index Index; + + const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; + const int block_size = 256; + const int num_per_thread = 128; + const int num_blocks = 32; + + LAUNCH_CUDA_KERNEL((InnerReductionKernel), + num_blocks, block_size, block_size*sizeof(float), device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); + } +}; + template @@ -145,7 +240,7 @@ __global__ void OuterReductionKernel(Reducer reducer, const Self input, Index nu } // Do the reduction. - const Index max_iter = DIVUP(num_coeffs_to_reduce, NumPerThread) * num_preserved_coeffs; + const Index max_iter = divup(num_coeffs_to_reduce, NumPerThread) * num_preserved_coeffs; for (Index i = thread_id; i < max_iter; i += num_threads) { const Index input_col = i % num_preserved_coeffs; const Index input_row = (i / num_preserved_coeffs) * NumPerThread; @@ -189,8 +284,6 @@ struct OuterReducer { } }; -#undef DIVUP - #endif From 54bf582303407387f9a34d99c8993aa3255274ec Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Wed, 6 Jan 2016 11:59:24 +0100 Subject: [PATCH 05/52] bug #1143: Work-around gcc bug --- Eigen/src/OrderingMethods/Eigen_Colamd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h index 6238676e5..70c987afa 100644 --- a/Eigen/src/OrderingMethods/Eigen_Colamd.h +++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h @@ -516,7 +516,7 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ Col [col].start = p [col] ; Col [col].length = p [col+1] - p [col] ; - if (Col [col].length < 0) + if ((Col [col].length) < 0) // extra parentheses to work-around gcc bug 10200 { /* column pointers must be non-decreasing */ stats [COLAMD_STATUS] = COLAMD_ERROR_col_length_negative ; From ee738321aa6c13f327821f4a4b1aaa4ead635687 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 6 Jan 2016 14:49:40 +0100 Subject: [PATCH 06/52] rm remaining debug code --- Eigen/src/Core/util/Constants.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index 9e6816021..5f71ba3df 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -224,7 +224,7 @@ enum { /** \ingroup enums * Enum for indicating whether a buffer is aligned or not. */ -enum Foo { +enum { Unaligned=0, /**< Data pointer has no specific alignment. */ Aligned8=8, /**< Data pointer is aligned on a 8 bytes boundary. */ Aligned16=16, /**< Data pointer is aligned on a 16 bytes boundary. */ From 213459d81850f98f3822624ae84c1f420f12092c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 6 Jan 2016 18:47:45 -0800 Subject: [PATCH 07/52] Optimized the performance of broadcasting of scalars. --- .../CXX11/src/Tensor/TensorBroadcasting.h | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index dc64959e1..0c95e5c0b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -46,6 +46,21 @@ struct nested, 1, typename eval type; }; +template +struct is_input_scalar { + static const bool value = false; +}; +template <> +struct is_input_scalar > { + static const bool value = true; +}; +#ifndef EIGEN_EMULATE_CXX11_META_H +template +struct is_input_scalar > { + static const bool value = (Sizes::total_size == 1); +}; +#endif + } // end namespace internal @@ -103,7 +118,7 @@ struct TensorEvaluator, Device> // and store the result in a scalar. Instead one should reshape the scalar into a a N-D // tensor with N >= 1 of 1 element first and then broadcast. EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + const InputDimensions& input_dims = m_impl.dimensions(); const Broadcast& broadcast = op.broadcast(); for (int i = 0; i < NumDims; ++i) { eigen_assert(input_dims[i] > 0); @@ -143,6 +158,10 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const { + if (internal::is_input_scalar::type>::value) { + return m_impl.coeff(0); + } + if (static_cast(Layout) == static_cast(ColMajor)) { return coeffColMajor(index); } else { @@ -214,6 +233,10 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const { + if (internal::is_input_scalar::type>::value) { + return m_impl.coeff(0); + } + if (static_cast(Layout) == static_cast(ColMajor)) { return packetColMajor(index); } else { From 0cb2ca5de216f258cd3106cf18d4aada9064ce13 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 6 Jan 2016 18:50:28 -0800 Subject: [PATCH 08/52] Fixed a typo. --- unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 0c95e5c0b..9ec20a99e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -234,7 +234,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const { if (internal::is_input_scalar::type>::value) { - return m_impl.coeff(0); + return internal::pset1(m_impl.coeff(0)); } if (static_cast(Layout) == static_cast(ColMajor)) { From 6639b7d6e86ef36f6f78cf51e36efa5a004154eb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 7 Jan 2016 18:45:19 -0800 Subject: [PATCH 09/52] Removed a couple of partial specialization that confuse nvcc and result in errors such as this: error: more than one partial specialization matches the template argument list of class "Eigen::internal::get<3, Eigen::internal::numeric_list>" "Eigen::internal::get>" "Eigen::internal::get>" --- unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h index 3f149c6a3..4d99f786c 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h @@ -109,11 +109,9 @@ template struct get; template struct get> : get> {}; template struct get<0, type_list> { typedef a type; }; -template struct get> { static_assert((n - n) < 0, "meta-template get: The element to extract from a list must be smaller than the size of the list."); }; template struct get> : get> {}; template struct get<0, numeric_list> { constexpr static T value = a; }; -template struct get> { static_assert((n - n) < 0, "meta-template get: The element to extract from a list must be smaller than the size of the list."); }; /* always get type, regardless of dummy; good for parameter pack expansion */ From f9d71a172992cfda5e2733f9f4a6e12a14b9ed73 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 8 Jan 2016 22:24:45 +0100 Subject: [PATCH 10/52] extend matlab conversion table --- doc/AsciiQuickReference.txt | 110 +++++++++++++++++++----------------- 1 file changed, 58 insertions(+), 52 deletions(-) diff --git a/doc/AsciiQuickReference.txt b/doc/AsciiQuickReference.txt index b5bdfa1f4..c604e575c 100644 --- a/doc/AsciiQuickReference.txt +++ b/doc/AsciiQuickReference.txt @@ -32,17 +32,19 @@ A << 1, 2, 3, // Initialize A. The elements can also be B << A, A, A; // B is three horizontally stacked A's. A.fill(10); // Fill A with all 10's. -// Eigen // Matlab -MatrixXd::Identity(rows,cols) // eye(rows,cols) -C.setIdentity(rows,cols) // C = eye(rows,cols) -MatrixXd::Zero(rows,cols) // zeros(rows,cols) -C.setZero(rows,cols) // C = ones(rows,cols) -MatrixXd::Ones(rows,cols) // ones(rows,cols) -C.setOnes(rows,cols) // C = ones(rows,cols) -MatrixXd::Random(rows,cols) // rand(rows,cols)*2-1 // MatrixXd::Random returns uniform random numbers in (-1, 1). -C.setRandom(rows,cols) // C = rand(rows,cols)*2-1 -VectorXd::LinSpaced(size,low,high) // linspace(low,high,size)' -v.setLinSpaced(size,low,high) // v = linspace(low,high,size)' +// Eigen // Matlab +MatrixXd::Identity(rows,cols) // eye(rows,cols) +C.setIdentity(rows,cols) // C = eye(rows,cols) +MatrixXd::Zero(rows,cols) // zeros(rows,cols) +C.setZero(rows,cols) // C = ones(rows,cols) +MatrixXd::Ones(rows,cols) // ones(rows,cols) +C.setOnes(rows,cols) // C = ones(rows,cols) +MatrixXd::Random(rows,cols) // rand(rows,cols)*2-1 // MatrixXd::Random returns uniform random numbers in (-1, 1). +C.setRandom(rows,cols) // C = rand(rows,cols)*2-1 +VectorXd::LinSpaced(size,low,high) // linspace(low,high,size)' +v.setLinSpaced(size,low,high) // v = linspace(low,high,size)' +VectorXi::LinSpaced(((hi-low)/step)+1, // low:step:hi + low,low+step*(size-1)) // Matrix slicing and blocks. All expressions listed here are read/write. @@ -85,13 +87,15 @@ P.bottomRightCorner() // P(end-rows+1:end, end-cols+1:end) R.row(i) = P.col(j); // R(i, :) = P(:, i) R.col(j1).swap(mat1.col(j2)); // R(:, [j1 j2]) = R(:, [j2, j1]) -// Views, transpose, etc; all read-write except for .adjoint(). +// Views, transpose, etc; // Eigen // Matlab R.adjoint() // R' -R.transpose() // R.' or conj(R') -R.diagonal() // diag(R) +R.transpose() // R.' or conj(R') // Read-write +R.diagonal() // diag(R) // Read-write x.asDiagonal() // diag(x) -R.transpose().colwise().reverse(); // rot90(R) +R.transpose().colwise().reverse() // rot90(R) // Read-write +R.replicate(i,j) // repmat(P,i,j) + // All the same as Matlab, but matlab doesn't have *= style operators. // Matrix-vector. Matrix-matrix. Matrix-scalar. @@ -103,37 +107,39 @@ a *= M; R = P + Q; R = P/s; R -= Q; R /= s; // Vectorized operations on each element independently -// Eigen // Matlab -R = P.cwiseProduct(Q); // R = P .* Q -R = P.array() * s.array();// R = P .* s -R = P.cwiseQuotient(Q); // R = P ./ Q -R = P.array() / Q.array();// R = P ./ Q -R = P.array() + s.array();// R = P + s -R = P.array() - s.array();// R = P - s -R.array() += s; // R = R + s -R.array() -= s; // R = R - s -R.array() < Q.array(); // R < Q -R.array() <= Q.array(); // R <= Q -R.cwiseInverse(); // 1 ./ P -R.array().inverse(); // 1 ./ P -R.array().sin() // sin(P) -R.array().cos() // cos(P) -R.array().pow(s) // P .^ s -R.array().square() // P .^ 2 -R.array().cube() // P .^ 3 -R.cwiseSqrt() // sqrt(P) -R.array().sqrt() // sqrt(P) -R.array().exp() // exp(P) -R.array().log() // log(P) -R.cwiseMax(P) // max(R, P) -R.array().max(P.array()) // max(R, P) -R.cwiseMin(P) // min(R, P) -R.array().min(P.array()) // min(R, P) -R.cwiseAbs() // abs(P) -R.array().abs() // abs(P) -R.cwiseAbs2() // abs(P.^2) -R.array().abs2() // abs(P.^2) -(R.array() < s).select(P,Q); // (R < s ? P : Q) +// Eigen // Matlab +R = P.cwiseProduct(Q); // R = P .* Q +R = P.array() * s.array(); // R = P .* s +R = P.cwiseQuotient(Q); // R = P ./ Q +R = P.array() / Q.array(); // R = P ./ Q +R = P.array() + s.array(); // R = P + s +R = P.array() - s.array(); // R = P - s +R.array() += s; // R = R + s +R.array() -= s; // R = R - s +R.array() < Q.array(); // R < Q +R.array() <= Q.array(); // R <= Q +R.cwiseInverse(); // 1 ./ P +R.array().inverse(); // 1 ./ P +R.array().sin() // sin(P) +R.array().cos() // cos(P) +R.array().pow(s) // P .^ s +R.array().square() // P .^ 2 +R.array().cube() // P .^ 3 +R.cwiseSqrt() // sqrt(P) +R.array().sqrt() // sqrt(P) +R.array().exp() // exp(P) +R.array().log() // log(P) +R.cwiseMax(P) // max(R, P) +R.array().max(P.array()) // max(R, P) +R.cwiseMin(P) // min(R, P) +R.array().min(P.array()) // min(R, P) +R.cwiseAbs() // abs(P) +R.array().abs() // abs(P) +R.cwiseAbs2() // abs(P.^2) +R.array().abs2() // abs(P.^2) +(R.array() < s).select(P,Q ); // (R < s ? P : Q) +R = (Q.array()==0).select(P,A) // R(Q==0) = P(Q==0) + // Reductions. int r, c; @@ -164,12 +170,12 @@ x.dot(y) // dot(x, y) x.cross(y) // cross(x, y) Requires #include //// Type conversion -// Eigen // Matlab -A.cast(); // double(A) -A.cast(); // single(A) -A.cast(); // int32(A) -A.real(); // real(A) -A.imag(); // imag(A) +// Eigen // Matlab +A.cast(); // double(A) +A.cast(); // single(A) +A.cast(); // int32(A) +A.real(); // real(A) +A.imag(); // imag(A) // if the original type equals destination type, no work is done // Note that for most operations Eigen requires all operands to have the same type: From 53749ff4152191d2f1bd56090a14f6474fe059c2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 8 Jan 2016 13:53:40 -0800 Subject: [PATCH 11/52] Prevent nvcc from miscompiling the cuda metakernel. Unfortunately this reintroduces some compulation warnings but it's much better than having to deal with random assertion failures. --- .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 6 +----- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 16 ++++------------ .../Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 4 ++-- 3 files changed, 7 insertions(+), 19 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index 4d7570077..af140a68b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -238,14 +238,10 @@ struct GpuDevice { }; -#ifndef __CUDA_ARCH__ #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ assert(cudaGetLastError() == cudaSuccess); -#else -#define LAUNCH_CUDA_KERNEL(...) \ - eigen_assert(false && "Cannot launch a kernel from another kernel"); -#endif + // FIXME: Should be device and kernel specific. #ifdef __CUDACC__ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index c28078882..d93e1de1b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -156,14 +156,14 @@ template class TensorExecutor { public: typedef typename Expression::Index Index; - EIGEN_DEVICE_FUNC static void run(const Expression& expr, const GpuDevice& device); + static void run(const Expression& expr, const GpuDevice& device); }; template class TensorExecutor { public: typedef typename Expression::Index Index; - EIGEN_DEVICE_FUNC static void run(const Expression& expr, const GpuDevice& device); + static void run(const Expression& expr, const GpuDevice& device); }; #if defined(__CUDACC__) @@ -213,9 +213,8 @@ EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) { /*static*/ template -EIGEN_DEVICE_FUNC inline void TensorExecutor::run(const Expression& expr, const GpuDevice& device) +inline void TensorExecutor::run(const Expression& expr, const GpuDevice& device) { -#ifndef __CUDA_ARCH__ TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) @@ -228,17 +227,13 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor::run( LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable, Index>), num_blocks, block_size, 0, device, evaluator, size); } evaluator.cleanup(); -#else - eigen_assert(false && "Cannot launch a kernel from another kernel"); -#endif } /*static*/ template -EIGEN_DEVICE_FUNC inline void TensorExecutor::run(const Expression& expr, const GpuDevice& device) +inline void TensorExecutor::run(const Expression& expr, const GpuDevice& device) { -#ifndef __CUDA_ARCH__ TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) @@ -251,9 +246,6 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor::run(c LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable, Index>), num_blocks, block_size, 0, device, evaluator, size); } evaluator.cleanup(); -#else - eigen_assert(false && "Cannot launch a kernel from another kernel"); -#endif } #endif // __CUDACC__ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index b046607b9..558d0c83d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -115,11 +115,11 @@ struct FullReducer { internal::is_same::value; template - EIGEN_DEVICE_FUNC static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { + static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { assert(false && "Should only be called on floats"); } - EIGEN_DEVICE_FUNC static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) { + static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) { typedef typename Self::Index Index; const Index num_coeffs = array_prod(self.m_impl.dimensions()); From 3358dfd5dd6c98fda9be133d1c4958ac00221006 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 8 Jan 2016 16:28:53 -0800 Subject: [PATCH 12/52] Reworked the dispatch of optimized cuda reduction kernels to workaround a nvcc bug that prevented the code from compiling in optimized mode in some cases --- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 2ecdccdfd..1c721de6d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -506,7 +506,7 @@ struct TensorEvaluator, Device> typedef typename internal::remove_const::type CoeffReturnType; typedef typename internal::remove_const::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { m_impl.evalSubExprsIfNeeded(NULL); // Use the FullReducer if possible. @@ -527,7 +527,6 @@ struct TensorEvaluator, Device> } // Attempt to use an optimized reduction. -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) else if (RunningOnGPU && data && (m_device.majorDeviceVersion() >= 3)) { bool reducing_inner_dims = true; for (int i = 0; i < NumReducedDims; ++i) { @@ -537,12 +536,12 @@ struct TensorEvaluator, Device> reducing_inner_dims &= m_reducedDims[NumInputDims - 1 - i]; } } - if (internal::InnerReducer::HasOptimizedImplementation && + if (internal::InnerReducer::HasOptimizedImplementation && (reducing_inner_dims || ReducingInnerMostDims)) { const Index num_values_to_reduce = internal::array_prod(m_reducedDims); const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); Op reducer(m_reducer); - internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); + internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); return false; } @@ -554,16 +553,15 @@ struct TensorEvaluator, Device> preserving_inner_dims &= m_reducedDims[i]; } } - if (internal::OuterReducer::HasOptimizedImplementation && + if (internal::OuterReducer::HasOptimizedImplementation && preserving_inner_dims) { const Index num_values_to_reduce = internal::array_prod(m_reducedDims); const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); Op reducer(m_reducer); - internal::OuterReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); + internal::OuterReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); return false; } } -#endif return true; } From d726e864ac91a1b5d5128972ee9e169966f2fa51 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 8 Jan 2016 16:38:14 -0800 Subject: [PATCH 13/52] Made it possible to use array of size 0 on CUDA devices --- unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h index ab9c2ec3e..456b34d0b 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h @@ -132,13 +132,13 @@ template class array { return *static_cast(NULL); } - static EIGEN_ALWAYS_INLINE std::size_t size() { return 0; } + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array() { } #ifdef EIGEN_HAS_VARIADIC_TEMPLATES - array(std::initializer_list l) { + EIGEN_DEVICE_FUNC array(std::initializer_list l) { eigen_assert(l.size() == 0); } #endif From e76904af1b0f6aef80b68492534404f534ce0240 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 8 Jan 2016 16:50:57 -0800 Subject: [PATCH 14/52] Simplified the dispatch code. --- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 1c721de6d..fd7064459 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -342,7 +342,7 @@ template struct InnerReducer { static const bool HasOptimizedImplementation = false; - static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { assert(false && "Not implemented"); } }; @@ -352,7 +352,7 @@ template struct OuterReducer { static const bool HasOptimizedImplementation = false; - static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { assert(false && "Not implemented"); } }; @@ -527,6 +527,7 @@ struct TensorEvaluator, Device> } // Attempt to use an optimized reduction. +#if 0 else if (RunningOnGPU && data && (m_device.majorDeviceVersion() >= 3)) { bool reducing_inner_dims = true; for (int i = 0; i < NumReducedDims; ++i) { @@ -562,6 +563,7 @@ struct TensorEvaluator, Device> return false; } } +#endif return true; } From 8b9dc9f0dfb44c2c4ad6d02fb88ecce0986cd154 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 9 Jan 2016 08:30:38 +0100 Subject: [PATCH 15/52] bug #1144: fix regression in x=y+A*x (aliasing), and move evaluator_traits::AssumeAliasing to evaluator_assume_aliasing. --- Eigen/src/Core/AssignEvaluator.h | 14 ++++---- Eigen/src/Core/CoreEvaluators.h | 8 ++--- Eigen/src/Core/ProductEvaluators.h | 25 ++++++-------- Eigen/src/Core/SelfAdjointView.h | 2 -- Eigen/src/Core/TriangularMatrix.h | 4 --- Eigen/src/Geometry/Homogeneous.h | 1 - Eigen/src/SparseCore/SparseSelfAdjointView.h | 2 -- Eigen/src/SparseQR/SparseQR.h | 1 - test/product.h | 35 +++++++++++++++----- test/product_large.cpp | 23 +++++++++++++ test/product_notemporary.cpp | 6 ++++ 11 files changed, 77 insertions(+), 44 deletions(-) diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index 9dfffbcc4..f6632de69 100755 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -682,9 +682,9 @@ template< typename DstXprType, typename SrcXprType, typename Functor, struct Assignment; -// The only purpose of this call_assignment() function is to deal with noalias() / AssumeAliasing and automatic transposition. -// Indeed, I (Gael) think that this concept of AssumeAliasing was a mistake, and it makes thing quite complicated. -// So this intermediate function removes everything related to AssumeAliasing such that Assignment +// The only purpose of this call_assignment() function is to deal with noalias() / "assume-aliasing" and automatic transposition. +// Indeed, I (Gael) think that this concept of "assume-aliasing" was a mistake, and it makes thing quite complicated. +// So this intermediate function removes everything related to "assume-aliasing" such that Assignment // does not has to bother about these annoying details. template @@ -698,21 +698,21 @@ EIGEN_DEVICE_FUNC void call_assignment(const Dst& dst, const Src& src) call_assignment(dst, src, internal::assign_op()); } -// Deal with AssumeAliasing +// Deal with "assume-aliasing" template -EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if::AssumeAliasing==1, void*>::type = 0) +EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if< evaluator_assume_aliasing::value, void*>::type = 0) { typename plain_matrix_type::type tmp(src); call_assignment_no_alias(dst, tmp, func); } template -EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if::AssumeAliasing==0, void*>::type = 0) +EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if::value, void*>::type = 0) { call_assignment_no_alias(dst, src, func); } -// by-pass AssumeAliasing +// by-pass "assume-aliasing" // When there is no aliasing, we require that 'dst' has been properly resized template class StorageBase, typename Src, typename Func> EIGEN_DEVICE_FUNC void call_assignment(NoAlias& dst, const Src& src, const Func& func) diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index f97dc33de..8bd73b814 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -63,10 +63,6 @@ struct evaluator_traits_base // by default, get evaluator kind and shape from storage typedef typename storage_kind_to_evaluator_kind::StorageKind>::Kind Kind; typedef typename storage_kind_to_shape::StorageKind>::Shape Shape; - - // 1 if assignment A = B assumes aliasing when B is of type T and thus B needs to be evaluated into a - // temporary; 0 if not. - static const int AssumeAliasing = 0; }; // Default evaluator traits @@ -75,6 +71,10 @@ struct evaluator_traits : public evaluator_traits_base { }; +template::Shape > +struct evaluator_assume_aliasing { + static const bool value = false; +}; // By default, we assume a unary expression: template diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 794038a2a..b2a0a4b4f 100755 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -38,10 +38,9 @@ struct evaluator > // Catch scalar * ( A * B ) and transform it to (A*scalar) * B // TODO we should apply that rule only if that's really helpful template -struct evaluator_traits, const Product > > - : evaluator_traits_base, const Product > > +struct evaluator_assume_aliasing, const Product > > { - enum { AssumeAliasing = 1 }; + static const bool value = true; }; template struct evaluator, const Product > > @@ -81,17 +80,8 @@ template< typename Lhs, typename Rhs, struct generic_product_impl; template -struct evaluator_traits > - : evaluator_traits_base > -{ - enum { AssumeAliasing = 1 }; -}; - -template -struct evaluator_traits > - : evaluator_traits_base > -{ - enum { AssumeAliasing = 0 }; +struct evaluator_assume_aliasing > { + static const bool value = true; }; // This is the default evaluator implementation for products: @@ -189,6 +179,13 @@ struct Assignment" expression to save one temporary // FIXME we could probably enable these rules for any product, i.e., not only Dense and DefaultProduct +// TODO enable it for "Dense ?= xpr - Product<>" as well. + +template +struct evaluator_assume_aliasing, const OtherXpr, + const Product >, DenseShape > { + static const bool value = true; +}; template struct assignment_from_xpr_plus_product diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h index 87e87ab3a..e709eb213 100644 --- a/Eigen/src/Core/SelfAdjointView.h +++ b/Eigen/src/Core/SelfAdjointView.h @@ -203,8 +203,6 @@ struct evaluator_traits > { typedef typename storage_kind_to_evaluator_kind::Kind Kind; typedef SelfAdjointShape Shape; - - static const int AssumeAliasing = 0; }; template diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h index 7d6a97848..27845e89c 100644 --- a/Eigen/src/Core/TriangularMatrix.h +++ b/Eigen/src/Core/TriangularMatrix.h @@ -704,10 +704,6 @@ struct evaluator_traits > { typedef typename storage_kind_to_evaluator_kind::Kind Kind; typedef typename glue_shapes::Shape, TriangularShape>::type Shape; - - // 1 if assignment A = B assumes aliasing when B is of type T and thus B needs to be evaluated into a - // temporary; 0 if not. - static const int AssumeAliasing = 0; }; template diff --git a/Eigen/src/Geometry/Homogeneous.h b/Eigen/src/Geometry/Homogeneous.h index 367fd3930..cd52b5470 100644 --- a/Eigen/src/Geometry/Homogeneous.h +++ b/Eigen/src/Geometry/Homogeneous.h @@ -304,7 +304,6 @@ struct evaluator_traits > { typedef typename storage_kind_to_evaluator_kind::Kind Kind; typedef HomogeneousShape Shape; - static const int AssumeAliasing = 0; }; template<> struct AssignmentKind { typedef Dense2Dense Kind; }; diff --git a/Eigen/src/SparseCore/SparseSelfAdjointView.h b/Eigen/src/SparseCore/SparseSelfAdjointView.h index 46c6ce1d3..975cefd28 100644 --- a/Eigen/src/SparseCore/SparseSelfAdjointView.h +++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h @@ -211,8 +211,6 @@ struct evaluator_traits > { typedef typename storage_kind_to_evaluator_kind::Kind Kind; typedef SparseSelfAdjointShape Shape; - - static const int AssumeAliasing = 0; }; struct SparseSelfAdjoint2Sparse {}; diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h index 4f26c19ca..0d448d02e 100644 --- a/Eigen/src/SparseQR/SparseQR.h +++ b/Eigen/src/SparseQR/SparseQR.h @@ -691,7 +691,6 @@ struct evaluator_traits > typedef typename SparseQRType::MatrixType MatrixType; typedef typename storage_kind_to_evaluator_kind::Kind Kind; typedef SparseShape Shape; - static const int AssumeAliasing = 0; }; template< typename DstXprType, typename SparseQRType> diff --git a/test/product.h b/test/product.h index 9dfff9303..bd92309d2 100644 --- a/test/product.h +++ b/test/product.h @@ -145,14 +145,31 @@ template void product(const MatrixType& m) VERIFY_IS_APPROX(res.col(r).noalias() = square * square.col(r), (square * square.col(r)).eval()); // inner product - Scalar x = square2.row(c) * square2.col(c2); - VERIFY_IS_APPROX(x, square2.row(c).transpose().cwiseProduct(square2.col(c2)).sum()); - + { + Scalar x = square2.row(c) * square2.col(c2); + VERIFY_IS_APPROX(x, square2.row(c).transpose().cwiseProduct(square2.col(c2)).sum()); + } + // outer product - VERIFY_IS_APPROX(m1.col(c) * m1.row(r), m1.block(0,c,rows,1) * m1.block(r,0,1,cols)); - VERIFY_IS_APPROX(m1.row(r).transpose() * m1.col(c).transpose(), m1.block(r,0,1,cols).transpose() * m1.block(0,c,rows,1).transpose()); - VERIFY_IS_APPROX(m1.block(0,c,rows,1) * m1.row(r), m1.block(0,c,rows,1) * m1.block(r,0,1,cols)); - VERIFY_IS_APPROX(m1.col(c) * m1.block(r,0,1,cols), m1.block(0,c,rows,1) * m1.block(r,0,1,cols)); - VERIFY_IS_APPROX(m1.leftCols(1) * m1.row(r), m1.block(0,0,rows,1) * m1.block(r,0,1,cols)); - VERIFY_IS_APPROX(m1.col(c) * m1.topRows(1), m1.block(0,c,rows,1) * m1.block(0,0,1,cols)); + { + VERIFY_IS_APPROX(m1.col(c) * m1.row(r), m1.block(0,c,rows,1) * m1.block(r,0,1,cols)); + VERIFY_IS_APPROX(m1.row(r).transpose() * m1.col(c).transpose(), m1.block(r,0,1,cols).transpose() * m1.block(0,c,rows,1).transpose()); + VERIFY_IS_APPROX(m1.block(0,c,rows,1) * m1.row(r), m1.block(0,c,rows,1) * m1.block(r,0,1,cols)); + VERIFY_IS_APPROX(m1.col(c) * m1.block(r,0,1,cols), m1.block(0,c,rows,1) * m1.block(r,0,1,cols)); + VERIFY_IS_APPROX(m1.leftCols(1) * m1.row(r), m1.block(0,0,rows,1) * m1.block(r,0,1,cols)); + VERIFY_IS_APPROX(m1.col(c) * m1.topRows(1), m1.block(0,c,rows,1) * m1.block(0,0,1,cols)); + } + + // Aliasing + { + ColVectorType x(cols); x.setRandom(); + ColVectorType z(x); + ColVectorType y(cols); y.setZero(); + ColSquareMatrixType A(cols,cols); A.setRandom(); + // CwiseBinaryOp + VERIFY_IS_APPROX(x = y + A*x, A*z); + x = z; + // CwiseUnaryOp + VERIFY_IS_APPROX(x = Scalar(1.)*(A*x), A*z); + } } diff --git a/test/product_large.cpp b/test/product_large.cpp index 7207973c2..98f84c53b 100644 --- a/test/product_large.cpp +++ b/test/product_large.cpp @@ -9,6 +9,27 @@ #include "product.h" +template +void test_aliasing() +{ + int rows = internal::random(1,12); + int cols = internal::random(1,12); + typedef Matrix MatrixType; + typedef Matrix VectorType; + VectorType x(cols); x.setRandom(); + VectorType z(x); + VectorType y(rows); y.setZero(); + MatrixType A(rows,cols); A.setRandom(); + // CwiseBinaryOp + VERIFY_IS_APPROX(x = y + A*x, A*z); // OK because "y + A*x" is marked as "assume-aliasing" + x = z; + // CwiseUnaryOp + VERIFY_IS_APPROX(x = T(1.)*(A*x), A*z); // OK because 1*(A*x) is replaced by (1*A*x) which is a Product<> expression + x = z; + // VERIFY_IS_APPROX(x = y-A*x, -A*z); // Not OK in 3.3 because x is resized before A*x gets evaluated + x = z; +} + void test_product_large() { for(int i = 0; i < g_repeat; i++) { @@ -17,6 +38,8 @@ void test_product_large() CALL_SUBTEST_3( product(MatrixXi(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_4( product(MatrixXcf(internal::random(1,EIGEN_TEST_MAX_SIZE/2), internal::random(1,EIGEN_TEST_MAX_SIZE/2))) ); CALL_SUBTEST_5( product(Matrix(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); + + CALL_SUBTEST_1( test_aliasing() ); } #if defined EIGEN_TEST_PART_6 diff --git a/test/product_notemporary.cpp b/test/product_notemporary.cpp index ff93cb881..5a3f3a01a 100644 --- a/test/product_notemporary.cpp +++ b/test/product_notemporary.cpp @@ -43,10 +43,16 @@ template void product_notemporary(const MatrixType& m) r1 = internal::random(8,rows-r0); VERIFY_EVALUATION_COUNT( m3 = (m1 * m2.adjoint()), 1); + VERIFY_EVALUATION_COUNT( m3 = (m1 * m2.adjoint()).transpose(), 1); VERIFY_EVALUATION_COUNT( m3.noalias() = m1 * m2.adjoint(), 0); + VERIFY_EVALUATION_COUNT( m3 = s1 * (m1 * m2.transpose()), 1); +// VERIFY_EVALUATION_COUNT( m3 = m3 + s1 * (m1 * m2.transpose()), 1); VERIFY_EVALUATION_COUNT( m3.noalias() = s1 * (m1 * m2.transpose()), 0); + VERIFY_EVALUATION_COUNT( m3 = m3 + (m1 * m2.adjoint()), 1); + + VERIFY_EVALUATION_COUNT( m3 = m3 + (m1 * m2.adjoint()).transpose(), 1); VERIFY_EVALUATION_COUNT( m3.noalias() = m3 + m1 * m2.transpose(), 0); VERIFY_EVALUATION_COUNT( m3.noalias() += m3 + m1 * m2.transpose(), 0); VERIFY_EVALUATION_COUNT( m3.noalias() -= m3 + m1 * m2.transpose(), 0); From 403a7cb6c34d163e4f120387b5dc5487d30bb1d5 Mon Sep 17 00:00:00 2001 From: Jeremy Barnes Date: Sun, 10 Jan 2016 22:39:13 -0500 Subject: [PATCH 16/52] Alternative way of forcing instantiation of device kernels without causing warnings or requiring device to device kernel invocations. This allows Tensorflow to work on SM 3.0 (ie, Amazon EC2) machines. --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 10 ++++++++++ unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 4 ++-- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index af140a68b..359a01b8f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -242,6 +242,16 @@ struct GpuDevice { (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ assert(cudaGetLastError() == cudaSuccess); +#ifndef __CUDA_ARCH__ +#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ + (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ + assert(cudaGetLastError() == cudaSuccess); +#else +#define LAUNCH_CUDA_KERNEL(kernel, ...) \ + { static const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \ + eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__); +#endif + // FIXME: Should be device and kernel specific. #ifdef __CUDACC__ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index fd7064459..9a66e81f7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -506,7 +506,7 @@ struct TensorEvaluator, Device> typedef typename internal::remove_const::type CoeffReturnType; typedef typename internal::remove_const::type PacketReturnType; - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) { m_impl.evalSubExprsIfNeeded(NULL); // Use the FullReducer if possible. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 558d0c83d..374edb605 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -116,7 +116,7 @@ struct FullReducer { template static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { - assert(false && "Should only be called on floats"); + eigen_assert(false && "Should only be called on floats"); } static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) { @@ -126,7 +126,7 @@ struct FullReducer { const int block_size = 256; const int num_per_thread = 128; const int num_blocks = std::ceil(static_cast(num_coeffs) / (block_size * num_per_thread)); - LAUNCH_CUDA_KERNEL((FullReductionKernel), + LAUNCH_CUDA_KERNEL((FullReductionKernel), num_blocks, block_size, 0, device, reducer, self, num_coeffs, output); } }; From 91678f489a89b1f4a393cd3b703e67922b5c4257 Mon Sep 17 00:00:00 2001 From: Jeremy Barnes Date: Sun, 10 Jan 2016 22:44:45 -0500 Subject: [PATCH 17/52] Cleaned up double-defined macro from last commit --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index 359a01b8f..c74613873 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -238,10 +238,6 @@ struct GpuDevice { }; -#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ - (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ - assert(cudaGetLastError() == cudaSuccess); - #ifndef __CUDA_ARCH__ #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ From 780623261eedd996404795dfb7928e680408adb5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 11 Jan 2016 09:07:14 -0800 Subject: [PATCH 18/52] Re-enabled the optimized reduction CUDA code. --- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 2 -- unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 6 +++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index fd7064459..cea32d05f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -527,7 +527,6 @@ struct TensorEvaluator, Device> } // Attempt to use an optimized reduction. -#if 0 else if (RunningOnGPU && data && (m_device.majorDeviceVersion() >= 3)) { bool reducing_inner_dims = true; for (int i = 0; i < NumReducedDims; ++i) { @@ -563,7 +562,6 @@ struct TensorEvaluator, Device> return false; } } -#endif return true; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 558d0c83d..198b3604c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -126,7 +126,7 @@ struct FullReducer { const int block_size = 256; const int num_per_thread = 128; const int num_blocks = std::ceil(static_cast(num_coeffs) / (block_size * num_per_thread)); - LAUNCH_CUDA_KERNEL((FullReductionKernel), + LAUNCH_CUDA_KERNEL((FullReductionKernel), num_blocks, block_size, 0, device, reducer, self, num_coeffs, output); } }; @@ -222,7 +222,7 @@ struct InnerReducer { const int num_per_thread = 128; const int num_blocks = 32; - LAUNCH_CUDA_KERNEL((InnerReductionKernel), + LAUNCH_CUDA_KERNEL((InnerReductionKernel), num_blocks, block_size, block_size*sizeof(float), device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); } }; @@ -279,7 +279,7 @@ struct OuterReducer { device.maxCudaThreadsPerMultiProcessor() / block_size; const int num_blocks = numext::mini(max_blocks, dyn_blocks); - LAUNCH_CUDA_KERNEL((OuterReductionKernel), + LAUNCH_CUDA_KERNEL((OuterReductionKernel), num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); } }; From 2ccb1c86342203cbc25a587590149d2cf5175900 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 11 Jan 2016 10:36:37 -0800 Subject: [PATCH 19/52] Fixed a bug in the dispatch of optimized reduction kernels. --- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index cea32d05f..7bd2326b0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -438,19 +438,18 @@ struct TensorEvaluator, Device> EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)), YOU_MADE_A_PROGRAMMING_MISTAKE); - // Bitmap indicating if an input dimension is reduced or not. - array reduced; + // Build the bitmap indicating if an input dimension is reduced or not. for (int i = 0; i < NumInputDims; ++i) { - reduced[i] = false; + m_reduced[i] = false; } for (int i = 0; i < NumReducedDims; ++i) { eigen_assert(op.dims()[i] >= 0); eigen_assert(op.dims()[i] < NumInputDims); - reduced[op.dims()[i]] = true; + m_reduced[op.dims()[i]] = true; } const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - internal::DimInitializer::run(input_dims, reduced, &m_dimensions, &m_reducedDims); + internal::DimInitializer::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims); // Precompute output strides. if (NumOutputDims > 0) { @@ -485,7 +484,7 @@ struct TensorEvaluator, Device> int outputIndex = 0; int reduceIndex = 0; for (int i = 0; i < NumInputDims; ++i) { - if (reduced[i]) { + if (m_reduced[i]) { m_reducedStrides[reduceIndex] = input_strides[i]; ++reduceIndex; } else { @@ -531,9 +530,9 @@ struct TensorEvaluator, Device> bool reducing_inner_dims = true; for (int i = 0; i < NumReducedDims; ++i) { if (static_cast(Layout) == static_cast(ColMajor)) { - reducing_inner_dims &= m_reducedDims[i]; + reducing_inner_dims &= m_reduced[i]; } else { - reducing_inner_dims &= m_reducedDims[NumInputDims - 1 - i]; + reducing_inner_dims &= m_reduced[NumInputDims - 1 - i]; } } if (internal::InnerReducer::HasOptimizedImplementation && @@ -548,9 +547,9 @@ struct TensorEvaluator, Device> bool preserving_inner_dims = true; for (int i = 0; i < NumReducedDims; ++i) { if (static_cast(Layout) == static_cast(ColMajor)) { - preserving_inner_dims &= m_reducedDims[NumInputDims - 1 - i]; + preserving_inner_dims &= m_reduced[NumInputDims - 1 - i]; } else { - preserving_inner_dims &= m_reducedDims[i]; + preserving_inner_dims &= m_reduced[i]; } } if (internal::OuterReducer::HasOptimizedImplementation && @@ -689,6 +688,8 @@ struct TensorEvaluator, Device> return startInput; } + // Bitmap indicating if an input dimension is reduced or not. + array m_reduced; // Dimensions of the output of the operation. Dimensions m_dimensions; // Precomputed strides for the output tensor. From b523771a24320014abfec537b0f4b568c19882eb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 11 Jan 2016 14:25:43 -0800 Subject: [PATCH 20/52] Silenced several compilation warnings triggered by nvcc. --- .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 48 ++++++++++++++----- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 8 ++-- .../CXX11/src/Tensor/TensorReductionCuda.h | 12 ++--- 3 files changed, 46 insertions(+), 22 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index c74613873..0f67f0f57 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -10,7 +10,6 @@ #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H) #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H - namespace Eigen { // This defines an interface that GPUDevice can take to use @@ -206,20 +205,45 @@ struct GpuDevice { #endif } - inline int getNumCudaMultiProcessors() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const { +#ifndef __CUDA_ARCH__ return stream_->deviceProperties().multiProcessorCount; +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); + return 0; +#endif } - inline int maxCudaThreadsPerBlock() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const { +#ifndef __CUDA_ARCH__ return stream_->deviceProperties().maxThreadsPerBlock; +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); + return 0; +#endif } - inline int maxCudaThreadsPerMultiProcessor() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const { +#ifndef __CUDA_ARCH__ return stream_->deviceProperties().maxThreadsPerMultiProcessor; +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); + return 0; +#endif } - inline int sharedMemPerBlock() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int sharedMemPerBlock() const { +#ifndef __CUDA_ARCH__ return stream_->deviceProperties().sharedMemPerBlock; +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); + return 0; +#endif } - inline int majorDeviceVersion() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { +#ifndef __CUDA_ARCH__ return stream_->deviceProperties().major; +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); + return 0; +#endif } // This function checks if the CUDA runtime recorded an error for the @@ -239,13 +263,13 @@ struct GpuDevice { }; #ifndef __CUDA_ARCH__ -#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ - (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ +#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ + (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ assert(cudaGetLastError() == cudaSuccess); #else -#define LAUNCH_CUDA_KERNEL(kernel, ...) \ - { static const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \ - eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__); +#define LAUNCH_CUDA_KERNEL(kernel, ...) \ + { const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \ + eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__); #endif @@ -260,4 +284,4 @@ static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { } // end namespace Eigen -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index d93e1de1b..d2ab70f2b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -156,14 +156,14 @@ template class TensorExecutor { public: typedef typename Expression::Index Index; - static void run(const Expression& expr, const GpuDevice& device); + static EIGEN_DEVICE_FUNC void run(const Expression& expr, const GpuDevice& device); }; template class TensorExecutor { public: typedef typename Expression::Index Index; - static void run(const Expression& expr, const GpuDevice& device); + static EIGEN_DEVICE_FUNC void run(const Expression& expr, const GpuDevice& device); }; #if defined(__CUDACC__) @@ -213,7 +213,7 @@ EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) { /*static*/ template -inline void TensorExecutor::run(const Expression& expr, const GpuDevice& device) +EIGEN_DEVICE_FUNC inline void TensorExecutor::run(const Expression& expr, const GpuDevice& device) { TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); @@ -232,7 +232,7 @@ inline void TensorExecutor::run(const Expression& /*static*/ template -inline void TensorExecutor::run(const Expression& expr, const GpuDevice& device) +EIGEN_DEVICE_FUNC inline void TensorExecutor::run(const Expression& expr, const GpuDevice& device) { TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 3fa3d5c3c..867654aff 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -115,8 +115,8 @@ struct FullReducer { internal::is_same::value; template - static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { - eigen_assert(false && "Should only be called on floats"); + static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { + assert(false && "Should only be called on floats"); } static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) { @@ -210,11 +210,11 @@ struct InnerReducer { internal::is_same::value; template - static void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { + static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { assert(false && "Should only be called to reduce floats on a gpu device"); } - static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; @@ -264,11 +264,11 @@ struct OuterReducer { internal::is_same::value; template - static void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { + static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { assert(false && "Should only be called to reduce floats on a gpu device"); } - static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; From 0504c56ea7b1dc4d804580cbc498b189ffd06b6e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 11 Jan 2016 15:49:21 -0800 Subject: [PATCH 21/52] Silenced a nvcc compilation warning --- unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 867654aff..91b6e68dd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -119,7 +119,7 @@ struct FullReducer { assert(false && "Should only be called on floats"); } - static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) { + static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) { typedef typename Self::Index Index; const Index num_coeffs = array_prod(self.m_impl.dimensions()); From 01c55d37e69ca3c45f4390b7e15c310028e8b8ed Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 11 Jan 2016 15:53:19 -0800 Subject: [PATCH 22/52] Deleted unused variable. --- unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 1 - 1 file changed, 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 91b6e68dd..59404dba5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -217,7 +217,6 @@ struct InnerReducer { static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; - const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; const int block_size = 256; const int num_per_thread = 128; const int num_blocks = 32; From 4f7714d72cb8a34dd42081a6ece310c984392354 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 11 Jan 2016 16:01:00 -0800 Subject: [PATCH 23/52] Enabled the use of fixed dimensions from within a cuda kernel. --- unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index f3c9a3148..2692563a9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -110,14 +110,14 @@ struct Sizes : internal::numeric_list { return internal::arg_prod(Indices...); } - Sizes() { } + EIGEN_DEVICE_FUNC Sizes() { } template - explicit Sizes(const array& /*indices*/) { + explicit EIGEN_DEVICE_FUNC Sizes(const array& /*indices*/) { // todo: add assertion } #ifdef EIGEN_HAS_VARIADIC_TEMPLATES - template Sizes(DenseIndex...) { } - explicit Sizes(std::initializer_list /*l*/) { + template EIGEN_DEVICE_FUNC Sizes(DenseIndex...) { } + explicit EIGEN_DEVICE_FUNC Sizes(std::initializer_list /*l*/) { // todo: add assertion } #endif From f894736d61198eef9d6463d3cdde8dbe1171a56b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 11 Jan 2016 16:42:18 -0800 Subject: [PATCH 24/52] Updated the tensor traits: the alignment is not part of the Flags enum anymore --- unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h index 7a9568b36..2f06f8442 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -20,7 +20,7 @@ class compute_tensor_flags enum { is_dynamic_size_storage = 1, - aligned_bit = + is_aligned = ( ((Options&DontAlign)==0) && ( #if EIGEN_MAX_STATIC_ALIGN_BYTES>0 @@ -35,12 +35,12 @@ class compute_tensor_flags 0 #endif ) - ) ? AlignedBit : 0, - packet_access_bit = packet_traits::Vectorizable && aligned_bit ? PacketAccessBit : 0 + ), + packet_access_bit = packet_traits::Vectorizable && is_aligned ? PacketAccessBit : 0 }; public: - enum { ret = packet_access_bit | aligned_bit}; + enum { ret = packet_access_bit}; }; @@ -86,7 +86,7 @@ struct traits > static const int Layout = BaseTraits::Layout; enum { Options = Options_, - Flags = (BaseTraits::Flags & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0), + Flags = BaseTraits::Flags, }; }; @@ -102,7 +102,7 @@ struct traits > static const int Layout = BaseTraits::Layout; enum { Options = BaseTraits::Options, - Flags = (BaseTraits::Flags & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0), + Flags = BaseTraits::Flags, }; }; From c5e6900400663901013dc48a3492e756b21c131e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 11 Jan 2016 17:06:39 -0800 Subject: [PATCH 25/52] Silenced a few compilation warnings. --- unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h | 7 ++++--- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 6 +++++- unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 2 ++ 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index 90ee50678..126a9e4ad 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -1261,7 +1261,7 @@ struct TensorEvaluatorm_leftImpl.evalSubExprsIfNeeded(NULL); this->m_rightImpl.evalSubExprsIfNeeded(NULL); if (data) { @@ -1274,7 +1274,7 @@ struct TensorEvaluatorm_lhs_inner_dim_contiguous) { if (this->m_rhs_inner_dim_contiguous) { if (this->m_rhs_inner_dim_reordered) { @@ -1313,10 +1313,11 @@ struct TensorEvaluator + template EIGEN_DEVICE_FUNC void evalTyped(Scalar* buffer) const { // columns in left side, rows in right side const Index k = this->m_k_size; + EIGEN_UNUSED_VARIABLE(k) // rows in left side const Index m = this->m_i_size; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index 0f67f0f57..5abdc489b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -275,10 +275,14 @@ struct GpuDevice { // FIXME: Should be device and kernel specific. #ifdef __CUDACC__ -static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { +static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { +#ifndef __CUDA_ARCH__ cudaError_t status = cudaDeviceSetSharedMemConfig(config); EIGEN_UNUSED_VARIABLE(status) assert(status == cudaSuccess); +#else + EIGEN_UNUSED_VARIABLE(config) +#endif } #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 59404dba5..89f055134 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -220,6 +220,8 @@ struct InnerReducer { const int block_size = 256; const int num_per_thread = 128; const int num_blocks = 32; + EIGEN_UNUSED_VARIABLE(block_size) + EIGEN_UNUSED_VARIABLE(num_blocks) LAUNCH_CUDA_KERNEL((InnerReductionKernel), num_blocks, block_size, block_size*sizeof(float), device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); From bbdabbb37976a37ab585d266691100bdd035f30b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 11 Jan 2016 17:26:56 -0800 Subject: [PATCH 26/52] Made the blas utils usable from within a cuda kernel --- Eigen/src/Core/util/BlasUtil.h | 42 +++++++++++++++++----------------- Eigen/src/Core/util/Memory.h | 7 +++--- 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index d00fa9707..498db3a70 100755 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -123,18 +123,18 @@ template struct get_factor::R template class BlasVectorMapper { public: - EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar *data) : m_data(data) {} + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar *data) : m_data(data) {} - EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { return m_data[i]; } template - EIGEN_ALWAYS_INLINE Packet load(Index i) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet load(Index i) const { return ploadt(m_data + i); } template - bool aligned(Index i) const { + EIGEN_DEVICE_FUNC bool aligned(Index i) const { return (size_t(m_data+i)%sizeof(Packet))==0; } @@ -148,25 +148,25 @@ class BlasLinearMapper { typedef typename packet_traits::type Packet; typedef typename packet_traits::half HalfPacket; - EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {} + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {} - EIGEN_ALWAYS_INLINE void prefetch(int i) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const { internal::prefetch(&operator()(i)); } - EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const { return m_data[i]; } - EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { return ploadt(m_data + i); } - EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { return ploadt(m_data + i); } - EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet &p) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet &p) const { pstoret(m_data + i, p); } @@ -184,18 +184,18 @@ class blas_data_mapper { typedef BlasLinearMapper LinearMapper; typedef BlasVectorMapper VectorMapper; - EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {} + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {} - EIGEN_ALWAYS_INLINE blas_data_mapper + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper getSubMapper(Index i, Index j) const { return blas_data_mapper(&operator()(i, j), m_stride); } - EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { return LinearMapper(&operator()(i, j)); } - EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { return VectorMapper(&operator()(i, j)); } @@ -205,28 +205,28 @@ class blas_data_mapper { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; } - EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { return ploadt(&operator()(i, j)); } - EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { return ploadt(&operator()(i, j)); } template - EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const { pscatter(&operator()(i, j), p, m_stride); } template - EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const { return pgather(&operator()(i, j), m_stride); } - const Index stride() const { return m_stride; } - const Scalar* data() const { return m_data; } + EIGEN_DEVICE_FUNC const Index stride() const { return m_stride; } + EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; } - Index firstAligned(Index size) const { + EIGEN_DEVICE_FUNC Index firstAligned(Index size) const { if (size_t(m_data)%sizeof(Scalar)) { return -1; } diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 1a899ea6c..823e077af 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -524,7 +524,7 @@ template EIGEN_DEVICE_FUNC inline void conditional_align * \sa first_default_aligned() */ template -inline Index first_aligned(const Scalar* array, Index size) +EIGEN_DEVICE_FUNC inline Index first_aligned(const Scalar* array, Index size) { static const Index ScalarSize = sizeof(Scalar); static const Index AlignmentSize = Alignment / ScalarSize; @@ -544,14 +544,15 @@ inline Index first_aligned(const Scalar* array, Index size) } else { - return std::min( (AlignmentSize - (Index((std::size_t(array)/sizeof(Scalar))) & AlignmentMask)) & AlignmentMask, size); + Index first = (AlignmentSize - (Index((std::size_t(array)/sizeof(Scalar))) & AlignmentMask)) & AlignmentMask; + return (first < size) ? first : size; } } /** \internal Returns the index of the first element of the array that is well aligned with respect the largest packet requirement. * \sa first_aligned(Scalar*,Index) and first_default_aligned(DenseBase) */ template -inline Index first_default_aligned(const Scalar* array, Index size) +EIGEN_DEVICE_FUNC inline Index first_default_aligned(const Scalar* array, Index size) { typedef typename packet_traits::type DefaultPacketType; return first_aligned::alignment>(array, size); From bd7d901da9bd824ee2a7a94d3b0c8668d77f5ff2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 11 Jan 2016 17:49:44 -0800 Subject: [PATCH 27/52] Reverted a previous change that tripped nvcc when compiling in debug mode. --- unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index 126a9e4ad..a5f3debc4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -1274,7 +1274,7 @@ struct TensorEvaluatorm_lhs_inner_dim_contiguous) { if (this->m_rhs_inner_dim_contiguous) { if (this->m_rhs_inner_dim_reordered) { @@ -1313,7 +1313,7 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC + template void evalTyped(Scalar* buffer) const { // columns in left side, rows in right side const Index k = this->m_k_size; @@ -1362,7 +1362,7 @@ struct TensorEvaluator), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k); } else { - const Index m_blocks = (m + 127) / 128; + const Index m_blocks = (m + 127) / 128; const Index n_blocks = (n + 63) / 64; const dim3 num_blocks(m_blocks, n_blocks, 1); const dim3 block_size(8, 32, 1); From d920d57f38e07739403a6c1e224c74fec5a36e6f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 12 Jan 2016 11:32:27 -0800 Subject: [PATCH 28/52] Improved the performance of the contraction of a 2d tensor with a 1d tensor by a factor of 3 or more. This helps speedup LSTM neural networks. --- .../CXX11/src/Tensor/TensorContraction.h | 48 ++++++++++--------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index eda93a1de..63d0c6f68 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -32,7 +32,7 @@ enum { template + int packet_size, bool inner_dim_contiguous, int Alignment> class SimpleTensorContractionMapper { public: EIGEN_DEVICE_FUNC @@ -144,11 +144,11 @@ class SimpleTensorContractionMapper { return IndexPair(linidx[0], linidx[1]); } - Index firstAligned(Index size) const { - return size; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const { + return (Alignment == Aligned) ? 0 : size; } - Index stride() const { - return 1; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const { + return ((side == Lhs) && inner_dim_contiguous) ? m_contract_strides[0] : 1; } protected: @@ -165,10 +165,10 @@ template - class BaseTensorContractionMapper : public SimpleTensorContractionMapper +class BaseTensorContractionMapper : public SimpleTensorContractionMapper { public: - typedef SimpleTensorContractionMapper ParentMapper; + typedef SimpleTensorContractionMapper ParentMapper; EIGEN_DEVICE_FUNC BaseTensorContractionMapper(const Tensor& tensor, @@ -181,6 +181,7 @@ template::type Packet; typedef typename packet_traits::half HalfPacket; + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { // whole method makes column major assumption @@ -192,7 +193,7 @@ templatecomputeIndex(i, j); eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1); - return this->m_tensor.template packet(index); + return this->m_tensor.template packet(index); } const IndexPair indexPair = this->computeIndexPair(i, j, packet_size - 1); @@ -207,7 +208,7 @@ template::value <= 1 || !inner_dim_reordered) && (last - first) == (packet_size - 1)) { - return this->m_tensor.template packet(first); + return this->m_tensor.template packet(first); } EIGEN_ALIGN_MAX Scalar data[packet_size]; @@ -223,6 +224,7 @@ template(data); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { // whole method makes column major assumption @@ -230,7 +232,7 @@ template::size; if (half_packet_size == packet_size) { - return loadPacket(i, j); + return loadPacket(i, j); } EIGEN_ALIGN_MAX Scalar data[half_packet_size]; for (Index k = 0; k < half_packet_size; k++) { @@ -246,10 +248,10 @@ template -class BaseTensorContractionMapper : public SimpleTensorContractionMapper +class BaseTensorContractionMapper : public SimpleTensorContractionMapper { public: - typedef SimpleTensorContractionMapper ParentMapper; + typedef SimpleTensorContractionMapper ParentMapper; EIGEN_DEVICE_FUNC BaseTensorContractionMapper(const Tensor& tensor, @@ -260,13 +262,13 @@ class BaseTensorContractionMapper::type Packet; - EIGEN_DEVICE_FUNC + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { EIGEN_ALIGN_MAX Scalar data[1]; data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); return pload::type>(data); } - EIGEN_DEVICE_FUNC + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const { return loadPacket(i, j); } @@ -304,14 +306,14 @@ class TensorContractionSubMapper { } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { - return m_base_mapper.loadPacket(i + m_vert_offset, m_horiz_offset); + return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { - return m_base_mapper.loadPacket(i + m_vert_offset, j + m_horiz_offset); + return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { - return m_base_mapper.loadHalfPacket(i + m_vert_offset, m_horiz_offset); + return m_base_mapper.template loadHalfPacket(i + m_vert_offset, m_horiz_offset); } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { @@ -325,12 +327,12 @@ class TensorContractionSubMapper { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const { EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT((AlignmentType == Aligned || Alignment == Unaligned), YOU_MADE_A_PROGRAMMING_MISTAKE); - return loadPacket(i); + const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned; + return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); } template - EIGEN_DEVICE_FUNC bool aligned(Index) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const { return false; } @@ -741,17 +743,19 @@ struct TensorContractionEvaluatorBase typedef TensorEvaluator RightEvaluator; const Index lhs_packet_size = internal::packet_traits::size; const Index rhs_packet_size = internal::packet_traits::size; + const int lhs_alignment = LeftEvaluator::IsAligned ? Aligned : Unaligned; + const int rhs_alignment = RightEvaluator::IsAligned ? Aligned : Unaligned; typedef internal::TensorContractionInputMapper LhsMapper; + false, lhs_alignment> LhsMapper; typedef internal::TensorContractionInputMapper RhsMapper; + rhs_inner_dim_reordered, rhs_alignment> RhsMapper; LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides); From 79b69b7444cfae2f7631e873e822cdca6f4e355f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 12 Jan 2016 15:21:09 -0800 Subject: [PATCH 29/52] Trigger the optimized matrix vector path more conservatively. --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 63d0c6f68..72a378dfd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -145,7 +145,10 @@ class SimpleTensorContractionMapper { } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const { - return (Alignment == Aligned) ? 0 : size; + // Only claim alignment when we can compute the actual stride (ie when we're + // dealing with the lhs with inner_dim_contiguous. This is because the + // matrix-vector product relies on the stride when dealing with aligned inputs. + return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size; } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const { return ((side == Lhs) && inner_dim_contiguous) ? m_contract_strides[0] : 1; From 9f013a9d86ad5cf82939bfeab2223652a821c448 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 13 Jan 2016 14:24:37 -0800 Subject: [PATCH 30/52] Properly record the rank of reduced tensors in the tensor traits. --- unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index c783aab97..781a37e34 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -134,7 +134,7 @@ struct traits > : public traits::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; + static const int NumDimensions = XprTraits::NumDimensions - array_size::value; static const int Layout = XprTraits::Layout; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 8028e71c0..2dc8815b8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -24,11 +24,14 @@ template struct traits > : traits { - typedef typename traits::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::Scalar Scalar; typedef typename internal::packet_traits::type Packet; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::Index Index; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; + static const int NumDimensions = XprTraits::NumDimensions - array_size::value; + static const int Layout = XprTraits::Layout; }; template From 8fe2532e70a8e0261717003d96d4df41ab978756 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Jan 2016 09:29:48 -0800 Subject: [PATCH 31/52] Fixed a boundary condition bug in the outer reduction kernel --- unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 89f055134..54ab34ba1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -241,7 +241,7 @@ __global__ void OuterReductionKernel(Reducer reducer, const Self input, Index nu } // Do the reduction. - const Index max_iter = divup(num_coeffs_to_reduce, NumPerThread) * num_preserved_coeffs; + const Index max_iter = num_preserved_coeffs * numext::maxi(1, (num_coeffs_to_reduce - NumPerThread + 1)); for (Index i = thread_id; i < max_iter; i += num_threads) { const Index input_col = i % num_preserved_coeffs; const Index input_row = (i / num_preserved_coeffs) * NumPerThread; From aed4cb1269d52d0ff0e69c8aa6d89c804185b18f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Jan 2016 21:45:14 -0800 Subject: [PATCH 32/52] Use warp shuffles instead of shared memory access to speedup the inner reduction kernel. --- .../CXX11/src/Tensor/TensorReductionCuda.h | 20 +++++++------------ 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 54ab34ba1..82ea09f07 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -132,8 +132,6 @@ struct FullReducer { }; -extern __shared__ float temp[]; - template __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, @@ -183,17 +181,13 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu } } - temp[threadIdx.x] = reduced_val; +#pragma unroll + for (int offset = warpSize/2; offset > 0; offset /= 2) { + reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val); + } - __syncthreads(); - const int warp_id = threadIdx.x & 31; - if (warp_id < 16) reducer.reduce(temp[threadIdx.x + 16], &temp[threadIdx.x]); - if (warp_id < 8) reducer.reduce(temp[threadIdx.x + 8], &temp[threadIdx.x]); - if (warp_id < 4) reducer.reduce(temp[threadIdx.x + 4], &temp[threadIdx.x]); - if (warp_id < 2) reducer.reduce(temp[threadIdx.x + 2], &temp[threadIdx.x]); - if (warp_id < 1) { - reducer.reduce(temp[threadIdx.x + 1], &temp[threadIdx.x]); - atomicReduce(&(output[row]), temp[threadIdx.x], reducer); + if ((threadIdx.x & (warpSize - 1)) == 0) { + atomicReduce(&(output[row]), reduced_val, reducer); } } @@ -224,7 +218,7 @@ struct InnerReducer { EIGEN_UNUSED_VARIABLE(num_blocks) LAUNCH_CUDA_KERNEL((InnerReductionKernel), - num_blocks, block_size, block_size*sizeof(float), device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); + num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); } }; From 0461f0153ef66fb95d16b620675c97107ef7a342 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 15 Jan 2016 11:22:16 -0800 Subject: [PATCH 33/52] Made it possible to compare tensor dimensions inside a CUDA kernel. --- unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 2692563a9..52569a359 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -405,20 +405,20 @@ template struct sizes_match_below_dim { - static inline bool run(Dims1&, Dims2&) { + static EIGEN_DEVICE_FUNC inline bool run(Dims1&, Dims2&) { return false; } }; template struct sizes_match_below_dim { - static inline bool run(Dims1& dims1, Dims2& dims2) { + static EIGEN_DEVICE_FUNC inline bool run(Dims1& dims1, Dims2& dims2) { return (array_get(dims1) == array_get(dims2)) & sizes_match_below_dim::run(dims1, dims2); } }; template struct sizes_match_below_dim { - static inline bool run(Dims1&, Dims2&) { + static EIGEN_DEVICE_FUNC inline bool run(Dims1&, Dims2&) { return true; } }; @@ -427,7 +427,7 @@ struct sizes_match_below_dim { template -bool dimensions_match(Dims1& dims1, Dims2& dims2) { +EIGEN_DEVICE_FUNC bool dimensions_match(Dims1& dims1, Dims2& dims2) { return internal::sizes_match_below_dim::value, internal::array_size::value>::run(dims1, dims2); } From 34057cff23bc3b9d59117af9bea0b3e098e3eaea Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 15 Jan 2016 15:11:56 -0800 Subject: [PATCH 34/52] Fixed a race condition that could affect some reductions on CUDA devices. --- .../CXX11/src/Tensor/TensorReductionCuda.h | 72 ++++++++++++++++--- 1 file changed, 61 insertions(+), 11 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 82ea09f07..2da18b147 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -76,13 +76,24 @@ __device__ inline void atomicReduce(T* output, T accum, SumReducer&) { #endif } + +template +__global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) { + const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; + const Index num_threads = blockDim.x * gridDim.x; + for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { + output[i] = val; + } +} + template __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs, typename Self::CoeffReturnType* output) { const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x; - if (first_index == 0) { + // Initialize the output value if it wasn't initialized by the ReductionInitKernel + if (gridDim.x == 1 && first_index == 0) { *output = reducer.initialize(); } @@ -126,6 +137,14 @@ struct FullReducer { const int block_size = 256; const int num_per_thread = 128; const int num_blocks = std::ceil(static_cast(num_coeffs) / (block_size * num_per_thread)); + + if (num_blocks > 1) { + // We initialize the outputs outside the reduction kernel when we can't be sure that there + // won't be a race conditions between multiple thread blocks. + LAUNCH_CUDA_KERNEL((ReductionInitKernel), + 1, 32, 0, device, reducer.initialize(), 1, output); + } + LAUNCH_CUDA_KERNEL((FullReductionKernel), num_blocks, block_size, 0, device, reducer, self, num_coeffs, output); } @@ -150,8 +169,11 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu const Index num_threads = blockDim.x * gridDim.x; const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; - for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { - output[i] = reducer.initialize(); + // Initialize the output values if they weren't initialized by the ReductionInitKernel + if (gridDim.x == 1) { + for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { + output[i] = reducer.initialize(); + } } for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) { @@ -211,11 +233,25 @@ struct InnerReducer { static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; + const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; const int block_size = 256; const int num_per_thread = 128; - const int num_blocks = 32; - EIGEN_UNUSED_VARIABLE(block_size) - EIGEN_UNUSED_VARIABLE(num_blocks) + const int dyn_blocks = divup(num_coeffs, block_size * num_per_thread); + const int max_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / block_size; + const int num_blocks = numext::mini(max_blocks, dyn_blocks); + + if (num_blocks > 1) { + // We initialize the outputs outside the reduction kernel when we can't be sure that there + // won't be a race conditions between multiple thread blocks. + const int dyn_blocks = divup(num_preserved_vals, 1024); + const int max_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / 1024; + const int num_blocks = numext::mini(max_blocks, dyn_blocks); + LAUNCH_CUDA_KERNEL((ReductionInitKernel), + num_blocks, 1024, 0, device, reducer.initialize(), + num_preserved_vals, output); + } LAUNCH_CUDA_KERNEL((InnerReductionKernel), num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); @@ -229,9 +265,11 @@ __global__ void OuterReductionKernel(Reducer reducer, const Self input, Index nu typename Self::CoeffReturnType* output) { const Index num_threads = blockDim.x * gridDim.x; const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; - // Initialize the output values - for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { - output[i] = reducer.initialize(); + // Initialize the output values if they weren't initialized by the ReductionInitKernel + if (gridDim.x == 1) { + for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { + output[i] = reducer.initialize(); + } } // Do the reduction. @@ -266,14 +304,26 @@ struct OuterReducer { static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; - const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; + const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; const int block_size = 256; const int num_per_thread = 16; - const int dyn_blocks = std::ceil(static_cast(num_coeffs) / (block_size * num_per_thread)); + const int dyn_blocks = divup(num_coeffs, block_size * num_per_thread); const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size; const int num_blocks = numext::mini(max_blocks, dyn_blocks); + if (num_blocks > 1) { + // We initialize the outputs in the reduction kernel itself when we don't have to worry + // about race conditions between multiple thread blocks. + const int dyn_blocks = divup(num_preserved_vals, 1024); + const int max_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / 1024; + const int num_blocks = numext::mini(max_blocks, dyn_blocks); + LAUNCH_CUDA_KERNEL((ReductionInitKernel), + num_blocks, 1024, 0, device, reducer.initialize(), + num_preserved_vals, output); + } + LAUNCH_CUDA_KERNEL((OuterReductionKernel), num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); } From 5b7713dd33f5d16282f69f10c33a739554762782 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Jan 2016 17:05:10 -0800 Subject: [PATCH 35/52] Record whether the underlying tensor storage can be accessed directly during the evaluation of an expression. --- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 3 ++- unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h | 2 ++ unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 3 +++ unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h | 2 ++ unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h | 2 ++ unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h | 2 ++ unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h | 2 ++ unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 3 ++- unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h | 6 ++++++ unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h | 3 ++- unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h | 3 ++- unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 3 ++- unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 4 ++++ unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorRef.h | 3 +++ unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h | 2 ++ unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h | 2 ++ unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h | 2 ++ unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h | 1 + 28 files changed, 53 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index ad525bac8..dc6ca4909 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -78,7 +78,8 @@ class Tensor : public TensorBase0) & !(Options_&DontAlign), PacketAccess = (internal::packet_traits::size > 1), Layout = Options_ & RowMajor ? RowMajor : ColMajor, - CoordAccess = true + CoordAccess = true, + RawAccess = true }; static const int Options = Options_; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index 781a37e34..f1ec04c49 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -89,6 +89,7 @@ struct TensorEvaluator, Device> BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -210,6 +211,7 @@ struct TensorEvaluator, Devi BlockAccess = false, Layout = TensorEvaluator >, Device>::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index a41d4d265..10fac0cc5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -97,6 +97,7 @@ struct TensorEvaluator, Device> IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, + RawAccess = TensorEvaluator::RawAccess, }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : @@ -152,6 +153,8 @@ struct TensorEvaluator, Device> return m_leftImpl.template packet(index); } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); } + private: TensorEvaluator m_leftImpl; TensorEvaluator m_rightImpl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 9ec20a99e..efca7cd79 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -109,6 +109,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index abc3c92ca..a209e885b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -145,6 +145,7 @@ struct TensorEvaluator, Device> PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -304,6 +305,7 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 3d153bb94..f57d2bb7d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -125,6 +125,7 @@ struct TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -287,6 +288,7 @@ template::PacketAccess & TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index 3ca7daf32..877bcd0df 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -162,6 +162,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess && internal::type_casting_traits::VectorizedCast, Layout = TensorEvaluator::Layout, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index a82bfc0aa..367a152a0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -306,6 +306,7 @@ struct TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -752,6 +753,7 @@ struct TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h index 0157f6fab..0f8a98caf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -95,6 +95,7 @@ struct TensorEvaluator, Devi BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device) @@ -250,6 +251,7 @@ struct TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index ff4373f59..e7daf7304 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -98,6 +98,7 @@ struct TensorEvaluator, Device> PacketAccess = true, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = true }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -140,7 +141,7 @@ struct TensorEvaluator, Device> return internal::ploadt(m_buffer + index); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_buffer; } private: TensorEvaluator m_impl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 902f25247..f726585b1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -43,6 +43,7 @@ struct TensorEvaluator PacketAccess = Derived::PacketAccess, Layout = Derived::Layout, CoordAccess = NumCoords > 0, + RawAccess = true }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) @@ -148,6 +149,7 @@ struct TensorEvaluator PacketAccess = Derived::PacketAccess, Layout = Derived::Layout, CoordAccess = NumCoords > 0, + RawAccess = true }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) @@ -207,6 +209,7 @@ struct TensorEvaluator, Device> PacketAccess = internal::functor_traits::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC @@ -257,6 +260,7 @@ struct TensorEvaluator, Device> PacketAccess = TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) @@ -312,6 +316,7 @@ struct TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) @@ -378,6 +383,7 @@ struct TensorEvaluator internal::packet_traits::HasBlend, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index 215a4ebad..3bfaf6d23 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -135,6 +135,7 @@ struct TensorEvaluator, D BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index a4d6ce6b3..28cb9f02d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -44,7 +44,8 @@ class TensorFixedSize : public TensorBase::size > 1), Layout = Options_ & RowMajor ? RowMajor : ColMajor, CoordAccess = true, - }; + RawAccess = true + }; typedef Dimensions_ Dimensions; static const std::size_t NumIndices = Dimensions::count; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index c16bf7e67..c9b0b2f28 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -92,6 +92,7 @@ struct TensorEvaluator, Device> IsAligned = true, PacketAccess = (internal::packet_traits::size > 1), Layout = TensorEvaluator::Layout, + RawAccess = true }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index 9316c9831..96f74b992 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -95,6 +95,7 @@ struct TensorEvaluator, Device> BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 11e510414..2ab332add 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -168,6 +168,7 @@ struct TensorEvaluator, Device> PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = NumDims == 5, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index ae9e9f751..52d89ad01 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -91,7 +91,8 @@ struct TensorEvaluator, Device> BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented - }; + RawAccess = false + }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_strides(op.strides()) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index f612bbd45..a37516974 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -123,6 +123,7 @@ struct TensorEvaluator, Device> PacketAccess = TensorEvaluator::PacketAccess, Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 5c759af09..6f69da34a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -49,7 +49,8 @@ template class TensorMap : public Tensor IsAligned = ((int(Options_)&Aligned)==Aligned), PacketAccess = (internal::packet_traits::size > 1), Layout = PlainObjectType::Layout, - CoordAccess = true + CoordAccess = true, + RawAccess = true }; EIGEN_DEVICE_FUNC diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index d8c923d74..0524cf0d8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -110,6 +110,7 @@ struct TensorEvaluator, Device> PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -170,6 +171,7 @@ template PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -317,6 +319,7 @@ struct TensorEvaluator, Devi PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = TensorEvaluator::CoordAccess, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -545,6 +548,7 @@ struct TensorEvaluator, Device> PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = TensorEvaluator::CoordAccess, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 91e32d200..39a305a93 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -93,6 +93,7 @@ struct TensorEvaluator, Device PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = true, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 8fb53f4f2..2cbb820b1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -94,6 +94,7 @@ struct TensorEvaluator, Device> PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = true, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 2dc8815b8..09ee0c2c6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -428,6 +428,7 @@ struct TensorEvaluator, Device> PacketAccess = Self::InputPacketAccess && Op::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; static const bool ReducingInnerMostDims = internal::are_inner_most_dims::value; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h index 6b25b2ba0..57197d060 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h @@ -139,6 +139,7 @@ template class TensorRef : public TensorBase, Device> PacketAccess = false, Layout = TensorRef::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef& m, const Device&) @@ -412,6 +414,7 @@ struct TensorEvaluator, Device> : public TensorEvaluator& m, const Device& d) : Base(m, d) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index 10328c61f..846f81e0f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -113,6 +113,7 @@ struct TensorEvaluator, Device PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, @@ -239,6 +240,7 @@ struct TensorEvaluator, Device> PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 15a22aa1b..c4adb7d4c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -113,6 +113,7 @@ struct TensorEvaluator, Device> PacketAccess = (internal::packet_traits::size > 1), Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -225,6 +226,7 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = (internal::packet_traits::size > 1), + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 97b6168a9..2c2eb6515 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -112,6 +112,7 @@ struct TensorEvaluator, Device> PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -258,6 +259,7 @@ struct TensorEvaluator, Device> PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index 6625c66d5..52b78b261 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -181,6 +181,7 @@ struct TensorEvaluator, D BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = NumDims == 6, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) From b3b722905f3df26a34cdda4f2cee74aa62403040 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Jan 2016 17:09:47 -0800 Subject: [PATCH 36/52] Improved code indentation --- unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index 28cb9f02d..70282dd83 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -54,7 +54,7 @@ class TensorFixedSize : public TensorBase m_storage; public: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } From 6d472d83754e0f16db1deb69218e10c2b21268b1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Jan 2016 17:22:05 -0800 Subject: [PATCH 37/52] Moved the contraction mapping code to its own file to make the code more manageable. --- unsupported/Eigen/CXX11/Tensor | 1 + .../CXX11/src/Tensor/TensorContraction.h | 357 ----------------- .../src/Tensor/TensorContractionMapper.h | 377 ++++++++++++++++++ 3 files changed, 378 insertions(+), 357 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 7481a9ddb..1c5734383 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -88,6 +88,7 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorReductionCuda.h" #include "src/Tensor/TensorArgMax.h" #include "src/Tensor/TensorConcatenation.h" +#include "src/Tensor/TensorContractionMapper.h" #include "src/Tensor/TensorContraction.h" #include "src/Tensor/TensorContractionThreadPool.h" #include "src/Tensor/TensorContractionCuda.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 72a378dfd..506696ae9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -21,363 +21,6 @@ namespace Eigen { */ namespace internal { -enum { - Rhs = 0, - Lhs = 1, -}; - -/* - * Implementation of the Eigen blas_data_mapper class for tensors. - */ -template -class SimpleTensorContractionMapper { - public: - EIGEN_DEVICE_FUNC - SimpleTensorContractionMapper(const Tensor& tensor, - const nocontract_t& nocontract_strides, - const nocontract_t& ij_strides, - const contract_t& contract_strides, - const contract_t& k_strides) : - m_tensor(tensor), - m_nocontract_strides(nocontract_strides), - m_ij_strides(ij_strides), - m_contract_strides(contract_strides), - m_k_strides(k_strides) { } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar operator()(Index row) const { - // column major assumption - return operator()(row, 0); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const { - return m_tensor.coeff(computeIndex(row, col)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const { - const bool left = (side == Lhs); - Index nocontract_val = left ? row : col; - Index linidx = 0; - for (int i = static_cast(array_size::value) - 1; i > 0; i--) { - const Index idx = nocontract_val / m_ij_strides[i]; - linidx += idx * m_nocontract_strides[i]; - nocontract_val -= idx * m_ij_strides[i]; - } - if (array_size::value > array_size::value) { - if (side == Lhs && inner_dim_contiguous) { - eigen_assert(m_nocontract_strides[0] == 1); - linidx += nocontract_val; - } else { - linidx += nocontract_val * m_nocontract_strides[0]; - } - } - - Index contract_val = left ? col : row; - for (int i = static_cast(array_size::value) - 1; i > 0; i--) { - const Index idx = contract_val / m_k_strides[i]; - linidx += idx * m_contract_strides[i]; - contract_val -= idx * m_k_strides[i]; - } - - if(array_size::value > 0) { - if (side == Rhs && inner_dim_contiguous) { - eigen_assert(m_contract_strides[0] == 1); - linidx += contract_val; - } else { - linidx += contract_val * m_contract_strides[0]; - } - } - - return linidx; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE IndexPair computeIndexPair(Index row, Index col, const Index distance) const { - const bool left = (side == Lhs); - Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; - Index linidx[2] = {0, 0}; - for (int i = static_cast(array_size::value) - 1; i > 0; i--) { - const Index idx0 = nocontract_val[0] / m_ij_strides[i]; - const Index idx1 = nocontract_val[1] / m_ij_strides[i]; - linidx[0] += idx0 * m_nocontract_strides[i]; - linidx[1] += idx1 * m_nocontract_strides[i]; - nocontract_val[0] -= idx0 * m_ij_strides[i]; - nocontract_val[1] -= idx1 * m_ij_strides[i]; - } - if (array_size::value > array_size::value) { - if (side == Lhs && inner_dim_contiguous) { - eigen_assert(m_nocontract_strides[0] == 1); - linidx[0] += nocontract_val[0]; - linidx[1] += nocontract_val[1]; - } else { - linidx[0] += nocontract_val[0] * m_nocontract_strides[0]; - linidx[1] += nocontract_val[1] * m_nocontract_strides[0]; - } - } - - Index contract_val[2] = {left ? col : row, left ? col : row + distance}; - for (int i = static_cast(array_size::value) - 1; i > 0; i--) { - const Index idx0 = contract_val[0] / m_k_strides[i]; - const Index idx1 = contract_val[1] / m_k_strides[i]; - linidx[0] += idx0 * m_contract_strides[i]; - linidx[1] += idx1 * m_contract_strides[i]; - contract_val[0] -= idx0 * m_k_strides[i]; - contract_val[1] -= idx1 * m_k_strides[i]; - } - - if (side == Rhs && inner_dim_contiguous) { - eigen_assert(m_contract_strides[0] == 1); - linidx[0] += contract_val[0]; - linidx[1] += contract_val[1]; - } else { - linidx[0] += contract_val[0] * m_contract_strides[0]; - linidx[1] += contract_val[1] * m_contract_strides[0]; - } - return IndexPair(linidx[0], linidx[1]); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const { - // Only claim alignment when we can compute the actual stride (ie when we're - // dealing with the lhs with inner_dim_contiguous. This is because the - // matrix-vector product relies on the stride when dealing with aligned inputs. - return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size; - } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const { - return ((side == Lhs) && inner_dim_contiguous) ? m_contract_strides[0] : 1; - } - - protected: - const Tensor m_tensor; - const nocontract_t m_nocontract_strides; - const nocontract_t m_ij_strides; - const contract_t m_contract_strides; - const contract_t m_k_strides; -}; - - -template -class BaseTensorContractionMapper : public SimpleTensorContractionMapper -{ - public: - typedef SimpleTensorContractionMapper ParentMapper; - - EIGEN_DEVICE_FUNC - BaseTensorContractionMapper(const Tensor& tensor, - const nocontract_t& nocontract_strides, - const nocontract_t& ij_strides, - const contract_t& contract_strides, - const contract_t& k_strides) : - ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } - - typedef typename packet_traits::type Packet; - typedef typename packet_traits::half HalfPacket; - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { - // whole method makes column major assumption - - // don't need to add offsets for now (because operator handles that) - // current code assumes packet size must be a multiple of 2 - EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - - if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) { - const Index index = this->computeIndex(i, j); - eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1); - return this->m_tensor.template packet(index); - } - - const IndexPair indexPair = this->computeIndexPair(i, j, packet_size - 1); - const Index first = indexPair.first; - const Index last = indexPair.second; - - // We can always do optimized packet reads from left hand side right now, because - // the vertical matrix dimension on the left hand side is never contracting. - // On the right hand side we need to check if the contracting dimensions may have - // been shuffled first. - if (Tensor::PacketAccess && - (side == Lhs || internal::array_size::value <= 1 || !inner_dim_reordered) && - (last - first) == (packet_size - 1)) { - - return this->m_tensor.template packet(first); - } - - EIGEN_ALIGN_MAX Scalar data[packet_size]; - - data[0] = this->m_tensor.coeff(first); - for (Index k = 1; k < packet_size - 1; k += 2) { - const IndexPair internal_pair = this->computeIndexPair(i + k, j, 1); - data[k] = this->m_tensor.coeff(internal_pair.first); - data[k + 1] = this->m_tensor.coeff(internal_pair.second); - } - data[packet_size - 1] = this->m_tensor.coeff(last); - - return pload(data); - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { - // whole method makes column major assumption - - // don't need to add offsets for now (because operator handles that) - const Index half_packet_size = unpacket_traits::size; - if (half_packet_size == packet_size) { - return loadPacket(i, j); - } - EIGEN_ALIGN_MAX Scalar data[half_packet_size]; - for (Index k = 0; k < half_packet_size; k++) { - data[k] = operator()(i + k, j); - } - return pload(data); - } -}; - - -template -class BaseTensorContractionMapper : public SimpleTensorContractionMapper -{ - public: - typedef SimpleTensorContractionMapper ParentMapper; - - EIGEN_DEVICE_FUNC - BaseTensorContractionMapper(const Tensor& tensor, - const nocontract_t& nocontract_strides, - const nocontract_t& ij_strides, - const contract_t& contract_strides, - const contract_t& k_strides) : - ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } - - typedef typename packet_traits::type Packet; - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { - EIGEN_ALIGN_MAX Scalar data[1]; - data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); - return pload::type>(data); - } - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const { - return loadPacket(i, j); - } -}; - -template -class TensorContractionInputMapper; - -template -class TensorContractionSubMapper { - public: - typedef typename packet_traits::type Packet; - typedef typename packet_traits::half HalfPacket; - - typedef TensorContractionInputMapper ParentMapper; - typedef TensorContractionSubMapper Self; - typedef Self LinearMapper; - - EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset) - : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { - return m_base_mapper(i + m_vert_offset, m_horiz_offset); - } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const { - return m_base_mapper(i + m_vert_offset, j + m_horiz_offset); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { - return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); - } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { - return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { - return m_base_mapper.template loadHalfPacket(i + m_vert_offset, m_horiz_offset); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { - m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { - return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset); - } - - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const { - EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); - const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned; - return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); - } - - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const { - return false; - } - - private: - const ParentMapper& m_base_mapper; - const Index m_vert_offset; - const Index m_horiz_offset; -}; - - -template -class TensorContractionInputMapper - : public BaseTensorContractionMapper { - - public: - typedef BaseTensorContractionMapper Base; - typedef TensorContractionSubMapper SubMapper; - typedef SubMapper VectorMapper; - - EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor, - const nocontract_t& nocontract_strides, - const nocontract_t& ij_strides, - const contract_t& contract_strides, - const contract_t& k_strides) - : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const { - return SubMapper(*this, i, j); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { - return VectorMapper(*this, i, j); - } -}; - - - template struct traits > { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h new file mode 100644 index 000000000..b25b34d61 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -0,0 +1,377 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H + +namespace Eigen { + +namespace internal { + +enum { + Rhs = 0, + Lhs = 1, +}; + +/* + * Implementation of the Eigen blas_data_mapper class for tensors. + */ +template +class SimpleTensorContractionMapper { + public: + EIGEN_DEVICE_FUNC + SimpleTensorContractionMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) : + m_tensor(tensor), + m_nocontract_strides(nocontract_strides), + m_ij_strides(ij_strides), + m_contract_strides(contract_strides), + m_k_strides(k_strides) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar operator()(Index row) const { + // column major assumption + return operator()(row, 0); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const { + return m_tensor.coeff(computeIndex(row, col)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const { + const bool left = (side == Lhs); + Index nocontract_val = left ? row : col; + Index linidx = 0; + for (int i = static_cast(array_size::value) - 1; i > 0; i--) { + const Index idx = nocontract_val / m_ij_strides[i]; + linidx += idx * m_nocontract_strides[i]; + nocontract_val -= idx * m_ij_strides[i]; + } + if (array_size::value > array_size::value) { + if (side == Lhs && inner_dim_contiguous) { + eigen_assert(m_nocontract_strides[0] == 1); + linidx += nocontract_val; + } else { + linidx += nocontract_val * m_nocontract_strides[0]; + } + } + + Index contract_val = left ? col : row; + for (int i = static_cast(array_size::value) - 1; i > 0; i--) { + const Index idx = contract_val / m_k_strides[i]; + linidx += idx * m_contract_strides[i]; + contract_val -= idx * m_k_strides[i]; + } + + if(array_size::value > 0) { + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx += contract_val; + } else { + linidx += contract_val * m_contract_strides[0]; + } + } + + return linidx; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE IndexPair computeIndexPair(Index row, Index col, const Index distance) const { + const bool left = (side == Lhs); + Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; + Index linidx[2] = {0, 0}; + for (int i = static_cast(array_size::value) - 1; i > 0; i--) { + const Index idx0 = nocontract_val[0] / m_ij_strides[i]; + const Index idx1 = nocontract_val[1] / m_ij_strides[i]; + linidx[0] += idx0 * m_nocontract_strides[i]; + linidx[1] += idx1 * m_nocontract_strides[i]; + nocontract_val[0] -= idx0 * m_ij_strides[i]; + nocontract_val[1] -= idx1 * m_ij_strides[i]; + } + if (array_size::value > array_size::value) { + if (side == Lhs && inner_dim_contiguous) { + eigen_assert(m_nocontract_strides[0] == 1); + linidx[0] += nocontract_val[0]; + linidx[1] += nocontract_val[1]; + } else { + linidx[0] += nocontract_val[0] * m_nocontract_strides[0]; + linidx[1] += nocontract_val[1] * m_nocontract_strides[0]; + } + } + + Index contract_val[2] = {left ? col : row, left ? col : row + distance}; + for (int i = static_cast(array_size::value) - 1; i > 0; i--) { + const Index idx0 = contract_val[0] / m_k_strides[i]; + const Index idx1 = contract_val[1] / m_k_strides[i]; + linidx[0] += idx0 * m_contract_strides[i]; + linidx[1] += idx1 * m_contract_strides[i]; + contract_val[0] -= idx0 * m_k_strides[i]; + contract_val[1] -= idx1 * m_k_strides[i]; + } + + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx[0] += contract_val[0]; + linidx[1] += contract_val[1]; + } else { + linidx[0] += contract_val[0] * m_contract_strides[0]; + linidx[1] += contract_val[1] * m_contract_strides[0]; + } + return IndexPair(linidx[0], linidx[1]); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const { + // Only claim alignment when we can compute the actual stride (ie when we're + // dealing with the lhs with inner_dim_contiguous. This is because the + // matrix-vector product relies on the stride when dealing with aligned inputs. + return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size; + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const { + return ((side == Lhs) && inner_dim_contiguous) ? m_contract_strides[0] : 1; + } + + protected: + const Tensor m_tensor; + const nocontract_t m_nocontract_strides; + const nocontract_t m_ij_strides; + const contract_t m_contract_strides; + const contract_t m_k_strides; +}; + + +template +class BaseTensorContractionMapper : public SimpleTensorContractionMapper +{ + public: + typedef SimpleTensorContractionMapper ParentMapper; + + EIGEN_DEVICE_FUNC + BaseTensorContractionMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) : + ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + typedef typename packet_traits::type Packet; + typedef typename packet_traits::half HalfPacket; + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { + // whole method makes column major assumption + + // don't need to add offsets for now (because operator handles that) + // current code assumes packet size must be a multiple of 2 + EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + + if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) { + const Index index = this->computeIndex(i, j); + eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1); + return this->m_tensor.template packet(index); + } + + const IndexPair indexPair = this->computeIndexPair(i, j, packet_size - 1); + const Index first = indexPair.first; + const Index last = indexPair.second; + + // We can always do optimized packet reads from left hand side right now, because + // the vertical matrix dimension on the left hand side is never contracting. + // On the right hand side we need to check if the contracting dimensions may have + // been shuffled first. + if (Tensor::PacketAccess && + (side == Lhs || internal::array_size::value <= 1 || !inner_dim_reordered) && + (last - first) == (packet_size - 1)) { + + return this->m_tensor.template packet(first); + } + + EIGEN_ALIGN_MAX Scalar data[packet_size]; + + data[0] = this->m_tensor.coeff(first); + for (Index k = 1; k < packet_size - 1; k += 2) { + const IndexPair internal_pair = this->computeIndexPair(i + k, j, 1); + data[k] = this->m_tensor.coeff(internal_pair.first); + data[k + 1] = this->m_tensor.coeff(internal_pair.second); + } + data[packet_size - 1] = this->m_tensor.coeff(last); + + return pload(data); + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { + // whole method makes column major assumption + + // don't need to add offsets for now (because operator handles that) + const Index half_packet_size = unpacket_traits::size; + if (half_packet_size == packet_size) { + return loadPacket(i, j); + } + EIGEN_ALIGN_MAX Scalar data[half_packet_size]; + for (Index k = 0; k < half_packet_size; k++) { + data[k] = operator()(i + k, j); + } + return pload(data); + } +}; + + +template +class BaseTensorContractionMapper : public SimpleTensorContractionMapper +{ + public: + typedef SimpleTensorContractionMapper ParentMapper; + + EIGEN_DEVICE_FUNC + BaseTensorContractionMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) : + ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + typedef typename packet_traits::type Packet; + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { + EIGEN_ALIGN_MAX Scalar data[1]; + data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); + return pload::type>(data); + } + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const { + return loadPacket(i, j); + } +}; + +template +class TensorContractionInputMapper; + +template +class TensorContractionSubMapper { + public: + typedef typename packet_traits::type Packet; + typedef typename packet_traits::half HalfPacket; + + typedef TensorContractionInputMapper ParentMapper; + typedef TensorContractionSubMapper Self; + typedef Self LinearMapper; + + EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset) + : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { + return m_base_mapper(i + m_vert_offset, m_horiz_offset); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const { + return m_base_mapper(i + m_vert_offset, j + m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { + return m_base_mapper.template loadHalfPacket(i + m_vert_offset, m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { + m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const { + EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned; + return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const { + return false; + } + + private: + const ParentMapper& m_base_mapper; + const Index m_vert_offset; + const Index m_horiz_offset; +}; + + +template +class TensorContractionInputMapper + : public BaseTensorContractionMapper { + + public: + typedef BaseTensorContractionMapper Base; + typedef TensorContractionSubMapper SubMapper; + typedef SubMapper VectorMapper; + + EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) + : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const { + return SubMapper(*this, i, j); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { + return VectorMapper(*this, i, j); + } +}; + + + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H From df79c00901e2c976f79a0b1518ed9a58a48f0501 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Jan 2016 17:24:08 -0800 Subject: [PATCH 38/52] Improved the formatting of the code --- unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index 52d89ad01..2798956ae 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -92,7 +92,7 @@ struct TensorEvaluator, Device> Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false - }; + }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_strides(op.strides()) From 234a1094b7839017e6cf8e8f376995ee13775e00 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 20 Jan 2016 09:18:44 +0100 Subject: [PATCH 39/52] Add static assertion to y(), z(), w() accessors --- Eigen/src/Core/DenseCoeffsBase.h | 36 +++++++++++++++++++++++++----- Eigen/src/Core/util/StaticAssert.h | 1 + 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/DenseCoeffsBase.h b/Eigen/src/Core/DenseCoeffsBase.h index 28cc1432c..423ab167d 100644 --- a/Eigen/src/Core/DenseCoeffsBase.h +++ b/Eigen/src/Core/DenseCoeffsBase.h @@ -191,19 +191,31 @@ class DenseCoeffsBase : public EigenBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType - y() const { return (*this)[1]; } + y() const + { + EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=2, OUT_OF_RANGE_ACCESS); + return (*this)[1]; + } /** equivalent to operator[](2). */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType - z() const { return (*this)[2]; } + z() const + { + EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=3, OUT_OF_RANGE_ACCESS); + return (*this)[2]; + } /** equivalent to operator[](3). */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType - w() const { return (*this)[3]; } + w() const + { + EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=4, OUT_OF_RANGE_ACCESS); + return (*this)[3]; + } /** \internal * \returns the packet of coefficients starting at the given row and column. It is your responsibility @@ -424,19 +436,31 @@ class DenseCoeffsBase : public DenseCoeffsBase=2, OUT_OF_RANGE_ACCESS); + return (*this)[1]; + } /** equivalent to operator[](2). */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& - z() { return (*this)[2]; } + z() + { + EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=3, OUT_OF_RANGE_ACCESS); + return (*this)[2]; + } /** equivalent to operator[](3). */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& - w() { return (*this)[3]; } + w() + { + EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=4, OUT_OF_RANGE_ACCESS); + return (*this)[3]; + } }; /** \brief Base class providing direct read-only coefficient access to matrices and arrays. diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h index 1fe365aa7..e174509e0 100644 --- a/Eigen/src/Core/util/StaticAssert.h +++ b/Eigen/src/Core/util/StaticAssert.h @@ -50,6 +50,7 @@ THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE, THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE, THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE, + OUT_OF_RANGE_ACCESS, YOU_MADE_A_PROGRAMMING_MISTAKE, EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT, EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE, From 0b7169d1f70b0c18ae14d2a2bb5f165b7b675d51 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 20 Jan 2016 18:15:59 +0100 Subject: [PATCH 40/52] bug #1147: fix compilation of PastixSupport --- Eigen/PaStiXSupport | 1 - 1 file changed, 1 deletion(-) diff --git a/Eigen/PaStiXSupport b/Eigen/PaStiXSupport index 3411dface..de3a63b4d 100644 --- a/Eigen/PaStiXSupport +++ b/Eigen/PaStiXSupport @@ -12,7 +12,6 @@ #include "src/Core/util/DisableStupidWarnings.h" -#include extern "C" { #include #include From db237d0c75abe70cf8ed0f21e7eaa66474f8a9bd Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 20 Jan 2016 18:49:01 +0100 Subject: [PATCH 41/52] bug #1145: fix PastixSupport LLT/LDLT wrappers (missing resize prior to calls to selfAdjointView) --- Eigen/src/PaStiXSupport/PaStiXSupport.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Eigen/src/PaStiXSupport/PaStiXSupport.h b/Eigen/src/PaStiXSupport/PaStiXSupport.h index c8cb2c0cc..d21495e81 100644 --- a/Eigen/src/PaStiXSupport/PaStiXSupport.h +++ b/Eigen/src/PaStiXSupport/PaStiXSupport.h @@ -581,6 +581,7 @@ class PastixLLT : public PastixBase< PastixLLT<_MatrixType, _UpLo> > void grabMatrix(const MatrixType& matrix, ColSpMatrix& out) { + out.resize(matrix.rows(), matrix.cols()); // Pastix supports only lower, column-major matrices out.template selfadjointView() = matrix.template selfadjointView(); internal::c_to_fortran_numbering(out); @@ -666,6 +667,7 @@ class PastixLDLT : public PastixBase< PastixLDLT<_MatrixType, _UpLo> > void grabMatrix(const MatrixType& matrix, ColSpMatrix& out) { // Pastix supports only lower, column-major matrices + out.resize(matrix.rows(), matrix.cols()); out.template selfadjointView() = matrix.template selfadjointView(); internal::c_to_fortran_numbering(out); } From 4c5e96aab6baf32b077d8824b4ecbe592caaa8a0 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 20 Jan 2016 18:56:17 +0100 Subject: [PATCH 42/52] bug #1148: silent Pastix by default --- Eigen/src/PaStiXSupport/PaStiXSupport.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/PaStiXSupport/PaStiXSupport.h b/Eigen/src/PaStiXSupport/PaStiXSupport.h index d21495e81..7c8622c97 100644 --- a/Eigen/src/PaStiXSupport/PaStiXSupport.h +++ b/Eigen/src/PaStiXSupport/PaStiXSupport.h @@ -268,7 +268,7 @@ void PastixBase::init() 0, 0, 0, 1, m_iparm.data(), m_dparm.data()); m_iparm[IPARM_MATRIX_VERIFICATION] = API_NO; - m_iparm[IPARM_VERBOSE] = 2; + m_iparm[IPARM_VERBOSE] = API_VERBOSE_NOT; m_iparm[IPARM_ORDERING] = API_ORDER_SCOTCH; m_iparm[IPARM_INCOMPLETE] = API_NO; m_iparm[IPARM_OOC_LIMIT] = 2000; From ed8ade9c65bc25f2226946e4275d62f7ceb28213 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 20 Jan 2016 19:01:24 +0100 Subject: [PATCH 43/52] bug #1149: fix Pastix*::*parm() --- Eigen/src/PaStiXSupport/PaStiXSupport.h | 6 +++--- test/pastix_support.cpp | 8 ++++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/Eigen/src/PaStiXSupport/PaStiXSupport.h b/Eigen/src/PaStiXSupport/PaStiXSupport.h index 7c8622c97..d2ebfd7bb 100644 --- a/Eigen/src/PaStiXSupport/PaStiXSupport.h +++ b/Eigen/src/PaStiXSupport/PaStiXSupport.h @@ -184,7 +184,7 @@ class PastixBase : public SparseSolverBase * The statistics related to the different phases of factorization and solve are saved here as well * \sa analyzePattern() factorize() */ - Array& dparm() + Array& dparm() { return m_dparm; } @@ -244,8 +244,8 @@ class PastixBase : public SparseSolverBase mutable ComputationInfo m_info; mutable pastix_data_t *m_pastixdata; // Data structure for pastix mutable int m_comm; // The MPI communicator identifier - mutable Matrix m_iparm; // integer vector for the input parameters - mutable Matrix m_dparm; // Scalar vector for the input parameters + mutable Array m_iparm; // integer vector for the input parameters + mutable Array m_dparm; // Scalar vector for the input parameters mutable Matrix m_perm; // Permutation vector mutable Matrix m_invp; // Inverse permutation vector mutable int m_size; // Size of the matrix diff --git a/test/pastix_support.cpp b/test/pastix_support.cpp index 49239e3a5..b62f85739 100644 --- a/test/pastix_support.cpp +++ b/test/pastix_support.cpp @@ -27,6 +27,14 @@ template void test_pastix_T() check_sparse_spd_solving(pastix_llt_upper); check_sparse_spd_solving(pastix_ldlt_upper); check_sparse_square_solving(pastix_lu); + + // Some compilation check: + pastix_llt_lower.iparm(); + pastix_llt_lower.dparm(); + pastix_ldlt_lower.iparm(); + pastix_ldlt_lower.dparm(); + pastix_lu.iparm(); + pastix_lu.dparm(); } // There is no support for selfadjoint matrices with PaStiX. From 47076bf00ec32c41c0a7f2ba438361ea5f0256e4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 20 Jan 2016 14:51:48 -0800 Subject: [PATCH 44/52] Reduce the register pressure exerted by the tensor mappers whenever possible. This improves the performance of the contraction of a matrix with a vector by about 35%. --- .../CXX11/src/Tensor/TensorContraction.h | 5 +- .../src/Tensor/TensorContractionMapper.h | 109 ++++++++++++++++-- 2 files changed, 101 insertions(+), 13 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 506696ae9..575ae7b54 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -128,6 +128,7 @@ struct TensorContractionEvaluatorBase PacketAccess = (internal::packet_traits::size > 1), Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented + RawAccess = true }; // Most of the code is assuming that both input tensors are ColMajor. If the @@ -434,11 +435,11 @@ struct TensorContractionEvaluatorBase } template - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return internal::ploadt(m_result + index); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; } protected: // Prevent assignment diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h index b25b34d61..9b6d18090 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -22,6 +22,54 @@ enum { /* * Implementation of the Eigen blas_data_mapper class for tensors. */ + +template struct CoeffLoader { + enum { + DirectOffsets = false + }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_tensor(tensor) { } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index) { + eigen_assert(false && "unsupported"); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return m_tensor.coeff(index); } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename Tensor::PacketReturnType packet(typename Tensor::Index index) const + { + return m_tensor.template packet(index); + } + + + private: + const Tensor m_tensor; +}; + +template struct CoeffLoader { + enum { + DirectOffsets = true + }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_data(tensor.data()) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { + m_data += offset; + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return loadConstant(m_data+index); } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename Tensor::PacketReturnType packet(typename Tensor::Index index) const + { + return internal::ploadt_ro(m_data + index); + } + private: + typedef typename Tensor::Scalar Scalar; + const Scalar* m_data; +}; + template::DirectOffsets + }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { + m_tensor.offsetBuffer(offset); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { } @@ -148,7 +204,7 @@ class SimpleTensorContractionMapper { } protected: - const Tensor m_tensor; + CoeffLoader m_tensor; const nocontract_t m_nocontract_strides; const nocontract_t m_ij_strides; const contract_t m_contract_strides; @@ -270,12 +326,6 @@ class BaseTensorContractionMapper -class TensorContractionInputMapper; template::type Packet; typedef typename packet_traits::half HalfPacket; - typedef TensorContractionInputMapper ParentMapper; + typedef BaseTensorContractionMapper ParentMapper; typedef TensorContractionSubMapper Self; typedef Self LinearMapper; + enum { + // We can use direct offsets iff the parent mapper supports then and we can compute the strides. + // TODO: we should also enable direct offsets for the Rhs case. + UseDirectOffsets = (side == Lhs) && inner_dim_contiguous && ParentMapper::DirectOffsets + }; + EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset) - : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { } + : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { + // Bake the offsets into the buffer used by the base mapper whenever possible. This avoids the need to recompute + // this offset every time we attempt to access a coefficient. + if (UseDirectOffsets) { + Index stride = m_base_mapper.stride(); + m_base_mapper.offsetBuffer(vert_offset + horiz_offset * stride); + } + } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { + if (UseDirectOffsets) { + return m_base_mapper(i, 0); + } return m_base_mapper(i + m_vert_offset, m_horiz_offset); } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const { + if (UseDirectOffsets) { + return m_base_mapper(i, j); + } return m_base_mapper(i + m_vert_offset, j + m_horiz_offset); } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + if (UseDirectOffsets) { + return m_base_mapper.template loadPacket(i, 0); + } return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { - return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); + if (UseDirectOffsets) { + return m_base_mapper.template loadPacket(i, j); + } + return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { + if (UseDirectOffsets) { + return m_base_mapper.template loadHalfPacket(i, 0); + } return m_base_mapper.template loadHalfPacket(i + m_vert_offset, m_horiz_offset); } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { + if (UseDirectOffsets) { + m_base_mapper.storePacket(i, 0, p); + } m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p); } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + if (UseDirectOffsets) { + return LinearMapper(m_base_mapper, i, j); + } return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset); } @@ -324,6 +408,9 @@ class TensorContractionSubMapper { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const { EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned; + if (UseDirectOffsets) { + return m_base_mapper.template loadPacket(i, 0); + } return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); } @@ -333,7 +420,7 @@ class TensorContractionSubMapper { } private: - const ParentMapper& m_base_mapper; + ParentMapper m_base_mapper; const Index m_vert_offset; const Index m_horiz_offset; }; From 62f7e777117d35ee4d45e77af429246b1e4e1c12 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 21 Jan 2016 00:02:59 +0100 Subject: [PATCH 45/52] add upper|lower case in incomplete_cholesky unit test --- test/incomplete_cholesky.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/test/incomplete_cholesky.cpp b/test/incomplete_cholesky.cpp index 435e2839a..7acad9872 100644 --- a/test/incomplete_cholesky.cpp +++ b/test/incomplete_cholesky.cpp @@ -15,16 +15,18 @@ template void test_incomplete_cholesky_T() { typedef SparseMatrix SparseMatrixType; - ConjugateGradient > > cg_illt_lower_amd; - ConjugateGradient > > cg_illt_lower_nat; - ConjugateGradient > > cg_illt_upper_amd; - ConjugateGradient > > cg_illt_upper_nat; + ConjugateGradient > > cg_illt_lower_amd; + ConjugateGradient > > cg_illt_lower_nat; + ConjugateGradient > > cg_illt_upper_amd; + ConjugateGradient > > cg_illt_upper_nat; + ConjugateGradient > > cg_illt_uplo_amd; CALL_SUBTEST( check_sparse_spd_solving(cg_illt_lower_amd) ); CALL_SUBTEST( check_sparse_spd_solving(cg_illt_lower_nat) ); CALL_SUBTEST( check_sparse_spd_solving(cg_illt_upper_amd) ); CALL_SUBTEST( check_sparse_spd_solving(cg_illt_upper_nat) ); + CALL_SUBTEST( check_sparse_spd_solving(cg_illt_uplo_amd) ); } void test_incomplete_cholesky() From 7ce932edd33c56baab0a5c5a8e0608c6345efd53 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 20 Jan 2016 18:12:08 -0800 Subject: [PATCH 46/52] Small cleanup and small fix to the contraction of row major tensors --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 575ae7b54..624e814e2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -147,8 +147,6 @@ struct TensorContractionEvaluatorBase static const int ContractDims = internal::array_size::value; static const int NumDims = max_n_1::size; - typedef array left_dim_mapper_t; - typedef array right_dim_mapper_t; typedef array contract_t; typedef array::size> left_nocontract_t; typedef array::size> right_nocontract_t; @@ -195,8 +193,8 @@ struct TensorContractionEvaluatorBase // We need to flip all the pairs of contracting indices as well as // reversing the dimensions. for (int i = 0; i < ContractDims; i++) { - eval_op_indices[i].first = LDims - 1 - op.indices()[i].second; - eval_op_indices[i].second = RDims - 1 - op.indices()[i].first; + eval_op_indices[i].first = LDims - 1 - op.indices()[ContractDims - 1 - i].second; + eval_op_indices[i].second = RDims - 1 - op.indices()[ContractDims - 1 - i].first; } } @@ -504,9 +502,6 @@ struct TensorEvaluator::Dimensions>::value; static const int ContractDims = internal::array_size::value; - typedef array left_dim_mapper_t; - typedef array right_dim_mapper_t; - typedef array contract_t; typedef array::size> left_nocontract_t; typedef array::size> right_nocontract_t; From 690bc950f70c61075d396671e63480bbd64bb297 Mon Sep 17 00:00:00 2001 From: Jan Prach Date: Wed, 20 Jan 2016 19:35:59 -0800 Subject: [PATCH 47/52] fix clang warnings "braces around scalar initializer" --- unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 12 ++++++------ .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h | 8 ++++---- unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 8 ++++---- .../Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h | 4 ++-- .../Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h | 2 +- unsupported/test/cxx11_tensor_broadcasting.cpp | 6 +++--- unsupported/test/cxx11_tensor_contraction.cpp | 2 +- unsupported/test/cxx11_tensor_map.cpp | 2 +- 10 files changed, 25 insertions(+), 25 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h index 4d99f786c..c1c57041f 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h @@ -404,7 +404,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector& a) { template constexpr inline array h_array_zip(array a, array b, numeric_list) { - return array{{ Op::run(array_get(a), array_get(b))... }}; + return array{ Op::run(array_get(a), array_get(b))... }; } template @@ -432,7 +432,7 @@ constexpr inline auto array_zip_and_reduce(array a, array b) -> decl template constexpr inline array h_array_apply(array a, numeric_list) { - return array{{ Op::run(array_get(a))... }}; + return array{ Op::run(array_get(a))... }; } template diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index dc6ca4909..092e30c1f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -119,7 +119,7 @@ class Tensor : public TensorBase{{firstIndex, secondIndex, otherIndices...}}); + return coeff(array{firstIndex, secondIndex, otherIndices...}); } #endif @@ -159,7 +159,7 @@ class Tensor : public TensorBase{{firstIndex, secondIndex, otherIndices...}}); + return coeffRef(array{firstIndex, secondIndex, otherIndices...}); } #endif @@ -199,7 +199,7 @@ class Tensor : public TensorBaseoperator()(array{{firstIndex, secondIndex, otherIndices...}}); + return this->operator()(array{firstIndex, secondIndex, otherIndices...}); } #else EIGEN_DEVICE_FUNC @@ -266,7 +266,7 @@ class Tensor : public TensorBase{{firstIndex, secondIndex, otherIndices...}}); + return operator()(array{firstIndex, secondIndex, otherIndices...}); } #else EIGEN_DEVICE_FUNC @@ -342,7 +342,7 @@ class Tensor : public TensorBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions) - : m_storage(internal::array_prod(array{{firstDimension, otherDimensions...}}), array{{firstDimension, otherDimensions...}}) + : m_storage(internal::array_prod(array{firstDimension, otherDimensions...}), array{firstDimension, otherDimensions...}) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) @@ -427,7 +427,7 @@ class Tensor : public TensorBase{{firstDimension, otherDimensions...}}); + resize(array{firstDimension, otherDimensions...}); } #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 52569a359..06cac3570 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -289,7 +289,7 @@ struct DSizes : array { template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, IndexTypes... otherDimensions) { EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE) - (*this) = array{{firstDimension, otherDimensions...}}; + (*this) = array{firstDimension, otherDimensions...}; } #else EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index 70282dd83..7d0858d02 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -73,7 +73,7 @@ class TensorFixedSize : public TensorBase{{firstIndex, otherIndices...}}); + return coeff(array{firstIndex, otherIndices...}); } #endif @@ -105,7 +105,7 @@ class TensorFixedSize : public TensorBase{{firstIndex, otherIndices...}}); + return coeffRef(array{firstIndex, otherIndices...}); } #endif @@ -137,7 +137,7 @@ class TensorFixedSize : public TensorBaseoperator()(array{{firstIndex, otherIndices...}}); + return this->operator()(array{firstIndex, otherIndices...}); } #endif @@ -176,7 +176,7 @@ class TensorFixedSize : public TensorBase{{firstIndex, otherIndices...}}); + return operator()(array{firstIndex, otherIndices...}); } #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 6f69da34a..7233f4c89 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -141,10 +141,10 @@ template class TensorMap : public Tensor { EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfRowMajor(array{firstIndex, otherIndices...}); return m_data[index]; } else { - const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfColMajor(array{firstIndex, otherIndices...}); return m_data[index]; } } @@ -228,10 +228,10 @@ template class TensorMap : public Tensor static_assert(sizeof...(otherIndices) + 1 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); const std::size_t NumDims = sizeof...(otherIndices) + 1; if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfRowMajor(array{firstIndex, otherIndices...}); return m_data[index]; } else { - const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfColMajor(array{firstIndex, otherIndices...}); return m_data[index]; } } diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h index bc4f2025f..1b9fe2779 100644 --- a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h +++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h @@ -55,7 +55,7 @@ class DynamicSGroup inline internal::tensor_symmetry_value_setter operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const { static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); - return operator()(tensor, std::array{{firstIndex, otherIndices...}}); + return operator()(tensor, std::array{firstIndex, otherIndices...}); } template @@ -90,7 +90,7 @@ class DynamicSGroup template inline std::array h_permute(std::size_t which, const std::array& idx, internal::numeric_list) const { - return std::array{{ idx[n >= m_numIndices ? n : m_elements[which].representation[n]]... }}; + return std::array{ idx[n >= m_numIndices ? n : m_elements[which].representation[n]]... }; } template diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h index 942293bd7..255c344b4 100644 --- a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h +++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h @@ -217,7 +217,7 @@ class StaticSGroup inline internal::tensor_symmetry_value_setter> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const { static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); - return operator()(tensor, std::array{{firstIndex, otherIndices...}}); + return operator()(tensor, std::array{firstIndex, otherIndices...}); } template diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp index 2ddf47234..6fdefd66c 100644 --- a/unsupported/test/cxx11_tensor_broadcasting.cpp +++ b/unsupported/test/cxx11_tensor_broadcasting.cpp @@ -167,13 +167,13 @@ static void test_fixed_size_broadcasting() TensorFixedSize, DataLayout> t2; t2 = t2.constant(20.0f); - Tensor t3 = t1 + t2.broadcast(Eigen::array{{10}}); + Tensor t3 = t1 + t2.broadcast(Eigen::array{10}); for (int i = 0; i < 10; ++i) { VERIFY_IS_APPROX(t3(i), t1(i) + t2(0)); } - TensorMap, DataLayout> > t4(t2.data(), {{1}}); - Tensor t5 = t1 + t4.broadcast(Eigen::array{{10}}); + TensorMap, DataLayout> > t4(t2.data(), {1}); + Tensor t5 = t1 + t4.broadcast(Eigen::array{10}); for (int i = 0; i < 10; ++i) { VERIFY_IS_APPROX(t5(i), t1(i) + t2(0)); } diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp index b0d52c6cf..c5f3af73e 100644 --- a/unsupported/test/cxx11_tensor_contraction.cpp +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -456,7 +456,7 @@ static void test_tensor_product() mat1.setRandom(); mat2.setRandom(); - Tensor result = mat1.contract(mat2, Eigen::array{{}}); + Tensor result = mat1.contract(mat2, Eigen::array{}); VERIFY_IS_EQUAL(result.dimension(0), 2); VERIFY_IS_EQUAL(result.dimension(1), 3); diff --git a/unsupported/test/cxx11_tensor_map.cpp b/unsupported/test/cxx11_tensor_map.cpp index a8a095e38..dc0f8a5a2 100644 --- a/unsupported/test/cxx11_tensor_map.cpp +++ b/unsupported/test/cxx11_tensor_map.cpp @@ -130,7 +130,7 @@ static void test_3d() } TensorMap> mat3(mat1.data(), 2, 3, 7); - TensorMap> mat4(mat2.data(), array{{2, 3, 7}}); + TensorMap> mat4(mat2.data(), array{2, 3, 7}); VERIFY_IS_EQUAL(mat3.rank(), 3); VERIFY_IS_EQUAL(mat3.size(), 2*3*7); From 34340458cbe33976559bf8fd73a9d4b2f747d611 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 21 Jan 2016 14:29:45 +0100 Subject: [PATCH 48/52] bug #1151: remove useless critical section --- Eigen/src/Core/products/GeneralMatrixMatrix.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index d830dfb96..d77fc2630 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -145,12 +145,9 @@ static void run(Index rows, Index cols, Index depth, // Release all the sub blocks A'_i of A' for the current thread, // i.e., we simply decrement the number of users by 1 - #pragma omp critical - { for(Index i=0; i Date: Thu, 21 Jan 2016 20:18:51 +0100 Subject: [PATCH 49/52] Add numext::sqrt function to enable custom optimized implementation. This changeset add two specializations for float/double on SSE. Those are mostly usefull with GCC for which std::sqrt add an extra and costly check on the result of _mm_sqrt_*. Clang does not add this burden. In this changeset, only DenseBase::norm() makes use of it. --- Eigen/src/Core/Dot.h | 3 +-- Eigen/src/Core/MathFunctions.h | 20 ++++++++++++++++++-- Eigen/src/Core/arch/SSE/MathFunctions.h | 22 ++++++++++++++++++++++ 3 files changed, 41 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h index c5040c67b..ce42854cd 100644 --- a/Eigen/src/Core/Dot.h +++ b/Eigen/src/Core/Dot.h @@ -99,8 +99,7 @@ EIGEN_STRONG_INLINE typename NumTraits::Scala template inline typename NumTraits::Scalar>::Real MatrixBase::norm() const { - EIGEN_USING_STD_MATH(sqrt) - return sqrt(squaredNorm()); + return numext::sqrt(squaredNorm()); } /** \returns an expression of the quotient of *this by its own norm. diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 4d5e1acb8..1c7b28a4b 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -954,8 +954,8 @@ T (ceil)(const T& x) return ceil(x); } -// Log base 2 for 32 bits positive integers. -// Conveniently returns 0 for x==0. +/** Log base 2 for 32 bits positive integers. + * Conveniently returns 0 for x==0. */ inline int log2(int x) { eigen_assert(x>=0); @@ -969,6 +969,22 @@ inline int log2(int x) return table[(v * 0x07C4ACDDU) >> 27]; } +/** \returns the square root of \a x. + * + * It is essentially equivalent to \code using std::sqrt; return sqrt(x); \endcode, + * but slightly faster for float/double and some compilers (e.g., gcc), thanks to + * specializations when SSE is enabled. + * + * It's usage is justified in performance critical functions, like norm/normalize. + */ +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T sqrt(const T &x) +{ + EIGEN_USING_STD_MATH(sqrt); + return sqrt(x); +} + } // end namespace numext namespace internal { diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 3b8b7303f..0dd52f96e 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -518,6 +518,28 @@ Packet2d prsqrt(const Packet2d& x) { } // end namespace internal +namespace numext { + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float sqrt(const float &x) +{ + return internal::pfirst(_mm_sqrt_ss(_mm_set_ss(x))); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double sqrt(const double &x) +{ +#if EIGEN_COMP_GNUC + return internal::pfirst(__builtin_ia32_sqrtsd(_mm_set_sd(x))); +#else + return internal::pfirst(_mm_sqrt_pd(_mm_set_sd(x))); +#endif +} + +} // end namespace numex + } // end namespace Eigen #endif // EIGEN_MATH_FUNCTIONS_SSE_H From 7cae8918c019feabf6c143c430d0cd82c74aeec3 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 21 Jan 2016 20:30:32 +0100 Subject: [PATCH 50/52] Fix compilation on old gcc+AVX --- Eigen/src/Core/arch/SSE/MathFunctions.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 0dd52f96e..74f6abc37 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -524,7 +524,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sqrt(const float &x) { - return internal::pfirst(_mm_sqrt_ss(_mm_set_ss(x))); + return internal::pfirst(internal::Packet4f(_mm_sqrt_ss(_mm_set_ss(x)))); } template<> @@ -532,9 +532,9 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double sqrt(const double &x) { #if EIGEN_COMP_GNUC - return internal::pfirst(__builtin_ia32_sqrtsd(_mm_set_sd(x))); + return internal::pfirst(internal::Packet2d(__builtin_ia32_sqrtsd(_mm_set_sd(x)))); #else - return internal::pfirst(_mm_sqrt_pd(_mm_set_sd(x))); + return internal::pfirst(internal::Packet2d(_mm_sqrt_pd(_mm_set_sd(x)))); #endif } From ee37eb4eed09fe35be2acc3699e80f49a44ea99a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 21 Jan 2016 20:43:42 +0100 Subject: [PATCH 51/52] bug #977: avoid division by 0 in normalize() and normalized(). --- Eigen/src/Core/Dot.h | 19 ++++++++++++++++--- test/adjoint.cpp | 9 +++++++++ 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h index ce42854cd..221fc3224 100644 --- a/Eigen/src/Core/Dot.h +++ b/Eigen/src/Core/Dot.h @@ -102,7 +102,10 @@ inline typename NumTraits::Scalar>::Real Matr return numext::sqrt(squaredNorm()); } -/** \returns an expression of the quotient of *this by its own norm. +/** \returns an expression of the quotient of \c *this by its own norm. + * + * \warning If the input vector is too small (i.e., this->norm()==0), + * then this function returns a copy of the input. * * \only_for_vectors * @@ -114,19 +117,29 @@ MatrixBase::normalized() const { typedef typename internal::nested_eval::type _Nested; _Nested n(derived()); - return n / n.norm(); + RealScalar z = n.squaredNorm(); + // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU + if(z>RealScalar(0)) + return n / numext::sqrt(z); + else + return n; } /** Normalizes the vector, i.e. divides it by its own norm. * * \only_for_vectors * + * \warning If the input vector is too small (i.e., this->norm()==0), then \c *this is left unchanged. + * * \sa norm(), normalized() */ template inline void MatrixBase::normalize() { - *this /= norm(); + RealScalar z = squaredNorm(); + // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU + if(z>RealScalar(0)) + derived() /= numext::sqrt(z); } //---------- implementation of other norms ---------- diff --git a/test/adjoint.cpp b/test/adjoint.cpp index 3b2a53c91..b1e69c2e5 100644 --- a/test/adjoint.cpp +++ b/test/adjoint.cpp @@ -42,6 +42,15 @@ template<> struct adjoint_specific { VERIFY_IS_APPROX(v1, v1.norm() * v3); VERIFY_IS_APPROX(v3, v1.normalized()); VERIFY_IS_APPROX(v3.norm(), RealScalar(1)); + + // check null inputs + VERIFY_IS_APPROX((v1*0).normalized(), (v1*0)); + RealScalar very_small = (std::numeric_limits::min)(); + VERIFY( (v1*very_small).norm() == 0 ); + VERIFY_IS_APPROX((v1*very_small).normalized(), (v1*very_small)); + v3 = v1*very_small; + v3.normalize(); + VERIFY_IS_APPROX(v3, (v1*very_small)); // check compatibility of dot and adjoint ref = NumTraits::IsInteger ? 0 : (std::max)((std::max)(v1.norm(),v2.norm()),(std::max)((square * v2).norm(),(square.adjoint() * v1).norm())); From c33479324c2a24094acf78c776655a6474c3bcca Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 21 Jan 2016 17:08:11 -0800 Subject: [PATCH 52/52] Fixed a constness bug --- unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 0524cf0d8..11284315c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -146,7 +146,7 @@ struct TensorEvaluator, Device> return m_impl.template packet(index); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_impl.data(); } + EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); } const TensorEvaluator& impl() const { return m_impl; }