From 34d9fce93bd5f1521017402154a8ab915af8fcb8 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 27 Feb 2017 16:33:33 -0800 Subject: [PATCH 01/37] Avoid unecessary float to double conversions. --- Eigen/src/Core/arch/CUDA/PacketMath.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index ad66399e0..4dda63188 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -291,7 +291,7 @@ template<> EIGEN_DEVICE_FUNC inline double2 pabs(const double2& a) { EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - double tmp = kernel.packet[0].y; + float tmp = kernel.packet[0].y; kernel.packet[0].y = kernel.packet[1].x; kernel.packet[1].x = tmp; From 554116bec1b19d417521e9bee767d6b57813492a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 27 Feb 2017 16:45:31 -0800 Subject: [PATCH 02/37] Added EIGEN_DEVICE_FUNC to make the prototype of the EigenBase override match that of DenseBase --- Eigen/src/Core/EigenBase.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Eigen/src/Core/EigenBase.h b/Eigen/src/Core/EigenBase.h index f76995af9..ccc122cfc 100644 --- a/Eigen/src/Core/EigenBase.h +++ b/Eigen/src/Core/EigenBase.h @@ -128,6 +128,7 @@ template struct EigenBase */ template template +EIGEN_DEVICE_FUNC Derived& DenseBase::operator=(const EigenBase &other) { call_assignment(derived(), other.derived()); @@ -136,6 +137,7 @@ Derived& DenseBase::operator=(const EigenBase &other) template template +EIGEN_DEVICE_FUNC Derived& DenseBase::operator+=(const EigenBase &other) { call_assignment(derived(), other.derived(), internal::add_assign_op()); @@ -144,6 +146,7 @@ Derived& DenseBase::operator+=(const EigenBase &other) template template +EIGEN_DEVICE_FUNC Derived& DenseBase::operator-=(const EigenBase &other) { call_assignment(derived(), other.derived(), internal::sub_assign_op()); From b1fc7c9a09e2aee938036ddba14e870c9658c791 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 27 Feb 2017 16:48:30 -0800 Subject: [PATCH 03/37] Added missing EIGEN_DEVICE_FUNC qualifiers. --- Eigen/src/Core/ArrayBase.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/ArrayBase.h b/Eigen/src/Core/ArrayBase.h index af5fb2566..9da960f08 100644 --- a/Eigen/src/Core/ArrayBase.h +++ b/Eigen/src/Core/ArrayBase.h @@ -175,7 +175,7 @@ template class ArrayBase */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & ArrayBase::operator-=(const ArrayBase &other) { call_assignment(derived(), other.derived(), internal::sub_assign_op()); @@ -188,7 +188,7 @@ ArrayBase::operator-=(const ArrayBase &other) */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & ArrayBase::operator+=(const ArrayBase& other) { call_assignment(derived(), other.derived(), internal::add_assign_op()); @@ -201,7 +201,7 @@ ArrayBase::operator+=(const ArrayBase& other) */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & ArrayBase::operator*=(const ArrayBase& other) { call_assignment(derived(), other.derived(), internal::mul_assign_op()); @@ -214,7 +214,7 @@ ArrayBase::operator*=(const ArrayBase& other) */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & ArrayBase::operator/=(const ArrayBase& other) { call_assignment(derived(), other.derived(), internal::div_assign_op()); From ed4dc9d01aa46688e12ef1e0772145c1b222602c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 27 Feb 2017 16:57:01 -0800 Subject: [PATCH 04/37] Declared the plset, ploadt_ro, and ploaddup packet primitives as usable within a gpu kernel --- Eigen/src/Core/GenericPacketMath.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index ac5552d3e..d19d5bbd2 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -231,7 +231,7 @@ pload1(const typename unpacket_traits::type *a) { return pset1( * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]} * Currently, this function is only used for scalar * complex products. */ -template EIGEN_DEVICE_FUNC inline Packet +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet ploaddup(const typename unpacket_traits::type* from) { return *from; } /** \internal \returns a packet with elements of \a *from quadrupled. @@ -279,7 +279,7 @@ inline void pbroadcast2(const typename unpacket_traits::type *a, } /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */ -template inline Packet +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet plset(const typename unpacket_traits::type& a) { return a; } /** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */ @@ -487,7 +487,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& fro * by the current computation. */ template -inline Packet ploadt_ro(const typename unpacket_traits::type* from) +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_traits::type* from) { return ploadt(from); } From 193939d6aaca2d8b4ee7cac9f0a89637596c692f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 27 Feb 2017 17:11:47 -0800 Subject: [PATCH 05/37] Added missing EIGEN_DEVICE_FUNC qualifiers to several nullary op methods. --- Eigen/src/Core/CwiseNullaryOp.h | 80 ++++++++++++++++----------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h index dd498f758..ddd607e38 100644 --- a/Eigen/src/Core/CwiseNullaryOp.h +++ b/Eigen/src/Core/CwiseNullaryOp.h @@ -105,7 +105,7 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp template -EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> DenseBase::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func) { return CwiseNullaryOp(rows, cols, func); @@ -150,7 +150,7 @@ DenseBase::NullaryExpr(Index size, const CustomNullaryOp& func) */ template template -EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> DenseBase::NullaryExpr(const CustomNullaryOp& func) { return CwiseNullaryOp(RowsAtCompileTime, ColsAtCompileTime, func); @@ -192,7 +192,7 @@ DenseBase::Constant(Index rows, Index cols, const Scalar& value) * \sa class CwiseNullaryOp */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Constant(Index size, const Scalar& value) { return DenseBase::NullaryExpr(size, internal::scalar_constant_op(value)); @@ -208,7 +208,7 @@ DenseBase::Constant(Index size, const Scalar& value) * \sa class CwiseNullaryOp */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Constant(const Scalar& value) { EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) @@ -220,7 +220,7 @@ DenseBase::Constant(const Scalar& value) * \sa LinSpaced(Index,Scalar,Scalar), setLinSpaced(Index,const Scalar&,const Scalar&) */ template -EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType DenseBase::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) @@ -232,7 +232,7 @@ DenseBase::LinSpaced(Sequential_t, Index size, const Scalar& low, const * \sa LinSpaced(Scalar,Scalar) */ template -EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType DenseBase::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) @@ -264,7 +264,7 @@ DenseBase::LinSpaced(Sequential_t, const Scalar& low, const Scalar& hig * \sa setLinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp */ template -EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType DenseBase::LinSpaced(Index size, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) @@ -276,7 +276,7 @@ DenseBase::LinSpaced(Index size, const Scalar& low, const Scalar& high) * Special version for fixed size types which does not require the size parameter. */ template -EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType DenseBase::LinSpaced(const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) @@ -286,7 +286,7 @@ DenseBase::LinSpaced(const Scalar& low, const Scalar& high) /** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */ template -bool DenseBase::isApproxToConstant +EIGEN_DEVICE_FUNC bool DenseBase::isApproxToConstant (const Scalar& val, const RealScalar& prec) const { typename internal::nested_eval::type self(derived()); @@ -301,7 +301,7 @@ bool DenseBase::isApproxToConstant * * \returns true if all coefficients in this matrix are approximately equal to \a value, to within precision \a prec */ template -bool DenseBase::isConstant +EIGEN_DEVICE_FUNC bool DenseBase::isConstant (const Scalar& val, const RealScalar& prec) const { return isApproxToConstant(val, prec); @@ -312,7 +312,7 @@ bool DenseBase::isConstant * \sa setConstant(), Constant(), class CwiseNullaryOp */ template -EIGEN_STRONG_INLINE void DenseBase::fill(const Scalar& val) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void DenseBase::fill(const Scalar& val) { setConstant(val); } @@ -322,7 +322,7 @@ EIGEN_STRONG_INLINE void DenseBase::fill(const Scalar& val) * \sa fill(), setConstant(Index,const Scalar&), setConstant(Index,Index,const Scalar&), setZero(), setOnes(), Constant(), class CwiseNullaryOp, setZero(), setOnes() */ template -EIGEN_STRONG_INLINE Derived& DenseBase::setConstant(const Scalar& val) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setConstant(const Scalar& val) { return derived() = Constant(rows(), cols(), val); } @@ -337,7 +337,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::setConstant(const Scalar& val) * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&) */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setConstant(Index size, const Scalar& val) { resize(size); @@ -356,7 +356,7 @@ PlainObjectBase::setConstant(Index size, const Scalar& val) * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&) */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setConstant(Index rows, Index cols, const Scalar& val) { resize(rows, cols); @@ -380,7 +380,7 @@ PlainObjectBase::setConstant(Index rows, Index cols, const Scalar& val) * \sa LinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp */ template -EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op(low,high,newSize)); @@ -400,7 +400,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(Index newSize, con * \sa LinSpaced(Index,const Scalar&,const Scalar&), setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp */ template -EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(const Scalar& low, const Scalar& high) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return setLinSpaced(size(), low, high); @@ -423,7 +423,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(const Scalar& low, * \sa Zero(), Zero(Index) */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Zero(Index rows, Index cols) { return Constant(rows, cols, Scalar(0)); @@ -446,7 +446,7 @@ DenseBase::Zero(Index rows, Index cols) * \sa Zero(), Zero(Index,Index) */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Zero(Index size) { return Constant(size, Scalar(0)); @@ -463,7 +463,7 @@ DenseBase::Zero(Index size) * \sa Zero(Index), Zero(Index,Index) */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Zero() { return Constant(Scalar(0)); @@ -478,7 +478,7 @@ DenseBase::Zero() * \sa class CwiseNullaryOp, Zero() */ template -bool DenseBase::isZero(const RealScalar& prec) const +EIGEN_DEVICE_FUNC bool DenseBase::isZero(const RealScalar& prec) const { typename internal::nested_eval::type self(derived()); for(Index j = 0; j < cols(); ++j) @@ -496,7 +496,7 @@ bool DenseBase::isZero(const RealScalar& prec) const * \sa class CwiseNullaryOp, Zero() */ template -EIGEN_STRONG_INLINE Derived& DenseBase::setZero() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setZero() { return setConstant(Scalar(0)); } @@ -511,7 +511,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::setZero() * \sa DenseBase::setZero(), setZero(Index,Index), class CwiseNullaryOp, DenseBase::Zero() */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setZero(Index newSize) { resize(newSize); @@ -529,7 +529,7 @@ PlainObjectBase::setZero(Index newSize) * \sa DenseBase::setZero(), setZero(Index), class CwiseNullaryOp, DenseBase::Zero() */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setZero(Index rows, Index cols) { resize(rows, cols); @@ -553,7 +553,7 @@ PlainObjectBase::setZero(Index rows, Index cols) * \sa Ones(), Ones(Index), isOnes(), class Ones */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Ones(Index rows, Index cols) { return Constant(rows, cols, Scalar(1)); @@ -576,7 +576,7 @@ DenseBase::Ones(Index rows, Index cols) * \sa Ones(), Ones(Index,Index), isOnes(), class Ones */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Ones(Index newSize) { return Constant(newSize, Scalar(1)); @@ -593,7 +593,7 @@ DenseBase::Ones(Index newSize) * \sa Ones(Index), Ones(Index,Index), isOnes(), class Ones */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Ones() { return Constant(Scalar(1)); @@ -608,7 +608,7 @@ DenseBase::Ones() * \sa class CwiseNullaryOp, Ones() */ template -bool DenseBase::isOnes +EIGEN_DEVICE_FUNC bool DenseBase::isOnes (const RealScalar& prec) const { return isApproxToConstant(Scalar(1), prec); @@ -622,7 +622,7 @@ bool DenseBase::isOnes * \sa class CwiseNullaryOp, Ones() */ template -EIGEN_STRONG_INLINE Derived& DenseBase::setOnes() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setOnes() { return setConstant(Scalar(1)); } @@ -637,7 +637,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::setOnes() * \sa MatrixBase::setOnes(), setOnes(Index,Index), class CwiseNullaryOp, MatrixBase::Ones() */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setOnes(Index newSize) { resize(newSize); @@ -655,7 +655,7 @@ PlainObjectBase::setOnes(Index newSize) * \sa MatrixBase::setOnes(), setOnes(Index), class CwiseNullaryOp, MatrixBase::Ones() */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setOnes(Index rows, Index cols) { resize(rows, cols); @@ -679,7 +679,7 @@ PlainObjectBase::setOnes(Index rows, Index cols) * \sa Identity(), setIdentity(), isIdentity() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::IdentityReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::IdentityReturnType MatrixBase::Identity(Index rows, Index cols) { return DenseBase::NullaryExpr(rows, cols, internal::scalar_identity_op()); @@ -696,7 +696,7 @@ MatrixBase::Identity(Index rows, Index cols) * \sa Identity(Index,Index), setIdentity(), isIdentity() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::IdentityReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::IdentityReturnType MatrixBase::Identity() { EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) @@ -771,7 +771,7 @@ struct setIdentity_impl * \sa class CwiseNullaryOp, Identity(), Identity(Index,Index), isIdentity() */ template -EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity() { return internal::setIdentity_impl::run(derived()); } @@ -787,7 +787,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity() * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Identity() */ template -EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity(Index rows, Index cols) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity(Index rows, Index cols) { derived().resize(rows, cols); return setIdentity(); @@ -800,7 +800,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity(Index rows, Index * \sa MatrixBase::Unit(Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::Unit(Index newSize, Index i) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::Unit(Index newSize, Index i) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return BasisReturnType(SquareMatrixType::Identity(newSize,newSize), i); @@ -815,7 +815,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBa * \sa MatrixBase::Unit(Index,Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::Unit(Index i) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::Unit(Index i) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return BasisReturnType(SquareMatrixType::Identity(),i); @@ -828,7 +828,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBa * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitX() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitX() { return Derived::Unit(0); } /** \returns an expression of the Y axis unit vector (0,1{,0}^*) @@ -838,7 +838,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBa * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitY() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitY() { return Derived::Unit(1); } /** \returns an expression of the Z axis unit vector (0,0,1{,0}^*) @@ -848,7 +848,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBa * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitZ() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitZ() { return Derived::Unit(2); } /** \returns an expression of the W axis unit vector (0,0,0,1) @@ -858,7 +858,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBa * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitW() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitW() { return Derived::Unit(3); } } // end namespace Eigen From 889c606f8fd242b1cf5e3c8f967ac7dad004775d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 27 Feb 2017 17:17:47 -0800 Subject: [PATCH 06/37] Added missing EIGEN_DEVICE_FUNC to the SelfCwise binary ops --- Eigen/src/Core/SelfCwiseBinaryOp.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/SelfCwiseBinaryOp.h b/Eigen/src/Core/SelfCwiseBinaryOp.h index 719ed72a5..50099df82 100644 --- a/Eigen/src/Core/SelfCwiseBinaryOp.h +++ b/Eigen/src/Core/SelfCwiseBinaryOp.h @@ -15,7 +15,7 @@ namespace Eigen { // TODO generalize the scalar type of 'other' template -EIGEN_STRONG_INLINE Derived& DenseBase::operator*=(const Scalar& other) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::operator*=(const Scalar& other) { typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op()); @@ -23,7 +23,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::operator*=(const Scalar& other) } template -EIGEN_STRONG_INLINE Derived& ArrayBase::operator+=(const Scalar& other) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase::operator+=(const Scalar& other) { typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op()); @@ -31,7 +31,7 @@ EIGEN_STRONG_INLINE Derived& ArrayBase::operator+=(const Scalar& other) } template -EIGEN_STRONG_INLINE Derived& ArrayBase::operator-=(const Scalar& other) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase::operator-=(const Scalar& other) { typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op()); @@ -39,7 +39,7 @@ EIGEN_STRONG_INLINE Derived& ArrayBase::operator-=(const Scalar& other) } template -EIGEN_STRONG_INLINE Derived& DenseBase::operator/=(const Scalar& other) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::operator/=(const Scalar& other) { typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op()); From 478a9f53be33c23ac5e22e0bb09cad7f719fedd4 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 28 Feb 2017 09:32:45 +0100 Subject: [PATCH 07/37] Fix typo. --- unsupported/Eigen/CXX11/src/Tensor/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md index fbb7f3bfc..38cdb9c69 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/README.md +++ b/unsupported/Eigen/CXX11/src/Tensor/README.md @@ -75,16 +75,16 @@ large enough to hold all the data. // Map a tensor of ints on top of stack-allocated storage. int storage[128]; // 2 x 4 x 2 x 8 = 128 - TensorMap t_4d(storage, 2, 4, 2, 8); + TensorMap> t_4d(storage, 2, 4, 2, 8); // The same storage can be viewed as a different tensor. // You can also pass the sizes as an array. - TensorMap t_2d(storage, 16, 8); + TensorMap> t_2d(storage, 16, 8); // You can also map fixed-size tensors. Here we get a 1d view of // the 2d fixed-size tensor. Tensor> t_4x3; - TensorMap t_12(t_4x3, 12); + TensorMap> t_12(t_4x3, 12); #### Class TensorRef From 4e98a7b2f0501408751e2da5f24d65f642371226 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 28 Feb 2017 09:47:38 +0100 Subject: [PATCH 08/37] bug #1396: add some missing EIGEN_DEVICE_FUNC --- Eigen/src/Core/DenseBase.h | 6 +++--- Eigen/src/Core/MatrixBase.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h index fc807577b..91a8511be 100644 --- a/Eigen/src/Core/DenseBase.h +++ b/Eigen/src/Core/DenseBase.h @@ -484,9 +484,9 @@ template class DenseBase return derived().coeff(0,0); } - bool all() const; - bool any() const; - Index count() const; + EIGEN_DEVICE_FUNC bool all() const; + EIGEN_DEVICE_FUNC bool any() const; + EIGEN_DEVICE_FUNC Index count() const; typedef VectorwiseOp RowwiseReturnType; typedef const VectorwiseOp ConstRowwiseReturnType; diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 675c94e12..200e57741 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -294,7 +294,7 @@ template class MatrixBase * fuzzy comparison such as isApprox() * \sa isApprox(), operator!= */ template - inline bool operator==(const MatrixBase& other) const + EIGEN_DEVICE_FUNC inline bool operator==(const MatrixBase& other) const { return cwiseEqual(other).all(); } /** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other. @@ -302,7 +302,7 @@ template class MatrixBase * fuzzy comparison such as isApprox() * \sa isApprox(), operator== */ template - inline bool operator!=(const MatrixBase& other) const + EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase& other) const { return cwiseNotEqual(other).any(); } NoAlias noalias(); From f3e9c42876de4b49bdb16b72b09a342d930ce6f0 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 28 Feb 2017 09:46:30 -0800 Subject: [PATCH 09/37] Added missing EIGEN_DEVICE_FUNC qualifiers --- Eigen/src/Core/Diagonal.h | 10 ++++++---- Eigen/src/Core/Transpose.h | 10 +++++----- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/Eigen/src/Core/Diagonal.h b/Eigen/src/Core/Diagonal.h index 49e711257..c62f5ff21 100644 --- a/Eigen/src/Core/Diagonal.h +++ b/Eigen/src/Core/Diagonal.h @@ -184,7 +184,7 @@ template class Diagonal * * \sa class Diagonal */ template -inline typename MatrixBase::DiagonalReturnType +EIGEN_DEVICE_FUNC inline typename MatrixBase::DiagonalReturnType MatrixBase::diagonal() { return DiagonalReturnType(derived()); @@ -192,7 +192,7 @@ MatrixBase::diagonal() /** This is the const version of diagonal(). */ template -inline typename MatrixBase::ConstDiagonalReturnType +EIGEN_DEVICE_FUNC inline typename MatrixBase::ConstDiagonalReturnType MatrixBase::diagonal() const { return ConstDiagonalReturnType(derived()); @@ -210,7 +210,7 @@ MatrixBase::diagonal() const * * \sa MatrixBase::diagonal(), class Diagonal */ template -inline typename MatrixBase::DiagonalDynamicIndexReturnType +EIGEN_DEVICE_FUNC inline typename MatrixBase::DiagonalDynamicIndexReturnType MatrixBase::diagonal(Index index) { return DiagonalDynamicIndexReturnType(derived(), index); @@ -218,7 +218,7 @@ MatrixBase::diagonal(Index index) /** This is the const version of diagonal(Index). */ template -inline typename MatrixBase::ConstDiagonalDynamicIndexReturnType +EIGEN_DEVICE_FUNC inline typename MatrixBase::ConstDiagonalDynamicIndexReturnType MatrixBase::diagonal(Index index) const { return ConstDiagonalDynamicIndexReturnType(derived(), index); @@ -237,6 +237,7 @@ MatrixBase::diagonal(Index index) const * \sa MatrixBase::diagonal(), class Diagonal */ template template +EIGEN_DEVICE_FUNC inline typename MatrixBase::template DiagonalIndexReturnType::Type MatrixBase::diagonal() { @@ -246,6 +247,7 @@ MatrixBase::diagonal() /** This is the const version of diagonal(). */ template template +EIGEN_DEVICE_FUNC inline typename MatrixBase::template ConstDiagonalIndexReturnType::Type MatrixBase::diagonal() const { diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index 79b767bcc..ba7d6e629 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -168,7 +168,7 @@ template class TransposeImpl * * \sa transposeInPlace(), adjoint() */ template -inline Transpose +EIGEN_DEVICE_FUNC inline Transpose DenseBase::transpose() { return TransposeReturnType(derived()); @@ -180,7 +180,7 @@ DenseBase::transpose() * * \sa transposeInPlace(), adjoint() */ template -inline typename DenseBase::ConstTransposeReturnType +EIGEN_DEVICE_FUNC inline typename DenseBase::ConstTransposeReturnType DenseBase::transpose() const { return ConstTransposeReturnType(derived()); @@ -206,7 +206,7 @@ DenseBase::transpose() const * * \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */ template -inline const typename MatrixBase::AdjointReturnType +EIGEN_DEVICE_FUNC inline const typename MatrixBase::AdjointReturnType MatrixBase::adjoint() const { return AdjointReturnType(this->transpose()); @@ -281,7 +281,7 @@ struct inplace_transpose_selector { // non squ * * \sa transpose(), adjoint(), adjointInPlace() */ template -inline void DenseBase::transposeInPlace() +EIGEN_DEVICE_FUNC inline void DenseBase::transposeInPlace() { eigen_assert((rows() == cols() || (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic)) && "transposeInPlace() called on a non-square non-resizable matrix"); @@ -312,7 +312,7 @@ inline void DenseBase::transposeInPlace() * * \sa transpose(), adjoint(), transposeInPlace() */ template -inline void MatrixBase::adjointInPlace() +EIGEN_DEVICE_FUNC inline void MatrixBase::adjointInPlace() { derived() = adjoint().eval(); } From 33443ec2b0d116daa5210fc5e7982ca4a1598bc7 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 28 Feb 2017 09:50:10 -0800 Subject: [PATCH 10/37] Added missing EIGEN_DEVICE_FUNC qualifiers --- Eigen/src/Core/Redux.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index b6e8f8887..2b5b73bf7 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -407,7 +407,7 @@ protected: */ template template -typename internal::traits::Scalar +EIGEN_DEVICE_FUNC typename internal::traits::Scalar DenseBase::redux(const Func& func) const { eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); @@ -422,7 +422,7 @@ DenseBase::redux(const Func& func) const * \warning the result is undefined if \c *this contains NaN. */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::minCoeff() const { return derived().redux(Eigen::internal::scalar_min_op()); @@ -432,7 +432,7 @@ DenseBase::minCoeff() const * \warning the result is undefined if \c *this contains NaN. */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::maxCoeff() const { return derived().redux(Eigen::internal::scalar_max_op()); @@ -445,7 +445,7 @@ DenseBase::maxCoeff() const * \sa trace(), prod(), mean() */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::sum() const { if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0)) @@ -458,7 +458,7 @@ DenseBase::sum() const * \sa trace(), prod(), sum() */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::mean() const { #ifdef __INTEL_COMPILER @@ -479,7 +479,7 @@ DenseBase::mean() const * \sa sum(), mean(), trace() */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::prod() const { if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0)) @@ -494,7 +494,7 @@ DenseBase::prod() const * \sa diagonal(), sum() */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar MatrixBase::trace() const { return derived().diagonal().sum(); From e993c94f07faef161076c8710e8cb434174dd250 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 28 Feb 2017 09:56:45 -0800 Subject: [PATCH 11/37] Added missing EIGEN_DEVICE_FUNC qualifiers --- Eigen/src/Core/CommaInitializer.h | 4 ++-- Eigen/src/Core/CwiseBinaryOp.h | 5 ++--- Eigen/src/Core/CwiseNullaryOp.h | 4 ++-- Eigen/src/Core/Fuzzy.h | 6 +++--- Eigen/src/Core/NestByValue.h | 10 +++++----- Eigen/src/Core/ReturnByValue.h | 2 +- 6 files changed, 15 insertions(+), 16 deletions(-) diff --git a/Eigen/src/Core/CommaInitializer.h b/Eigen/src/Core/CommaInitializer.h index d218e9814..35fdbb819 100644 --- a/Eigen/src/Core/CommaInitializer.h +++ b/Eigen/src/Core/CommaInitializer.h @@ -141,7 +141,7 @@ struct CommaInitializer * \sa CommaInitializer::finished(), class CommaInitializer */ template -inline CommaInitializer DenseBase::operator<< (const Scalar& s) +EIGEN_DEVICE_FUNC inline CommaInitializer DenseBase::operator<< (const Scalar& s) { return CommaInitializer(*static_cast(this), s); } @@ -149,7 +149,7 @@ inline CommaInitializer DenseBase::operator<< (const Scalar& s /** \sa operator<<(const Scalar&) */ template template -inline CommaInitializer +EIGEN_DEVICE_FUNC inline CommaInitializer DenseBase::operator<<(const DenseBase& other) { return CommaInitializer(*static_cast(this), other); diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h index a36765e39..bf2632d9e 100644 --- a/Eigen/src/Core/CwiseBinaryOp.h +++ b/Eigen/src/Core/CwiseBinaryOp.h @@ -158,7 +158,7 @@ public: */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & MatrixBase::operator-=(const MatrixBase &other) { call_assignment(derived(), other.derived(), internal::sub_assign_op()); @@ -171,7 +171,7 @@ MatrixBase::operator-=(const MatrixBase &other) */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & MatrixBase::operator+=(const MatrixBase& other) { call_assignment(derived(), other.derived(), internal::add_assign_op()); @@ -181,4 +181,3 @@ MatrixBase::operator+=(const MatrixBase& other) } // end namespace Eigen #endif // EIGEN_CWISE_BINARY_OP_H - diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h index ddd607e38..144608ec2 100644 --- a/Eigen/src/Core/CwiseNullaryOp.h +++ b/Eigen/src/Core/CwiseNullaryOp.h @@ -131,7 +131,7 @@ DenseBase::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f */ template template -EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> DenseBase::NullaryExpr(Index size, const CustomNullaryOp& func) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) @@ -170,7 +170,7 @@ DenseBase::NullaryExpr(const CustomNullaryOp& func) * \sa class CwiseNullaryOp */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Constant(Index rows, Index cols, const Scalar& value) { return DenseBase::NullaryExpr(rows, cols, internal::scalar_constant_op(value)); diff --git a/Eigen/src/Core/Fuzzy.h b/Eigen/src/Core/Fuzzy.h index 3e403a09d..43aa49b2b 100644 --- a/Eigen/src/Core/Fuzzy.h +++ b/Eigen/src/Core/Fuzzy.h @@ -100,7 +100,7 @@ struct isMuchSmallerThan_scalar_selector */ template template -bool DenseBase::isApprox( +EIGEN_DEVICE_FUNC bool DenseBase::isApprox( const DenseBase& other, const RealScalar& prec ) const @@ -122,7 +122,7 @@ bool DenseBase::isApprox( * \sa isApprox(), isMuchSmallerThan(const DenseBase&, RealScalar) const */ template -bool DenseBase::isMuchSmallerThan( +EIGEN_DEVICE_FUNC bool DenseBase::isMuchSmallerThan( const typename NumTraits::Real& other, const RealScalar& prec ) const @@ -142,7 +142,7 @@ bool DenseBase::isMuchSmallerThan( */ template template -bool DenseBase::isMuchSmallerThan( +EIGEN_DEVICE_FUNC bool DenseBase::isMuchSmallerThan( const DenseBase& other, const RealScalar& prec ) const diff --git a/Eigen/src/Core/NestByValue.h b/Eigen/src/Core/NestByValue.h index 13adf070e..01cf192e9 100644 --- a/Eigen/src/Core/NestByValue.h +++ b/Eigen/src/Core/NestByValue.h @@ -67,25 +67,25 @@ template class NestByValue } template - inline const PacketScalar packet(Index row, Index col) const + EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index row, Index col) const { return m_expression.template packet(row, col); } template - inline void writePacket(Index row, Index col, const PacketScalar& x) + EIGEN_DEVICE_FUNC inline void writePacket(Index row, Index col, const PacketScalar& x) { m_expression.const_cast_derived().template writePacket(row, col, x); } template - inline const PacketScalar packet(Index index) const + EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index index) const { return m_expression.template packet(index); } template - inline void writePacket(Index index, const PacketScalar& x) + EIGEN_DEVICE_FUNC inline void writePacket(Index index, const PacketScalar& x) { m_expression.const_cast_derived().template writePacket(index, x); } @@ -99,7 +99,7 @@ template class NestByValue /** \returns an expression of the temporary version of *this. */ template -inline const NestByValue +EIGEN_DEVICE_FUNC inline const NestByValue DenseBase::nestByValue() const { return NestByValue(derived()); diff --git a/Eigen/src/Core/ReturnByValue.h b/Eigen/src/Core/ReturnByValue.h index c44b7673b..11dc86d07 100644 --- a/Eigen/src/Core/ReturnByValue.h +++ b/Eigen/src/Core/ReturnByValue.h @@ -79,7 +79,7 @@ template class ReturnByValue template template -Derived& DenseBase::operator=(const ReturnByValue& other) +EIGEN_DEVICE_FUNC Derived& DenseBase::operator=(const ReturnByValue& other) { other.evalTo(derived()); return derived(); From 765f4cc4b4774fc114ab794e998c5ab4f2d733f9 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 28 Feb 2017 11:57:00 -0800 Subject: [PATCH 12/37] Deleted extra: EIGEN_DEVICE_FUNC: the QR and Cholesky code isn't ready to run on GPU yet. --- Eigen/src/Cholesky/LDLT.h | 1 - Eigen/src/Cholesky/LLT.h | 1 - Eigen/src/QR/ColPivHouseholderQR.h | 1 - Eigen/src/QR/CompleteOrthogonalDecomposition.h | 2 +- Eigen/src/QR/FullPivHouseholderQR.h | 9 ++++----- Eigen/src/QR/HouseholderQR.h | 9 ++++----- 6 files changed, 9 insertions(+), 14 deletions(-) diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h index fcee7b2e3..9b4fdb414 100644 --- a/Eigen/src/Cholesky/LDLT.h +++ b/Eigen/src/Cholesky/LDLT.h @@ -258,7 +258,6 @@ template class LDLT #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; #endif diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index 87ca8d423..e6c02d803 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -200,7 +200,6 @@ template class LLT #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; #endif diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h index 0e47c8332..d35395d04 100644 --- a/Eigen/src/QR/ColPivHouseholderQR.h +++ b/Eigen/src/QR/ColPivHouseholderQR.h @@ -416,7 +416,6 @@ template class ColPivHouseholderQR #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; #endif diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h index 34c637b70..13b61fcdb 100644 --- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h +++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h @@ -367,7 +367,7 @@ class CompleteOrthogonalDecomposition { #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType& rhs, DstType& dst) const; + void _solve_impl(const RhsType& rhs, DstType& dst) const; #endif protected: diff --git a/Eigen/src/QR/FullPivHouseholderQR.h b/Eigen/src/QR/FullPivHouseholderQR.h index e489bddc2..c31e47cc4 100644 --- a/Eigen/src/QR/FullPivHouseholderQR.h +++ b/Eigen/src/QR/FullPivHouseholderQR.h @@ -392,22 +392,21 @@ template class FullPivHouseholderQR * diagonal coefficient of U. */ RealScalar maxPivot() const { return m_maxpivot; } - + #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; #endif protected: - + static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } - + void computeInPlace(); - + MatrixType m_qr; HCoeffsType m_hCoeffs; IntDiagSizeVectorType m_rows_transpositions; diff --git a/Eigen/src/QR/HouseholderQR.h b/Eigen/src/QR/HouseholderQR.h index 3513d995c..762b21c36 100644 --- a/Eigen/src/QR/HouseholderQR.h +++ b/Eigen/src/QR/HouseholderQR.h @@ -204,28 +204,27 @@ template class HouseholderQR inline Index rows() const { return m_qr.rows(); } inline Index cols() const { return m_qr.cols(); } - + /** \returns a const reference to the vector of Householder coefficients used to represent the factor \c Q. * * For advanced uses only. */ const HCoeffsType& hCoeffs() const { return m_hCoeffs; } - + #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; #endif protected: - + static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } void computeInPlace(); - + MatrixType m_qr; HCoeffsType m_hCoeffs; RowVectorType m_temp; From de7b0fdea9db957d2135c32e850ad069b64b5f1e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 28 Feb 2017 13:52:22 -0800 Subject: [PATCH 13/37] Made the TensorStorage class compile with clang 3.9 --- unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index 2854a4a17..e6a666f78 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -31,12 +31,12 @@ namespace Eigen { * * \sa Tensor */ -template class TensorStorage; +template class TensorStorage; // Pure fixed-size storage -template -class TensorStorage +template +class TensorStorage { private: static const std::size_t Size = FixedDimensions::total_size; @@ -66,7 +66,7 @@ class TensorStorage // pure dynamic -template +template class TensorStorage, Options_> { public: From 4a7df114c883eb17251f25b8b975c0ddf266acd6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 28 Feb 2017 14:00:15 -0800 Subject: [PATCH 14/37] Added missing EIGEN_DEVICE_FUNC --- Eigen/src/Core/Assign.h | 2 +- Eigen/src/Core/Dot.h | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h index 53806ba33..655412efd 100644 --- a/Eigen/src/Core/Assign.h +++ b/Eigen/src/Core/Assign.h @@ -16,7 +16,7 @@ namespace Eigen { template template -EIGEN_STRONG_INLINE Derived& DenseBase +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase ::lazyAssign(const DenseBase& other) { enum{ diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h index 06ef18b8b..bb8e3fecc 100644 --- a/Eigen/src/Core/Dot.h +++ b/Eigen/src/Core/Dot.h @@ -90,7 +90,7 @@ MatrixBase::dot(const MatrixBase& other) const * \sa dot(), norm(), lpNorm() */ template -EIGEN_STRONG_INLINE typename NumTraits::Scalar>::Real MatrixBase::squaredNorm() const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits::Scalar>::Real MatrixBase::squaredNorm() const { return numext::real((*this).cwiseAbs2().sum()); } @@ -102,7 +102,7 @@ EIGEN_STRONG_INLINE typename NumTraits::Scala * \sa lpNorm(), dot(), squaredNorm() */ template -inline typename NumTraits::Scalar>::Real MatrixBase::norm() const +EIGEN_DEVICE_FUNC inline typename NumTraits::Scalar>::Real MatrixBase::norm() const { return numext::sqrt(squaredNorm()); } @@ -117,7 +117,7 @@ inline typename NumTraits::Scalar>::Real Matr * \sa norm(), normalize() */ template -inline const typename MatrixBase::PlainObject +EIGEN_DEVICE_FUNC inline const typename MatrixBase::PlainObject MatrixBase::normalized() const { typedef typename internal::nested_eval::type _Nested; @@ -139,7 +139,7 @@ MatrixBase::normalized() const * \sa norm(), normalized() */ template -inline void MatrixBase::normalize() +EIGEN_DEVICE_FUNC inline void MatrixBase::normalize() { RealScalar z = squaredNorm(); // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU @@ -160,7 +160,7 @@ inline void MatrixBase::normalize() * \sa stableNorm(), stableNormalize(), normalized() */ template -inline const typename MatrixBase::PlainObject +EIGEN_DEVICE_FUNC inline const typename MatrixBase::PlainObject MatrixBase::stableNormalized() const { typedef typename internal::nested_eval::type _Nested; @@ -185,7 +185,7 @@ MatrixBase::stableNormalized() const * \sa stableNorm(), stableNormalized(), normalize() */ template -inline void MatrixBase::stableNormalize() +EIGEN_DEVICE_FUNC inline void MatrixBase::stableNormalize() { RealScalar w = cwiseAbs().maxCoeff(); RealScalar z = (derived()/w).squaredNorm(); @@ -257,9 +257,9 @@ struct lpNorm_selector template template #ifndef EIGEN_PARSED_BY_DOXYGEN -inline typename NumTraits::Scalar>::Real +EIGEN_DEVICE_FUNC inline typename NumTraits::Scalar>::Real #else -MatrixBase::RealScalar +EIGEN_DEVICE_FUNC MatrixBase::RealScalar #endif MatrixBase::lpNorm() const { From c36bc2d445596d46c7f5a9271bfa69c79e2e1558 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 28 Feb 2017 14:58:45 -0800 Subject: [PATCH 15/37] Added missing EIGEN_DEVICE_FUNC qualifiers --- Eigen/src/Core/BooleanRedux.h | 6 +++--- Eigen/src/Core/DiagonalProduct.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/BooleanRedux.h b/Eigen/src/Core/BooleanRedux.h index ed607d5d8..ccf519067 100644 --- a/Eigen/src/Core/BooleanRedux.h +++ b/Eigen/src/Core/BooleanRedux.h @@ -76,7 +76,7 @@ struct any_unroller * \sa any(), Cwise::operator<() */ template -inline bool DenseBase::all() const +EIGEN_DEVICE_FUNC inline bool DenseBase::all() const { typedef internal::evaluator Evaluator; enum { @@ -100,7 +100,7 @@ inline bool DenseBase::all() const * \sa all() */ template -inline bool DenseBase::any() const +EIGEN_DEVICE_FUNC inline bool DenseBase::any() const { typedef internal::evaluator Evaluator; enum { @@ -124,7 +124,7 @@ inline bool DenseBase::any() const * \sa all(), any() */ template -inline Eigen::Index DenseBase::count() const +EIGEN_DEVICE_FUNC inline Eigen::Index DenseBase::count() const { return derived().template cast().template cast().sum(); } diff --git a/Eigen/src/Core/DiagonalProduct.h b/Eigen/src/Core/DiagonalProduct.h index d372b938f..7911d1cd1 100644 --- a/Eigen/src/Core/DiagonalProduct.h +++ b/Eigen/src/Core/DiagonalProduct.h @@ -17,7 +17,7 @@ namespace Eigen { */ template template -inline const Product +EIGEN_DEVICE_FUNC inline const Product MatrixBase::operator*(const DiagonalBase &a_diagonal) const { return Product(derived(),a_diagonal.derived()); From 857adbbd52bb1a36c913a828fb5f24b95deee965 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 28 Feb 2017 16:42:00 -0800 Subject: [PATCH 16/37] Added missing EIGEN_DEVICE_FUNC qualifiers --- Eigen/src/Core/Random.h | 2 +- Eigen/src/Core/Replicate.h | 4 ++-- Eigen/src/Core/Reverse.h | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/Random.h b/Eigen/src/Core/Random.h index 6faf789c7..486e9ed52 100644 --- a/Eigen/src/Core/Random.h +++ b/Eigen/src/Core/Random.h @@ -128,7 +128,7 @@ DenseBase::Random() * \sa class CwiseNullaryOp, setRandom(Index), setRandom(Index,Index) */ template -inline Derived& DenseBase::setRandom() +EIGEN_DEVICE_FUNC inline Derived& DenseBase::setRandom() { return *this = Random(rows(), cols()); } diff --git a/Eigen/src/Core/Replicate.h b/Eigen/src/Core/Replicate.h index 9960ef884..0b2d6d743 100644 --- a/Eigen/src/Core/Replicate.h +++ b/Eigen/src/Core/Replicate.h @@ -115,7 +115,7 @@ template class Replicate */ template template -const Replicate +EIGEN_DEVICE_FUNC const Replicate DenseBase::replicate() const { return Replicate(derived()); @@ -130,7 +130,7 @@ DenseBase::replicate() const * \sa VectorwiseOp::replicate(), DenseBase::replicate(), class Replicate */ template -const typename VectorwiseOp::ReplicateReturnType +EIGEN_DEVICE_FUNC const typename VectorwiseOp::ReplicateReturnType VectorwiseOp::replicate(Index factor) const { return typename VectorwiseOp::ReplicateReturnType diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h index 0640cda2a..8b6b3ab03 100644 --- a/Eigen/src/Core/Reverse.h +++ b/Eigen/src/Core/Reverse.h @@ -114,7 +114,7 @@ template class Reverse * */ template -inline typename DenseBase::ReverseReturnType +EIGEN_DEVICE_FUNC inline typename DenseBase::ReverseReturnType DenseBase::reverse() { return ReverseReturnType(derived()); @@ -136,7 +136,7 @@ DenseBase::reverse() * * \sa VectorwiseOp::reverseInPlace(), reverse() */ template -inline void DenseBase::reverseInPlace() +EIGEN_DEVICE_FUNC inline void DenseBase::reverseInPlace() { if(cols()>rows()) { @@ -201,7 +201,7 @@ struct vectorwise_reverse_inplace_impl * * \sa DenseBase::reverseInPlace(), reverse() */ template -void VectorwiseOp::reverseInPlace() +EIGEN_DEVICE_FUNC void VectorwiseOp::reverseInPlace() { internal::vectorwise_reverse_inplace_impl::run(_expression().const_cast_derived()); } From c92406d6137e4c55fc608c6916cc20899d00cff4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 28 Feb 2017 17:03:11 -0800 Subject: [PATCH 17/37] Silenced clang compilation warning. --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index e6cee11ef..be8d69386 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -217,7 +217,10 @@ struct GpuDevice { EIGEN_UNUSED_VARIABLE(err) assert(err == cudaSuccess); #else - eigen_assert(false && "The default device should be used instead to generate kernel code"); + EIGEN_UNUSED_VARIABLE(dst); + EIGEN_UNUSED_VARIABLE(src); + EIGEN_UNUSED_VARIABLE(n); + eigen_assert(false && "The default device should be used instead to generate kernel code"); #endif } From 7b61944669f23a20f6c850b9c07d930c049c6ede Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 28 Feb 2017 17:05:28 -0800 Subject: [PATCH 18/37] Made most of the packet math primitives usable within CUDA kernel when compiling with clang --- Eigen/src/Core/arch/CUDA/PacketMath.h | 4 ++-- Eigen/src/Core/util/Macros.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index 4dda63188..8c46af09b 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -167,10 +167,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu(const d return make_double2(from[0], from[1]); } -template<> EIGEN_STRONG_INLINE float4 ploaddup(const float* from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup(const float* from) { return make_float4(from[0], from[0], from[1], from[1]); } -template<> EIGEN_STRONG_INLINE double2 ploaddup(const double* from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup(const double* from) { return make_double2(from[0], from[0]); } diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 29c796647..14ec87da8 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -542,8 +542,8 @@ // - static is not very good because it prevents definitions from different object files to be merged. // So static causes the resulting linked executable to be bloated with multiple copies of the same function. // - inline is not perfect either as it unwantedly hints the compiler toward inlining the function. -#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS inline +#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC +#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC inline #ifdef NDEBUG # ifndef EIGEN_NO_DEBUG From 3a3f040baa602e13fe36d2949712d4af1b2db354 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 28 Feb 2017 17:06:15 -0800 Subject: [PATCH 19/37] Added missing EIGEN_DEVICE_FUNC qualifiers --- Eigen/src/Core/VectorwiseOp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h index 4fe267e9f..893bc796f 100644 --- a/Eigen/src/Core/VectorwiseOp.h +++ b/Eigen/src/Core/VectorwiseOp.h @@ -670,7 +670,7 @@ template class VectorwiseOp * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting */ template -inline typename DenseBase::ColwiseReturnType +EIGEN_DEVICE_FUNC inline typename DenseBase::ColwiseReturnType DenseBase::colwise() { return ColwiseReturnType(derived()); @@ -684,7 +684,7 @@ DenseBase::colwise() * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting */ template -inline typename DenseBase::RowwiseReturnType +EIGEN_DEVICE_FUNC inline typename DenseBase::RowwiseReturnType DenseBase::rowwise() { return RowwiseReturnType(derived()); From c1d87ec110d9ee96da39d58c8f88481f5cb9d04c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 1 Mar 2017 10:08:50 -0800 Subject: [PATCH 20/37] Added missing EIGEN_DEVICE_FUNC qualifiers --- Eigen/src/Core/CoreEvaluators.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 412f5a661..54276b836 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -139,14 +139,14 @@ public: EIGEN_ONLY_USED_FOR_DEBUG(outerStride); eigen_internal_assert(outerStride==OuterStride); } - Index outerStride() const { return OuterStride; } + EIGEN_DEVICE_FUNC Index outerStride() const { return OuterStride; } const Scalar *data; }; template class plainobjectbase_evaluator_data { public: plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {} - Index outerStride() const { return m_outerStride; } + EIGEN_DEVICE_FUNC Index outerStride() const { return m_outerStride; } const Scalar *data; protected: Index m_outerStride; From 1e2d046651102c57f5f4eca38ff7844e1b0ca6fd Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 1 Mar 2017 10:13:42 -0800 Subject: [PATCH 21/37] Silenced a couple of compilation warnings --- unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h | 1 + unsupported/Eigen/CXX11/src/util/EmulateArray.h | 1 + 2 files changed, 2 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h index 3523e7c94..d23f2e4c8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -23,6 +23,7 @@ struct static_val { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) { + EIGEN_UNUSED_VARIABLE(v); eigen_assert(v == n); } }; diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h index 03169d591..573ca435a 100644 --- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h @@ -169,6 +169,7 @@ template class array { #if EIGEN_HAS_VARIADIC_TEMPLATES EIGEN_DEVICE_FUNC array(std::initializer_list l) : dummy() { + EIGEN_UNUSED_VARIABLE(l); eigen_assert(l.size() == 0); } #endif From 09ae0e6586b978ce1ea9960984e1228dfc8971b8 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 1 Mar 2017 11:47:47 -0800 Subject: [PATCH 22/37] Adjusted the EIGEN_DEVICE_FUNC qualifiers to make sure that: * they're used consistently between the declaration and the definition of a function * we avoid calling host only methods from host device methods. --- Eigen/src/Core/CoreEvaluators.h | 4 ++-- Eigen/src/Core/DiagonalMatrix.h | 4 ++-- Eigen/src/Core/GeneralProduct.h | 2 +- Eigen/src/Core/SelfAdjointView.h | 4 ++-- Eigen/src/Core/SolveTriangular.h | 2 +- Eigen/src/Core/TriangularMatrix.h | 15 ++++++++------- .../Core/products/GeneralMatrixMatrixTriangular.h | 6 +++--- Eigen/src/Core/products/SelfadjointProduct.h | 2 +- Eigen/src/Core/products/SelfadjointRank2Update.h | 2 +- Eigen/src/Core/util/IntegralConstant.h | 4 ++-- Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h | 4 ++-- Eigen/src/LU/FullPivLU.h | 2 -- Eigen/src/SVD/SVDBase.h | 1 - 13 files changed, 25 insertions(+), 27 deletions(-) diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 54276b836..15b361b38 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -134,7 +134,7 @@ private: // this helper permits to completely eliminate m_outerStride if it is known at compiletime. template class plainobjectbase_evaluator_data { public: - plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr) + EIGEN_DEVICE_FUNC plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr) { EIGEN_ONLY_USED_FOR_DEBUG(outerStride); eigen_internal_assert(outerStride==OuterStride); @@ -145,7 +145,7 @@ public: template class plainobjectbase_evaluator_data { public: - plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {} + EIGEN_DEVICE_FUNC plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {} EIGEN_DEVICE_FUNC Index outerStride() const { return m_outerStride; } const Scalar *data; protected: diff --git a/Eigen/src/Core/DiagonalMatrix.h b/Eigen/src/Core/DiagonalMatrix.h index ecfdce8ef..4e8297ee6 100644 --- a/Eigen/src/Core/DiagonalMatrix.h +++ b/Eigen/src/Core/DiagonalMatrix.h @@ -44,7 +44,7 @@ class DiagonalBase : public EigenBase EIGEN_DEVICE_FUNC DenseMatrixType toDenseMatrix() const { return derived(); } - + EIGEN_DEVICE_FUNC inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); } EIGEN_DEVICE_FUNC @@ -273,7 +273,7 @@ class DiagonalWrapper * \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal() **/ template -inline const DiagonalWrapper +EIGEN_DEVICE_FUNC inline const DiagonalWrapper MatrixBase::asDiagonal() const { return DiagonalWrapper(derived()); diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index 0f16cd8e3..b206b0a7a 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -428,7 +428,7 @@ MatrixBase::operator*(const MatrixBase &other) const template template const Product -MatrixBase::lazyProduct(const MatrixBase &other) const +EIGEN_DEVICE_FUNC MatrixBase::lazyProduct(const MatrixBase &other) const { enum { ProductIsValid = Derived::ColsAtCompileTime==Dynamic diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h index 504c98f0e..7e71fe3c0 100644 --- a/Eigen/src/Core/SelfAdjointView.h +++ b/Eigen/src/Core/SelfAdjointView.h @@ -322,7 +322,7 @@ public: /** This is the const version of MatrixBase::selfadjointView() */ template template -typename MatrixBase::template ConstSelfAdjointViewReturnType::Type +EIGEN_DEVICE_FUNC typename MatrixBase::template ConstSelfAdjointViewReturnType::Type MatrixBase::selfadjointView() const { return typename ConstSelfAdjointViewReturnType::Type(derived()); @@ -339,7 +339,7 @@ MatrixBase::selfadjointView() const */ template template -typename MatrixBase::template SelfAdjointViewReturnType::Type +EIGEN_DEVICE_FUNC typename MatrixBase::template SelfAdjointViewReturnType::Type MatrixBase::selfadjointView() { return typename SelfAdjointViewReturnType::Type(derived()); diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h index 049890b25..a0011d4f9 100644 --- a/Eigen/src/Core/SolveTriangular.h +++ b/Eigen/src/Core/SolveTriangular.h @@ -164,7 +164,7 @@ struct triangular_solver_selector { #ifndef EIGEN_PARSED_BY_DOXYGEN template template -void TriangularViewImpl::solveInPlace(const MatrixBase& _other) const +EIGEN_DEVICE_FUNC void TriangularViewImpl::solveInPlace(const MatrixBase& _other) const { OtherDerived& other = _other.const_cast_derived(); eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) ); diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h index 667ef09dc..ed80da36a 100644 --- a/Eigen/src/Core/TriangularMatrix.h +++ b/Eigen/src/Core/TriangularMatrix.h @@ -488,7 +488,6 @@ template class TriangularViewImpl<_Mat * \sa TriangularView::solveInPlace() */ template - EIGEN_DEVICE_FUNC inline const internal::triangular_solve_retval solve(const MatrixBase& other) const; @@ -554,7 +553,7 @@ template class TriangularViewImpl<_Mat // FIXME should we keep that possibility template template -inline TriangularView& +EIGEN_DEVICE_FUNC inline TriangularView& TriangularViewImpl::operator=(const MatrixBase& other) { internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op()); @@ -564,7 +563,7 @@ TriangularViewImpl::operator=(const MatrixBase template -void TriangularViewImpl::lazyAssign(const MatrixBase& other) +EIGEN_DEVICE_FUNC void TriangularViewImpl::lazyAssign(const MatrixBase& other) { internal::call_assignment_no_alias(derived(), other.template triangularView()); } @@ -573,7 +572,7 @@ void TriangularViewImpl::lazyAssign(const MatrixBase template -inline TriangularView& +EIGEN_DEVICE_FUNC inline TriangularView& TriangularViewImpl::operator=(const TriangularBase& other) { eigen_assert(Mode == int(OtherDerived::Mode)); @@ -583,7 +582,7 @@ TriangularViewImpl::operator=(const TriangularBase template -void TriangularViewImpl::lazyAssign(const TriangularBase& other) +EIGEN_DEVICE_FUNC void TriangularViewImpl::lazyAssign(const TriangularBase& other) { eigen_assert(Mode == int(OtherDerived::Mode)); internal::call_assignment_no_alias(derived(), other.derived()); @@ -598,7 +597,7 @@ void TriangularViewImpl::lazyAssign(const TriangularBas * If the matrix is triangular, the opposite part is set to zero. */ template template -void TriangularBase::evalTo(MatrixBase &other) const +EIGEN_DEVICE_FUNC void TriangularBase::evalTo(MatrixBase &other) const { evalToLazy(other.derived()); } @@ -624,6 +623,7 @@ void TriangularBase::evalTo(MatrixBase &other) const */ template template +EIGEN_DEVICE_FUNC typename MatrixBase::template TriangularViewReturnType::Type MatrixBase::triangularView() { @@ -633,6 +633,7 @@ MatrixBase::triangularView() /** This is the const version of MatrixBase::triangularView() */ template template +EIGEN_DEVICE_FUNC typename MatrixBase::template ConstTriangularViewReturnType::Type MatrixBase::triangularView() const { @@ -930,7 +931,7 @@ struct triangular_assignment_loop * If the matrix is triangular, the opposite part is set to zero. */ template template -void TriangularBase::evalToLazy(MatrixBase &other) const +EIGEN_DEVICE_FUNC void TriangularBase::evalToLazy(MatrixBase &other) const { other.derived().resize(this->rows(), this->cols()); internal::call_triangular_assignment_loop(other.derived(), derived().nestedExpression()); diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 7122efa60..ad38bcf51 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -292,12 +292,12 @@ struct general_product_to_triangular_selector template template -TriangularView& TriangularViewImpl::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta) +EIGEN_DEVICE_FUNC TriangularView& TriangularViewImpl::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta) { eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols()); - + general_product_to_triangular_selector::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta); - + return derived(); } diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h index f038d686f..39c5b59ff 100644 --- a/Eigen/src/Core/products/SelfadjointProduct.h +++ b/Eigen/src/Core/products/SelfadjointProduct.h @@ -120,7 +120,7 @@ struct selfadjoint_product_selector template template -SelfAdjointView& SelfAdjointView +EIGEN_DEVICE_FUNC SelfAdjointView& SelfAdjointView ::rankUpdate(const MatrixBase& u, const Scalar& alpha) { selfadjoint_product_selector::run(_expression().const_cast_derived(), u.derived(), alpha); diff --git a/Eigen/src/Core/products/SelfadjointRank2Update.h b/Eigen/src/Core/products/SelfadjointRank2Update.h index 2ae364111..d395888e5 100644 --- a/Eigen/src/Core/products/SelfadjointRank2Update.h +++ b/Eigen/src/Core/products/SelfadjointRank2Update.h @@ -57,7 +57,7 @@ template struct conj_expr_if template template -SelfAdjointView& SelfAdjointView +EIGEN_DEVICE_FUNC SelfAdjointView& SelfAdjointView ::rankUpdate(const MatrixBase& u, const MatrixBase& v, const Scalar& alpha) { typedef internal::blas_traits UBlasTraits; diff --git a/Eigen/src/Core/util/IntegralConstant.h b/Eigen/src/Core/util/IntegralConstant.h index ae41015bd..78a4705cd 100644 --- a/Eigen/src/Core/util/IntegralConstant.h +++ b/Eigen/src/Core/util/IntegralConstant.h @@ -151,9 +151,9 @@ struct get_fixed_value,Default> { static const int value = N; }; -template Index get_runtime_value(const T &x) { return x; } +template EIGEN_DEVICE_FUNC Index get_runtime_value(const T &x) { return x; } #if !EIGEN_HAS_CXX14 -template Index get_runtime_value(FixedInt (*)()) { return N; } +template EIGEN_DEVICE_FUNC Index get_runtime_value(FixedInt (*)()) { return N; } #endif // Cleanup integer/FixedInt/VariableAndFixedInt/etc types: diff --git a/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h b/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h index 4fec8af0a..dbbd4806a 100644 --- a/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +++ b/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h @@ -85,7 +85,7 @@ MatrixBase::eigenvalues() const * \sa SelfAdjointEigenSolver::eigenvalues(), MatrixBase::eigenvalues() */ template -inline typename SelfAdjointView::EigenvaluesReturnType +EIGEN_DEVICE_FUNC inline typename SelfAdjointView::EigenvaluesReturnType SelfAdjointView::eigenvalues() const { typedef typename SelfAdjointView::PlainObject PlainObject; @@ -149,7 +149,7 @@ MatrixBase::operatorNorm() const * \sa eigenvalues(), MatrixBase::operatorNorm() */ template -inline typename SelfAdjointView::RealScalar +EIGEN_DEVICE_FUNC inline typename SelfAdjointView::RealScalar SelfAdjointView::operatorNorm() const { return eigenvalues().cwiseAbs().maxCoeff(); diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h index 03b6af706..ec61086d5 100644 --- a/Eigen/src/LU/FullPivLU.h +++ b/Eigen/src/LU/FullPivLU.h @@ -411,11 +411,9 @@ template class FullPivLU #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; template - EIGEN_DEVICE_FUNC void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif diff --git a/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h index cc90a3b75..429414797 100644 --- a/Eigen/src/SVD/SVDBase.h +++ b/Eigen/src/SVD/SVDBase.h @@ -212,7 +212,6 @@ public: #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; #endif From a71943b9a432c8962f025b56313584f33111ace4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 2 Mar 2017 10:47:29 -0800 Subject: [PATCH 23/37] Made the Tensor code compile with clang 3.9 --- .../CXX11/src/Tensor/TensorContractionCuda.h | 101 +++++++++--------- .../CXX11/src/Tensor/TensorReductionCuda.h | 1 - 2 files changed, 48 insertions(+), 54 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index d65dbb40f..c04b784a4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -529,7 +529,6 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh float2 rhs_shmem2[][8], const Index m_size, const Index n_size, const Index k_size, const Index base_m, const Index base_n) { - typedef float Scalar; // prefetch registers float4 lhs_pf0, rhs_pf0; @@ -540,27 +539,27 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh } -#define prefetch_lhs(reg, row, col) \ - if (!CHECK_LHS_BOUNDARY) { \ - if (col < k_size) { \ - reg =lhs.loadPacket(row, col); \ - } \ - } else { \ - if (col < k_size) { \ - if (row + 3 < m_size) { \ - reg =lhs.loadPacket(row, col); \ - } else if (row + 2 < m_size) { \ - reg.x =lhs(row + 0, col); \ - reg.y =lhs(row + 1, col); \ - reg.z =lhs(row + 2, col); \ - } else if (row + 1 < m_size) { \ - reg.x =lhs(row + 0, col); \ - reg.y =lhs(row + 1, col); \ - } else if (row < m_size) { \ - reg.x =lhs(row + 0, col); \ - } \ - } \ - } \ +#define prefetch_lhs(reg, row, col) \ + if (!CHECK_LHS_BOUNDARY) { \ + if (col < k_size) { \ + reg =lhs.template loadPacket(row, col); \ + } \ + } else { \ + if (col < k_size) { \ + if (row + 3 < m_size) { \ + reg =lhs.template loadPacket(row, col); \ + } else if (row + 2 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + reg.z =lhs(row + 2, col); \ + } else if (row + 1 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + } else if (row < m_size) { \ + reg.x =lhs(row + 0, col); \ + } \ + } \ + } \ Index lhs_vert = base_m+threadIdx.x*4; @@ -578,7 +577,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh if (!CHECK_RHS_BOUNDARY) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); } else if (rhs_vert + 2 < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -593,7 +592,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh } else { if (rhs_horiz0 < n_size) { if ((rhs_vert + 3) < k_size) { - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); } else if ((rhs_vert + 2) < k_size) { rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); @@ -766,7 +765,6 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, float2 rhs_shmem2[][8], const Index m_size, const Index n_size, const Index k_size, const Index base_m, const Index base_n) { - typedef float Scalar; // prefetch registers float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3; @@ -790,37 +788,37 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, if (!CHECK_LHS_BOUNDARY) { if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+24)); } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); } } else { // just CHECK_LHS_BOUNDARY if (lhs_vert + 3 < m_size) { if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+24)); } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); } } else if (lhs_vert + 2 < m_size) { if ((threadIdx.y/4+k+24) < k_size) { @@ -909,8 +907,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, if (!CHECK_RHS_BOUNDARY) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); - rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1); + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); } else if (rhs_vert + 2 < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -932,8 +930,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, if (rhs_horiz1 < n_size) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); - rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1); + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); } else if (rhs_vert + 2 < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -954,7 +952,7 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, } else if (rhs_horiz0 < n_size) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); } else if ((rhs_vert + 2) < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -1137,9 +1135,6 @@ EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, typedef float2 LHS_MEM[64][32]; typedef float2 RHS_MEM[128][8]; - typedef float2 LHS_MEM16x16[32][16]; - typedef float2 RHS_MEM16x16[64][8]; - const Index m_block_idx = blockIdx.x; const Index n_block_idx = blockIdx.y; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 65638b6a8..edb0ab280 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -287,7 +287,6 @@ struct FullReductionLauncher< void>::type> { static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) { typedef typename Self::Index Index; - typedef typename Self::CoeffReturnType Scalar; const int block_size = 256; const int num_per_thread = 128; const int num_blocks = divup(num_coeffs, block_size * num_per_thread); From 1c03d43a5cc421da067f12fe02db9eedaa1d125d Mon Sep 17 00:00:00 2001 From: Ilya Biryukov Date: Mon, 6 Mar 2017 12:01:12 +0100 Subject: [PATCH 24/37] Fixed compilation with cuda-clang --- Eigen/src/Core/MathFunctions.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 7a6b999af..5ec6c395e 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -512,7 +512,7 @@ namespace std_fallback { template struct expm1_impl { - static inline Scalar run(const Scalar& x) + EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) #if EIGEN_HAS_CXX11_MATH @@ -549,7 +549,7 @@ namespace std_fallback { template struct log1p_impl { - static inline Scalar run(const Scalar& x) + EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) #if EIGEN_HAS_CXX11_MATH From 659087b622e94f7e35a56b7ed2cb01b024c80a7b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 7 Mar 2017 10:02:34 +0100 Subject: [PATCH 25/37] bug #1400: fix stableNorm with EIGEN_DONT_ALIGN_STATICALLY --- Eigen/src/Core/StableNorm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h index d2fe1e199..be04ed44d 100644 --- a/Eigen/src/Core/StableNorm.h +++ b/Eigen/src/Core/StableNorm.h @@ -170,7 +170,8 @@ MatrixBase::stableNorm() const enum { CanAlign = ( (int(DerivedCopyClean::Flags)&DirectAccessBit) || (int(internal::evaluator::Alignment)>0) // FIXME Alignment)>0 might not be enough - ) && (blockSize*sizeof(Scalar)*20) // if we cannot allocate on the stack, then let's not bother about this optimization }; typedef typename internal::conditional, internal::evaluator::Alignment>, typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper; From d9677185250d780b0011202621c90530937049d2 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 7 Mar 2017 10:16:39 +0100 Subject: [PATCH 26/37] do not include std header within extern C --- Eigen/src/misc/lapacke.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Eigen/src/misc/lapacke.h b/Eigen/src/misc/lapacke.h index 8c7e79b03..dd6fd3b0a 100755 --- a/Eigen/src/misc/lapacke.h +++ b/Eigen/src/misc/lapacke.h @@ -43,10 +43,6 @@ #include "lapacke_config.h" #endif -#ifdef __cplusplus -extern "C" { -#endif /* __cplusplus */ - #include #ifndef lapack_int @@ -81,7 +77,7 @@ extern "C" { #endif #ifndef lapack_complex_float_real -#define lapack_complex_float_real(z) (creal(z)) +#define lapack_complex_float_rea@l(z) (creal(z)) #endif #ifndef lapack_complex_float_imag @@ -108,6 +104,11 @@ lapack_complex_double lapack_make_complex_double( double re, double im ); #endif + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + #ifndef LAPACKE_malloc #define LAPACKE_malloc( size ) malloc( size ) #endif From e958c2baac15ccf0cf7b7919ef729b118e43a6ed Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 7 Mar 2017 10:47:40 +0100 Subject: [PATCH 27/37] remove UTF8 symbols --- Eigen/src/Core/arch/CUDA/Half.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index db9878796..67518da9f 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -13,7 +13,7 @@ // Redistribution and use in source and binary forms, with or without // modification, are permitted. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, From 5694315fbb7a2abc157cad284852b5e3df2f9576 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 7 Mar 2017 10:53:47 +0100 Subject: [PATCH 28/37] remove UTF8 symbol --- Eigen/src/Core/DenseBase.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h index 91a8511be..fd933eed4 100644 --- a/Eigen/src/Core/DenseBase.h +++ b/Eigen/src/Core/DenseBase.h @@ -296,7 +296,7 @@ template class DenseBase EIGEN_DEVICE_FUNC Derived& operator=(const ReturnByValue& func); - /** \ínternal + /** \internal * Copies \a other into *this without evaluating other. \returns a reference to *this. * \deprecated */ template From e5156e4d253fcc8ac13b0131973de3f56e810ab5 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 7 Mar 2017 11:25:58 +0100 Subject: [PATCH 29/37] fix typo --- Eigen/src/misc/lapacke.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/misc/lapacke.h b/Eigen/src/misc/lapacke.h index dd6fd3b0a..3d8e24f5a 100755 --- a/Eigen/src/misc/lapacke.h +++ b/Eigen/src/misc/lapacke.h @@ -77,7 +77,7 @@ #endif #ifndef lapack_complex_float_real -#define lapack_complex_float_rea@l(z) (creal(z)) +#define lapack_complex_float_real(z) (creal(z)) #endif #ifndef lapack_complex_float_imag From 970ff78294503896940fb760d948f1eed156250f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 8 Mar 2017 16:16:53 +0100 Subject: [PATCH 30/37] bug #1401: fix compilation of "cond ? x : -x" with x an AutoDiffScalar --- unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h index 50fedf6ac..d2808860c 100755 --- a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h +++ b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h @@ -108,7 +108,9 @@ class AutoDiffScalar template AutoDiffScalar(const AutoDiffScalar& other #ifndef EIGEN_PARSED_BY_DOXYGEN - , typename internal::enable_if::type>::Scalar>::value,void*>::type = 0 + , typename internal::enable_if< + internal::is_same::type>::Scalar>::value + && internal::is_convertible::value , void*>::type = 0 #endif ) : m_value(other.value()), m_derivatives(other.derivatives()) From 344c2694a64494721e6d36d1197bde47c7d12af9 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 9 Mar 2017 15:41:03 -0800 Subject: [PATCH 31/37] Make the non-blocking threadpool more flexible and less wasteful of CPU cycles for high-latency use-cases. * Adds a hint to ThreadPool allowing us to turn off spin waiting. Currently each reader and record yielder op in a graph creates a threadpool with a thread that spins for 1000 iterations through the work stealing loop before yielding. This is wasteful for such ops that process I/O. * This also changes the number of iterations through the steal loop to be inversely proportional to the number of threads. Since the time of each iteration is proportional to the number of threads, this yields roughly a constant spin time. * Implement a separate worker loop for the num_threads == 1 case since there is no point in going through the expensive steal loop. Moreover, since Steal() calls PopBack() on the victim queues it might reverse the order in which ops are executed, compared to the order in which they are scheduled, which is usually counter-productive for the types of I/O workloads the single thread pools tend to be used for. * Store num_threads in a member variable for simplicity and to avoid a data race between the thread creation loop and worker threads calling threads_.size(). --- .../src/ThreadPool/NonBlockingThreadPool.h | 163 ++++++++++++------ .../test/cxx11_non_blocking_thread_pool.cpp | 7 +- 2 files changed, 111 insertions(+), 59 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h index ed1a761b6..e28afedb4 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h @@ -20,7 +20,9 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { typedef RunQueue Queue; NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment()) - : env_(env), + : num_threads_(num_threads), + allow_spinning_(true), + env_(env), threads_(num_threads), queues_(num_threads), coprimes_(num_threads), @@ -30,34 +32,24 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { done_(false), cancelled_(false), ec_(waiters_) { - waiters_.resize(num_threads); + Init(); + } - // Calculate coprimes of num_threads. - // Coprimes are used for a random walk over all threads in Steal - // and NonEmptyQueueIndex. Iteration is based on the fact that if we take - // a walk starting thread index t and calculate num_threads - 1 subsequent - // indices as (t + coprime) % num_threads, we will cover all threads without - // repetitions (effectively getting a presudo-random permutation of thread - // indices). - for (int i = 1; i <= num_threads; i++) { - unsigned a = i; - unsigned b = num_threads; - // If GCD(a, b) == 1, then a and b are coprimes. - while (b != 0) { - unsigned tmp = a; - a = b; - b = tmp % b; - } - if (a == 1) { - coprimes_.push_back(i); - } - } - for (int i = 0; i < num_threads; i++) { - queues_.push_back(new Queue()); - } - for (int i = 0; i < num_threads; i++) { - threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); })); - } + NonBlockingThreadPoolTempl(int num_threads, bool allow_spinning, + Environment env = Environment()) + : num_threads_(num_threads), + allow_spinning_(allow_spinning), + env_(env), + threads_(num_threads), + queues_(num_threads), + coprimes_(num_threads), + waiters_(num_threads), + blocked_(0), + spinning_(0), + done_(false), + cancelled_(false), + ec_(waiters_) { + Init(); } ~NonBlockingThreadPoolTempl() { @@ -77,8 +69,8 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { } // Join threads explicitly to avoid destruction order issues. - for (size_t i = 0; i < threads_.size(); i++) delete threads_[i]; - for (size_t i = 0; i < threads_.size(); i++) delete queues_[i]; + for (size_t i = 0; i < num_threads_; i++) delete threads_[i]; + for (size_t i = 0; i < num_threads_; i++) delete queues_[i]; } void Schedule(std::function fn) { @@ -125,7 +117,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { } int NumThreads() const final { - return static_cast(threads_.size()); + return num_threads_; } int CurrentThreadId() const final { @@ -149,6 +141,8 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { }; Environment env_; + const int num_threads_; + const bool allow_spinning_; MaxSizeVector threads_; MaxSizeVector queues_; MaxSizeVector coprimes_; @@ -159,6 +153,37 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { std::atomic cancelled_; EventCount ec_; + void Init() { + waiters_.resize(num_threads_); + + // Calculate coprimes of num_threads_. + // Coprimes are used for a random walk over all threads in Steal + // and NonEmptyQueueIndex. Iteration is based on the fact that if we take + // a walk starting thread index t and calculate num_threads - 1 subsequent + // indices as (t + coprime) % num_threads, we will cover all threads without + // repetitions (effectively getting a presudo-random permutation of thread + // indices). + for (int i = 1; i <= num_threads_; i++) { + unsigned a = i; + unsigned b = num_threads_; + // If GCD(a, b) == 1, then a and b are coprimes. + while (b != 0) { + unsigned tmp = a; + a = b; + b = tmp % b; + } + if (a == 1) { + coprimes_.push_back(i); + } + } + for (int i = 0; i < num_threads_; i++) { + queues_.push_back(new Queue()); + } + for (int i = 0; i < num_threads_; i++) { + threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); })); + } + } + // Main worker thread loop. void WorkerLoop(int thread_id) { PerThread* pt = GetPerThread(); @@ -167,36 +192,62 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { pt->thread_id = thread_id; Queue* q = queues_[thread_id]; EventCount::Waiter* waiter = &waiters_[thread_id]; - while (!cancelled_) { - Task t = q->PopFront(); - if (!t.f) { - t = Steal(); + // TODO(dvyukov,rmlarsen): The time spent in Steal() is proportional + // to num_threads_ and we assume that new work is scheduled at a + // constant rate, so we set spin_count to 5000 / num_threads_. The + // constant was picked based on a fair dice roll, tune it. + const int spin_count = + allow_spinning_ && num_threads_ > 0 ? 5000 / num_threads_ : 0; + if (num_threads_ == 1) { + // For num_threads_ == 1 there is no point in going through the expensive + // steal loop. Moreover, since Steal() calls PopBack() on the victim + // queues it might reverse the order in which ops are executed compared to + // the order in which they are scheduled, which tends to be + // counter-productive for the types of I/O workloads the single thread + // pools tend to be used for. + while (!cancelled_) { + Task t = q->PopFront(); + for (int i = 0; i < spin_count && !t.f; i++) { + if (!cancelled_.load(std::memory_order_relaxed)) { + t = q->PopFront(); + } + } if (!t.f) { - // Leave one thread spinning. This reduces latency. - // TODO(dvyukov): 1000 iterations is based on fair dice roll, tune it. - // Also, the time it takes to attempt to steal work 1000 times depends - // on the size of the thread pool. However the speed at which the user - // of the thread pool submit tasks is independent of the size of the - // pool. Consider a time based limit instead. - if (!spinning_ && !spinning_.exchange(true)) { - for (int i = 0; i < 1000 && !t.f; i++) { - if (!cancelled_.load(std::memory_order_relaxed)) { - t = Steal(); - } else { + if (!WaitForWork(waiter, &t)) { + return; + } + } + if (t.f) { + env_.ExecuteTask(t); + } + } + } else { + while (!cancelled_) { + Task t = q->PopFront(); + if (!t.f) { + t = Steal(); + if (!t.f) { + // Leave one thread spinning. This reduces latency. + if (allow_spinning_ && !spinning_ && !spinning_.exchange(true)) { + for (int i = 0; i < spin_count && !t.f; i++) { + if (!cancelled_.load(std::memory_order_relaxed)) { + t = Steal(); + } else { + return; + } + } + spinning_ = false; + } + if (!t.f) { + if (!WaitForWork(waiter, &t)) { return; } } - spinning_ = false; - } - if (!t.f) { - if (!WaitForWork(waiter, &t)) { - return; - } } } - } - if (t.f) { - env_.ExecuteTask(t); + if (t.f) { + env_.ExecuteTask(t); + } } } } @@ -244,7 +295,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { // If we are shutting down and all worker threads blocked without work, // that's we are done. blocked_++; - if (done_ && blocked_ == threads_.size()) { + if (done_ && blocked_ == num_threads_) { ec_.CancelWait(waiter); // Almost done, but need to re-check queues. // Consider that all queues are empty and all worker threads are preempted diff --git a/unsupported/test/cxx11_non_blocking_thread_pool.cpp b/unsupported/test/cxx11_non_blocking_thread_pool.cpp index 2c5765ce4..48cd2d4e4 100644 --- a/unsupported/test/cxx11_non_blocking_thread_pool.cpp +++ b/unsupported/test/cxx11_non_blocking_thread_pool.cpp @@ -23,11 +23,11 @@ static void test_create_destroy_empty_pool() } -static void test_parallelism() +static void test_parallelism(bool allow_spinning) { // Test we never-ever fail to match available tasks with idle threads. const int kThreads = 16; // code below expects that this is a multiple of 4 - NonBlockingThreadPool tp(kThreads); + NonBlockingThreadPool tp(kThreads, allow_spinning); VERIFY_IS_EQUAL(tp.NumThreads(), kThreads); VERIFY_IS_EQUAL(tp.CurrentThreadId(), -1); for (int iter = 0; iter < 100; ++iter) { @@ -119,6 +119,7 @@ static void test_cancel() void test_cxx11_non_blocking_thread_pool() { CALL_SUBTEST(test_create_destroy_empty_pool()); - CALL_SUBTEST(test_parallelism()); + CALL_SUBTEST(test_parallelism(true)); + CALL_SUBTEST(test_parallelism(false)); CALL_SUBTEST(test_cancel()); } From d56ab01094d30cf57150b454b8ebaf20ac7bc85e Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 10 Mar 2017 08:30:22 -0800 Subject: [PATCH 32/37] Use C++11 ctor forwarding to simplify code a bit. --- .../src/ThreadPool/NonBlockingThreadPool.h | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h index e28afedb4..4fbd3af1e 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h @@ -19,22 +19,6 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { typedef typename Environment::Task Task; typedef RunQueue Queue; - NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment()) - : num_threads_(num_threads), - allow_spinning_(true), - env_(env), - threads_(num_threads), - queues_(num_threads), - coprimes_(num_threads), - waiters_(num_threads), - blocked_(0), - spinning_(0), - done_(false), - cancelled_(false), - ec_(waiters_) { - Init(); - } - NonBlockingThreadPoolTempl(int num_threads, bool allow_spinning, Environment env = Environment()) : num_threads_(num_threads), @@ -52,6 +36,9 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { Init(); } + NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment()) + : NonBlockingThreadPoolTempl(num_threads, true, env) {} + ~NonBlockingThreadPoolTempl() { done_ = true; From bfd7bf9c5b4ebc932a339200bb4a455eadcf6d28 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 10 Mar 2017 08:48:20 -0800 Subject: [PATCH 33/37] Get rid of Init(). --- .../src/ThreadPool/NonBlockingThreadPool.h | 66 +++++++++---------- 1 file changed, 31 insertions(+), 35 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h index 4fbd3af1e..9dcc9dab7 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h @@ -19,6 +19,9 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { typedef typename Environment::Task Task; typedef RunQueue Queue; + NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment()) + : NonBlockingThreadPoolTempl(num_threads, true, env) {} + NonBlockingThreadPoolTempl(int num_threads, bool allow_spinning, Environment env = Environment()) : num_threads_(num_threads), @@ -33,11 +36,35 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { done_(false), cancelled_(false), ec_(waiters_) { - Init(); - } + waiters_.resize(num_threads_); - NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment()) - : NonBlockingThreadPoolTempl(num_threads, true, env) {} + // Calculate coprimes of num_threads_. + // Coprimes are used for a random walk over all threads in Steal + // and NonEmptyQueueIndex. Iteration is based on the fact that if we take + // a walk starting thread index t and calculate num_threads - 1 subsequent + // indices as (t + coprime) % num_threads, we will cover all threads without + // repetitions (effectively getting a presudo-random permutation of thread + // indices). + for (int i = 1; i <= num_threads_; i++) { + unsigned a = i; + unsigned b = num_threads_; + // If GCD(a, b) == 1, then a and b are coprimes. + while (b != 0) { + unsigned tmp = a; + a = b; + b = tmp % b; + } + if (a == 1) { + coprimes_.push_back(i); + } + } + for (int i = 0; i < num_threads_; i++) { + queues_.push_back(new Queue()); + } + for (int i = 0; i < num_threads_; i++) { + threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); })); + } + } ~NonBlockingThreadPoolTempl() { done_ = true; @@ -140,37 +167,6 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { std::atomic cancelled_; EventCount ec_; - void Init() { - waiters_.resize(num_threads_); - - // Calculate coprimes of num_threads_. - // Coprimes are used for a random walk over all threads in Steal - // and NonEmptyQueueIndex. Iteration is based on the fact that if we take - // a walk starting thread index t and calculate num_threads - 1 subsequent - // indices as (t + coprime) % num_threads, we will cover all threads without - // repetitions (effectively getting a presudo-random permutation of thread - // indices). - for (int i = 1; i <= num_threads_; i++) { - unsigned a = i; - unsigned b = num_threads_; - // If GCD(a, b) == 1, then a and b are coprimes. - while (b != 0) { - unsigned tmp = a; - a = b; - b = tmp % b; - } - if (a == 1) { - coprimes_.push_back(i); - } - } - for (int i = 0; i < num_threads_; i++) { - queues_.push_back(new Queue()); - } - for (int i = 0; i < num_threads_; i++) { - threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); })); - } - } - // Main worker thread loop. void WorkerLoop(int thread_id) { PerThread* pt = GetPerThread(); From f0f35911181cc7e2089a3319d966dcbd3596461b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 14 Mar 2017 14:16:53 -0700 Subject: [PATCH 34/37] Made the reduction code compile with cuda-clang --- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index c841786b8..e341e2e9b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -11,6 +11,17 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H +// clang is incompatible with the CUDA syntax wrt making a kernel a class friend, +// so we'll use a macro to make clang happy. +#ifndef KERNEL_FRIEND +#if defined(__clang__) && defined(__CUDA__) +#define KERNEL_FRIEND friend __global__ +#else +#define KERNEL_FRIEND friend +#endif +#endif + + namespace Eigen { @@ -681,15 +692,15 @@ struct TensorEvaluator, template friend struct internal::FullReducerShard; #endif #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) - template friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); + template KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); #ifdef EIGEN_HAS_CUDA_FP16 - template friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); - template friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); - template friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*); + template KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); + template KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); + template KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*); #endif - template friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); + template KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); - template friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); + template KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); #endif #if defined(EIGEN_USE_SYCL) From 61160a21d23880749bce3b0d630d9880f70af6e5 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Wed, 15 Mar 2017 06:57:25 -0400 Subject: [PATCH 35/37] ARM prefetch fixes: Implement prefetch on ARM64. Do not clobber cc on ARM32. --- Eigen/src/Core/arch/NEON/PacketMath.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 84a56bdcc..aede4a6d5 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -51,14 +51,17 @@ typedef uint32x4_t Packet4ui; #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ const Packet4i p4i_##NAME = pset1(X) -// arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function -// which available on LLVM and GCC (at least) -#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC +#if EIGEN_ARCH_ARM64 + // __builtin_prefetch tends to do nothing on ARM64 compilers because the + // prefetch instructions there are too detailed for __builtin_prefetch to map + // meaningfully to them. + #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : ); +#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR); #elif defined __pld #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR) -#elif !EIGEN_ARCH_ARM64 - #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ( " pld [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" ); +#elif EIGEN_ARCH_ARM32 + #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : ); #else // by default no explicit prefetching #define EIGEN_ARM_PREFETCH(ADDR) From 89fd0c38812b024734eeacf9c23ed3714c8b0f93 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 15 Mar 2017 15:18:03 +0100 Subject: [PATCH 36/37] better check array index before using it --- Eigen/src/OrderingMethods/Eigen_Colamd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h index 933cd564b..da85b4d6e 100644 --- a/Eigen/src/OrderingMethods/Eigen_Colamd.h +++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h @@ -1004,7 +1004,7 @@ static IndexType find_ordering /* return the number of garbage collections */ COLAMD_ASSERT (head [min_score] >= COLAMD_EMPTY) ; /* get pivot column from head of minimum degree list */ - while (head [min_score] == COLAMD_EMPTY && min_score < n_col) + while (min_score < n_col && head [min_score] == COLAMD_EMPTY) { min_score++ ; } From fd7db52f9b3b1227719c6d8281ff5e5198aaae82 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 15 Mar 2017 20:02:39 -0700 Subject: [PATCH 37/37] Silenced compilation warning --- unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h | 1 + 1 file changed, 1 insertion(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index f335edf7d..c46a778b5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -50,6 +50,7 @@ template struct DimensionId { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) { + EIGEN_UNUSED_VARIABLE(dim); eigen_assert(dim == DimId); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {