From 34d9fce93bd5f1521017402154a8ab915af8fcb8 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 27 Feb 2017 16:33:33 -0800
Subject: [PATCH 01/37] Avoid unecessary float to double conversions.

---
 Eigen/src/Core/arch/CUDA/PacketMath.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h
index ad66399e0..4dda63188 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -291,7 +291,7 @@ template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
 
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<float4,4>& kernel) {
-  double tmp = kernel.packet[0].y;
+  float tmp = kernel.packet[0].y;
   kernel.packet[0].y = kernel.packet[1].x;
   kernel.packet[1].x = tmp;
 

From 554116bec1b19d417521e9bee767d6b57813492a Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 27 Feb 2017 16:45:31 -0800
Subject: [PATCH 02/37] Added EIGEN_DEVICE_FUNC to make the prototype of the
 EigenBase override match that of DenseBase

---
 Eigen/src/Core/EigenBase.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Eigen/src/Core/EigenBase.h b/Eigen/src/Core/EigenBase.h
index f76995af9..ccc122cfc 100644
--- a/Eigen/src/Core/EigenBase.h
+++ b/Eigen/src/Core/EigenBase.h
@@ -128,6 +128,7 @@ template<typename Derived> struct EigenBase
   */
 template<typename Derived>
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
 {
   call_assignment(derived(), other.derived());
@@ -136,6 +137,7 @@ Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
 
 template<typename Derived>
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
 {
   call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -144,6 +146,7 @@ Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
 
 template<typename Derived>
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived> &other)
 {
   call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());

From b1fc7c9a09e2aee938036ddba14e870c9658c791 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 27 Feb 2017 16:48:30 -0800
Subject: [PATCH 03/37] Added missing EIGEN_DEVICE_FUNC qualifiers.

---
 Eigen/src/Core/ArrayBase.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Eigen/src/Core/ArrayBase.h b/Eigen/src/Core/ArrayBase.h
index af5fb2566..9da960f08 100644
--- a/Eigen/src/Core/ArrayBase.h
+++ b/Eigen/src/Core/ArrayBase.h
@@ -175,7 +175,7 @@ template<typename Derived> class ArrayBase
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
 {
   call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -188,7 +188,7 @@ ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
 {
   call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -201,7 +201,7 @@ ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
 {
   call_assignment(derived(), other.derived(), internal::mul_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -214,7 +214,7 @@ ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator/=(const ArrayBase<OtherDerived>& other)
 {
   call_assignment(derived(), other.derived(), internal::div_assign_op<Scalar,typename OtherDerived::Scalar>());

From ed4dc9d01aa46688e12ef1e0772145c1b222602c Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 27 Feb 2017 16:57:01 -0800
Subject: [PATCH 04/37] Declared the plset, ploadt_ro, and ploaddup packet
 primitives as usable within a gpu kernel

---
 Eigen/src/Core/GenericPacketMath.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index ac5552d3e..d19d5bbd2 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -231,7 +231,7 @@ pload1(const typename unpacket_traits<Packet>::type  *a) { return pset1<Packet>(
   * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]}
   * Currently, this function is only used for scalar * complex products.
   */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
 ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
 
 /** \internal \returns a packet with elements of \a *from quadrupled.
@@ -279,7 +279,7 @@ inline void pbroadcast2(const typename unpacket_traits<Packet>::type *a,
 }
 
 /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
 plset(const typename unpacket_traits<Packet>::type& a) { return a; }
 
 /** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
@@ -487,7 +487,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& fro
   * by the current computation.
   */
 template<typename Packet, int LoadMode>
-inline Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
 {
   return ploadt<Packet, LoadMode>(from);
 }

From 193939d6aaca2d8b4ee7cac9f0a89637596c692f Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 27 Feb 2017 17:11:47 -0800
Subject: [PATCH 05/37] Added missing EIGEN_DEVICE_FUNC qualifiers to several
 nullary op methods.

---
 Eigen/src/Core/CwiseNullaryOp.h | 80 ++++++++++++++++-----------------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h
index dd498f758..ddd607e38 100644
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -105,7 +105,7 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp
   */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func)
 {
   return CwiseNullaryOp<CustomNullaryOp, PlainObject>(rows, cols, func);
@@ -150,7 +150,7 @@ DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
   */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
 {
   return CwiseNullaryOp<CustomNullaryOp, PlainObject>(RowsAtCompileTime, ColsAtCompileTime, func);
@@ -192,7 +192,7 @@ DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
   * \sa class CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Constant(Index size, const Scalar& value)
 {
   return DenseBase<Derived>::NullaryExpr(size, internal::scalar_constant_op<Scalar>(value));
@@ -208,7 +208,7 @@ DenseBase<Derived>::Constant(Index size, const Scalar& value)
   * \sa class CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Constant(const Scalar& value)
 {
   EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
@@ -220,7 +220,7 @@ DenseBase<Derived>::Constant(const Scalar& value)
   * \sa LinSpaced(Index,Scalar,Scalar), setLinSpaced(Index,const Scalar&,const Scalar&)
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@@ -232,7 +232,7 @@ DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const
   * \sa LinSpaced(Scalar,Scalar)
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@@ -264,7 +264,7 @@ DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& hig
   * \sa setLinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@@ -276,7 +276,7 @@ DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
   * Special version for fixed size types which does not require the size parameter.
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@@ -286,7 +286,7 @@ DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
 
 /** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */
 template<typename Derived>
-bool DenseBase<Derived>::isApproxToConstant
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApproxToConstant
 (const Scalar& val, const RealScalar& prec) const
 {
   typename internal::nested_eval<Derived,1>::type self(derived());
@@ -301,7 +301,7 @@ bool DenseBase<Derived>::isApproxToConstant
   *
   * \returns true if all coefficients in this matrix are approximately equal to \a value, to within precision \a prec */
 template<typename Derived>
-bool DenseBase<Derived>::isConstant
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isConstant
 (const Scalar& val, const RealScalar& prec) const
 {
   return isApproxToConstant(val, prec);
@@ -312,7 +312,7 @@ bool DenseBase<Derived>::isConstant
   * \sa setConstant(), Constant(), class CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
 {
   setConstant(val);
 }
@@ -322,7 +322,7 @@ EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
   * \sa fill(), setConstant(Index,const Scalar&), setConstant(Index,Index,const Scalar&), setZero(), setOnes(), Constant(), class CwiseNullaryOp, setZero(), setOnes()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
 {
   return derived() = Constant(rows(), cols(), val);
 }
@@ -337,7 +337,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
   * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
 {
   resize(size);
@@ -356,7 +356,7 @@ PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
   * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
 {
   resize(rows, cols);
@@ -380,7 +380,7 @@ PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
   * \sa LinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar,PacketScalar>(low,high,newSize));
@@ -400,7 +400,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, con
   * \sa LinSpaced(Index,const Scalar&,const Scalar&), setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return setLinSpaced(size(), low, high);
@@ -423,7 +423,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low,
   * \sa Zero(), Zero(Index)
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Zero(Index rows, Index cols)
 {
   return Constant(rows, cols, Scalar(0));
@@ -446,7 +446,7 @@ DenseBase<Derived>::Zero(Index rows, Index cols)
   * \sa Zero(), Zero(Index,Index)
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Zero(Index size)
 {
   return Constant(size, Scalar(0));
@@ -463,7 +463,7 @@ DenseBase<Derived>::Zero(Index size)
   * \sa Zero(Index), Zero(Index,Index)
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Zero()
 {
   return Constant(Scalar(0));
@@ -478,7 +478,7 @@ DenseBase<Derived>::Zero()
   * \sa class CwiseNullaryOp, Zero()
   */
 template<typename Derived>
-bool DenseBase<Derived>::isZero(const RealScalar& prec) const
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isZero(const RealScalar& prec) const
 {
   typename internal::nested_eval<Derived,1>::type self(derived());
   for(Index j = 0; j < cols(); ++j)
@@ -496,7 +496,7 @@ bool DenseBase<Derived>::isZero(const RealScalar& prec) const
   * \sa class CwiseNullaryOp, Zero()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
 {
   return setConstant(Scalar(0));
 }
@@ -511,7 +511,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
   * \sa DenseBase::setZero(), setZero(Index,Index), class CwiseNullaryOp, DenseBase::Zero()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setZero(Index newSize)
 {
   resize(newSize);
@@ -529,7 +529,7 @@ PlainObjectBase<Derived>::setZero(Index newSize)
   * \sa DenseBase::setZero(), setZero(Index), class CwiseNullaryOp, DenseBase::Zero()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setZero(Index rows, Index cols)
 {
   resize(rows, cols);
@@ -553,7 +553,7 @@ PlainObjectBase<Derived>::setZero(Index rows, Index cols)
   * \sa Ones(), Ones(Index), isOnes(), class Ones
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Ones(Index rows, Index cols)
 {
   return Constant(rows, cols, Scalar(1));
@@ -576,7 +576,7 @@ DenseBase<Derived>::Ones(Index rows, Index cols)
   * \sa Ones(), Ones(Index,Index), isOnes(), class Ones
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Ones(Index newSize)
 {
   return Constant(newSize, Scalar(1));
@@ -593,7 +593,7 @@ DenseBase<Derived>::Ones(Index newSize)
   * \sa Ones(Index), Ones(Index,Index), isOnes(), class Ones
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Ones()
 {
   return Constant(Scalar(1));
@@ -608,7 +608,7 @@ DenseBase<Derived>::Ones()
   * \sa class CwiseNullaryOp, Ones()
   */
 template<typename Derived>
-bool DenseBase<Derived>::isOnes
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isOnes
 (const RealScalar& prec) const
 {
   return isApproxToConstant(Scalar(1), prec);
@@ -622,7 +622,7 @@ bool DenseBase<Derived>::isOnes
   * \sa class CwiseNullaryOp, Ones()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
 {
   return setConstant(Scalar(1));
 }
@@ -637,7 +637,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
   * \sa MatrixBase::setOnes(), setOnes(Index,Index), class CwiseNullaryOp, MatrixBase::Ones()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setOnes(Index newSize)
 {
   resize(newSize);
@@ -655,7 +655,7 @@ PlainObjectBase<Derived>::setOnes(Index newSize)
   * \sa MatrixBase::setOnes(), setOnes(Index), class CwiseNullaryOp, MatrixBase::Ones()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
 {
   resize(rows, cols);
@@ -679,7 +679,7 @@ PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
   * \sa Identity(), setIdentity(), isIdentity()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
 MatrixBase<Derived>::Identity(Index rows, Index cols)
 {
   return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_identity_op<Scalar>());
@@ -696,7 +696,7 @@ MatrixBase<Derived>::Identity(Index rows, Index cols)
   * \sa Identity(Index,Index), setIdentity(), isIdentity()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
 MatrixBase<Derived>::Identity()
 {
   EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
@@ -771,7 +771,7 @@ struct setIdentity_impl<Derived, true>
   * \sa class CwiseNullaryOp, Identity(), Identity(Index,Index), isIdentity()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
 {
   return internal::setIdentity_impl<Derived>::run(derived());
 }
@@ -787,7 +787,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
   * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Identity()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols)
 {
   derived().resize(rows, cols);
   return setIdentity();
@@ -800,7 +800,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index
   * \sa MatrixBase::Unit(Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return BasisReturnType(SquareMatrixType::Identity(newSize,newSize), i);
@@ -815,7 +815,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
   * \sa MatrixBase::Unit(Index,Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return BasisReturnType(SquareMatrixType::Identity(),i);
@@ -828,7 +828,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
   * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX()
 { return Derived::Unit(0); }
 
 /** \returns an expression of the Y axis unit vector (0,1{,0}^*)
@@ -838,7 +838,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
   * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY()
 { return Derived::Unit(1); }
 
 /** \returns an expression of the Z axis unit vector (0,0,1{,0}^*)
@@ -848,7 +848,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
   * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ()
 { return Derived::Unit(2); }
 
 /** \returns an expression of the W axis unit vector (0,0,0,1)
@@ -858,7 +858,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
   * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
 { return Derived::Unit(3); }
 
 } // end namespace Eigen

From 889c606f8fd242b1cf5e3c8f967ac7dad004775d Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 27 Feb 2017 17:17:47 -0800
Subject: [PATCH 06/37] Added missing EIGEN_DEVICE_FUNC to the SelfCwise binary
 ops

---
 Eigen/src/Core/SelfCwiseBinaryOp.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Eigen/src/Core/SelfCwiseBinaryOp.h b/Eigen/src/Core/SelfCwiseBinaryOp.h
index 719ed72a5..50099df82 100644
--- a/Eigen/src/Core/SelfCwiseBinaryOp.h
+++ b/Eigen/src/Core/SelfCwiseBinaryOp.h
@@ -15,7 +15,7 @@ namespace Eigen {
 // TODO generalize the scalar type of 'other'
 
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
 {
   typedef typename Derived::PlainObject PlainObject;
   internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar,Scalar>());
@@ -23,7 +23,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
 }
 
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
 {
   typedef typename Derived::PlainObject PlainObject;
   internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar,Scalar>());
@@ -31,7 +31,7 @@ EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
 }
 
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
 {
   typedef typename Derived::PlainObject PlainObject;
   internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar,Scalar>());
@@ -39,7 +39,7 @@ EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
 }
 
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
 {
   typedef typename Derived::PlainObject PlainObject;
   internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar,Scalar>());

From 478a9f53be33c23ac5e22e0bb09cad7f719fedd4 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 28 Feb 2017 09:32:45 +0100
Subject: [PATCH 07/37] Fix typo.

---
 unsupported/Eigen/CXX11/src/Tensor/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md
index fbb7f3bfc..38cdb9c69 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/README.md
+++ b/unsupported/Eigen/CXX11/src/Tensor/README.md
@@ -75,16 +75,16 @@ large enough to hold all the data.
 
     // Map a tensor of ints on top of stack-allocated storage.
     int storage[128];  // 2 x 4 x 2 x 8 = 128
-    TensorMap<int, 4> t_4d(storage, 2, 4, 2, 8);
+    TensorMap<Tensor<int, 4>> t_4d(storage, 2, 4, 2, 8);
 
     // The same storage can be viewed as a different tensor.
     // You can also pass the sizes as an array.
-    TensorMap<int, 2> t_2d(storage, 16, 8);
+    TensorMap<Tensor<int, 2>> t_2d(storage, 16, 8);
 
     // You can also map fixed-size tensors.  Here we get a 1d view of
     // the 2d fixed-size tensor.
     Tensor<float, Sizes<4, 5>> t_4x3;
-    TensorMap<float, 1> t_12(t_4x3, 12);
+    TensorMap<Tensor<float, 1>> t_12(t_4x3, 12);
 
 
 #### Class TensorRef

From 4e98a7b2f0501408751e2da5f24d65f642371226 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 28 Feb 2017 09:47:38 +0100
Subject: [PATCH 08/37] bug #1396: add some missing EIGEN_DEVICE_FUNC

---
 Eigen/src/Core/DenseBase.h  | 6 +++---
 Eigen/src/Core/MatrixBase.h | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h
index fc807577b..91a8511be 100644
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -484,9 +484,9 @@ template<typename Derived> class DenseBase
       return derived().coeff(0,0);
     }
 
-    bool all() const;
-    bool any() const;
-    Index count() const;
+    EIGEN_DEVICE_FUNC bool all() const;
+    EIGEN_DEVICE_FUNC bool any() const;
+    EIGEN_DEVICE_FUNC Index count() const;
 
     typedef VectorwiseOp<Derived, Horizontal> RowwiseReturnType;
     typedef const VectorwiseOp<const Derived, Horizontal> ConstRowwiseReturnType;
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index 675c94e12..200e57741 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -294,7 +294,7 @@ template<typename Derived> class MatrixBase
       *          fuzzy comparison such as isApprox()
       * \sa isApprox(), operator!= */
     template<typename OtherDerived>
-    inline bool operator==(const MatrixBase<OtherDerived>& other) const
+    EIGEN_DEVICE_FUNC inline bool operator==(const MatrixBase<OtherDerived>& other) const
     { return cwiseEqual(other).all(); }
 
     /** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other.
@@ -302,7 +302,7 @@ template<typename Derived> class MatrixBase
       *          fuzzy comparison such as isApprox()
       * \sa isApprox(), operator== */
     template<typename OtherDerived>
-    inline bool operator!=(const MatrixBase<OtherDerived>& other) const
+    EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase<OtherDerived>& other) const
     { return cwiseNotEqual(other).any(); }
 
     NoAlias<Derived,Eigen::MatrixBase > noalias();

From f3e9c42876de4b49bdb16b72b09a342d930ce6f0 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 28 Feb 2017 09:46:30 -0800
Subject: [PATCH 09/37] Added missing EIGEN_DEVICE_FUNC qualifiers

---
 Eigen/src/Core/Diagonal.h  | 10 ++++++----
 Eigen/src/Core/Transpose.h | 10 +++++-----
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/Eigen/src/Core/Diagonal.h b/Eigen/src/Core/Diagonal.h
index 49e711257..c62f5ff21 100644
--- a/Eigen/src/Core/Diagonal.h
+++ b/Eigen/src/Core/Diagonal.h
@@ -184,7 +184,7 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
   *
   * \sa class Diagonal */
 template<typename Derived>
-inline typename MatrixBase<Derived>::DiagonalReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalReturnType
 MatrixBase<Derived>::diagonal()
 {
   return DiagonalReturnType(derived());
@@ -192,7 +192,7 @@ MatrixBase<Derived>::diagonal()
 
 /** This is the const version of diagonal(). */
 template<typename Derived>
-inline typename MatrixBase<Derived>::ConstDiagonalReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalReturnType
 MatrixBase<Derived>::diagonal() const
 {
   return ConstDiagonalReturnType(derived());
@@ -210,7 +210,7 @@ MatrixBase<Derived>::diagonal() const
   *
   * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
-inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
 MatrixBase<Derived>::diagonal(Index index)
 {
   return DiagonalDynamicIndexReturnType(derived(), index);
@@ -218,7 +218,7 @@ MatrixBase<Derived>::diagonal(Index index)
 
 /** This is the const version of diagonal(Index). */
 template<typename Derived>
-inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
 MatrixBase<Derived>::diagonal(Index index) const
 {
   return ConstDiagonalDynamicIndexReturnType(derived(), index);
@@ -237,6 +237,7 @@ MatrixBase<Derived>::diagonal(Index index) const
   * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
 template<int Index_>
+EIGEN_DEVICE_FUNC
 inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index_>::Type
 MatrixBase<Derived>::diagonal()
 {
@@ -246,6 +247,7 @@ MatrixBase<Derived>::diagonal()
 /** This is the const version of diagonal<int>(). */
 template<typename Derived>
 template<int Index_>
+EIGEN_DEVICE_FUNC
 inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index_>::Type
 MatrixBase<Derived>::diagonal() const
 {
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index 79b767bcc..ba7d6e629 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -168,7 +168,7 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
   *
   * \sa transposeInPlace(), adjoint() */
 template<typename Derived>
-inline Transpose<Derived>
+EIGEN_DEVICE_FUNC inline Transpose<Derived>
 DenseBase<Derived>::transpose()
 {
   return TransposeReturnType(derived());
@@ -180,7 +180,7 @@ DenseBase<Derived>::transpose()
   *
   * \sa transposeInPlace(), adjoint() */
 template<typename Derived>
-inline typename DenseBase<Derived>::ConstTransposeReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ConstTransposeReturnType
 DenseBase<Derived>::transpose() const
 {
   return ConstTransposeReturnType(derived());
@@ -206,7 +206,7 @@ DenseBase<Derived>::transpose() const
   *
   * \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */
 template<typename Derived>
-inline const typename MatrixBase<Derived>::AdjointReturnType
+EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::AdjointReturnType
 MatrixBase<Derived>::adjoint() const
 {
   return AdjointReturnType(this->transpose());
@@ -281,7 +281,7 @@ struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non squ
   *
   * \sa transpose(), adjoint(), adjointInPlace() */
 template<typename Derived>
-inline void DenseBase<Derived>::transposeInPlace()
+EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::transposeInPlace()
 {
   eigen_assert((rows() == cols() || (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic))
                && "transposeInPlace() called on a non-square non-resizable matrix");
@@ -312,7 +312,7 @@ inline void DenseBase<Derived>::transposeInPlace()
   *
   * \sa transpose(), adjoint(), transposeInPlace() */
 template<typename Derived>
-inline void MatrixBase<Derived>::adjointInPlace()
+EIGEN_DEVICE_FUNC inline void MatrixBase<Derived>::adjointInPlace()
 {
   derived() = adjoint().eval();
 }

From 33443ec2b0d116daa5210fc5e7982ca4a1598bc7 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 28 Feb 2017 09:50:10 -0800
Subject: [PATCH 10/37] Added missing EIGEN_DEVICE_FUNC qualifiers

---
 Eigen/src/Core/Redux.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index b6e8f8887..2b5b73bf7 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -407,7 +407,7 @@ protected:
   */
 template<typename Derived>
 template<typename Func>
-typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::redux(const Func& func) const
 {
   eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
@@ -422,7 +422,7 @@ DenseBase<Derived>::redux(const Func& func) const
   * \warning the result is undefined if \c *this contains NaN.
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff() const
 {
   return derived().redux(Eigen::internal::scalar_min_op<Scalar,Scalar>());
@@ -432,7 +432,7 @@ DenseBase<Derived>::minCoeff() const
   * \warning the result is undefined if \c *this contains NaN.
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff() const
 {
   return derived().redux(Eigen::internal::scalar_max_op<Scalar,Scalar>());
@@ -445,7 +445,7 @@ DenseBase<Derived>::maxCoeff() const
   * \sa trace(), prod(), mean()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::sum() const
 {
   if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
@@ -458,7 +458,7 @@ DenseBase<Derived>::sum() const
 * \sa trace(), prod(), sum()
 */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::mean() const
 {
 #ifdef __INTEL_COMPILER
@@ -479,7 +479,7 @@ DenseBase<Derived>::mean() const
   * \sa sum(), mean(), trace()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::prod() const
 {
   if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
@@ -494,7 +494,7 @@ DenseBase<Derived>::prod() const
   * \sa diagonal(), sum()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 MatrixBase<Derived>::trace() const
 {
   return derived().diagonal().sum();

From e993c94f07faef161076c8710e8cb434174dd250 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 28 Feb 2017 09:56:45 -0800
Subject: [PATCH 11/37] Added missing EIGEN_DEVICE_FUNC qualifiers

---
 Eigen/src/Core/CommaInitializer.h |  4 ++--
 Eigen/src/Core/CwiseBinaryOp.h    |  5 ++---
 Eigen/src/Core/CwiseNullaryOp.h   |  4 ++--
 Eigen/src/Core/Fuzzy.h            |  6 +++---
 Eigen/src/Core/NestByValue.h      | 10 +++++-----
 Eigen/src/Core/ReturnByValue.h    |  2 +-
 6 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/Eigen/src/Core/CommaInitializer.h b/Eigen/src/Core/CommaInitializer.h
index d218e9814..35fdbb819 100644
--- a/Eigen/src/Core/CommaInitializer.h
+++ b/Eigen/src/Core/CommaInitializer.h
@@ -141,7 +141,7 @@ struct CommaInitializer
   * \sa CommaInitializer::finished(), class CommaInitializer
   */
 template<typename Derived>
-inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
+EIGEN_DEVICE_FUNC inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
 {
   return CommaInitializer<Derived>(*static_cast<Derived*>(this), s);
 }
@@ -149,7 +149,7 @@ inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s
 /** \sa operator<<(const Scalar&) */
 template<typename Derived>
 template<typename OtherDerived>
-inline CommaInitializer<Derived>
+EIGEN_DEVICE_FUNC inline CommaInitializer<Derived>
 DenseBase<Derived>::operator<<(const DenseBase<OtherDerived>& other)
 {
   return CommaInitializer<Derived>(*static_cast<Derived *>(this), other);
diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h
index a36765e39..bf2632d9e 100644
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -158,7 +158,7 @@ public:
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
 {
   call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -171,7 +171,7 @@ MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
 {
   call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -181,4 +181,3 @@ MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
 } // end namespace Eigen
 
 #endif // EIGEN_CWISE_BINARY_OP_H
-
diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h
index ddd607e38..144608ec2 100644
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -131,7 +131,7 @@ DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f
   */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@@ -170,7 +170,7 @@ DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
   * \sa class CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
 {
   return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_constant_op<Scalar>(value));
diff --git a/Eigen/src/Core/Fuzzy.h b/Eigen/src/Core/Fuzzy.h
index 3e403a09d..43aa49b2b 100644
--- a/Eigen/src/Core/Fuzzy.h
+++ b/Eigen/src/Core/Fuzzy.h
@@ -100,7 +100,7 @@ struct isMuchSmallerThan_scalar_selector<Derived, true>
   */
 template<typename Derived>
 template<typename OtherDerived>
-bool DenseBase<Derived>::isApprox(
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApprox(
   const DenseBase<OtherDerived>& other,
   const RealScalar& prec
 ) const
@@ -122,7 +122,7 @@ bool DenseBase<Derived>::isApprox(
   * \sa isApprox(), isMuchSmallerThan(const DenseBase<OtherDerived>&, RealScalar) const
   */
 template<typename Derived>
-bool DenseBase<Derived>::isMuchSmallerThan(
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
   const typename NumTraits<Scalar>::Real& other,
   const RealScalar& prec
 ) const
@@ -142,7 +142,7 @@ bool DenseBase<Derived>::isMuchSmallerThan(
   */
 template<typename Derived>
 template<typename OtherDerived>
-bool DenseBase<Derived>::isMuchSmallerThan(
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
   const DenseBase<OtherDerived>& other,
   const RealScalar& prec
 ) const
diff --git a/Eigen/src/Core/NestByValue.h b/Eigen/src/Core/NestByValue.h
index 13adf070e..01cf192e9 100644
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h
@@ -67,25 +67,25 @@ template<typename ExpressionType> class NestByValue
     }
 
     template<int LoadMode>
-    inline const PacketScalar packet(Index row, Index col) const
+    EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index row, Index col) const
     {
       return m_expression.template packet<LoadMode>(row, col);
     }
 
     template<int LoadMode>
-    inline void writePacket(Index row, Index col, const PacketScalar& x)
+    EIGEN_DEVICE_FUNC inline void writePacket(Index row, Index col, const PacketScalar& x)
     {
       m_expression.const_cast_derived().template writePacket<LoadMode>(row, col, x);
     }
 
     template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
+    EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index index) const
     {
       return m_expression.template packet<LoadMode>(index);
     }
 
     template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& x)
+    EIGEN_DEVICE_FUNC inline void writePacket(Index index, const PacketScalar& x)
     {
       m_expression.const_cast_derived().template writePacket<LoadMode>(index, x);
     }
@@ -99,7 +99,7 @@ template<typename ExpressionType> class NestByValue
 /** \returns an expression of the temporary version of *this.
   */
 template<typename Derived>
-inline const NestByValue<Derived>
+EIGEN_DEVICE_FUNC inline const NestByValue<Derived>
 DenseBase<Derived>::nestByValue() const
 {
   return NestByValue<Derived>(derived());
diff --git a/Eigen/src/Core/ReturnByValue.h b/Eigen/src/Core/ReturnByValue.h
index c44b7673b..11dc86d07 100644
--- a/Eigen/src/Core/ReturnByValue.h
+++ b/Eigen/src/Core/ReturnByValue.h
@@ -79,7 +79,7 @@ template<typename Derived> class ReturnByValue
 
 template<typename Derived>
 template<typename OtherDerived>
-Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
+EIGEN_DEVICE_FUNC Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
 {
   other.evalTo(derived());
   return derived();

From 765f4cc4b4774fc114ab794e998c5ab4f2d733f9 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 28 Feb 2017 11:57:00 -0800
Subject: [PATCH 12/37] Deleted extra: EIGEN_DEVICE_FUNC: the QR and Cholesky
 code isn't ready to run on GPU yet.

---
 Eigen/src/Cholesky/LDLT.h                      | 1 -
 Eigen/src/Cholesky/LLT.h                       | 1 -
 Eigen/src/QR/ColPivHouseholderQR.h             | 1 -
 Eigen/src/QR/CompleteOrthogonalDecomposition.h | 2 +-
 Eigen/src/QR/FullPivHouseholderQR.h            | 9 ++++-----
 Eigen/src/QR/HouseholderQR.h                   | 9 ++++-----
 6 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h
index fcee7b2e3..9b4fdb414 100644
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -258,7 +258,6 @@ template<typename _MatrixType, int _UpLo> class LDLT
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
     #endif
 
diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h
index 87ca8d423..e6c02d803 100644
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -200,7 +200,6 @@ template<typename _MatrixType, int _UpLo> class LLT
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
     #endif
 
diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h
index 0e47c8332..d35395d04 100644
--- a/Eigen/src/QR/ColPivHouseholderQR.h
+++ b/Eigen/src/QR/ColPivHouseholderQR.h
@@ -416,7 +416,6 @@ template<typename _MatrixType> class ColPivHouseholderQR
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
     #endif
 
diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h
index 34c637b70..13b61fcdb 100644
--- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h
+++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h
@@ -367,7 +367,7 @@ class CompleteOrthogonalDecomposition {
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
   template <typename RhsType, typename DstType>
-  EIGEN_DEVICE_FUNC void _solve_impl(const RhsType& rhs, DstType& dst) const;
+  void _solve_impl(const RhsType& rhs, DstType& dst) const;
 #endif
 
  protected:
diff --git a/Eigen/src/QR/FullPivHouseholderQR.h b/Eigen/src/QR/FullPivHouseholderQR.h
index e489bddc2..c31e47cc4 100644
--- a/Eigen/src/QR/FullPivHouseholderQR.h
+++ b/Eigen/src/QR/FullPivHouseholderQR.h
@@ -392,22 +392,21 @@ template<typename _MatrixType> class FullPivHouseholderQR
       *          diagonal coefficient of U.
       */
     RealScalar maxPivot() const { return m_maxpivot; }
-    
+
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
     #endif
 
   protected:
-    
+
     static void check_template_parameters()
     {
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
     }
-    
+
     void computeInPlace();
-    
+
     MatrixType m_qr;
     HCoeffsType m_hCoeffs;
     IntDiagSizeVectorType m_rows_transpositions;
diff --git a/Eigen/src/QR/HouseholderQR.h b/Eigen/src/QR/HouseholderQR.h
index 3513d995c..762b21c36 100644
--- a/Eigen/src/QR/HouseholderQR.h
+++ b/Eigen/src/QR/HouseholderQR.h
@@ -204,28 +204,27 @@ template<typename _MatrixType> class HouseholderQR
 
     inline Index rows() const { return m_qr.rows(); }
     inline Index cols() const { return m_qr.cols(); }
-    
+
     /** \returns a const reference to the vector of Householder coefficients used to represent the factor \c Q.
       * 
       * For advanced uses only.
       */
     const HCoeffsType& hCoeffs() const { return m_hCoeffs; }
-    
+
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
     #endif
 
   protected:
-    
+
     static void check_template_parameters()
     {
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
     }
 
     void computeInPlace();
-    
+
     MatrixType m_qr;
     HCoeffsType m_hCoeffs;
     RowVectorType m_temp;

From de7b0fdea9db957d2135c32e850ad069b64b5f1e Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 28 Feb 2017 13:52:22 -0800
Subject: [PATCH 13/37] Made the TensorStorage class compile with clang 3.9

---
 unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
index 2854a4a17..e6a666f78 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -31,12 +31,12 @@ namespace Eigen {
   *
   * \sa Tensor
   */
-template<typename T, typename Dimensions, int Options_> class TensorStorage;
+template<typename T, typename Dimensions, int Options> class TensorStorage;
 
 
 // Pure fixed-size storage
-template<typename T, int Options_, typename FixedDimensions>
-class TensorStorage<T, FixedDimensions, Options_>
+template<typename T, typename FixedDimensions, int Options_>
+class TensorStorage
 {
  private:
   static const std::size_t Size = FixedDimensions::total_size;
@@ -66,7 +66,7 @@ class TensorStorage<T, FixedDimensions, Options_>
 
 
 // pure dynamic
-template<typename T, int Options_, typename IndexType, int NumIndices_>
+template<typename T, typename IndexType, int NumIndices_, int Options_>
 class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
 {
   public:

From 4a7df114c883eb17251f25b8b975c0ddf266acd6 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 28 Feb 2017 14:00:15 -0800
Subject: [PATCH 14/37] Added missing EIGEN_DEVICE_FUNC

---
 Eigen/src/Core/Assign.h |  2 +-
 Eigen/src/Core/Dot.h    | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h
index 53806ba33..655412efd 100644
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -16,7 +16,7 @@ namespace Eigen {
 
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
   ::lazyAssign(const DenseBase<OtherDerived>& other)
 {
   enum{
diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h
index 06ef18b8b..bb8e3fecc 100644
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -90,7 +90,7 @@ MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
   * \sa dot(), norm(), lpNorm()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
 {
   return numext::real((*this).cwiseAbs2().sum());
 }
@@ -102,7 +102,7 @@ EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scala
   * \sa lpNorm(), dot(), squaredNorm()
   */
 template<typename Derived>
-inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
+EIGEN_DEVICE_FUNC inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
 {
   return numext::sqrt(squaredNorm());
 }
@@ -117,7 +117,7 @@ inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real Matr
   * \sa norm(), normalize()
   */
 template<typename Derived>
-inline const typename MatrixBase<Derived>::PlainObject
+EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::normalized() const
 {
   typedef typename internal::nested_eval<Derived,2>::type _Nested;
@@ -139,7 +139,7 @@ MatrixBase<Derived>::normalized() const
   * \sa norm(), normalized()
   */
 template<typename Derived>
-inline void MatrixBase<Derived>::normalize()
+EIGEN_DEVICE_FUNC inline void MatrixBase<Derived>::normalize()
 {
   RealScalar z = squaredNorm();
   // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
@@ -160,7 +160,7 @@ inline void MatrixBase<Derived>::normalize()
   * \sa stableNorm(), stableNormalize(), normalized()
   */
 template<typename Derived>
-inline const typename MatrixBase<Derived>::PlainObject
+EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::stableNormalized() const
 {
   typedef typename internal::nested_eval<Derived,3>::type _Nested;
@@ -185,7 +185,7 @@ MatrixBase<Derived>::stableNormalized() const
   * \sa stableNorm(), stableNormalized(), normalize()
   */
 template<typename Derived>
-inline void MatrixBase<Derived>::stableNormalize()
+EIGEN_DEVICE_FUNC inline void MatrixBase<Derived>::stableNormalize()
 {
   RealScalar w = cwiseAbs().maxCoeff();
   RealScalar z = (derived()/w).squaredNorm();
@@ -257,9 +257,9 @@ struct lpNorm_selector<Derived, Infinity>
 template<typename Derived>
 template<int p>
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+EIGEN_DEVICE_FUNC inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 #else
-MatrixBase<Derived>::RealScalar
+EIGEN_DEVICE_FUNC MatrixBase<Derived>::RealScalar
 #endif
 MatrixBase<Derived>::lpNorm() const
 {

From c36bc2d445596d46c7f5a9271bfa69c79e2e1558 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 28 Feb 2017 14:58:45 -0800
Subject: [PATCH 15/37] Added missing EIGEN_DEVICE_FUNC qualifiers

---
 Eigen/src/Core/BooleanRedux.h    | 6 +++---
 Eigen/src/Core/DiagonalProduct.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Eigen/src/Core/BooleanRedux.h b/Eigen/src/Core/BooleanRedux.h
index ed607d5d8..ccf519067 100644
--- a/Eigen/src/Core/BooleanRedux.h
+++ b/Eigen/src/Core/BooleanRedux.h
@@ -76,7 +76,7 @@ struct any_unroller<Derived, Dynamic, Rows>
   * \sa any(), Cwise::operator<()
   */
 template<typename Derived>
-inline bool DenseBase<Derived>::all() const
+EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::all() const
 {
   typedef internal::evaluator<Derived> Evaluator;
   enum {
@@ -100,7 +100,7 @@ inline bool DenseBase<Derived>::all() const
   * \sa all()
   */
 template<typename Derived>
-inline bool DenseBase<Derived>::any() const
+EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::any() const
 {
   typedef internal::evaluator<Derived> Evaluator;
   enum {
@@ -124,7 +124,7 @@ inline bool DenseBase<Derived>::any() const
   * \sa all(), any()
   */
 template<typename Derived>
-inline Eigen::Index DenseBase<Derived>::count() const
+EIGEN_DEVICE_FUNC inline Eigen::Index DenseBase<Derived>::count() const
 {
   return derived().template cast<bool>().template cast<Index>().sum();
 }
diff --git a/Eigen/src/Core/DiagonalProduct.h b/Eigen/src/Core/DiagonalProduct.h
index d372b938f..7911d1cd1 100644
--- a/Eigen/src/Core/DiagonalProduct.h
+++ b/Eigen/src/Core/DiagonalProduct.h
@@ -17,7 +17,7 @@ namespace Eigen {
   */
 template<typename Derived>
 template<typename DiagonalDerived>
-inline const Product<Derived, DiagonalDerived, LazyProduct>
+EIGEN_DEVICE_FUNC inline const Product<Derived, DiagonalDerived, LazyProduct>
 MatrixBase<Derived>::operator*(const DiagonalBase<DiagonalDerived> &a_diagonal) const
 {
   return Product<Derived, DiagonalDerived, LazyProduct>(derived(),a_diagonal.derived());

From 857adbbd52bb1a36c913a828fb5f24b95deee965 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 28 Feb 2017 16:42:00 -0800
Subject: [PATCH 16/37] Added missing EIGEN_DEVICE_FUNC qualifiers

---
 Eigen/src/Core/Random.h    | 2 +-
 Eigen/src/Core/Replicate.h | 4 ++--
 Eigen/src/Core/Reverse.h   | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/Eigen/src/Core/Random.h b/Eigen/src/Core/Random.h
index 6faf789c7..486e9ed52 100644
--- a/Eigen/src/Core/Random.h
+++ b/Eigen/src/Core/Random.h
@@ -128,7 +128,7 @@ DenseBase<Derived>::Random()
   * \sa class CwiseNullaryOp, setRandom(Index), setRandom(Index,Index)
   */
 template<typename Derived>
-inline Derived& DenseBase<Derived>::setRandom()
+EIGEN_DEVICE_FUNC inline Derived& DenseBase<Derived>::setRandom()
 {
   return *this = Random(rows(), cols());
 }
diff --git a/Eigen/src/Core/Replicate.h b/Eigen/src/Core/Replicate.h
index 9960ef884..0b2d6d743 100644
--- a/Eigen/src/Core/Replicate.h
+++ b/Eigen/src/Core/Replicate.h
@@ -115,7 +115,7 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
   */
 template<typename Derived>
 template<int RowFactor, int ColFactor>
-const Replicate<Derived,RowFactor,ColFactor>
+EIGEN_DEVICE_FUNC const Replicate<Derived,RowFactor,ColFactor>
 DenseBase<Derived>::replicate() const
 {
   return Replicate<Derived,RowFactor,ColFactor>(derived());
@@ -130,7 +130,7 @@ DenseBase<Derived>::replicate() const
   * \sa VectorwiseOp::replicate(), DenseBase::replicate(), class Replicate
   */
 template<typename ExpressionType, int Direction>
-const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
+EIGEN_DEVICE_FUNC const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
 VectorwiseOp<ExpressionType,Direction>::replicate(Index factor) const
 {
   return typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h
index 0640cda2a..8b6b3ab03 100644
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h
@@ -114,7 +114,7 @@ template<typename MatrixType, int Direction> class Reverse
   *
   */
 template<typename Derived>
-inline typename DenseBase<Derived>::ReverseReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ReverseReturnType
 DenseBase<Derived>::reverse()
 {
   return ReverseReturnType(derived());
@@ -136,7 +136,7 @@ DenseBase<Derived>::reverse()
   *
   * \sa VectorwiseOp::reverseInPlace(), reverse() */
 template<typename Derived>
-inline void DenseBase<Derived>::reverseInPlace()
+EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::reverseInPlace()
 {
   if(cols()>rows())
   {
@@ -201,7 +201,7 @@ struct vectorwise_reverse_inplace_impl<Horizontal>
   *
   * \sa DenseBase::reverseInPlace(), reverse() */
 template<typename ExpressionType, int Direction>
-void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
+EIGEN_DEVICE_FUNC void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
 {
   internal::vectorwise_reverse_inplace_impl<Direction>::run(_expression().const_cast_derived());
 }

From c92406d6137e4c55fc608c6916cc20899d00cff4 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 28 Feb 2017 17:03:11 -0800
Subject: [PATCH 17/37] Silenced clang compilation warning.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index e6cee11ef..be8d69386 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -217,7 +217,10 @@ struct GpuDevice {
     EIGEN_UNUSED_VARIABLE(err)
     assert(err == cudaSuccess);
 #else
-  eigen_assert(false && "The default device should be used instead to generate kernel code");
+    EIGEN_UNUSED_VARIABLE(dst);
+    EIGEN_UNUSED_VARIABLE(src);
+    EIGEN_UNUSED_VARIABLE(n);
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
   }
 

From 7b61944669f23a20f6c850b9c07d930c049c6ede Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 28 Feb 2017 17:05:28 -0800
Subject: [PATCH 18/37] Made most of the packet math primitives usable within
 CUDA kernel when compiling with clang

---
 Eigen/src/Core/arch/CUDA/PacketMath.h | 4 ++--
 Eigen/src/Core/util/Macros.h          | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h
index 4dda63188..8c46af09b 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -167,10 +167,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const d
   return make_double2(from[0], from[1]);
 }
 
-template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float*   from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float*   from) {
   return make_float4(from[0], from[0], from[1], from[1]);
 }
-template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double*  from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double*  from) {
   return make_double2(from[0], from[0]);
 }
 
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 29c796647..14ec87da8 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -542,8 +542,8 @@
 //  - static is not very good because it prevents definitions from different object files to be merged.
 //           So static causes the resulting linked executable to be bloated with multiple copies of the same function.
 //  - inline is not perfect either as it unwantedly hints the compiler toward inlining the function.
-#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS inline
+#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC
+#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC inline
 
 #ifdef NDEBUG
 # ifndef EIGEN_NO_DEBUG

From 3a3f040baa602e13fe36d2949712d4af1b2db354 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 28 Feb 2017 17:06:15 -0800
Subject: [PATCH 19/37] Added missing EIGEN_DEVICE_FUNC qualifiers

---
 Eigen/src/Core/VectorwiseOp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h
index 4fe267e9f..893bc796f 100644
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@@ -670,7 +670,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
   * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
   */
 template<typename Derived>
-inline typename DenseBase<Derived>::ColwiseReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ColwiseReturnType
 DenseBase<Derived>::colwise()
 {
   return ColwiseReturnType(derived());
@@ -684,7 +684,7 @@ DenseBase<Derived>::colwise()
   * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
   */
 template<typename Derived>
-inline typename DenseBase<Derived>::RowwiseReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::RowwiseReturnType
 DenseBase<Derived>::rowwise()
 {
   return RowwiseReturnType(derived());

From c1d87ec110d9ee96da39d58c8f88481f5cb9d04c Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 1 Mar 2017 10:08:50 -0800
Subject: [PATCH 20/37] Added missing EIGEN_DEVICE_FUNC qualifiers

---
 Eigen/src/Core/CoreEvaluators.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index 412f5a661..54276b836 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -139,14 +139,14 @@ public:
     EIGEN_ONLY_USED_FOR_DEBUG(outerStride);
     eigen_internal_assert(outerStride==OuterStride);
   }
-  Index outerStride() const { return OuterStride; }
+  EIGEN_DEVICE_FUNC Index outerStride() const { return OuterStride; }
   const Scalar *data;
 };
 
 template<typename Scalar> class plainobjectbase_evaluator_data<Scalar,Dynamic> {
 public:
   plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {}
-  Index outerStride() const { return m_outerStride; }
+  EIGEN_DEVICE_FUNC Index outerStride() const { return m_outerStride; }
   const Scalar *data;
 protected:
   Index m_outerStride;

From 1e2d046651102c57f5f4eca38ff7844e1b0ca6fd Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 1 Mar 2017 10:13:42 -0800
Subject: [PATCH 21/37] Silenced a couple of compilation warnings

---
 unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h | 1 +
 unsupported/Eigen/CXX11/src/util/EmulateArray.h    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
index 3523e7c94..d23f2e4c8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -23,6 +23,7 @@ struct static_val {
 
   template <typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) {
+    EIGEN_UNUSED_VARIABLE(v);
     eigen_assert(v == n);
   }
 };
diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
index 03169d591..573ca435a 100644
--- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
@@ -169,6 +169,7 @@ template <typename T> class array<T, 0> {
 
 #if EIGEN_HAS_VARIADIC_TEMPLATES
   EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() {
+    EIGEN_UNUSED_VARIABLE(l);
     eigen_assert(l.size() == 0);
   }
 #endif

From 09ae0e6586b978ce1ea9960984e1228dfc8971b8 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 1 Mar 2017 11:47:47 -0800
Subject: [PATCH 22/37] Adjusted the EIGEN_DEVICE_FUNC qualifiers to make sure
 that:   * they're used consistently between the declaration and the
 definition of a function   * we avoid calling host only methods from host
 device methods.

---
 Eigen/src/Core/CoreEvaluators.h                   |  4 ++--
 Eigen/src/Core/DiagonalMatrix.h                   |  4 ++--
 Eigen/src/Core/GeneralProduct.h                   |  2 +-
 Eigen/src/Core/SelfAdjointView.h                  |  4 ++--
 Eigen/src/Core/SolveTriangular.h                  |  2 +-
 Eigen/src/Core/TriangularMatrix.h                 | 15 ++++++++-------
 .../Core/products/GeneralMatrixMatrixTriangular.h |  6 +++---
 Eigen/src/Core/products/SelfadjointProduct.h      |  2 +-
 Eigen/src/Core/products/SelfadjointRank2Update.h  |  2 +-
 Eigen/src/Core/util/IntegralConstant.h            |  4 ++--
 Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h     |  4 ++--
 Eigen/src/LU/FullPivLU.h                          |  2 --
 Eigen/src/SVD/SVDBase.h                           |  1 -
 13 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index 54276b836..15b361b38 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -134,7 +134,7 @@ private:
 // this helper permits to completely eliminate m_outerStride if it is known at compiletime.
 template<typename Scalar,int OuterStride> class plainobjectbase_evaluator_data {
 public:
-  plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr)
+  EIGEN_DEVICE_FUNC plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr)
   {
     EIGEN_ONLY_USED_FOR_DEBUG(outerStride);
     eigen_internal_assert(outerStride==OuterStride);
@@ -145,7 +145,7 @@ public:
 
 template<typename Scalar> class plainobjectbase_evaluator_data<Scalar,Dynamic> {
 public:
-  plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {}
+  EIGEN_DEVICE_FUNC plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {}
   EIGEN_DEVICE_FUNC Index outerStride() const { return m_outerStride; }
   const Scalar *data;
 protected:
diff --git a/Eigen/src/Core/DiagonalMatrix.h b/Eigen/src/Core/DiagonalMatrix.h
index ecfdce8ef..4e8297ee6 100644
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h
@@ -44,7 +44,7 @@ class DiagonalBase : public EigenBase<Derived>
 
     EIGEN_DEVICE_FUNC
     DenseMatrixType toDenseMatrix() const { return derived(); }
-    
+
     EIGEN_DEVICE_FUNC
     inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); }
     EIGEN_DEVICE_FUNC
@@ -273,7 +273,7 @@ class DiagonalWrapper
   * \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal()
   **/
 template<typename Derived>
-inline const DiagonalWrapper<const Derived>
+EIGEN_DEVICE_FUNC inline const DiagonalWrapper<const Derived>
 MatrixBase<Derived>::asDiagonal() const
 {
   return DiagonalWrapper<const Derived>(derived());
diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h
index 0f16cd8e3..b206b0a7a 100644
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -428,7 +428,7 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
 template<typename Derived>
 template<typename OtherDerived>
 const Product<Derived,OtherDerived,LazyProduct>
-MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
+EIGEN_DEVICE_FUNC MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
 {
   enum {
     ProductIsValid =  Derived::ColsAtCompileTime==Dynamic
diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h
index 504c98f0e..7e71fe3c0 100644
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@@ -322,7 +322,7 @@ public:
 /** This is the const version of MatrixBase::selfadjointView() */
 template<typename Derived>
 template<unsigned int UpLo>
-typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView() const
 {
   return typename ConstSelfAdjointViewReturnType<UpLo>::Type(derived());
@@ -339,7 +339,7 @@ MatrixBase<Derived>::selfadjointView() const
   */
 template<typename Derived>
 template<unsigned int UpLo>
-typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView()
 {
   return typename SelfAdjointViewReturnType<UpLo>::Type(derived());
diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h
index 049890b25..a0011d4f9 100644
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -164,7 +164,7 @@ struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename MatrixType, unsigned int Mode>
 template<int Side, typename OtherDerived>
-void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
 {
   OtherDerived& other = _other.const_cast_derived();
   eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) );
diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h
index 667ef09dc..ed80da36a 100644
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -488,7 +488,6 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
       * \sa TriangularView::solveInPlace()
       */
     template<int Side, typename Other>
-    EIGEN_DEVICE_FUNC
     inline const internal::triangular_solve_retval<Side,TriangularViewType, Other>
     solve(const MatrixBase<Other>& other) const;
 
@@ -554,7 +553,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
 // FIXME should we keep that possibility
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-inline TriangularView<MatrixType, Mode>&
+EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
 TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDerived>& other)
 {
   internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -564,7 +563,7 @@ TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDer
 // FIXME should we keep that possibility
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
 {
   internal::call_assignment_no_alias(derived(), other.template triangularView<Mode>());
 }
@@ -573,7 +572,7 @@ void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<Ot
 
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-inline TriangularView<MatrixType, Mode>&
+EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
 TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<OtherDerived>& other)
 {
   eigen_assert(Mode == int(OtherDerived::Mode));
@@ -583,7 +582,7 @@ TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<Othe
 
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
 {
   eigen_assert(Mode == int(OtherDerived::Mode));
   internal::call_assignment_no_alias(derived(), other.derived());
@@ -598,7 +597,7 @@ void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBas
   * If the matrix is triangular, the opposite part is set to zero. */
 template<typename Derived>
 template<typename DenseDerived>
-void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
+EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
 {
   evalToLazy(other.derived());
 }
@@ -624,6 +623,7 @@ void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
   */
 template<typename Derived>
 template<unsigned int Mode>
+EIGEN_DEVICE_FUNC
 typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type
 MatrixBase<Derived>::triangularView()
 {
@@ -633,6 +633,7 @@ MatrixBase<Derived>::triangularView()
 /** This is the const version of MatrixBase::triangularView() */
 template<typename Derived>
 template<unsigned int Mode>
+EIGEN_DEVICE_FUNC
 typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type
 MatrixBase<Derived>::triangularView() const
 {
@@ -930,7 +931,7 @@ struct triangular_assignment_loop<Kernel, Mode, Dynamic, SetOpposite>
   * If the matrix is triangular, the opposite part is set to zero. */
 template<typename Derived>
 template<typename DenseDerived>
-void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
+EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
 {
   other.derived().resize(this->rows(), this->cols());
   internal::call_triangular_assignment_loop<Derived::Mode,(Derived::Mode&SelfAdjoint)==0 /* SetOpposite */>(other.derived(), derived().nestedExpression());
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
index 7122efa60..ad38bcf51 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -292,12 +292,12 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
 
 template<typename MatrixType, unsigned int UpLo>
 template<typename ProductType>
-TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta)
+EIGEN_DEVICE_FUNC TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta)
 {
   eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols());
-  
+
   general_product_to_triangular_selector<MatrixType, ProductType, UpLo, internal::traits<ProductType>::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta);
-  
+
   return derived();
 }
 
diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h
index f038d686f..39c5b59ff 100644
--- a/Eigen/src/Core/products/SelfadjointProduct.h
+++ b/Eigen/src/Core/products/SelfadjointProduct.h
@@ -120,7 +120,7 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false>
 
 template<typename MatrixType, unsigned int UpLo>
 template<typename DerivedU>
-SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
+EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
 ::rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha)
 {
   selfadjoint_product_selector<MatrixType,DerivedU,UpLo>::run(_expression().const_cast_derived(), u.derived(), alpha);
diff --git a/Eigen/src/Core/products/SelfadjointRank2Update.h b/Eigen/src/Core/products/SelfadjointRank2Update.h
index 2ae364111..d395888e5 100644
--- a/Eigen/src/Core/products/SelfadjointRank2Update.h
+++ b/Eigen/src/Core/products/SelfadjointRank2Update.h
@@ -57,7 +57,7 @@ template<bool Cond, typename T> struct conj_expr_if
 
 template<typename MatrixType, unsigned int UpLo>
 template<typename DerivedU, typename DerivedV>
-SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
+EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
 ::rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, const Scalar& alpha)
 {
   typedef internal::blas_traits<DerivedU> UBlasTraits;
diff --git a/Eigen/src/Core/util/IntegralConstant.h b/Eigen/src/Core/util/IntegralConstant.h
index ae41015bd..78a4705cd 100644
--- a/Eigen/src/Core/util/IntegralConstant.h
+++ b/Eigen/src/Core/util/IntegralConstant.h
@@ -151,9 +151,9 @@ struct get_fixed_value<variable_if_dynamic<T,N>,Default> {
   static const int value = N;
 };
 
-template<typename T> Index get_runtime_value(const T &x) { return x; }
+template<typename T> EIGEN_DEVICE_FUNC Index get_runtime_value(const T &x) { return x; }
 #if !EIGEN_HAS_CXX14
-template<int N> Index get_runtime_value(FixedInt<N> (*)()) { return N; }
+template<int N> EIGEN_DEVICE_FUNC Index get_runtime_value(FixedInt<N> (*)()) { return N; }
 #endif
 
 // Cleanup integer/FixedInt/VariableAndFixedInt/etc types:
diff --git a/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h b/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
index 4fec8af0a..dbbd4806a 100644
--- a/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
+++ b/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
@@ -85,7 +85,7 @@ MatrixBase<Derived>::eigenvalues() const
   * \sa SelfAdjointEigenSolver::eigenvalues(), MatrixBase::eigenvalues()
   */
 template<typename MatrixType, unsigned int UpLo> 
-inline typename SelfAdjointView<MatrixType, UpLo>::EigenvaluesReturnType
+EIGEN_DEVICE_FUNC inline typename SelfAdjointView<MatrixType, UpLo>::EigenvaluesReturnType
 SelfAdjointView<MatrixType, UpLo>::eigenvalues() const
 {
   typedef typename SelfAdjointView<MatrixType, UpLo>::PlainObject PlainObject;
@@ -149,7 +149,7 @@ MatrixBase<Derived>::operatorNorm() const
   * \sa eigenvalues(), MatrixBase::operatorNorm()
   */
 template<typename MatrixType, unsigned int UpLo>
-inline typename SelfAdjointView<MatrixType, UpLo>::RealScalar
+EIGEN_DEVICE_FUNC inline typename SelfAdjointView<MatrixType, UpLo>::RealScalar
 SelfAdjointView<MatrixType, UpLo>::operatorNorm() const
 {
   return eigenvalues().cwiseAbs().maxCoeff();
diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h
index 03b6af706..ec61086d5 100644
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@@ -411,11 +411,9 @@ template<typename _MatrixType> class FullPivLU
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
 
     template<bool Conjugate, typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
     void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
     #endif
 
diff --git a/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h
index cc90a3b75..429414797 100644
--- a/Eigen/src/SVD/SVDBase.h
+++ b/Eigen/src/SVD/SVDBase.h
@@ -212,7 +212,6 @@ public:
   
   #ifndef EIGEN_PARSED_BY_DOXYGEN
   template<typename RhsType, typename DstType>
-  EIGEN_DEVICE_FUNC
   void _solve_impl(const RhsType &rhs, DstType &dst) const;
   #endif
 

From a71943b9a432c8962f025b56313584f33111ace4 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 2 Mar 2017 10:47:29 -0800
Subject: [PATCH 23/37] Made the Tensor code compile with clang 3.9

---
 .../CXX11/src/Tensor/TensorContractionCuda.h  | 101 +++++++++---------
 .../CXX11/src/Tensor/TensorReductionCuda.h    |   1 -
 2 files changed, 48 insertions(+), 54 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index d65dbb40f..c04b784a4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -529,7 +529,6 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
                        float2 rhs_shmem2[][8], const Index m_size,
                        const Index n_size, const Index k_size,
                        const Index base_m, const Index base_n) {
-  typedef float Scalar;
 
   // prefetch registers
   float4 lhs_pf0, rhs_pf0;
@@ -540,27 +539,27 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
   }
 
 
-#define prefetch_lhs(reg, row, col)                   \
-    if (!CHECK_LHS_BOUNDARY) {                        \
-      if (col < k_size) {                             \
-        reg =lhs.loadPacket<Unaligned>(row, col);     \
-      }                                               \
-    } else {                                          \
-      if (col < k_size) {                             \
-        if (row + 3 < m_size) {                       \
-          reg =lhs.loadPacket<Unaligned>(row, col);   \
-        } else if (row + 2 < m_size) {                \
-          reg.x =lhs(row + 0, col);                   \
-          reg.y =lhs(row + 1, col);                   \
-          reg.z =lhs(row + 2, col);                   \
-        } else if (row + 1 < m_size) {                \
-          reg.x =lhs(row + 0, col);                   \
-          reg.y =lhs(row + 1, col);                   \
-        } else if (row  < m_size) {                   \
-          reg.x =lhs(row + 0, col);                   \
-        }                                             \
-      }                                               \
-    }                                                 \
+#define prefetch_lhs(reg, row, col)                            \
+    if (!CHECK_LHS_BOUNDARY) {                                 \
+      if (col < k_size) {                                      \
+        reg =lhs.template loadPacket<Unaligned>(row, col);     \
+      }                                                        \
+    } else {                                                   \
+      if (col < k_size) {                                      \
+        if (row + 3 < m_size) {                                \
+          reg =lhs.template loadPacket<Unaligned>(row, col);   \
+        } else if (row + 2 < m_size) {                         \
+          reg.x =lhs(row + 0, col);                            \
+          reg.y =lhs(row + 1, col);                            \
+          reg.z =lhs(row + 2, col);                            \
+        } else if (row + 1 < m_size) {                         \
+          reg.x =lhs(row + 0, col);                            \
+          reg.y =lhs(row + 1, col);                            \
+        } else if (row  < m_size) {                            \
+          reg.x =lhs(row + 0, col);                            \
+        }                                                      \
+      }                                                        \
+    }                                                          \
 
 
   Index lhs_vert = base_m+threadIdx.x*4;
@@ -578,7 +577,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
     if (!CHECK_RHS_BOUNDARY) {
       if ((rhs_vert + 3) < k_size) {
         // just CHECK_RHS_BOUNDARY
-        rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+        rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
       } else if (rhs_vert + 2 < k_size) {
         // just CHECK_RHS_BOUNDARY
         rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -593,7 +592,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
     } else {
       if (rhs_horiz0 < n_size) {
         if ((rhs_vert + 3) < k_size) {
-          rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+          rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
         } else if ((rhs_vert + 2) < k_size) {
           rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
           rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
@@ -766,7 +765,6 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
                        float2 rhs_shmem2[][8], const Index m_size,
                        const Index n_size, const Index k_size,
                        const Index base_m, const Index base_n) {
-  typedef float Scalar;
 
   // prefetch registers
   float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
@@ -790,37 +788,37 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
 
      if (!CHECK_LHS_BOUNDARY) {
       if ((threadIdx.y/4+k+24) < k_size) {
-        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-        lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
-        lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
+        lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+        lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
       } else if ((threadIdx.y/4+k+16) < k_size) {
-        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-        lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+        lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
       } else if ((threadIdx.y/4+k+8) < k_size) {
-        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
       } else if ((threadIdx.y/4+k) < k_size) {
-        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
       }
     } else {
       // just CHECK_LHS_BOUNDARY
       if (lhs_vert + 3 < m_size) {
         if ((threadIdx.y/4+k+24) < k_size) {
-          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-          lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
-          lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
+          lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+          lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+          lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
         } else if ((threadIdx.y/4+k+16) < k_size) {
-          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-          lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+          lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+          lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
         } else if ((threadIdx.y/4+k+8) < k_size) {
-          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+          lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
         } else if ((threadIdx.y/4+k) < k_size) {
-          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
         }
       } else if (lhs_vert + 2 < m_size) {
         if ((threadIdx.y/4+k+24) < k_size) {
@@ -909,8 +907,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
     if (!CHECK_RHS_BOUNDARY) {
       if ((rhs_vert + 3) < k_size) {
         // just CHECK_RHS_BOUNDARY
-        rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
-        rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
+        rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+        rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
       } else if (rhs_vert + 2 < k_size) {
         // just CHECK_RHS_BOUNDARY
         rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -932,8 +930,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
       if (rhs_horiz1 < n_size) {
         if ((rhs_vert + 3) < k_size) {
           // just CHECK_RHS_BOUNDARY
-          rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
-          rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
+          rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+          rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
         } else if (rhs_vert + 2 < k_size) {
           // just CHECK_RHS_BOUNDARY
           rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -954,7 +952,7 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
       } else if (rhs_horiz0 < n_size) {
         if ((rhs_vert + 3) < k_size) {
           // just CHECK_RHS_BOUNDARY
-          rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+          rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
         } else if ((rhs_vert + 2) < k_size) {
           // just CHECK_RHS_BOUNDARY
           rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -1137,9 +1135,6 @@ EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
   typedef float2 LHS_MEM[64][32];
   typedef float2 RHS_MEM[128][8];
 
-  typedef float2 LHS_MEM16x16[32][16];
-  typedef float2 RHS_MEM16x16[64][8];
-
   const Index m_block_idx = blockIdx.x;
   const Index n_block_idx = blockIdx.y;
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 65638b6a8..edb0ab280 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -287,7 +287,6 @@ struct FullReductionLauncher<
     void>::type> {
   static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
     typedef typename Self::Index Index;
-    typedef typename Self::CoeffReturnType Scalar;
     const int block_size = 256;
     const int num_per_thread = 128;
     const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);

From 1c03d43a5cc421da067f12fe02db9eedaa1d125d Mon Sep 17 00:00:00 2001
From: Ilya Biryukov <ibiryukov@google.com>
Date: Mon, 6 Mar 2017 12:01:12 +0100
Subject: [PATCH 24/37] Fixed compilation with cuda-clang

---
 Eigen/src/Core/MathFunctions.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 7a6b999af..5ec6c395e 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -512,7 +512,7 @@ namespace std_fallback {
 
 template<typename Scalar>
 struct expm1_impl {
-  static inline Scalar run(const Scalar& x)
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
   {
     EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
     #if EIGEN_HAS_CXX11_MATH
@@ -549,7 +549,7 @@ namespace std_fallback {
 
 template<typename Scalar>
 struct log1p_impl {
-  static inline Scalar run(const Scalar& x)
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
   {
     EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
     #if EIGEN_HAS_CXX11_MATH

From 659087b622e94f7e35a56b7ed2cb01b024c80a7b Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 7 Mar 2017 10:02:34 +0100
Subject: [PATCH 25/37] bug #1400: fix stableNorm with
 EIGEN_DONT_ALIGN_STATICALLY

---
 Eigen/src/Core/StableNorm.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h
index d2fe1e199..be04ed44d 100644
--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h
@@ -170,7 +170,8 @@ MatrixBase<Derived>::stableNorm() const
   enum {
     CanAlign = (   (int(DerivedCopyClean::Flags)&DirectAccessBit)
                 || (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
-               ) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT) // ifwe cannot allocate on the stack, then let's not bother about this optimization
+               ) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT)
+                 && (EIGEN_MAX_STATIC_ALIGN_BYTES>0) // if we cannot allocate on the stack, then let's not bother about this optimization
   };
   typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>,
                                                    typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper;

From d9677185250d780b0011202621c90530937049d2 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 7 Mar 2017 10:16:39 +0100
Subject: [PATCH 26/37] do not include std header within extern C

---
 Eigen/src/misc/lapacke.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/Eigen/src/misc/lapacke.h b/Eigen/src/misc/lapacke.h
index 8c7e79b03..dd6fd3b0a 100755
--- a/Eigen/src/misc/lapacke.h
+++ b/Eigen/src/misc/lapacke.h
@@ -43,10 +43,6 @@
 #include "lapacke_config.h"
 #endif
 
-#ifdef __cplusplus
-extern "C" {
-#endif /* __cplusplus */
-
 #include <stdlib.h>
 
 #ifndef lapack_int
@@ -81,7 +77,7 @@ extern "C" {
 #endif
 
 #ifndef lapack_complex_float_real
-#define lapack_complex_float_real(z)       (creal(z))
+#define lapack_complex_float_rea@l(z)       (creal(z))
 #endif
 
 #ifndef lapack_complex_float_imag
@@ -108,6 +104,11 @@ lapack_complex_double lapack_make_complex_double( double re, double im );
 
 #endif
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
 #ifndef LAPACKE_malloc
 #define LAPACKE_malloc( size ) malloc( size )
 #endif

From e958c2baac15ccf0cf7b7919ef729b118e43a6ed Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 7 Mar 2017 10:47:40 +0100
Subject: [PATCH 27/37] remove UTF8 symbols

---
 Eigen/src/Core/arch/CUDA/Half.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h
index db9878796..67518da9f 100644
--- a/Eigen/src/Core/arch/CUDA/Half.h
+++ b/Eigen/src/Core/arch/CUDA/Half.h
@@ -13,7 +13,7 @@
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted.
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

From 5694315fbb7a2abc157cad284852b5e3df2f9576 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 7 Mar 2017 10:53:47 +0100
Subject: [PATCH 28/37] remove UTF8 symbol

---
 Eigen/src/Core/DenseBase.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h
index 91a8511be..fd933eed4 100644
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -296,7 +296,7 @@ template<typename Derived> class DenseBase
     EIGEN_DEVICE_FUNC
     Derived& operator=(const ReturnByValue<OtherDerived>& func);
 
-    /** \ínternal
+    /** \internal
       * Copies \a other into *this without evaluating other. \returns a reference to *this.
       * \deprecated */
     template<typename OtherDerived>

From e5156e4d253fcc8ac13b0131973de3f56e810ab5 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 7 Mar 2017 11:25:58 +0100
Subject: [PATCH 29/37] fix typo

---
 Eigen/src/misc/lapacke.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/misc/lapacke.h b/Eigen/src/misc/lapacke.h
index dd6fd3b0a..3d8e24f5a 100755
--- a/Eigen/src/misc/lapacke.h
+++ b/Eigen/src/misc/lapacke.h
@@ -77,7 +77,7 @@
 #endif
 
 #ifndef lapack_complex_float_real
-#define lapack_complex_float_rea@l(z)       (creal(z))
+#define lapack_complex_float_real(z)       (creal(z))
 #endif
 
 #ifndef lapack_complex_float_imag

From 970ff78294503896940fb760d948f1eed156250f Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 8 Mar 2017 16:16:53 +0100
Subject: [PATCH 30/37] bug #1401: fix compilation of "cond ? x : -x" with x an
 AutoDiffScalar

---
 unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
index 50fedf6ac..d2808860c 100755
--- a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
+++ b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
@@ -108,7 +108,9 @@ class AutoDiffScalar
     template<typename OtherDerType>
     AutoDiffScalar(const AutoDiffScalar<OtherDerType>& other
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-    , typename internal::enable_if<internal::is_same<Scalar, typename internal::traits<typename internal::remove_all<OtherDerType>::type>::Scalar>::value,void*>::type = 0
+    , typename internal::enable_if<
+            internal::is_same<Scalar, typename internal::traits<typename internal::remove_all<OtherDerType>::type>::Scalar>::value
+        &&  internal::is_convertible<OtherDerType,DerType>::value , void*>::type = 0
 #endif
     )
       : m_value(other.value()), m_derivatives(other.derivatives())

From 344c2694a64494721e6d36d1197bde47c7d12af9 Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Thu, 9 Mar 2017 15:41:03 -0800
Subject: [PATCH 31/37] Make the non-blocking threadpool more flexible and less
 wasteful of CPU cycles for high-latency use-cases.

* Adds a hint to ThreadPool allowing us to turn off spin waiting. Currently each reader and record yielder op in a graph creates a threadpool with a thread that spins for 1000 iterations through the work stealing loop before yielding. This is wasteful for such ops that process I/O.

* This also changes the number of iterations through the steal loop to be inversely proportional to the number of threads. Since the time of each iteration is proportional to the number of threads, this yields roughly a constant spin time.

* Implement a separate worker loop for the num_threads == 1 case since there is no point in going through the expensive steal loop. Moreover, since Steal() calls PopBack() on the victim queues it might reverse the order in which ops are executed, compared to the order in which they are scheduled, which is usually counter-productive for the types of I/O workloads the single thread pools tend to be used for.

* Store num_threads in a member variable for simplicity and to avoid a data race between the thread creation loop and worker threads calling threads_.size().
---
 .../src/ThreadPool/NonBlockingThreadPool.h    | 163 ++++++++++++------
 .../test/cxx11_non_blocking_thread_pool.cpp   |   7 +-
 2 files changed, 111 insertions(+), 59 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
index ed1a761b6..e28afedb4 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -20,7 +20,9 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
   typedef RunQueue<Task, 1024> Queue;
 
   NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment())
-      : env_(env),
+      : num_threads_(num_threads),
+        allow_spinning_(true),
+        env_(env),
         threads_(num_threads),
         queues_(num_threads),
         coprimes_(num_threads),
@@ -30,34 +32,24 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
         done_(false),
         cancelled_(false),
         ec_(waiters_) {
-    waiters_.resize(num_threads);
+    Init();
+  }
 
-    // Calculate coprimes of num_threads.
-    // Coprimes are used for a random walk over all threads in Steal
-    // and NonEmptyQueueIndex. Iteration is based on the fact that if we take
-    // a walk starting thread index t and calculate num_threads - 1 subsequent
-    // indices as (t + coprime) % num_threads, we will cover all threads without
-    // repetitions (effectively getting a presudo-random permutation of thread
-    // indices).
-    for (int i = 1; i <= num_threads; i++) {
-      unsigned a = i;
-      unsigned b = num_threads;
-      // If GCD(a, b) == 1, then a and b are coprimes.
-      while (b != 0) {
-        unsigned tmp = a;
-        a = b;
-        b = tmp % b;
-      }
-      if (a == 1) {
-        coprimes_.push_back(i);
-      }
-    }
-    for (int i = 0; i < num_threads; i++) {
-      queues_.push_back(new Queue());
-    }
-    for (int i = 0; i < num_threads; i++) {
-      threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); }));
-    }
+  NonBlockingThreadPoolTempl(int num_threads, bool allow_spinning,
+                             Environment env = Environment())
+      : num_threads_(num_threads),
+        allow_spinning_(allow_spinning),
+        env_(env),
+        threads_(num_threads),
+        queues_(num_threads),
+        coprimes_(num_threads),
+        waiters_(num_threads),
+        blocked_(0),
+        spinning_(0),
+        done_(false),
+        cancelled_(false),
+        ec_(waiters_) {
+    Init();
   }
 
   ~NonBlockingThreadPoolTempl() {
@@ -77,8 +69,8 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
     }
 
     // Join threads explicitly to avoid destruction order issues.
-    for (size_t i = 0; i < threads_.size(); i++) delete threads_[i];
-    for (size_t i = 0; i < threads_.size(); i++) delete queues_[i];
+    for (size_t i = 0; i < num_threads_; i++) delete threads_[i];
+    for (size_t i = 0; i < num_threads_; i++) delete queues_[i];
   }
 
   void Schedule(std::function<void()> fn) {
@@ -125,7 +117,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
   }
 
   int NumThreads() const final {
-    return static_cast<int>(threads_.size());
+    return num_threads_;
   }
 
   int CurrentThreadId() const final {
@@ -149,6 +141,8 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
   };
 
   Environment env_;
+  const int num_threads_;
+  const bool allow_spinning_;
   MaxSizeVector<Thread*> threads_;
   MaxSizeVector<Queue*> queues_;
   MaxSizeVector<unsigned> coprimes_;
@@ -159,6 +153,37 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
   std::atomic<bool> cancelled_;
   EventCount ec_;
 
+  void Init() {
+    waiters_.resize(num_threads_);
+
+    // Calculate coprimes of num_threads_.
+    // Coprimes are used for a random walk over all threads in Steal
+    // and NonEmptyQueueIndex. Iteration is based on the fact that if we take
+    // a walk starting thread index t and calculate num_threads - 1 subsequent
+    // indices as (t + coprime) % num_threads, we will cover all threads without
+    // repetitions (effectively getting a presudo-random permutation of thread
+    // indices).
+    for (int i = 1; i <= num_threads_; i++) {
+      unsigned a = i;
+      unsigned b = num_threads_;
+      // If GCD(a, b) == 1, then a and b are coprimes.
+      while (b != 0) {
+        unsigned tmp = a;
+        a = b;
+        b = tmp % b;
+      }
+      if (a == 1) {
+        coprimes_.push_back(i);
+      }
+    }
+    for (int i = 0; i < num_threads_; i++) {
+      queues_.push_back(new Queue());
+    }
+    for (int i = 0; i < num_threads_; i++) {
+      threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); }));
+    }
+  }
+
   // Main worker thread loop.
   void WorkerLoop(int thread_id) {
     PerThread* pt = GetPerThread();
@@ -167,36 +192,62 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
     pt->thread_id = thread_id;
     Queue* q = queues_[thread_id];
     EventCount::Waiter* waiter = &waiters_[thread_id];
-    while (!cancelled_) {
-      Task t = q->PopFront();
-      if (!t.f) {
-        t = Steal();
+    // TODO(dvyukov,rmlarsen): The time spent in Steal() is proportional
+    // to num_threads_ and we assume that new work is scheduled at a
+    // constant rate, so we set spin_count to 5000 / num_threads_. The
+    // constant was picked based on a fair dice roll, tune it.
+    const int spin_count =
+        allow_spinning_ && num_threads_ > 0 ? 5000 / num_threads_ : 0;
+    if (num_threads_ == 1) {
+      // For num_threads_ == 1 there is no point in going through the expensive
+      // steal loop. Moreover, since Steal() calls PopBack() on the victim
+      // queues it might reverse the order in which ops are executed compared to
+      // the order in which they are scheduled, which tends to be
+      // counter-productive for the types of I/O workloads the single thread
+      // pools tend to be used for.
+      while (!cancelled_) {
+        Task t = q->PopFront();
+        for (int i = 0; i < spin_count && !t.f; i++) {
+          if (!cancelled_.load(std::memory_order_relaxed)) {
+            t = q->PopFront();
+          }
+        }
         if (!t.f) {
-          // Leave one thread spinning. This reduces latency.
-          // TODO(dvyukov): 1000 iterations is based on fair dice roll, tune it.
-          // Also, the time it takes to attempt to steal work 1000 times depends
-          // on the size of the thread pool. However the speed at which the user
-          // of the thread pool submit tasks is independent of the size of the
-          // pool. Consider a time based limit instead.
-          if (!spinning_ && !spinning_.exchange(true)) {
-            for (int i = 0; i < 1000 && !t.f; i++) {
-              if (!cancelled_.load(std::memory_order_relaxed)) {
-                t = Steal();
-              } else {
+          if (!WaitForWork(waiter, &t)) {
+            return;
+          }
+        }
+        if (t.f) {
+          env_.ExecuteTask(t);
+        }
+      }
+    } else {
+      while (!cancelled_) {
+        Task t = q->PopFront();
+        if (!t.f) {
+          t = Steal();
+          if (!t.f) {
+            // Leave one thread spinning. This reduces latency.
+            if (allow_spinning_ && !spinning_ && !spinning_.exchange(true)) {
+              for (int i = 0; i < spin_count && !t.f; i++) {
+                if (!cancelled_.load(std::memory_order_relaxed)) {
+                  t = Steal();
+                } else {
+                  return;
+                }
+              }
+              spinning_ = false;
+            }
+            if (!t.f) {
+              if (!WaitForWork(waiter, &t)) {
                 return;
               }
             }
-            spinning_ = false;
-          }
-          if (!t.f) {
-            if (!WaitForWork(waiter, &t)) {
-              return;
-            }
           }
         }
-      }
-      if (t.f) {
-        env_.ExecuteTask(t);
+        if (t.f) {
+          env_.ExecuteTask(t);
+        }
       }
     }
   }
@@ -244,7 +295,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
     // If we are shutting down and all worker threads blocked without work,
     // that's we are done.
     blocked_++;
-    if (done_ && blocked_ == threads_.size()) {
+    if (done_ && blocked_ == num_threads_) {
       ec_.CancelWait(waiter);
       // Almost done, but need to re-check queues.
       // Consider that all queues are empty and all worker threads are preempted
diff --git a/unsupported/test/cxx11_non_blocking_thread_pool.cpp b/unsupported/test/cxx11_non_blocking_thread_pool.cpp
index 2c5765ce4..48cd2d4e4 100644
--- a/unsupported/test/cxx11_non_blocking_thread_pool.cpp
+++ b/unsupported/test/cxx11_non_blocking_thread_pool.cpp
@@ -23,11 +23,11 @@ static void test_create_destroy_empty_pool()
 }
 
 
-static void test_parallelism()
+static void test_parallelism(bool allow_spinning)
 {
   // Test we never-ever fail to match available tasks with idle threads.
   const int kThreads = 16;  // code below expects that this is a multiple of 4
-  NonBlockingThreadPool tp(kThreads);
+  NonBlockingThreadPool tp(kThreads, allow_spinning);
   VERIFY_IS_EQUAL(tp.NumThreads(), kThreads);
   VERIFY_IS_EQUAL(tp.CurrentThreadId(), -1);
   for (int iter = 0; iter < 100; ++iter) {
@@ -119,6 +119,7 @@ static void test_cancel()
 void test_cxx11_non_blocking_thread_pool()
 {
   CALL_SUBTEST(test_create_destroy_empty_pool());
-  CALL_SUBTEST(test_parallelism());
+  CALL_SUBTEST(test_parallelism(true));
+  CALL_SUBTEST(test_parallelism(false));
   CALL_SUBTEST(test_cancel());
 }

From d56ab01094d30cf57150b454b8ebaf20ac7bc85e Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Fri, 10 Mar 2017 08:30:22 -0800
Subject: [PATCH 32/37] Use C++11 ctor forwarding to simplify code a bit.

---
 .../src/ThreadPool/NonBlockingThreadPool.h    | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
index e28afedb4..4fbd3af1e 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -19,22 +19,6 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
   typedef typename Environment::Task Task;
   typedef RunQueue<Task, 1024> Queue;
 
-  NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment())
-      : num_threads_(num_threads),
-        allow_spinning_(true),
-        env_(env),
-        threads_(num_threads),
-        queues_(num_threads),
-        coprimes_(num_threads),
-        waiters_(num_threads),
-        blocked_(0),
-        spinning_(0),
-        done_(false),
-        cancelled_(false),
-        ec_(waiters_) {
-    Init();
-  }
-
   NonBlockingThreadPoolTempl(int num_threads, bool allow_spinning,
                              Environment env = Environment())
       : num_threads_(num_threads),
@@ -52,6 +36,9 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
     Init();
   }
 
+  NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment())
+      : NonBlockingThreadPoolTempl(num_threads, true, env) {}
+
   ~NonBlockingThreadPoolTempl() {
     done_ = true;
 

From bfd7bf9c5b4ebc932a339200bb4a455eadcf6d28 Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Fri, 10 Mar 2017 08:48:20 -0800
Subject: [PATCH 33/37] Get rid of Init().

---
 .../src/ThreadPool/NonBlockingThreadPool.h    | 66 +++++++++----------
 1 file changed, 31 insertions(+), 35 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
index 4fbd3af1e..9dcc9dab7 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -19,6 +19,9 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
   typedef typename Environment::Task Task;
   typedef RunQueue<Task, 1024> Queue;
 
+  NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment())
+      : NonBlockingThreadPoolTempl(num_threads, true, env) {}
+
   NonBlockingThreadPoolTempl(int num_threads, bool allow_spinning,
                              Environment env = Environment())
       : num_threads_(num_threads),
@@ -33,11 +36,35 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
         done_(false),
         cancelled_(false),
         ec_(waiters_) {
-    Init();
-  }
+    waiters_.resize(num_threads_);
 
-  NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment())
-      : NonBlockingThreadPoolTempl(num_threads, true, env) {}
+    // Calculate coprimes of num_threads_.
+    // Coprimes are used for a random walk over all threads in Steal
+    // and NonEmptyQueueIndex. Iteration is based on the fact that if we take
+    // a walk starting thread index t and calculate num_threads - 1 subsequent
+    // indices as (t + coprime) % num_threads, we will cover all threads without
+    // repetitions (effectively getting a presudo-random permutation of thread
+    // indices).
+    for (int i = 1; i <= num_threads_; i++) {
+      unsigned a = i;
+      unsigned b = num_threads_;
+      // If GCD(a, b) == 1, then a and b are coprimes.
+      while (b != 0) {
+        unsigned tmp = a;
+        a = b;
+        b = tmp % b;
+      }
+      if (a == 1) {
+        coprimes_.push_back(i);
+      }
+    }
+    for (int i = 0; i < num_threads_; i++) {
+      queues_.push_back(new Queue());
+    }
+    for (int i = 0; i < num_threads_; i++) {
+      threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); }));
+    }
+  }
 
   ~NonBlockingThreadPoolTempl() {
     done_ = true;
@@ -140,37 +167,6 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
   std::atomic<bool> cancelled_;
   EventCount ec_;
 
-  void Init() {
-    waiters_.resize(num_threads_);
-
-    // Calculate coprimes of num_threads_.
-    // Coprimes are used for a random walk over all threads in Steal
-    // and NonEmptyQueueIndex. Iteration is based on the fact that if we take
-    // a walk starting thread index t and calculate num_threads - 1 subsequent
-    // indices as (t + coprime) % num_threads, we will cover all threads without
-    // repetitions (effectively getting a presudo-random permutation of thread
-    // indices).
-    for (int i = 1; i <= num_threads_; i++) {
-      unsigned a = i;
-      unsigned b = num_threads_;
-      // If GCD(a, b) == 1, then a and b are coprimes.
-      while (b != 0) {
-        unsigned tmp = a;
-        a = b;
-        b = tmp % b;
-      }
-      if (a == 1) {
-        coprimes_.push_back(i);
-      }
-    }
-    for (int i = 0; i < num_threads_; i++) {
-      queues_.push_back(new Queue());
-    }
-    for (int i = 0; i < num_threads_; i++) {
-      threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); }));
-    }
-  }
-
   // Main worker thread loop.
   void WorkerLoop(int thread_id) {
     PerThread* pt = GetPerThread();

From f0f35911181cc7e2089a3319d966dcbd3596461b Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 14 Mar 2017 14:16:53 -0700
Subject: [PATCH 34/37] Made the reduction code compile with cuda-clang

---
 .../Eigen/CXX11/src/Tensor/TensorReduction.h  | 23 ++++++++++++++-----
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index c841786b8..e341e2e9b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -11,6 +11,17 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
 
+// clang is incompatible with the CUDA syntax wrt making a kernel a class friend,
+// so we'll use a macro to make clang happy.
+#ifndef KERNEL_FRIEND
+#if defined(__clang__) && defined(__CUDA__)
+#define KERNEL_FRIEND friend __global__
+#else
+#define KERNEL_FRIEND friend
+#endif
+#endif
+
+
 namespace Eigen {
 
 
@@ -681,15 +692,15 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
   template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
 #endif
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
+  template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
 #ifdef EIGEN_HAS_CUDA_FP16
-  template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
-  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
-  template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
+  template <typename S, typename R, typename I> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
+  template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
+  template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
 #endif
-  template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+  template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
 
-  template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+  template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
 #endif
 
 #if defined(EIGEN_USE_SYCL)

From 61160a21d23880749bce3b0d630d9880f70af6e5 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Wed, 15 Mar 2017 06:57:25 -0400
Subject: [PATCH 35/37] ARM prefetch fixes: Implement prefetch on ARM64. Do not
 clobber cc on ARM32.

---
 Eigen/src/Core/arch/NEON/PacketMath.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 84a56bdcc..aede4a6d5 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -51,14 +51,17 @@ typedef uint32x4_t  Packet4ui;
 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
   const Packet4i p4i_##NAME = pset1<Packet4i>(X)
 
-// arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function
-// which available on LLVM and GCC (at least)
-#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
+#if EIGEN_ARCH_ARM64
+  // __builtin_prefetch tends to do nothing on ARM64 compilers because the
+  // prefetch instructions there are too detailed for __builtin_prefetch to map
+  // meaningfully to them.
+  #define EIGEN_ARM_PREFETCH(ADDR)  __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : );
+#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
   #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
 #elif defined __pld
   #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
-#elif !EIGEN_ARCH_ARM64
-  #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ( "   pld [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
+#elif EIGEN_ARCH_ARM32
+  #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : );
 #else
   // by default no explicit prefetching
   #define EIGEN_ARM_PREFETCH(ADDR)

From 89fd0c38812b024734eeacf9c23ed3714c8b0f93 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 15 Mar 2017 15:18:03 +0100
Subject: [PATCH 36/37] better check array index before using it

---
 Eigen/src/OrderingMethods/Eigen_Colamd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h
index 933cd564b..da85b4d6e 100644
--- a/Eigen/src/OrderingMethods/Eigen_Colamd.h
+++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h
@@ -1004,7 +1004,7 @@ static IndexType find_ordering /* return the number of garbage collections */
     COLAMD_ASSERT (head [min_score] >= COLAMD_EMPTY) ;
 
     /* get pivot column from head of minimum degree list */
-    while (head [min_score] == COLAMD_EMPTY && min_score < n_col)
+    while (min_score < n_col && head [min_score] == COLAMD_EMPTY)
     {
       min_score++ ;
     }

From fd7db52f9b3b1227719c6d8281ff5e5198aaae82 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 15 Mar 2017 20:02:39 -0700
Subject: [PATCH 37/37] Silenced compilation warning

---
 unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index f335edf7d..c46a778b5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -50,6 +50,7 @@ template <DenseIndex DimId>
 struct DimensionId
 {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) {
+    EIGEN_UNUSED_VARIABLE(dim);
     eigen_assert(dim == DimId);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {