Merge.

2025-02-23 18:20:47 +08:00 · 2016-02-04 14:32:19 -08:00 · 2016-02-04 14:32:19 -08:00 · 093f2b3c01
commit 093f2b3c01
parent 2e39cc40a4 bcdcdace48
25 changed files with 607 additions and 165 deletions
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@ -76,32 +76,6 @@ public:
 #endif
 };

-// template<typename Lhs, typename Rhs> struct product_tag
-// {
-// private:
-//   
-//   typedef typename remove_all<Lhs>::type _Lhs;
-//   typedef typename remove_all<Rhs>::type _Rhs;
-//   enum {
-//     Rows  = _Lhs::RowsAtCompileTime,
-//     Cols  = _Rhs::ColsAtCompileTime,
-//     Depth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::ColsAtCompileTime, _Rhs::RowsAtCompileTime)
-//   };
-// 
-//   enum {
-//     rows_select = Rows==1 ? int(Rows) : int(Large),
-//     cols_select = Cols==1 ? int(Cols) : int(Large),
-//     depth_select = Depth==1 ? int(Depth) : int(Large)
-//   };
-//   typedef product_type_selector<rows_select, cols_select, depth_select> selector;
-// 
-// public:
-//   enum {
-//     ret = selector::ret
-//   };
-// 
-// };
-
 /* The following allows to select the kind of product at compile time
 * based on the three dimensions of the product.
 * This is a compile time mapping from {1,Small,Large}^3 -> {product types} */
@ -125,8 +99,8 @@ template<>              struct product_type_selector<Small,Small,Large>  { enum
 template<>              struct product_type_selector<Large,Small,Large>  { enum { ret = GemmProduct }; };
 template<>              struct product_type_selector<Small,Large,Large>  { enum { ret = GemmProduct }; };
 template<>              struct product_type_selector<Large,Large,Large>  { enum { ret = GemmProduct }; };
-template<>              struct product_type_selector<Large,Small,Small>  { enum { ret = GemmProduct }; };
-template<>              struct product_type_selector<Small,Large,Small>  { enum { ret = GemmProduct }; };
+template<>              struct product_type_selector<Large,Small,Small>  { enum { ret = CoeffBasedProductMode }; };
+template<>              struct product_type_selector<Small,Large,Small>  { enum { ret = CoeffBasedProductMode }; };
 template<>              struct product_type_selector<Large,Large,Small>  { enum { ret = GemmProduct }; };

 } // end namespace internal
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@ -134,6 +134,11 @@ pcast(const SrcPacket& a, const SrcPacket& /*b*/) {
  return static_cast<TgtPacket>(a);
 }

+template <typename SrcPacket, typename TgtPacket>
+EIGEN_DEVICE_FUNC inline TgtPacket
+pcast(const SrcPacket& a, const SrcPacket& /*b*/, const SrcPacket& /*c*/, const SrcPacket& /*d*/) {
+  return static_cast<TgtPacket>(a);
+}

 /** \internal \returns a + b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
--- a/Eigen/src/Core/SpecialFunctions.h
+++ b/Eigen/src/Core/SpecialFunctions.h
@ -134,7 +134,24 @@ struct lgamma_impl<double> {
 * Implementation of digamma (psi)                                          *
 ****************************************************************************/

-#ifdef EIGEN_HAS_C99_MATH
+template <typename Scalar>
+struct digamma_retval {
+  typedef Scalar type;
+};
+
+#ifndef EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct digamma_impl {
+  EIGEN_DEVICE_FUNC
+  static Scalar run(Scalar x) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+#else

 /*
 *
@ -202,14 +219,6 @@ struct digamma_impl_maybe_poly<double> {
  }
 };

-#endif  // EIGEN_HAS_C99_MATH
-
-template <typename Scalar>
-struct digamma_retval {
-  typedef Scalar type;
-};
-
-#ifdef EIGEN_HAS_C99_MATH
 template <typename Scalar>
 struct digamma_impl {
  EIGEN_DEVICE_FUNC
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@ -178,7 +178,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
    // We also include a register-level block of the result (mx x nr).
    // (In an ideal world only the lhs panel would stay in L1)
    // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
-    const Index max_kc = ((l1-k_sub)/k_div) & (~(k_peeling-1));
+    const Index max_kc = std::max<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
    const Index old_k = k;
    if(k>max_kc)
    {
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@ -20,6 +20,7 @@
    #pragma warning( push )
  #endif
  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4717 4800)
+
 #elif defined __INTEL_COMPILER
  // 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
  //        ICC 12 generates this warning even without any inline keyword, when defining class methods 'inline' i.e. inside of class body
@ -32,6 +33,7 @@
    #pragma warning push
  #endif
  #pragma warning disable 2196 279 1684 2259
+
 #elif defined __clang__
  // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant
  //     this is really a stupid warning as it warns on compile-time expressions involving enums
@ -39,6 +41,17 @@
    #pragma clang diagnostic push
  #endif
  #pragma clang diagnostic ignored "-Wconstant-logical-operand"
+
+#elif defined __NVCC__
+  // Disable the "statement is unreachable" message
+  #pragma diag_suppress code_is_unreachable
+  // Disable the "dynamic initialization in unreachable code" message
+  #pragma diag_suppress initialization_not_reachable
+  // Disable the "calling a __host__ function from a __host__ __device__ function is not allowed" messages (yes, there are 4 of them)
+  #pragma diag_suppress 2651
+  #pragma diag_suppress 2653
+  #pragma diag_suppress 2668
+  #pragma diag_suppress 2670
 #endif

 #endif // not EIGEN_WARNINGS_DISABLED
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@ -94,10 +94,6 @@ template<typename BinaryOp,  typename Lhs, typename Rhs>  class CwiseBinaryOp;
 template<typename Decomposition, typename Rhstype>        class Solve;
 template<typename XprType>                                class Inverse;

-namespace internal {
-  template<typename Lhs, typename Rhs> struct product_tag;
-}
-
 template<typename Lhs, typename Rhs, int Option = DefaultProduct> class Product;
         
 template<typename Derived> class DiagonalBase;
--- a/Eigen/src/Core/util/ReenableStupidWarnings.h
+++ b/Eigen/src/Core/util/ReenableStupidWarnings.h
@ -8,6 +8,14 @@
    #pragma warning pop
  #elif defined __clang__
    #pragma clang diagnostic pop
+  #elif defined __NVCC__
+//    Don't reenable the diagnostic messages, as it turns out these messages need
+//    to be disabled at the point of the template instantiation (i.e the user code)
+//    otherwise they'll be triggeredby nvcc.
+//    #pragma diag_default code_is_unreachable
+//    #pragma diag_default initialization_not_reachable
+//    #pragma diag_default 2651
+//    #pragma diag_default 2653
  #endif
 #endif

--- a/Eigen/src/SparseCore/SparseBlock.h
+++ b/Eigen/src/SparseCore/SparseBlock.h
@ -73,8 +73,15 @@ public:
    Index m_outerStart;
    const internal::variable_if_dynamic<Index, OuterSize> m_outerSize;
  
-  public:
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
+  protected:
+    // Disable assignment with clear error message.
+    // Note that simply removing operator= yields compilation errors with ICC+MSVC
+    template<typename T>
+    BlockImpl& operator=(const T&)
+    {
+      EIGEN_STATIC_ASSERT(sizeof(T)==0, THIS_SPARSE_BLOCK_SUBEXPRESSION_IS_READ_ONLY);
+      return *this;
+    }
 };


@ -424,8 +431,6 @@ public:
    friend struct internal::unary_evaluator<Block<XprType,BlockRows,BlockCols,InnerPanel>, internal::IteratorBased, Scalar >;
    
    Index nonZeros() const { return Dynamic; }
-    
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)

    typename internal::ref_selector<XprType>::non_const_type m_matrix;
    const internal::variable_if_dynamic<Index, XprType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;
@ -433,6 +438,16 @@ public:
    const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_blockRows;
    const internal::variable_if_dynamic<Index, ColsAtCompileTime> m_blockCols;

+  protected:
+    // Disable assignment with clear error message.
+    // Note that simply removing operator= yields compilation errors with ICC+MSVC
+    template<typename T>
+    BlockImpl& operator=(const T&)
+    {
+      EIGEN_STATIC_ASSERT(sizeof(T)==0, THIS_SPARSE_BLOCK_SUBEXPRESSION_IS_READ_ONLY);
+      return *this;
+    }
+
 };

 namespace internal {
--- a/Eigen/src/StlSupport/StdDeque.h
+++ b/Eigen/src/StlSupport/StdDeque.h
@ -13,32 +13,24 @@

 #include "details.h"

-// Define the explicit instantiation (e.g. necessary for the Intel compiler)
-#if EIGEN_COMP_GNUC || EIGEN_COMP_ICC
-  #define EIGEN_EXPLICIT_STL_DEQUE_INSTANTIATION(...) template class std::deque<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> >;
-#else
-  #define EIGEN_EXPLICIT_STL_DEQUE_INSTANTIATION(...)
-#endif
-
 /**
 * This section contains a convenience MACRO which allows an easy specialization of
 * std::deque such that for data types with alignment issues the correct allocator
 * is used automatically.
 */
 #define EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(...) \
-EIGEN_EXPLICIT_STL_DEQUE_INSTANTIATION(__VA_ARGS__) \
 namespace std \
 { \
-  template<typename _Ay> \
-  class deque<__VA_ARGS__, _Ay>  \
+  template<> \
+  class deque<__VA_ARGS__, std::allocator<__VA_ARGS__> >           \
    : public deque<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > \
  { \
    typedef deque<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > deque_base; \
  public: \
    typedef __VA_ARGS__ value_type; \
-    typedef typename deque_base::allocator_type allocator_type; \
-    typedef typename deque_base::size_type size_type;  \
-    typedef typename deque_base::iterator iterator;  \
+    typedef deque_base::allocator_type allocator_type; \
+    typedef deque_base::size_type size_type;  \
+    typedef deque_base::iterator iterator;  \
    explicit deque(const allocator_type& a = allocator_type()) : deque_base(a) {}  \
    template<typename InputIterator> \
    deque(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : deque_base(first, last, a) {} \
--- a/Eigen/src/StlSupport/StdList.h
+++ b/Eigen/src/StlSupport/StdList.h
@ -12,32 +12,24 @@

 #include "details.h"

-// Define the explicit instantiation (e.g. necessary for the Intel compiler)
-#if EIGEN_COMP_GNUC || EIGEN_COMP_ICC
-  #define EIGEN_EXPLICIT_STL_LIST_INSTANTIATION(...) template class std::list<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> >;
-#else
-  #define EIGEN_EXPLICIT_STL_LIST_INSTANTIATION(...)
-#endif
-
 /**
 * This section contains a convenience MACRO which allows an easy specialization of
 * std::list such that for data types with alignment issues the correct allocator
 * is used automatically.
 */
 #define EIGEN_DEFINE_STL_LIST_SPECIALIZATION(...) \
-EIGEN_EXPLICIT_STL_LIST_INSTANTIATION(__VA_ARGS__) \
 namespace std \
 { \
-  template<typename _Ay> \
-  class list<__VA_ARGS__, _Ay>  \
+  template<> \
+  class list<__VA_ARGS__, std::allocator<__VA_ARGS__> >           \
    : public list<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > \
  { \
    typedef list<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > list_base; \
  public: \
    typedef __VA_ARGS__ value_type; \
-    typedef typename list_base::allocator_type allocator_type; \
-    typedef typename list_base::size_type size_type;  \
-    typedef typename list_base::iterator iterator;  \
+    typedef list_base::allocator_type allocator_type; \
+    typedef list_base::size_type size_type;  \
+    typedef list_base::iterator iterator;  \
    explicit list(const allocator_type& a = allocator_type()) : list_base(a) {}  \
    template<typename InputIterator> \
    list(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : list_base(first, last, a) {} \
--- a/doc/SparseQuickReference.dox
+++ b/doc/SparseQuickReference.dox
@ -21,7 +21,7 @@ i.e either row major or column major. The default is column major. Most arithmet
 <td> Resize/Reserve</td>
 <td> 
 \code
-    sm1.resize(m,n);      //Change sm1 to a m x n matrix. 
+    sm1.resize(m,n);      // Change sm1 to a m x n matrix.
    sm1.reserve(nnz);     // Allocate room for nnz nonzeros elements.   
  \endcode 
 </td>
@ -151,10 +151,10 @@ It is easy to perform arithmetic operations on sparse matrices provided that the
 <td> Permutation </td>
 <td> 
 \code 
-perm.indices(); // Reference to the vector of indices
+perm.indices();      // Reference to the vector of indices
 sm1.twistedBy(perm); // Permute rows and columns
-sm2 = sm1 * perm; //Permute the columns
-sm2 = perm * sm1; // Permute the columns
+sm2 = sm1 * perm;    // Permute the columns
+sm2 = perm * sm1;    // Permute the columns
 \endcode 
 </td>
 <td> 
@ -181,9 +181,9 @@ sm2 = perm * sm1; // Permute the columns

 \section sparseotherops Other supported operations
 <table class="manual">
-<tr><th>Operations</th> <th> Code </th> <th> Notes</th> </tr>
+<tr><th style="min-width:initial"> Code </th> <th> Notes</th> </tr>
+<tr><td colspan="2">Sub-matrices</td></tr>
 <tr>
-<td>Sub-matrices</td> 
 <td> 
 \code 
  sm1.block(startRow, startCol, rows, cols); 
@ -193,25 +193,31 @@ sm2 = perm * sm1; // Permute the columns
  sm1.bottomLeftCorner( rows, cols);
  sm1.bottomRightCorner( rows, cols);
  \endcode
-</td> <td>  </td>
+</td><td>
+Contrary to dense matrices, here <strong>all these methods are read-only</strong>.\n
+See \ref TutorialSparse_SubMatrices and below for read-write sub-matrices.
+</td>
 </tr>
-<tr> 
-<td> Range </td>
+<tr class="alt"><td colspan="2"> Range </td></tr>
+<tr class="alt">
 <td> 
 \code 
-  sm1.innerVector(outer); 
-  sm1.innerVectors(start, size);
-  sm1.leftCols(size);
-  sm2.rightCols(size);
-  sm1.middleRows(start, numRows);
-  sm1.middleCols(start, numCols);
-  sm1.col(j);
+  sm1.innerVector(outer);           // RW
+  sm1.innerVectors(start, size);    // RW
+  sm1.leftCols(size);               // RW
+  sm2.rightCols(size);              // RO because sm2 is row-major
+  sm1.middleRows(start, numRows);   // RO becasue sm1 is column-major
+  sm1.middleCols(start, numCols);   // RW
+  sm1.col(j);                       // RW
 \endcode
 </td>
-<td>A inner vector is either a row (for row-major) or a column (for column-major). As stated earlier, the evaluation can be done in a matrix with different storage order </td>
+<td>
+A inner vector is either a row (for row-major) or a column (for column-major).\n
+As stated earlier, for a read-write sub-matrix (RW), the evaluation can be done in a matrix with different storage order.
+</td>
 </tr>
+<tr><td colspan="2"> Triangular and selfadjoint views</td></tr>
 <tr>
-<td> Triangular and selfadjoint views</td>
 <td> 
 \code
  sm2 = sm1.triangularview<Lower>();
@ -222,26 +228,30 @@ sm2 = perm * sm1; // Permute the columns
 \code 
  \endcode </td>
 </tr>
-<tr> 
-<td>Triangular solve </td>
+<tr class="alt"><td colspan="2">Triangular solve </td></tr>
+<tr class="alt">
 <td> 
 \code 
 dv2 = sm1.triangularView<Upper>().solve(dv1);
- dv2 = sm1.topLeftCorner(size, size).triangularView<Lower>().solve(dv1);
+ dv2 = sm1.topLeftCorner(size, size)
+          .triangularView<Lower>().solve(dv1);
 \endcode 
 </td>
 <td> For general sparse solve, Use any suitable module described at \ref TopicSparseSystems </td>
 </tr>
+<tr><td colspan="2"> Low-level API</td></tr>
 <tr>
-<td> Low-level API</td>
 <td>
 \code
-sm1.valuePtr(); // Pointer to the values
-sm1.innerIndextr(); // Pointer to the indices.
-sm1.outerIndexPtr(); //Pointer to the beginning of each inner vector
+sm1.valuePtr();      // Pointer to the values
+sm1.innerIndextr();  // Pointer to the indices.
+sm1.outerIndexPtr(); // Pointer to the beginning of each inner vector
 \endcode
 </td>
-<td> If the matrix is not in compressed form, makeCompressed() should be called before. Note that these functions are mostly provided for interoperability purposes with external libraries. A better access to the values of the matrix is done by using the InnerIterator class as described in \link TutorialSparse the Tutorial Sparse \endlink section</td>
+<td>
+If the matrix is not in compressed form, makeCompressed() should be called before.\n
+Note that these functions are mostly provided for interoperability purposes with external libraries.\n
+A better access to the values of the matrix is done by using the InnerIterator class as described in \link TutorialSparse the Tutorial Sparse \endlink section</td>
 </tr>
 </table>
 */
--- a/doc/TutorialSparse.dox
+++ b/doc/TutorialSparse.dox
@ -241,11 +241,11 @@ In the following \em sm denotes a sparse matrix, \em sv a sparse vector, \em dm
 sm1.real()   sm1.imag()   -sm1                    0.5*sm1
 sm1+sm2      sm1-sm2      sm1.cwiseProduct(sm2)
 \endcode
-However, a strong restriction is that the storage orders must match. For instance, in the following example:
+However, <strong>a strong restriction is that the storage orders must match</strong>. For instance, in the following example:
 \code
 sm4 = sm1 + sm2 + sm3;
 \endcode
-sm1, sm2, and sm3 must all be row-major or all column major.
+sm1, sm2, and sm3 must all be row-major or all column-major.
 On the other hand, there is no restriction on the target matrix sm4.
 For instance, this means that for computing \f$ A^T + A \f$, the matrix \f$ A^T \f$ must be evaluated into a temporary matrix of compatible storage order:
 \code
@ -311,6 +311,26 @@ sm2 = sm1.transpose() * P;
    \endcode


+\subsection TutorialSparse_SubMatrices Block operations
+
+Regarding read-access, sparse matrices expose the same API than for dense matrices to access to sub-matrices such as blocks, columns, and rows. See \ref TutorialBlockOperations for a detailed introduction.
+However, for performance reasons, writing to a sub-sparse-matrix is much more limited, and currently only contiguous sets of columns (resp. rows) of a column-major (resp. row-major) SparseMatrix are writable. Moreover, this information has to be known at compile-time, leaving out methods such as <tt>block(...)</tt> and <tt>corner*(...)</tt>. The available API for write-access to a SparseMatrix are summarized below:
+\code
+SparseMatrix<double,ColMajor> sm1;
+sm1.col(j) = ...;
+sm1.leftCols(ncols) = ...;
+sm1.middleCols(j,ncols) = ...;
+sm1.rightCols(ncols) = ...;
+
+SparseMatrix<double,RowMajor> sm2;
+sm2.row(i) = ...;
+sm2.topRows(nrows) = ...;
+sm2.middleRows(i,nrows) = ...;
+sm2.bottomRows(nrows) = ...;
+\endcode
+
+In addition, sparse matrices expose the SparseMatrixBase::innerVector() and SparseMatrixBase::innerVectors() methods, which are aliases to the col/middleCols methods for a column-major storage, and to the row/middleRows methods for a row-major storage.
+
 \subsection TutorialSparse_TriangularSelfadjoint Triangular and selfadjoint views

 Just as with dense matrices, the triangularView() function can be used to address a triangular part of the matrix, and perform triangular solves with a dense right hand side:
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@ -226,7 +226,9 @@ ei_add_test(geo_homogeneous)
 ei_add_test(stdvector)
 ei_add_test(stdvector_overload)
 ei_add_test(stdlist)
+ei_add_test(stdlist_overload)
 ei_add_test(stddeque)
+ei_add_test(stddeque_overload)
 ei_add_test(sparse_basic)
 ei_add_test(sparse_block)
 ei_add_test(sparse_vector)
--- a/test/stddeque_overload.cpp
+++ b/test/stddeque_overload.cpp
@ -0,0 +1,159 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/StdDeque>
+#include <Eigen/Geometry>
+
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Vector4f)
+
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Matrix2f)
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Matrix4f)
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Matrix4d)
+
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Affine3f)
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Affine3d)
+
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Quaternionf)
+EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Quaterniond)
+
+template<typename MatrixType>
+void check_stddeque_matrix(const MatrixType& m)
+{
+  typename MatrixType::Index rows = m.rows();
+  typename MatrixType::Index cols = m.cols();
+  MatrixType x = MatrixType::Random(rows,cols), y = MatrixType::Random(rows,cols);
+  std::deque<MatrixType> v(10, MatrixType(rows,cols)), w(20, y);
+  v[5] = x;
+  w[6] = v[5];
+  VERIFY_IS_APPROX(w[6], v[5]);
+  v = w;
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(w[i], v[i]);
+  }
+
+  v.resize(21);
+  v[20] = x;
+  VERIFY_IS_APPROX(v[20], x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(v[21], y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(v[22], x);
+  VERIFY((size_t)&(v[22]) == (size_t)&(v[21]) + sizeof(MatrixType));
+
+  // do a lot of push_back such that the deque gets internally resized
+  // (with memory reallocation)
+  MatrixType* ref = &w[0];
+  for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i)
+    v.push_back(w[i%w.size()]);
+  for(unsigned int i=23; i<v.size(); ++i)
+  {
+    VERIFY(v[i]==w[(i-23)%w.size()]);
+  }
+}
+
+template<typename TransformType>
+void check_stddeque_transform(const TransformType&)
+{
+  typedef typename TransformType::MatrixType MatrixType;
+  TransformType x(MatrixType::Random()), y(MatrixType::Random());
+  std::deque<TransformType> v(10), w(20, y);
+  v[5] = x;
+  w[6] = v[5];
+  VERIFY_IS_APPROX(w[6], v[5]);
+  v = w;
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(w[i], v[i]);
+  }
+
+  v.resize(21);
+  v[20] = x;
+  VERIFY_IS_APPROX(v[20], x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(v[21], y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(v[22], x);
+
+  // do a lot of push_back such that the deque gets internally resized
+  // (with memory reallocation)
+  TransformType* ref = &w[0];
+  for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i)
+    v.push_back(w[i%w.size()]);
+  for(unsigned int i=23; i<v.size(); ++i)
+  {
+    VERIFY(v[i].matrix()==w[(i-23)%w.size()].matrix());
+  }
+}
+
+template<typename QuaternionType>
+void check_stddeque_quaternion(const QuaternionType&)
+{
+  typedef typename QuaternionType::Coefficients Coefficients;
+  QuaternionType x(Coefficients::Random()), y(Coefficients::Random());
+  std::deque<QuaternionType> v(10), w(20, y);
+  v[5] = x;
+  w[6] = v[5];
+  VERIFY_IS_APPROX(w[6], v[5]);
+  v = w;
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(w[i], v[i]);
+  }
+
+  v.resize(21);
+  v[20] = x;
+  VERIFY_IS_APPROX(v[20], x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(v[21], y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(v[22], x);
+
+  // do a lot of push_back such that the deque gets internally resized
+  // (with memory reallocation)
+  QuaternionType* ref = &w[0];
+  for(int i=0; i<30 || ((ref==&w[0]) && i<300); ++i)
+    v.push_back(w[i%w.size()]);
+  for(unsigned int i=23; i<v.size(); ++i)
+  {
+    VERIFY(v[i].coeffs()==w[(i-23)%w.size()].coeffs());
+  }
+}
+
+void test_stddeque_overload()
+{
+  // some non vectorizable fixed sizes
+  CALL_SUBTEST_1(check_stddeque_matrix(Vector2f()));
+  CALL_SUBTEST_1(check_stddeque_matrix(Matrix3f()));
+  CALL_SUBTEST_2(check_stddeque_matrix(Matrix3d()));
+
+  // some vectorizable fixed sizes
+  CALL_SUBTEST_1(check_stddeque_matrix(Matrix2f()));
+  CALL_SUBTEST_1(check_stddeque_matrix(Vector4f()));
+  CALL_SUBTEST_1(check_stddeque_matrix(Matrix4f()));
+  CALL_SUBTEST_2(check_stddeque_matrix(Matrix4d()));
+
+  // some dynamic sizes
+  CALL_SUBTEST_3(check_stddeque_matrix(MatrixXd(1,1)));
+  CALL_SUBTEST_3(check_stddeque_matrix(VectorXd(20)));
+  CALL_SUBTEST_3(check_stddeque_matrix(RowVectorXf(20)));
+  CALL_SUBTEST_3(check_stddeque_matrix(MatrixXcf(10,10)));
+
+  // some Transform
+  CALL_SUBTEST_4(check_stddeque_transform(Affine2f())); // does not need the specialization (2+1)^2 = 9
+  CALL_SUBTEST_4(check_stddeque_transform(Affine3f()));
+  CALL_SUBTEST_4(check_stddeque_transform(Affine3d()));
+
+  // some Quaternion
+  CALL_SUBTEST_5(check_stddeque_quaternion(Quaternionf()));
+  CALL_SUBTEST_5(check_stddeque_quaternion(Quaterniond()));
+}
--- a/test/stdlist_overload.cpp
+++ b/test/stdlist_overload.cpp
@ -0,0 +1,192 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/StdList>
+#include <Eigen/Geometry>
+
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Vector4f)
+
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Matrix2f)
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Matrix4f)
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Matrix4d)
+
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Affine3f)
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Affine3d)
+
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Quaternionf)
+EIGEN_DEFINE_STL_LIST_SPECIALIZATION(Quaterniond)
+
+template <class Container, class Position>
+typename Container::iterator get(Container & c, Position position)
+{
+  typename Container::iterator it = c.begin();
+  std::advance(it, position);
+  return it;
+}
+
+template <class Container, class Position, class Value>
+void set(Container & c, Position position, const Value & value)
+{
+  typename Container::iterator it = c.begin();
+  std::advance(it, position);
+  *it = value;
+}
+
+template<typename MatrixType>
+void check_stdlist_matrix(const MatrixType& m)
+{
+  typename MatrixType::Index rows = m.rows();
+  typename MatrixType::Index cols = m.cols();
+  MatrixType x = MatrixType::Random(rows,cols), y = MatrixType::Random(rows,cols);
+  std::list<MatrixType> v(10, MatrixType(rows,cols)), w(20, y);
+  typename std::list<MatrixType>::iterator itv = get(v, 5);
+  typename std::list<MatrixType>::iterator itw = get(w, 6);
+  *itv = x;
+  *itw = *itv;
+  VERIFY_IS_APPROX(*itw, *itv);
+  v = w;
+  itv = v.begin();
+  itw = w.begin();
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(*itw, *itv);
+    ++itv;
+    ++itw;
+  }
+
+  v.resize(21);
+  set(v, 20, x);
+  VERIFY_IS_APPROX(*get(v, 20), x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(*get(v, 21), y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(*get(v, 22), x);
+
+  // do a lot of push_back such that the list gets internally resized
+  // (with memory reallocation)
+  MatrixType* ref = &(*get(w, 0));
+  for(int i=0; i<30 || ((ref==&(*get(w, 0))) && i<300); ++i)
+    v.push_back(*get(w, i%w.size()));
+  for(unsigned int i=23; i<v.size(); ++i)
+  {
+    VERIFY((*get(v, i))==(*get(w, (i-23)%w.size())));
+  }
+}
+
+template<typename TransformType>
+void check_stdlist_transform(const TransformType&)
+{
+  typedef typename TransformType::MatrixType MatrixType;
+  TransformType x(MatrixType::Random()), y(MatrixType::Random());
+  std::list<TransformType> v(10), w(20, y);
+  typename std::list<TransformType>::iterator itv = get(v, 5);
+  typename std::list<TransformType>::iterator itw = get(w, 6);
+  *itv = x;
+  *itw = *itv;
+  VERIFY_IS_APPROX(*itw, *itv);
+  v = w;
+  itv = v.begin();
+  itw = w.begin();
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(*itw, *itv);
+    ++itv;
+    ++itw;
+  }
+
+  v.resize(21);
+  set(v, 20, x);
+  VERIFY_IS_APPROX(*get(v, 20), x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(*get(v, 21), y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(*get(v, 22), x);
+
+  // do a lot of push_back such that the list gets internally resized
+  // (with memory reallocation)
+  TransformType* ref = &(*get(w, 0));
+  for(int i=0; i<30 || ((ref==&(*get(w, 0))) && i<300); ++i)
+    v.push_back(*get(w, i%w.size()));
+  for(unsigned int i=23; i<v.size(); ++i)
+  {
+    VERIFY(get(v, i)->matrix()==get(w, (i-23)%w.size())->matrix());
+  }
+}
+
+template<typename QuaternionType>
+void check_stdlist_quaternion(const QuaternionType&)
+{
+  typedef typename QuaternionType::Coefficients Coefficients;
+  QuaternionType x(Coefficients::Random()), y(Coefficients::Random());
+  std::list<QuaternionType> v(10), w(20, y);
+  typename std::list<QuaternionType>::iterator itv = get(v, 5);
+  typename std::list<QuaternionType>::iterator itw = get(w, 6);
+  *itv = x;
+  *itw = *itv;
+  VERIFY_IS_APPROX(*itw, *itv);
+  v = w;
+  itv = v.begin();
+  itw = w.begin();
+  for(int i = 0; i < 20; i++)
+  {
+    VERIFY_IS_APPROX(*itw, *itv);
+    ++itv;
+    ++itw;
+  }
+
+  v.resize(21);
+  set(v, 20, x);
+  VERIFY_IS_APPROX(*get(v, 20), x);
+  v.resize(22,y);
+  VERIFY_IS_APPROX(*get(v, 21), y);
+  v.push_back(x);
+  VERIFY_IS_APPROX(*get(v, 22), x);
+
+  // do a lot of push_back such that the list gets internally resized
+  // (with memory reallocation)
+  QuaternionType* ref = &(*get(w, 0));
+  for(int i=0; i<30 || ((ref==&(*get(w, 0))) && i<300); ++i)
+    v.push_back(*get(w, i%w.size()));
+  for(unsigned int i=23; i<v.size(); ++i)
+  {
+    VERIFY(get(v, i)->coeffs()==get(w, (i-23)%w.size())->coeffs());
+  }
+}
+
+void test_stdlist_overload()
+{
+  // some non vectorizable fixed sizes
+  CALL_SUBTEST_1(check_stdlist_matrix(Vector2f()));
+  CALL_SUBTEST_1(check_stdlist_matrix(Matrix3f()));
+  CALL_SUBTEST_2(check_stdlist_matrix(Matrix3d()));
+
+  // some vectorizable fixed sizes
+  CALL_SUBTEST_1(check_stdlist_matrix(Matrix2f()));
+  CALL_SUBTEST_1(check_stdlist_matrix(Vector4f()));
+  CALL_SUBTEST_1(check_stdlist_matrix(Matrix4f()));
+  CALL_SUBTEST_2(check_stdlist_matrix(Matrix4d()));
+
+  // some dynamic sizes
+  CALL_SUBTEST_3(check_stdlist_matrix(MatrixXd(1,1)));
+  CALL_SUBTEST_3(check_stdlist_matrix(VectorXd(20)));
+  CALL_SUBTEST_3(check_stdlist_matrix(RowVectorXf(20)));
+  CALL_SUBTEST_3(check_stdlist_matrix(MatrixXcf(10,10)));
+
+  // some Transform
+  CALL_SUBTEST_4(check_stdlist_transform(Affine2f())); // does not need the specialization (2+1)^2 = 9
+  CALL_SUBTEST_4(check_stdlist_transform(Affine3f()));
+  CALL_SUBTEST_4(check_stdlist_transform(Affine3d()));
+
+  // some Quaternion
+  CALL_SUBTEST_5(check_stdlist_quaternion(Quaternionf()));
+  CALL_SUBTEST_5(check_stdlist_quaternion(Quaterniond()));
+}
--- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
@ -165,10 +165,10 @@ template <typename T> class array<T, 0> {
  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; }

  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array() { }
+  EIGEN_STRONG_INLINE array() : dummy() { }

 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
-  EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) {
+  EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() {
    eigen_assert(l.size() == 0);
  }
 #endif
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@ -344,7 +344,7 @@ class TensorContractionSubMapper {
  enum {
    // We can use direct offsets iff the parent mapper supports then and we can compute the strides.
    // TODO: we should also enable direct offsets for the Rhs case.
-    UseDirectOffsets = (side == Lhs) && inner_dim_contiguous && ParentMapper::DirectOffsets
+    UseDirectOffsets = ParentMapper::DirectOffsets && (side == Lhs) && inner_dim_contiguous && (array_size<contract_t>::value > 0)
  };

  EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@ -86,6 +86,26 @@ struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 2, 1> {
  const TensorEvaluator& m_impl;
 };

+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 4, 1> {
+  PacketConverter(const TensorEvaluator& impl)
+      : m_impl(impl) {}
+
+  template<int LoadMode, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+    const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+
+    SrcPacket src1 = m_impl.template packet<LoadMode>(index);
+    SrcPacket src2 = m_impl.template packet<LoadMode>(index + SrcPacketSize);
+    SrcPacket src3 = m_impl.template packet<LoadMode>(index + 2 * SrcPacketSize);
+    SrcPacket src4 = m_impl.template packet<LoadMode>(index + 3 * SrcPacketSize);
+    TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2, src3, src4);
+    return result;
+  }
+
+ private:
+  const TensorEvaluator& m_impl;
+};

 template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
 struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, 2> {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@ -109,10 +109,12 @@ class CudaStreamDevice : public StreamInterface {
 struct GpuDevice {
  // The StreamInterface is not owned: the caller is
  // responsible for its initialization and eventual destruction.
-  explicit GpuDevice(const StreamInterface* stream) : stream_(stream) {
+  explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
+    eigen_assert(stream);
+  }
+  explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
    eigen_assert(stream);
  }
-
  // TODO(bsteiner): This is an internal API, we should not expose it.
  EIGEN_STRONG_INLINE const cudaStream_t& stream() const {
    return stream_->stream();
@ -246,6 +248,10 @@ struct GpuDevice {
 #endif
  }

+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const {
+    return max_blocks_;
+  }
+
  // This function checks if the CUDA runtime recorded an error for the
  // underlying stream device.
  inline bool ok() const {
@ -259,7 +265,7 @@ struct GpuDevice {

 private:
  const StreamInterface* stream_;
-
+  int max_blocks_;
 };

 #ifndef __CUDA_ARCH__
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@ -220,7 +220,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(
  if (needs_assign)
  {
    const int block_size = device.maxCudaThreadsPerBlock();
-    const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
+    const int max_blocks = numext::maxi<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size);
    const Index size = array_prod(evaluator.dimensions());
    // Create a least one block to ensure we won't crash if we're called with tensors of size 0.
    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
@ -239,7 +239,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(c
  if (needs_assign)
  {
    const int block_size = device.maxCudaThreadsPerBlock();
-    const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
+    const int max_blocks = numext::maxi<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size);
    const Index size = array_prod(evaluator.dimensions());
    // Create a least one block to ensure we won't crash if we're called with tensors of size 0.
    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@ -33,18 +33,19 @@ struct TensorUInt128
  HIGH high;
  LOW low;

+  template<typename OTHER_HIGH, typename OTHER_LOW>
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  TensorUInt128(int32_t x) : high(0), low(x) {
+  TensorUInt128(const TensorUInt128<OTHER_HIGH, OTHER_LOW>& other) : high(other.high), low(other.low) {
+    EIGEN_STATIC_ASSERT(sizeof(OTHER_HIGH) <= sizeof(HIGH), "high too wide");
+    EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), "low too wide");
+  }
+
+  template<typename T>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  explicit TensorUInt128(const T& x) : high(0), low(x) {
    eigen_assert(x >= 0);
  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  TensorUInt128(uint32_t x) : high(0), low(x) { }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  TensorUInt128(int64_t x) : high(0), low(x) {
-    eigen_assert(x >= 0);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  TensorUInt128(uint64_t x) : high(0), low(x) { }
+
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
  TensorUInt128(uint64_t y, uint64_t x) : high(y), low(x) { }

--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@ -158,7 +158,7 @@ if(CUDA_FOUND)
  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
    set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE)
  endif()
-  set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_30")
+  set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_30 -Xcudafe \"--display_error_number\"")
  cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
  set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")

--- a/unsupported/test/cxx11_tensor_contract_cuda.cu
+++ b/unsupported/test/cxx11_tensor_contract_cuda.cu
@ -22,16 +22,16 @@ using Eigen::Tensor;
 typedef Tensor<float, 1>::DimensionPair DimPair;

 template<int DataLayout>
-static void test_cuda_contraction(int m_size, int k_size, int n_size)
+void test_cuda_contraction(int m_size, int k_size, int n_size)
 {
-  std::cout << "Calling with (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
  // with these dimensions, the output has 300 * 140 elements, which is
  // more than 30 * 1024, which is the number of threads in blocks on
  // a 15 SM GK110 GPU
-  Tensor<float, 2, DataLayout> t_left(Eigen::array<int, 2>(m_size, k_size));
-  Tensor<float, 2, DataLayout> t_right(Eigen::array<int, 2>(k_size, n_size));
-  Tensor<float, 2, DataLayout> t_result(Eigen::array<int, 2>(m_size, n_size));
-  Tensor<float, 2, DataLayout> t_result_gpu(Eigen::array<int, 2>(m_size, n_size));
+  Tensor<float, 2, DataLayout> t_left(m_size, k_size);
+  Tensor<float, 2, DataLayout> t_right(k_size, n_size);
+  Tensor<float, 2, DataLayout> t_result(m_size, n_size);
+  Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size);
  Eigen::array<DimPair, 1> dims(DimPair(1, 0));

  t_left.setRandom();
@ -84,41 +84,69 @@ static void test_cuda_contraction(int m_size, int k_size, int n_size)
  cudaFree((void*)d_t_result);
 }

+template<int DataLayout>
+void test_cuda_contraction_m() {
+  for (int k = 32; k < 256; k++) {
+    test_cuda_contraction<ColMajor>(k, 128, 128);
+    test_cuda_contraction<RowMajor>(k, 128, 128);
+  }
+}
+
+template<int DataLayout>
+void test_cuda_contraction_k() {
+  for (int k = 32; k < 256; k++) {
+    test_cuda_contraction<ColMajor>(128, k, 128);
+    test_cuda_contraction<RowMajor>(128, k, 128);
+  }
+}
+
+template<int DataLayout>
+void test_cuda_contraction_n() {
+  for (int k = 32; k < 256; k++) {
+    test_cuda_contraction<ColMajor>(128, 128, k);
+    test_cuda_contraction<RowMajor>(128, 128, k);
+  }
+}
+
+
+template<int DataLayout>
+void test_cuda_contraction_sizes() {
+  int m_sizes[] = { 31,  39,   63,   64,   65,
+                   127, 129,  255,  257 , 511,
+                   512, 513, 1023, 1024, 1025};
+
+  int n_sizes[] = { 31,  39,   63,   64,   65,
+                   127, 129,  255,  257,  511,
+                   512, 513, 1023, 1024, 1025};
+
+  int k_sizes[] = {  31,   39,  63,  64,   65,
+                     95,   96, 127, 129,  255,
+                    257,  511, 512, 513, 1023,
+                   1024, 1025};
+
+  for (int i = 0; i < 15; i++) {
+    for (int j = 0; j < 15; j++) {
+      for (int k = 0; k < 17; k++) {
+        test_cuda_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
+      }
+    }
+  }
+}

 void test_cxx11_tensor_cuda()
 {
-  std::cout << "Calling contraction tests" << std::endl;
-  CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, 128, 128));
-  CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, 128, 128));
-  for (int k = 32; k < 256; k++) {
-    CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, k, 128));
-    CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, k, 128));
-  }
-  for (int k = 32; k < 256; k++) {
-    CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, 128, k));
-    CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, 128, k));
-  }
-  for (int k = 32; k < 256; k++) {
-    CALL_SUBTEST(test_cuda_contraction<ColMajor>(k, 128, 128));
-    CALL_SUBTEST(test_cuda_contraction<RowMajor>(k, 128, 128));
-  }
+  CALL_SUBTEST_1(test_cuda_contraction<ColMajor>(128, 128, 128));
+  CALL_SUBTEST_1(test_cuda_contraction<RowMajor>(128, 128, 128));

-  int m_sizes[] = {31,   39,   63,   64,  65,
-                   127, 129,  255,  257, 511,
-                   512, 513, 1023, 1024, 1025 };
-  int n_sizes[] = {31,   39,   63,   64,  65,
-                   127, 129,  255,  257, 511,
-                   512, 513, 1023, 1024, 1025 };
+  CALL_SUBTEST_2(test_cuda_contraction_m<ColMajor>());
+  CALL_SUBTEST_3(test_cuda_contraction_m<RowMajor>());

-  int k_sizes[] = { 31,  39,  63, 64,    65,
-                    95,  96, 127, 129,  255,
-                   257, 511, 512, 513, 1023,
-                  1024, 1025};
+  CALL_SUBTEST_4(test_cuda_contraction_k<ColMajor>());
+  CALL_SUBTEST_5(test_cuda_contraction_k<RowMajor>());

-  for (int i = 0; i <15; i++)
-    for (int j = 0; j < 15; j++)
-      for (int k = 0; k < 17; k++) {
-        CALL_SUBTEST(test_cuda_contraction<ColMajor>(m_sizes[i], n_sizes[j], k_sizes[k]));
-        CALL_SUBTEST(test_cuda_contraction<RowMajor>(m_sizes[i], n_sizes[j], k_sizes[k]));
-      }
+  CALL_SUBTEST_6(test_cuda_contraction_n<ColMajor>());
+  CALL_SUBTEST_7(test_cuda_contraction_n<RowMajor>());
+
+  CALL_SUBTEST_8(test_cuda_contraction_sizes<ColMajor>());
+  CALL_SUBTEST_9(test_cuda_contraction_sizes<RowMajor>());
 }
--- a/unsupported/test/cxx11_tensor_device.cu
+++ b/unsupported/test/cxx11_tensor_device.cu
@ -109,19 +109,19 @@ struct GPUContext {

 // The actual expression to evaluate
 template <typename Context>
-static void test_contextual_eval(Context* context)
+void test_contextual_eval(Context* context)
 {
  context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f);
 }

 template <typename Context>
-static void test_forced_contextual_eval(Context* context)
+void test_forced_contextual_eval(Context* context)
 {
  context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f);
 }

 template <typename Context>
-static void test_compound_assignment(Context* context)
+void test_compound_assignment(Context* context)
 {
  context->out().device(context->device()) = context->in1().constant(2.718f);
  context->out().device(context->device()) += context->in1() + context->in2() * 3.14f;
@ -129,7 +129,7 @@ static void test_compound_assignment(Context* context)


 template <typename Context>
-static void test_contraction(Context* context)
+void test_contraction(Context* context)
 {
  Eigen::array<std::pair<int, int>, 2> dims;
  dims[0] = std::make_pair(1, 1);
@ -145,7 +145,7 @@ static void test_contraction(Context* context)


 template <typename Context>
-static void test_1d_convolution(Context* context)
+void test_1d_convolution(Context* context)
 {
  Eigen::DSizes<int, 3> indices(0,0,0);
  Eigen::DSizes<int, 3> sizes(40,49,70);
@ -155,7 +155,7 @@ static void test_1d_convolution(Context* context)
 }

 template <typename Context>
-static void test_2d_convolution(Context* context)
+void test_2d_convolution(Context* context)
 {
  Eigen::DSizes<int, 3> indices(0,0,0);
  Eigen::DSizes<int, 3> sizes(40,49,69);
@ -165,7 +165,7 @@ static void test_2d_convolution(Context* context)
 }

 template <typename Context>
-static void test_3d_convolution(Context* context)
+void test_3d_convolution(Context* context)
 {
  Eigen::DSizes<int, 3> indices(0,0,0);
  Eigen::DSizes<int, 3> sizes(39,49,69);
@ -175,7 +175,7 @@ static void test_3d_convolution(Context* context)
 }


-static void test_cpu() {
+void test_cpu() {
  Eigen::Tensor<float, 3> in1(40,50,70);
  Eigen::Tensor<float, 3> in2(40,50,70);
  Eigen::Tensor<float, 3> out(40,50,70);
@ -267,7 +267,7 @@ static void test_cpu() {
  }
 }

-static void test_gpu() {
+void test_gpu() {
  Eigen::Tensor<float, 3> in1(40,50,70);
  Eigen::Tensor<float, 3> in2(40,50,70);
  Eigen::Tensor<float, 3> out(40,50,70);
--- a/unsupported/test/cxx11_tensor_uint128.cpp
+++ b/unsupported/test/cxx11_tensor_uint128.cpp
@ -127,7 +127,7 @@ void test_misc2() {

      TensorUInt128<uint64_t, uint64_t> result = (TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) - TensorUInt128<static_val<1>, static_val<0> >(1, 0) + TensorUInt128<static_val<0>, static_val<1> >(1));
      uint64_t actual = static_cast<uint64_t>(result);
-      VERIFY_EQUAL(actual, expected);
+      VERIFY_IS_EQUAL(actual, expected);
    }
  }
 }