Merged eigen/eigen into default

2025-03-07 18:27:40 +08:00 · 2016-03-28 01:48:45 +03:00 · 2016-03-28 01:48:45 +03:00 · f48011119e
commit f48011119e
parent ed6b9d08f1 74f91ed06c
45 changed files with 327 additions and 110 deletions
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
--- a/Eigen/src/Core/Assign_MKL.h
+++ b/Eigen/src/Core/Assign_MKL.h
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@ -60,6 +60,23 @@ template<typename T> struct GenericNumTraits
    MulCost = 1
  };

+  // Division is messy but important, because it is expensive and throughput
+  // varies significantly. The following numbers are based on min division
+  // throughput on Haswell.
+  template<bool Vectorized>
+  struct Div {
+    enum {
+#ifdef EIGEN_VECTORIZE_AVX
+      AVX = true,
+#else
+      AVX = false,
+#endif
+      Cost = IsInteger ? (sizeof(T) == 8 ? (IsSigned ? 24 : 21) : (IsSigned ? 8 : 9)):
+          Vectorized ? (sizeof(T) == 8 ? (AVX ? 16 : 8) : (AVX ? 14 : 7)) : 8
+    };
+  };
+
+
  typedef T Real;
  typedef typename internal::conditional<
                     IsInteger,
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
--- a/Eigen/src/Core/SpecialFunctions.h
+++ b/Eigen/src/Core/SpecialFunctions.h
@ -576,7 +576,7 @@ struct igammac_impl {
      pkm1 = pk;
      qkm2 = qkm1;
      qkm1 = qk;
-      if (abs(pk) > big) {
+      if (numext::abs(pk) > big) {
        pkm2 *= biginv;
        pkm1 *= biginv;
        qkm2 *= biginv;
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
--- a/Eigen/src/Core/arch/CUDA/Half.h
+++ b/Eigen/src/Core/arch/CUDA/Half.h
@ -63,38 +63,69 @@ static inline EIGEN_DEVICE_FUNC float half_to_float(__half h);

 // Class definition.
 struct half : public __half {
-  EIGEN_DEVICE_FUNC half() : __half(internal::raw_uint16_to_half(0)) {}
+  EIGEN_DEVICE_FUNC half() {}

-  // TODO(sesse): Should these conversions be marked as explicit?
-  EIGEN_DEVICE_FUNC half(float f) : __half(internal::float_to_half_rtne(f)) {}
-  EIGEN_DEVICE_FUNC half(int i) : __half(internal::float_to_half_rtne(static_cast<float>(i))) {}
-  EIGEN_DEVICE_FUNC half(double d) : __half(internal::float_to_half_rtne(static_cast<float>(d))) {}
-  EIGEN_DEVICE_FUNC half(bool b)
-      : __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
  EIGEN_DEVICE_FUNC half(const __half& h) : __half(h) {}
  EIGEN_DEVICE_FUNC half(const half& h) : __half(h) {}
-  EIGEN_DEVICE_FUNC half(const volatile half& h)
-      : __half(internal::raw_uint16_to_half(h.x)) {}

+  explicit EIGEN_DEVICE_FUNC half(bool b)
+      : __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
+  explicit EIGEN_DEVICE_FUNC half(int i)
+      : __half(internal::float_to_half_rtne(static_cast<float>(i))) {}
+  explicit EIGEN_DEVICE_FUNC half(long l)
+      : __half(internal::float_to_half_rtne(static_cast<float>(l))) {}
+  explicit EIGEN_DEVICE_FUNC half(long long ll)
+      : __half(internal::float_to_half_rtne(static_cast<float>(ll))) {}
+  explicit EIGEN_DEVICE_FUNC half(float f)
+      : __half(internal::float_to_half_rtne(f)) {}
+  explicit EIGEN_DEVICE_FUNC half(double d)
+      : __half(internal::float_to_half_rtne(static_cast<float>(d))) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const {
+    // +0.0 and -0.0 become false, everything else becomes true.
+    return static_cast<bool>(x & 0x7fff);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const {
+    return static_cast<signed char>(internal::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const {
+    return static_cast<unsigned char>(internal::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const {
+    return static_cast<short>(internal::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const {
+    return static_cast<unsigned short>(internal::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const {
+    return static_cast<int>(internal::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const {
+    return static_cast<unsigned int>(internal::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const {
+    return static_cast<long>(internal::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const {
+    return static_cast<unsigned long>(internal::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const {
+    return static_cast<long long>(internal::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const {
+    return static_cast<unsigned long long>(internal::half_to_float(*this));
+  }
  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const {
    return internal::half_to_float(*this);
  }
  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const {
-    return internal::half_to_float(*this);
+    return static_cast<double>(internal::half_to_float(*this));
  }

  EIGEN_DEVICE_FUNC half& operator=(const half& other) {
    x = other.x;
    return *this;
  }
-  EIGEN_DEVICE_FUNC half& operator=(const volatile half& other) {
-    x = other.x;
-    return *this;
-  }
-  EIGEN_DEVICE_FUNC volatile half& operator=(const half& other) volatile {
-    x = other.x;
-    return *this;
-  }
 };

 #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
@ -203,6 +234,12 @@ static inline EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) {

 #endif  // Emulate support for half floats

+// Division by an index. Do it in full float precision to avoid accuracy
+// issues in converting the denominator to half.
+static inline EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
+  return Eigen::half(static_cast<float>(a) / static_cast<float>(b));
+}
+
 // Conversion routines, including fallbacks for the host or older CUDA.
 // Note that newer Intel CPUs (Haswell or newer) have vectorized versions of
 // these in hardware. If we need more performance on older/other CPUs, they are
@ -341,4 +378,14 @@ static inline EIGEN_DEVICE_FUNC Eigen::half log(const Eigen::half& a) {

 } // end namespace std

+
+// Add the missing shfl_xor intrinsic
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+__device__ inline Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
+  return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
+}
+
+#endif
+
+
 #endif // EIGEN_HALF_CUDA_H
--- a/Eigen/src/Core/arch/CUDA/TypeCasting.h
+++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h
@ -114,8 +114,8 @@ template<> EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
  return __float22half2_rn(make_float2(a.x, a.y));
 #else
-  half r1 = a.x;
-  half r2 = a.y;
+  half r1 = static_cast<half>(a.x);
+  half r2 = static_cast<half>(a.y);
  half2 r;
  r.x = 0;
  r.x |= r1.x;
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@ -238,7 +238,13 @@ template<typename Scalar> struct scalar_hypot_op {
 };
 template<typename Scalar>
 struct functor_traits<scalar_hypot_op<Scalar> > {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess=0 };
+  enum
+  {
+    Cost = 3 * NumTraits<Scalar>::AddCost +
+           2 * NumTraits<Scalar>::MulCost +
+           2 * NumTraits<Scalar>::template Div<false>::Cost,
+    PacketAccess = false
+  };
 };

 /** \internal
@ -297,9 +303,10 @@ template<typename LhsScalar,typename RhsScalar> struct scalar_quotient_op {
 };
 template<typename LhsScalar,typename RhsScalar>
 struct functor_traits<scalar_quotient_op<LhsScalar,RhsScalar> > {
+  typedef typename scalar_quotient_op<LhsScalar,RhsScalar>::result_type result_type;
  enum {
-    Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost), // rough estimate!
-    PacketAccess = scalar_quotient_op<LhsScalar,RhsScalar>::Vectorizable
+    PacketAccess = scalar_quotient_op<LhsScalar,RhsScalar>::Vectorizable,
+    Cost = NumTraits<result_type>::template Div<PacketAccess>::Cost
  };
 };

@ -564,6 +571,10 @@ struct scalar_inverse_mult_op {
  { return internal::pdiv(pset1<Packet>(m_other),a); }
  Scalar m_other;
 };
+template<typename Scalar>
+struct functor_traits<scalar_inverse_mult_op<Scalar> >
+{ enum { PacketAccess = packet_traits<Scalar>::HasDiv, Cost = NumTraits<Scalar>::template Div<PacketAccess>::Cost }; };
+

 } // end namespace internal

--- a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
--- a/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
--- a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
--- a/Eigen/src/Eigenvalues/ComplexSchur_MKL.h
+++ b/Eigen/src/Eigenvalues/ComplexSchur_MKL.h
--- a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
+++ b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
--- a/Eigen/src/Eigenvalues/RealQZ.h
+++ b/Eigen/src/Eigenvalues/RealQZ.h
--- a/Eigen/src/Eigenvalues/RealSchur_MKL.h
+++ b/Eigen/src/Eigenvalues/RealSchur_MKL.h
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
--- a/Eigen/src/PardisoSupport/PardisoSupport.h
+++ b/Eigen/src/PardisoSupport/PardisoSupport.h
--- a/Eigen/src/QR/ColPivHouseholderQR_MKL.h
+++ b/Eigen/src/QR/ColPivHouseholderQR_MKL.h
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
--- a/Eigen/src/SparseLU/SparseLU.h
+++ b/Eigen/src/SparseLU/SparseLU.h
--- a/bench/tensors/tensor_benchmarks.h
+++ b/bench/tensors/tensor_benchmarks.h
@ -333,7 +333,7 @@ template <typename Device, typename T> class BenchmarkSuite {

 #ifndef EIGEN_HAS_INDEX_LIST
    Eigen::array<TensorIndex, 1> sum_along_dim;
-    sum_along_dim = 1;
+    sum_along_dim[0] = 1;
 #else
    // Take advantage of cxx11 to give the compiler information it can use to
    // optimize the code.
@ -356,7 +356,7 @@ template <typename Device, typename T> class BenchmarkSuite {
    input_size[1] = n_;
    const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
        b_, input_size);
-    const Eigen::array<TensorIndex, 0> output_size;
+    Eigen::array<TensorIndex, 0> output_size;
    TensorMap<Tensor<float, 0, 0, TensorIndex>, Eigen::Aligned> C(
        c_, output_size);

--- a/bench/tensors/tensor_benchmarks_fp16_gpu.cu
+++ b/bench/tensors/tensor_benchmarks_fp16_gpu.cu
@ -12,7 +12,7 @@
    StopBenchmarkTiming();                                                     \
    Eigen::CudaStreamDevice stream;                                            \
    Eigen::GpuDevice device(&stream);                                          \
-    BenchmarkSuite<Eigen::GpuDevice, half> suite(device, N);                   \
+    BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, N);            \
    cudaDeviceSynchronize();                                                   \
    suite.FUNC(iters);                                                         \
  }                                                                            \
@ -41,7 +41,7 @@ BM_FuncGPU(colReduction);
    StopBenchmarkTiming();                                                     \
    Eigen::CudaStreamDevice stream;                                            \
    Eigen::GpuDevice device(&stream);                                          \
-    BenchmarkSuite<Eigen::GpuDevice, half> suite(device, D1, D2, D3);          \
+    BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, D1, D2, D3);   \
    cudaDeviceSynchronize();                                                   \
    suite.FUNC(iters);                                                         \
  }                                                                            \
@ -60,7 +60,7 @@ BM_FuncWithInputDimsGPU(contraction, N, N, 64);
    StopBenchmarkTiming();                                                     \
    Eigen::CudaStreamDevice stream;                                            \
    Eigen::GpuDevice device(&stream);                                          \
-    BenchmarkSuite<Eigen::GpuDevice, half> suite(device, N);                   \
+    BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, N);            \
    cudaDeviceSynchronize();                                                   \
    suite.FUNC(iters, DIM1, DIM2);                                             \
  }                                                                            \
@ -73,4 +73,4 @@ BM_FuncWithKernelDimsGPU(convolution, 7, 4);
 BM_FuncWithKernelDimsGPU(convolution, 4, 7);
 BM_FuncWithKernelDimsGPU(convolution, 7, 64);
 BM_FuncWithKernelDimsGPU(convolution, 64, 7);
-*/
+*/
--- a/test/main.h
+++ b/test/main.h
@ -58,6 +58,10 @@
 #define isnan(X) please_protect_your_isnan_with_parentheses
 #define isinf(X) please_protect_your_isinf_with_parentheses
 #define isfinite(X) please_protect_your_isfinite_with_parentheses
+#ifdef M_PI
+#undef M_PI
+#endif
+#define M_PI please_use_EIGEN_PI_instead_of_M_PI

 #define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes
 // B0 is defined in POSIX header termios.h
@ -331,11 +335,13 @@ inline bool test_isApprox(const std::complex<double>& a, const std::complex<doub
 inline bool test_isMuchSmallerThan(const std::complex<double>& a, const std::complex<double>& b)
 { return internal::isMuchSmallerThan(a, b, test_precision<std::complex<double> >()); }

+#ifndef EIGEN_TEST_NO_LONGDOUBLE
 inline bool test_isApprox(const std::complex<long double>& a, const std::complex<long double>& b)
 { return internal::isApprox(a, b, test_precision<std::complex<long double> >()); }
 inline bool test_isMuchSmallerThan(const std::complex<long double>& a, const std::complex<long double>& b)
 { return internal::isMuchSmallerThan(a, b, test_precision<std::complex<long double> >()); }
 #endif
+#endif

 #ifndef EIGEN_TEST_NO_LONGDOUBLE
 inline bool test_isApprox(const long double& a, const long double& b)
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@ -51,6 +51,7 @@ typedef unsigned __int64 uint64_t;
 #endif

 #ifdef EIGEN_USE_THREADS
+#include <atomic>
 #include <condition_variable>
 #include <deque>
 #include <mutex>
--- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
@ -13,7 +13,7 @@


 // The array class is only available starting with cxx11. Emulate our own here
-// if needed.
+// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler!
 // Moreover, CUDA doesn't support the STL containers, so we use our own instead.
 #if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(__CUDACC__) || defined(EIGEN_AVOID_STL_ARRAY)

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@ -607,7 +607,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
    const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>
    extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols,
                           const Index plane_stride = 1, const Index row_stride = 1, const Index col_stride = 1,
-                           const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = 0) const {
+                           const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const {
      return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, 1, 1, 1, padding_type, padding_value);
    }

@ -619,7 +619,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
                           const Index plane_inflate_stride, const Index row_inflate_stride, const Index col_inflate_stride,
                           const Index padding_top_z, const Index padding_bottom_z,
                           const Index padding_top, const Index padding_bottom,
-                           const Index padding_left, const Index padding_right, const Scalar padding_value = 0) const {
+                           const Index padding_left, const Index padding_right, const Scalar padding_value = Scalar(0)) const {
      return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, plane_inflate_stride, row_inflate_stride, col_inflate_stride, padding_top_z, padding_bottom_z, padding_top, padding_bottom, padding_left, padding_right, padding_value);
    }

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@ -20,7 +20,7 @@ template<typename Scalar, typename Index, typename LhsMapper,
         typename RhsMapper, typename OutputMapper, bool needs_edge_check>
 __device__ EIGEN_STRONG_INLINE void
 EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
-                               const OutputMapper output, volatile Scalar* lhs_shmem, volatile Scalar* rhs_shmem,
+                               const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem,
                       const Index m_size, const Index n_size, const Index k_size) {

  const Index m_block_idx = blockIdx.x;
@ -319,8 +319,8 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
    Scalar rrow(7);

    // Now x corresponds to k, y to m, and z to n
-    const volatile Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
-    const volatile Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
+    const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
+    const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];

 #define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
 #define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
@ -503,8 +503,8 @@ __launch_bounds__(512)
 EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
                       const OutputMapper output,
                       const Index m_size, const Index n_size, const Index k_size) {
-  __shared__ volatile Scalar lhs_shmem[72 * 64];
-  __shared__ volatile Scalar rhs_shmem[72 * 64];
+  __shared__ Scalar lhs_shmem[72 * 64];
+  __shared__ Scalar rhs_shmem[72 * 64];

  const Index m_block_idx = blockIdx.x;
  const Index n_block_idx = blockIdx.y;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@ -27,7 +27,7 @@ class ThreadPoolInterface {
 class ThreadPool : public ThreadPoolInterface {
 public:
  // Construct a pool that contains "num_threads" threads.
-  explicit ThreadPool(int num_threads) {
+  explicit ThreadPool(int num_threads) : threads_(num_threads), waiters_(num_threads) {
    for (int i = 0; i < num_threads; i++) {
      threads_.push_back(new std::thread([this]() { WorkerLoop(); }));
    }
@ -110,55 +110,90 @@ class ThreadPool : public ThreadPoolInterface {
  };

  std::mutex mu_;
-  std::vector<std::thread*> threads_;               // All threads
-  std::vector<Waiter*> waiters_;                    // Stack of waiting threads.
+  MaxSizeVector<std::thread*> threads_;             // All threads
+  MaxSizeVector<Waiter*> waiters_;                  // Stack of waiting threads.
  std::deque<std::function<void()>> pending_;       // Queue of pending work
  std::condition_variable empty_;                   // Signaled on pending_.empty()
  bool exiting_ = false;
 };


-// Notification is an object that allows a user to to wait for another
-// thread to signal a notification that an event has occurred.
-//
-// Multiple threads can wait on the same Notification object.
-// but only one caller must call Notify() on the object.
-class Notification {
+// Barrier is an object that allows one or more threads to wait until
+// Notify has been called a specified number of times.
+class Barrier {
 public:
-  Notification() : notified_(false) {}
-  ~Notification() {}
+  Barrier(unsigned int count) : state_(count << 1), notified_(false) {
+    eigen_assert(((count << 1) >> 1) == count);
+  }
+  ~Barrier() {
+    eigen_assert((state_>>1) == 0);
+  }

  void Notify() {
+    unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2;
+    if (v != 1) {
+      eigen_assert(((v + 2) & ~1) != 0);
+      return;  // either count has not dropped to 0, or waiter is not waiting
+    }
    std::unique_lock<std::mutex> l(mu_);
    eigen_assert(!notified_);
    notified_ = true;
    cv_.notify_all();
  }

-  void WaitForNotification() {
+  void Wait() {
+    unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel);
+    if ((v >> 1) == 0) return;
    std::unique_lock<std::mutex> l(mu_);
-    cv_.wait(l, [this]() { return notified_; } );
+    while (!notified_) {
+      cv_.wait(l);
+    }
  }

 private:
  std::mutex mu_;
  std::condition_variable cv_;
+  std::atomic<unsigned int> state_;  // low bit is waiter flag
  bool notified_;
 };

+
+// Notification is an object that allows a user to to wait for another
+// thread to signal a notification that an event has occurred.
+//
+// Multiple threads can wait on the same Notification object,
+// but only one caller must call Notify() on the object.
+struct Notification : Barrier {
+  Notification() : Barrier(1) {};
+};
+
+
 // Runs an arbitrary function and then calls Notify() on the passed in
 // Notification.
-template <typename Function, typename... Args> struct FunctionWrapper
+template <typename Function, typename... Args> struct FunctionWrapperWithNotification
 {
  static void run(Notification* n, Function f, Args... args) {
    f(args...);
-    n->Notify();
+    if (n) {
+      n->Notify();
+    }
  }
 };

-static EIGEN_STRONG_INLINE void wait_until_ready(Notification* n) {
+template <typename Function, typename... Args> struct FunctionWrapperWithBarrier
+{
+  static void run(Barrier* b, Function f, Args... args) {
+    f(args...);
+    if (b) {
+      b->Notify();
+    }
+  }
+};
+
+template <typename SyncType>
+static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
  if (n) {
-    n->WaitForNotification();
+    n->Wait();
  }
 }

@ -203,10 +238,20 @@ struct ThreadPoolDevice {
  EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const {
    Notification* n = new Notification();
    std::function<void()> func =
-      std::bind(&FunctionWrapper<Function, Args...>::run, n, f, args...);
+      std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, f, args...);
    pool_->Schedule(func);
    return n;
  }
+
+  template <class Function, class... Args>
+  EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b,
+                                                Function&& f,
+                                                Args&&... args) const {
+    std::function<void()> func = std::bind(
+        &FunctionWrapperWithBarrier<Function, Args...>::run, b, f, args...);
+    pool_->Schedule(func);
+  }
+
  template <class Function, class... Args>
  EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const {
    std::function<void()> func = std::bind(f, args...);
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@ -111,6 +111,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* scalar) {
+    EIGEN_UNUSED_VARIABLE(scalar);
    eigen_assert(scalar == NULL);
    return m_impl.evalSubExprsIfNeeded(m_buffer);
  }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@ -125,22 +125,18 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>

      int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1;
      const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
-      const Index numblocks = size / blocksize;
+      const unsigned int numblocks = static_cast<unsigned int>(size / blocksize);

-      MaxSizeVector<Notification*> results(numblocks);
-      for (int i = 0; i < numblocks; ++i) {
-        results.push_back(device.enqueue(&EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize));
+      Barrier barrier(numblocks);
+      for (unsigned int i = 0; i < numblocks; ++i) {
+        device.enqueue_with_barrier(&barrier, &EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize);
      }

      if (numblocks * blocksize < size) {
        EvalRange<Evaluator, Index, Vectorizable>::run(evaluator, numblocks * blocksize, size);
      }

-      for (int i = 0; i < numblocks; ++i) {
-        wait_until_ready(results[i]);
-        delete results[i];
-      }
-
+      barrier.Wait();
    }
    evaluator.cleanup();
  }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@ -25,7 +25,20 @@ struct scalar_mod_op {
 };
 template <typename Scalar>
 struct functor_traits<scalar_mod_op<Scalar> >
-{ enum { Cost = 2 * NumTraits<Scalar>::MulCost, PacketAccess = false }; };
+{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
+
+
+/** \internal
+ * \brief Template functor to compute the modulo between 2 arrays.
+ */
+template <typename Scalar>
+struct scalar_mod2_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op);
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; }
+};
+template <typename Scalar>
+struct functor_traits<scalar_mod2_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };


 /** \internal
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@ -41,7 +41,7 @@ template <> struct max_n_1<0> {
 template <typename Scalar, typename Device>
 struct PacketType {
  typedef typename internal::packet_traits<Scalar>::type type;
-  static const int size = internal::unpacket_traits<type>::size;
+  enum { size = internal::unpacket_traits<type>::size };
 };

 // For CUDA packet types when using a GpuDevice
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@ -54,7 +54,6 @@ class TensorReshapingOp : public TensorBase<TensorReshapingOp<NewDimensions, Xpr
 {
  public:
  typedef typename Eigen::internal::traits<TensorReshapingOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
  typedef typename Eigen::internal::nested<TensorReshapingOp>::type Nested;
  typedef typename Eigen::internal::traits<TensorReshapingOp>::StorageKind StorageKind;
@ -143,7 +142,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
    return m_impl.template packet<LoadMode>(index);
  }

-  EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); }
+  EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); }

  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }

@ -234,7 +233,6 @@ class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, X
 {
  public:
  typedef typename Eigen::internal::traits<TensorSlicingOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename Eigen::internal::nested<TensorSlicingOp>::type Nested;
  typedef typename Eigen::internal::traits<TensorSlicingOp>::StorageKind StorageKind;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@ -253,15 +253,14 @@ struct FullReducer<Self, Op, ThreadPoolDevice, false> {
      return;
    } else {
      const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
-      const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
+      const unsigned int numblocks = blocksize > 0 ? static_cast<unsigned int>(num_coeffs / blocksize) : 0;
      eigen_assert(num_coeffs >= numblocks * blocksize);

-      MaxSizeVector<Notification*> results(numblocks);
+      Barrier barrier(numblocks);
      MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
-      for (Index i = 0; i < numblocks; ++i) {
-        results.push_back(
-            device.enqueue(&FullReducerShard<Self, Op, false>::run, self,
-                           i * blocksize, blocksize, reducer, &shards[i]));
+      for (unsigned int i = 0; i < numblocks; ++i) {
+        device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, false>::run, self,
+                                    i * blocksize, blocksize, reducer, &shards[i]);
      }

      typename Self::CoeffReturnType finalShard;
@ -271,11 +270,8 @@ struct FullReducer<Self, Op, ThreadPoolDevice, false> {
      } else {
        finalShard = reducer.initialize();
      }
-      for (Index i = 0; i < numblocks; ++i) {
-        wait_until_ready(results[i]);
-        delete results[i];
-      }
-      for (Index i = 0; i < numblocks; ++i) {
+      barrier.Wait();
+      for (unsigned int i = 0; i < numblocks; ++i) {
        reducer.reduce(shards[i], &finalShard);
      }
      *output = reducer.finalize(finalShard);
@ -304,15 +300,15 @@ struct FullReducer<Self, Op, ThreadPoolDevice, true> {
      return;
    }
    const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
-    const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
+    const unsigned int numblocks = blocksize > 0 ? static_cast<unsigned int>(num_coeffs / blocksize) : 0;
    eigen_assert(num_coeffs >= numblocks * blocksize);

-    MaxSizeVector<Notification*> results(numblocks);
+    Barrier barrier(numblocks);
    MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
-    for (Index i = 0; i < numblocks; ++i) {
-      results.push_back(device.enqueue(&FullReducerShard<Self, Op, true>::run,
-                                       self, i * blocksize, blocksize, reducer,
-                                       &shards[i]));
+    for (unsigned int i = 0; i < numblocks; ++i) {
+      device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, true>::run,
+                                  self, i * blocksize, blocksize, reducer,
+                                  &shards[i]);
    }
    typename Self::CoeffReturnType finalShard;
    if (numblocks * blocksize < num_coeffs) {
@ -322,11 +318,8 @@ struct FullReducer<Self, Op, ThreadPoolDevice, true> {
      finalShard = reducer.initialize();
    }

-    for (Index i = 0; i < numblocks; ++i) {
-      wait_until_ready(results[i]);
-      delete results[i];
-    }
-    for (Index i = 0; i < numblocks; ++i) {
+    barrier.Wait();
+    for (unsigned int i = 0; i < numblocks; ++i) {
      reducer.reduce(shards[i], &finalShard);
    }
    *output = reducer.finalize(finalShard);
--- a/unsupported/Eigen/OpenGLSupport
+++ b/unsupported/Eigen/OpenGLSupport
@ -180,11 +180,11 @@ template<typename Scalar> void glLoadMatrix(const Transform<Scalar,3,AffineCompa

 inline void glRotate(const Rotation2D<float>& rot)
 {
-  glRotatef(rot.angle()*180.f/float(M_PI), 0.f, 0.f, 1.f);
+  glRotatef(rot.angle()*180.f/float(EIGEN_PI), 0.f, 0.f, 1.f);
 }
 inline void glRotate(const Rotation2D<double>& rot)
 {
-  glRotated(rot.angle()*180.0/M_PI, 0.0, 0.0, 1.0);
+  glRotated(rot.angle()*180.0/EIGEN_PI, 0.0, 0.0, 1.0);
 }

 template<typename Derived> void glRotate(const RotationBase<Derived,3>& rot)
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
@ -11,10 +11,6 @@
 #ifndef EIGEN_MATRIX_LOGARITHM
 #define EIGEN_MATRIX_LOGARITHM

-#ifndef M_PI
-#define M_PI 3.141592653589793238462643383279503L
-#endif
-
 namespace Eigen { 

 namespace internal { 
@ -65,8 +61,8 @@ void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result)
  else
  {
    // computation in previous branch is inaccurate if A(1,1) \approx A(0,0)
-    int unwindingNumber = static_cast<int>(ceil((imag(logA11 - logA00) - M_PI) / (2*M_PI)));
-    result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,2*M_PI*unwindingNumber)) / y;
+    int unwindingNumber = static_cast<int>(ceil((imag(logA11 - logA00) - EIGEN_PI) / (2*EIGEN_PI)));
+    result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,2*EIGEN_PI*unwindingNumber)) / y;
  }
 }

--- a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
@ -298,8 +298,8 @@ MatrixPowerAtomic<MatrixType>::computeSuperDiag(const ComplexScalar& curr, const

  ComplexScalar logCurr = log(curr);
  ComplexScalar logPrev = log(prev);
-  int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - M_PI) / (2*M_PI));
-  ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, M_PI*unwindingNumber);
+  int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - EIGEN_PI) / (2*EIGEN_PI));
+  ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, EIGEN_PI*unwindingNumber);
  return RealScalar(2) * exp(RealScalar(0.5) * p * (logCurr + logPrev)) * sinh(p * w) / (curr - prev);
 }

--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@ -149,6 +149,7 @@ if(EIGEN_TEST_CXX11)
  ei_add_test(cxx11_tensor_argmax)
  ei_add_test(cxx11_tensor_shuffling)
  ei_add_test(cxx11_tensor_striding)
+  ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
  ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
  ei_add_test(cxx11_tensor_ref)
  ei_add_test(cxx11_tensor_random)
@ -175,9 +176,14 @@ endif()
 # These tests needs nvcc
 find_package(CUDA 7.0)
 if(CUDA_FOUND AND EIGEN_TEST_NVCC)
-  # Mke sure to compile without the -pedantic and -Wundef flags since they trigger thousands of compilation warnings in the CUDA runtime
+  # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor
+  # and -fno-check-new flags since they trigger thousands of compilation warnings
+  # in the CUDA runtime
  string(REPLACE "-pedantic" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
  string(REPLACE "-Wundef" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  string(REPLACE "-Wnon-virtual-dtor" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+
  message(STATUS "Flags used to compile cuda code: " ${CMAKE_CXX_FLAGS})

  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
--- a/unsupported/test/cxx11_tensor_argmax_cuda.cu
+++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu
@ -7,8 +7,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

-// TODO(mdevin): Free the cuda memory.

+#define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_FUNC cxx11_tensor_cuda
 #define EIGEN_USE_GPU

--- a/unsupported/test/cxx11_tensor_cuda.cu
+++ b/unsupported/test/cxx11_tensor_cuda.cu
@ -853,6 +853,10 @@ void test_cxx11_tensor_cuda()
  CALL_SUBTEST_3(test_cuda_convolution_3d<ColMajor>());
  CALL_SUBTEST_3(test_cuda_convolution_3d<RowMajor>());

+#if __cplusplus > 199711L
+  // std::erf, std::erfc, and so on where only added in c++11. We use them
+  // as a golden reference to validate the results produced by Eigen. Therefore
+  // we can only run these tests if we use a c++11 compiler.
  CALL_SUBTEST_4(test_cuda_lgamma<float>(1.0f));
  CALL_SUBTEST_4(test_cuda_lgamma<float>(100.0f));
  CALL_SUBTEST_4(test_cuda_lgamma<float>(0.01f));
@ -860,6 +864,7 @@ void test_cxx11_tensor_cuda()

  CALL_SUBTEST_4(test_cuda_digamma<float>());

+
  CALL_SUBTEST_4(test_cuda_erf<float>(1.0f));
  CALL_SUBTEST_4(test_cuda_erf<float>(100.0f));
  CALL_SUBTEST_4(test_cuda_erf<float>(0.01f));
@ -894,4 +899,5 @@ void test_cxx11_tensor_cuda()

  CALL_SUBTEST_5(test_cuda_igamma<double>());
  CALL_SUBTEST_5(test_cuda_igammac<double>());
+#endif
 }
--- a/unsupported/test/cxx11_tensor_notification.cpp
+++ b/unsupported/test/cxx11_tensor_notification.cpp
@ -0,0 +1,81 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Vijay Vasudevan <vrv@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+
+#include <stdlib.h>
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+
+#if EIGEN_OS_WIN || EIGEN_OS_WIN64
+#include <windows.h>
+void sleep(int seconds) {
+  Sleep(seconds*1000);
+}
+#else
+#include <unistd.h>
+#endif
+
+
+namespace {
+
+void WaitAndAdd(Eigen::Notification* n, int* counter) {
+  n->Wait();
+  *counter = *counter + 1;
+}
+
+}  // namespace
+
+static void test_notification_single()
+{
+  ThreadPool thread_pool(1);
+
+  int counter = 0;
+  Eigen::Notification n;
+  std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter);
+  thread_pool.Schedule(func);
+  sleep(1);
+
+  // The thread should be waiting for the notification.
+  VERIFY_IS_EQUAL(counter, 0);
+
+  // Unblock the thread
+  n.Notify();
+
+  sleep(1);
+
+  // Verify the counter has been incremented
+  VERIFY_IS_EQUAL(counter, 1);
+}
+
+// Like test_notification_single() but enqueues multiple threads to
+// validate that all threads get notified by Notify().
+static void test_notification_multiple()
+{
+  ThreadPool thread_pool(1);
+
+  int counter = 0;
+  Eigen::Notification n;
+  std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter);
+  thread_pool.Schedule(func);
+  thread_pool.Schedule(func);
+  thread_pool.Schedule(func);
+  thread_pool.Schedule(func);
+  sleep(1);
+  VERIFY_IS_EQUAL(counter, 0);
+  n.Notify();
+  sleep(1);
+  VERIFY_IS_EQUAL(counter, 4);
+}
+
+void test_cxx11_tensor_notification()
+{
+  CALL_SUBTEST(test_notification_single());
+  CALL_SUBTEST(test_notification_multiple());
+}
--- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
@ -134,7 +134,7 @@ void test_cuda_elementwise() {
  gpu_device.deallocate(d_res_float);
 }

-/*
+
 void test_cuda_contractions() {
  Eigen::CudaStreamDevice stream;
  Eigen::GpuDevice gpu_device(&stream);
@ -181,7 +181,7 @@ void test_cuda_contractions() {
  gpu_device.deallocate(d_float2);
  gpu_device.deallocate(d_res_half);
  gpu_device.deallocate(d_res_float);
-}*/
+}


 void test_cuda_reductions() {
@ -244,7 +244,7 @@ void test_cxx11_tensor_of_float16_cuda()
    CALL_SUBTEST_1(test_cuda_conversion());
    CALL_SUBTEST_1(test_cuda_unary());
    CALL_SUBTEST_1(test_cuda_elementwise());
-//    CALL_SUBTEST_2(test_cuda_contractions());
+    CALL_SUBTEST_2(test_cuda_contractions());
    CALL_SUBTEST_3(test_cuda_reductions());
  }
  else {
--- a/unsupported/test/matrix_function.cpp
+++ b/unsupported/test/matrix_function.cpp
@ -113,8 +113,8 @@ void testMatrixLogarithm(const MatrixType& A)

  MatrixType scaledA;
  RealScalar maxImagPartOfSpectrum = A.eigenvalues().imag().cwiseAbs().maxCoeff();
-  if (maxImagPartOfSpectrum >= 0.9 * M_PI)
-    scaledA = A * 0.9 * M_PI / maxImagPartOfSpectrum;
+  if (maxImagPartOfSpectrum >= 0.9 * EIGEN_PI)
+    scaledA = A * 0.9 * EIGEN_PI / maxImagPartOfSpectrum;
  else
    scaledA = A;

--- a/unsupported/test/matrix_power.cpp
+++ b/unsupported/test/matrix_power.cpp
@ -24,7 +24,7 @@ void test2dRotation(double tol)
    s = std::sin(angle);
    B << c, s, -s, c;

-    C = Apow(std::ldexp(angle,1) / M_PI);
+    C = Apow(std::ldexp(angle,1) / EIGEN_PI);
    std::cout << "test2dRotation: i = " << i << "   error powerm = " << relerr(C,B) << '\n';
    VERIFY(C.isApprox(B, tol));
  }