mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-03-07 18:27:40 +08:00
Merged eigen/eigen into default
This commit is contained in:
commit
f48011119e
0
Eigen/src/Core/AssignEvaluator.h
Executable file → Normal file
0
Eigen/src/Core/AssignEvaluator.h
Executable file → Normal file
0
Eigen/src/Core/Assign_MKL.h
Executable file → Normal file
0
Eigen/src/Core/Assign_MKL.h
Executable file → Normal file
@ -60,6 +60,23 @@ template<typename T> struct GenericNumTraits
|
||||
MulCost = 1
|
||||
};
|
||||
|
||||
// Division is messy but important, because it is expensive and throughput
|
||||
// varies significantly. The following numbers are based on min division
|
||||
// throughput on Haswell.
|
||||
template<bool Vectorized>
|
||||
struct Div {
|
||||
enum {
|
||||
#ifdef EIGEN_VECTORIZE_AVX
|
||||
AVX = true,
|
||||
#else
|
||||
AVX = false,
|
||||
#endif
|
||||
Cost = IsInteger ? (sizeof(T) == 8 ? (IsSigned ? 24 : 21) : (IsSigned ? 8 : 9)):
|
||||
Vectorized ? (sizeof(T) == 8 ? (AVX ? 16 : 8) : (AVX ? 14 : 7)) : 8
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
typedef T Real;
|
||||
typedef typename internal::conditional<
|
||||
IsInteger,
|
||||
|
0
Eigen/src/Core/ProductEvaluators.h
Executable file → Normal file
0
Eigen/src/Core/ProductEvaluators.h
Executable file → Normal file
@ -576,7 +576,7 @@ struct igammac_impl {
|
||||
pkm1 = pk;
|
||||
qkm2 = qkm1;
|
||||
qkm1 = qk;
|
||||
if (abs(pk) > big) {
|
||||
if (numext::abs(pk) > big) {
|
||||
pkm2 *= biginv;
|
||||
pkm1 *= biginv;
|
||||
qkm2 *= biginv;
|
||||
|
0
Eigen/src/Core/VectorwiseOp.h
Executable file → Normal file
0
Eigen/src/Core/VectorwiseOp.h
Executable file → Normal file
@ -63,38 +63,69 @@ static inline EIGEN_DEVICE_FUNC float half_to_float(__half h);
|
||||
|
||||
// Class definition.
|
||||
struct half : public __half {
|
||||
EIGEN_DEVICE_FUNC half() : __half(internal::raw_uint16_to_half(0)) {}
|
||||
EIGEN_DEVICE_FUNC half() {}
|
||||
|
||||
// TODO(sesse): Should these conversions be marked as explicit?
|
||||
EIGEN_DEVICE_FUNC half(float f) : __half(internal::float_to_half_rtne(f)) {}
|
||||
EIGEN_DEVICE_FUNC half(int i) : __half(internal::float_to_half_rtne(static_cast<float>(i))) {}
|
||||
EIGEN_DEVICE_FUNC half(double d) : __half(internal::float_to_half_rtne(static_cast<float>(d))) {}
|
||||
EIGEN_DEVICE_FUNC half(bool b)
|
||||
: __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
|
||||
EIGEN_DEVICE_FUNC half(const __half& h) : __half(h) {}
|
||||
EIGEN_DEVICE_FUNC half(const half& h) : __half(h) {}
|
||||
EIGEN_DEVICE_FUNC half(const volatile half& h)
|
||||
: __half(internal::raw_uint16_to_half(h.x)) {}
|
||||
|
||||
explicit EIGEN_DEVICE_FUNC half(bool b)
|
||||
: __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
|
||||
explicit EIGEN_DEVICE_FUNC half(int i)
|
||||
: __half(internal::float_to_half_rtne(static_cast<float>(i))) {}
|
||||
explicit EIGEN_DEVICE_FUNC half(long l)
|
||||
: __half(internal::float_to_half_rtne(static_cast<float>(l))) {}
|
||||
explicit EIGEN_DEVICE_FUNC half(long long ll)
|
||||
: __half(internal::float_to_half_rtne(static_cast<float>(ll))) {}
|
||||
explicit EIGEN_DEVICE_FUNC half(float f)
|
||||
: __half(internal::float_to_half_rtne(f)) {}
|
||||
explicit EIGEN_DEVICE_FUNC half(double d)
|
||||
: __half(internal::float_to_half_rtne(static_cast<float>(d))) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const {
|
||||
// +0.0 and -0.0 become false, everything else becomes true.
|
||||
return static_cast<bool>(x & 0x7fff);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const {
|
||||
return static_cast<signed char>(internal::half_to_float(*this));
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const {
|
||||
return static_cast<unsigned char>(internal::half_to_float(*this));
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const {
|
||||
return static_cast<short>(internal::half_to_float(*this));
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const {
|
||||
return static_cast<unsigned short>(internal::half_to_float(*this));
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const {
|
||||
return static_cast<int>(internal::half_to_float(*this));
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const {
|
||||
return static_cast<unsigned int>(internal::half_to_float(*this));
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const {
|
||||
return static_cast<long>(internal::half_to_float(*this));
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const {
|
||||
return static_cast<unsigned long>(internal::half_to_float(*this));
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const {
|
||||
return static_cast<long long>(internal::half_to_float(*this));
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const {
|
||||
return static_cast<unsigned long long>(internal::half_to_float(*this));
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const {
|
||||
return internal::half_to_float(*this);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const {
|
||||
return internal::half_to_float(*this);
|
||||
return static_cast<double>(internal::half_to_float(*this));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC half& operator=(const half& other) {
|
||||
x = other.x;
|
||||
return *this;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC half& operator=(const volatile half& other) {
|
||||
x = other.x;
|
||||
return *this;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC volatile half& operator=(const half& other) volatile {
|
||||
x = other.x;
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||
@ -203,6 +234,12 @@ static inline EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) {
|
||||
|
||||
#endif // Emulate support for half floats
|
||||
|
||||
// Division by an index. Do it in full float precision to avoid accuracy
|
||||
// issues in converting the denominator to half.
|
||||
static inline EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
|
||||
return Eigen::half(static_cast<float>(a) / static_cast<float>(b));
|
||||
}
|
||||
|
||||
// Conversion routines, including fallbacks for the host or older CUDA.
|
||||
// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of
|
||||
// these in hardware. If we need more performance on older/other CPUs, they are
|
||||
@ -341,4 +378,14 @@ static inline EIGEN_DEVICE_FUNC Eigen::half log(const Eigen::half& a) {
|
||||
|
||||
} // end namespace std
|
||||
|
||||
|
||||
// Add the missing shfl_xor intrinsic
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
||||
__device__ inline Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
|
||||
return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#endif // EIGEN_HALF_CUDA_H
|
||||
|
@ -114,8 +114,8 @@ template<> EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
||||
return __float22half2_rn(make_float2(a.x, a.y));
|
||||
#else
|
||||
half r1 = a.x;
|
||||
half r2 = a.y;
|
||||
half r1 = static_cast<half>(a.x);
|
||||
half r2 = static_cast<half>(a.y);
|
||||
half2 r;
|
||||
r.x = 0;
|
||||
r.x |= r1.x;
|
||||
|
@ -238,7 +238,13 @@ template<typename Scalar> struct scalar_hypot_op {
|
||||
};
|
||||
template<typename Scalar>
|
||||
struct functor_traits<scalar_hypot_op<Scalar> > {
|
||||
enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess=0 };
|
||||
enum
|
||||
{
|
||||
Cost = 3 * NumTraits<Scalar>::AddCost +
|
||||
2 * NumTraits<Scalar>::MulCost +
|
||||
2 * NumTraits<Scalar>::template Div<false>::Cost,
|
||||
PacketAccess = false
|
||||
};
|
||||
};
|
||||
|
||||
/** \internal
|
||||
@ -297,9 +303,10 @@ template<typename LhsScalar,typename RhsScalar> struct scalar_quotient_op {
|
||||
};
|
||||
template<typename LhsScalar,typename RhsScalar>
|
||||
struct functor_traits<scalar_quotient_op<LhsScalar,RhsScalar> > {
|
||||
typedef typename scalar_quotient_op<LhsScalar,RhsScalar>::result_type result_type;
|
||||
enum {
|
||||
Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost), // rough estimate!
|
||||
PacketAccess = scalar_quotient_op<LhsScalar,RhsScalar>::Vectorizable
|
||||
PacketAccess = scalar_quotient_op<LhsScalar,RhsScalar>::Vectorizable,
|
||||
Cost = NumTraits<result_type>::template Div<PacketAccess>::Cost
|
||||
};
|
||||
};
|
||||
|
||||
@ -564,6 +571,10 @@ struct scalar_inverse_mult_op {
|
||||
{ return internal::pdiv(pset1<Packet>(m_other),a); }
|
||||
Scalar m_other;
|
||||
};
|
||||
template<typename Scalar>
|
||||
struct functor_traits<scalar_inverse_mult_op<Scalar> >
|
||||
{ enum { PacketAccess = packet_traits<Scalar>::HasDiv, Cost = NumTraits<Scalar>::template Div<PacketAccess>::Cost }; };
|
||||
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
0
Eigen/src/Core/products/GeneralMatrixVector_MKL.h
Executable file → Normal file
0
Eigen/src/Core/products/GeneralMatrixVector_MKL.h
Executable file → Normal file
0
Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
Executable file → Normal file
0
Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
Executable file → Normal file
0
Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
Executable file → Normal file
0
Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
Executable file → Normal file
0
Eigen/src/Eigenvalues/ComplexSchur_MKL.h
Executable file → Normal file
0
Eigen/src/Eigenvalues/ComplexSchur_MKL.h
Executable file → Normal file
0
Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
Executable file → Normal file
0
Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
Executable file → Normal file
0
Eigen/src/Eigenvalues/RealQZ.h
Executable file → Normal file
0
Eigen/src/Eigenvalues/RealQZ.h
Executable file → Normal file
0
Eigen/src/Eigenvalues/RealSchur_MKL.h
Executable file → Normal file
0
Eigen/src/Eigenvalues/RealSchur_MKL.h
Executable file → Normal file
0
Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
Executable file → Normal file
0
Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
Executable file → Normal file
0
Eigen/src/PardisoSupport/PardisoSupport.h
Executable file → Normal file
0
Eigen/src/PardisoSupport/PardisoSupport.h
Executable file → Normal file
0
Eigen/src/QR/ColPivHouseholderQR_MKL.h
Executable file → Normal file
0
Eigen/src/QR/ColPivHouseholderQR_MKL.h
Executable file → Normal file
0
Eigen/src/SVD/JacobiSVD.h
Executable file → Normal file
0
Eigen/src/SVD/JacobiSVD.h
Executable file → Normal file
0
Eigen/src/SparseLU/SparseLU.h
Executable file → Normal file
0
Eigen/src/SparseLU/SparseLU.h
Executable file → Normal file
@ -333,7 +333,7 @@ template <typename Device, typename T> class BenchmarkSuite {
|
||||
|
||||
#ifndef EIGEN_HAS_INDEX_LIST
|
||||
Eigen::array<TensorIndex, 1> sum_along_dim;
|
||||
sum_along_dim = 1;
|
||||
sum_along_dim[0] = 1;
|
||||
#else
|
||||
// Take advantage of cxx11 to give the compiler information it can use to
|
||||
// optimize the code.
|
||||
@ -356,7 +356,7 @@ template <typename Device, typename T> class BenchmarkSuite {
|
||||
input_size[1] = n_;
|
||||
const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
|
||||
b_, input_size);
|
||||
const Eigen::array<TensorIndex, 0> output_size;
|
||||
Eigen::array<TensorIndex, 0> output_size;
|
||||
TensorMap<Tensor<float, 0, 0, TensorIndex>, Eigen::Aligned> C(
|
||||
c_, output_size);
|
||||
|
||||
|
@ -12,7 +12,7 @@
|
||||
StopBenchmarkTiming(); \
|
||||
Eigen::CudaStreamDevice stream; \
|
||||
Eigen::GpuDevice device(&stream); \
|
||||
BenchmarkSuite<Eigen::GpuDevice, half> suite(device, N); \
|
||||
BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, N); \
|
||||
cudaDeviceSynchronize(); \
|
||||
suite.FUNC(iters); \
|
||||
} \
|
||||
@ -41,7 +41,7 @@ BM_FuncGPU(colReduction);
|
||||
StopBenchmarkTiming(); \
|
||||
Eigen::CudaStreamDevice stream; \
|
||||
Eigen::GpuDevice device(&stream); \
|
||||
BenchmarkSuite<Eigen::GpuDevice, half> suite(device, D1, D2, D3); \
|
||||
BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, D1, D2, D3); \
|
||||
cudaDeviceSynchronize(); \
|
||||
suite.FUNC(iters); \
|
||||
} \
|
||||
@ -60,7 +60,7 @@ BM_FuncWithInputDimsGPU(contraction, N, N, 64);
|
||||
StopBenchmarkTiming(); \
|
||||
Eigen::CudaStreamDevice stream; \
|
||||
Eigen::GpuDevice device(&stream); \
|
||||
BenchmarkSuite<Eigen::GpuDevice, half> suite(device, N); \
|
||||
BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, N); \
|
||||
cudaDeviceSynchronize(); \
|
||||
suite.FUNC(iters, DIM1, DIM2); \
|
||||
} \
|
||||
@ -73,4 +73,4 @@ BM_FuncWithKernelDimsGPU(convolution, 7, 4);
|
||||
BM_FuncWithKernelDimsGPU(convolution, 4, 7);
|
||||
BM_FuncWithKernelDimsGPU(convolution, 7, 64);
|
||||
BM_FuncWithKernelDimsGPU(convolution, 64, 7);
|
||||
*/
|
||||
*/
|
||||
|
@ -58,6 +58,10 @@
|
||||
#define isnan(X) please_protect_your_isnan_with_parentheses
|
||||
#define isinf(X) please_protect_your_isinf_with_parentheses
|
||||
#define isfinite(X) please_protect_your_isfinite_with_parentheses
|
||||
#ifdef M_PI
|
||||
#undef M_PI
|
||||
#endif
|
||||
#define M_PI please_use_EIGEN_PI_instead_of_M_PI
|
||||
|
||||
#define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes
|
||||
// B0 is defined in POSIX header termios.h
|
||||
@ -331,11 +335,13 @@ inline bool test_isApprox(const std::complex<double>& a, const std::complex<doub
|
||||
inline bool test_isMuchSmallerThan(const std::complex<double>& a, const std::complex<double>& b)
|
||||
{ return internal::isMuchSmallerThan(a, b, test_precision<std::complex<double> >()); }
|
||||
|
||||
#ifndef EIGEN_TEST_NO_LONGDOUBLE
|
||||
inline bool test_isApprox(const std::complex<long double>& a, const std::complex<long double>& b)
|
||||
{ return internal::isApprox(a, b, test_precision<std::complex<long double> >()); }
|
||||
inline bool test_isMuchSmallerThan(const std::complex<long double>& a, const std::complex<long double>& b)
|
||||
{ return internal::isMuchSmallerThan(a, b, test_precision<std::complex<long double> >()); }
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef EIGEN_TEST_NO_LONGDOUBLE
|
||||
inline bool test_isApprox(const long double& a, const long double& b)
|
||||
|
@ -51,6 +51,7 @@ typedef unsigned __int64 uint64_t;
|
||||
#endif
|
||||
|
||||
#ifdef EIGEN_USE_THREADS
|
||||
#include <atomic>
|
||||
#include <condition_variable>
|
||||
#include <deque>
|
||||
#include <mutex>
|
||||
|
@ -13,7 +13,7 @@
|
||||
|
||||
|
||||
// The array class is only available starting with cxx11. Emulate our own here
|
||||
// if needed.
|
||||
// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler!
|
||||
// Moreover, CUDA doesn't support the STL containers, so we use our own instead.
|
||||
#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(__CUDACC__) || defined(EIGEN_AVOID_STL_ARRAY)
|
||||
|
||||
|
@ -607,7 +607,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
|
||||
const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>
|
||||
extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols,
|
||||
const Index plane_stride = 1, const Index row_stride = 1, const Index col_stride = 1,
|
||||
const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = 0) const {
|
||||
const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const {
|
||||
return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, 1, 1, 1, padding_type, padding_value);
|
||||
}
|
||||
|
||||
@ -619,7 +619,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
|
||||
const Index plane_inflate_stride, const Index row_inflate_stride, const Index col_inflate_stride,
|
||||
const Index padding_top_z, const Index padding_bottom_z,
|
||||
const Index padding_top, const Index padding_bottom,
|
||||
const Index padding_left, const Index padding_right, const Scalar padding_value = 0) const {
|
||||
const Index padding_left, const Index padding_right, const Scalar padding_value = Scalar(0)) const {
|
||||
return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, plane_inflate_stride, row_inflate_stride, col_inflate_stride, padding_top_z, padding_bottom_z, padding_top, padding_bottom, padding_left, padding_right, padding_value);
|
||||
}
|
||||
|
||||
|
@ -20,7 +20,7 @@ template<typename Scalar, typename Index, typename LhsMapper,
|
||||
typename RhsMapper, typename OutputMapper, bool needs_edge_check>
|
||||
__device__ EIGEN_STRONG_INLINE void
|
||||
EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
|
||||
const OutputMapper output, volatile Scalar* lhs_shmem, volatile Scalar* rhs_shmem,
|
||||
const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem,
|
||||
const Index m_size, const Index n_size, const Index k_size) {
|
||||
|
||||
const Index m_block_idx = blockIdx.x;
|
||||
@ -319,8 +319,8 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
|
||||
Scalar rrow(7);
|
||||
|
||||
// Now x corresponds to k, y to m, and z to n
|
||||
const volatile Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
|
||||
const volatile Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
|
||||
const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
|
||||
const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
|
||||
|
||||
#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
|
||||
#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
|
||||
@ -503,8 +503,8 @@ __launch_bounds__(512)
|
||||
EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
|
||||
const OutputMapper output,
|
||||
const Index m_size, const Index n_size, const Index k_size) {
|
||||
__shared__ volatile Scalar lhs_shmem[72 * 64];
|
||||
__shared__ volatile Scalar rhs_shmem[72 * 64];
|
||||
__shared__ Scalar lhs_shmem[72 * 64];
|
||||
__shared__ Scalar rhs_shmem[72 * 64];
|
||||
|
||||
const Index m_block_idx = blockIdx.x;
|
||||
const Index n_block_idx = blockIdx.y;
|
||||
|
@ -27,7 +27,7 @@ class ThreadPoolInterface {
|
||||
class ThreadPool : public ThreadPoolInterface {
|
||||
public:
|
||||
// Construct a pool that contains "num_threads" threads.
|
||||
explicit ThreadPool(int num_threads) {
|
||||
explicit ThreadPool(int num_threads) : threads_(num_threads), waiters_(num_threads) {
|
||||
for (int i = 0; i < num_threads; i++) {
|
||||
threads_.push_back(new std::thread([this]() { WorkerLoop(); }));
|
||||
}
|
||||
@ -110,55 +110,90 @@ class ThreadPool : public ThreadPoolInterface {
|
||||
};
|
||||
|
||||
std::mutex mu_;
|
||||
std::vector<std::thread*> threads_; // All threads
|
||||
std::vector<Waiter*> waiters_; // Stack of waiting threads.
|
||||
MaxSizeVector<std::thread*> threads_; // All threads
|
||||
MaxSizeVector<Waiter*> waiters_; // Stack of waiting threads.
|
||||
std::deque<std::function<void()>> pending_; // Queue of pending work
|
||||
std::condition_variable empty_; // Signaled on pending_.empty()
|
||||
bool exiting_ = false;
|
||||
};
|
||||
|
||||
|
||||
// Notification is an object that allows a user to to wait for another
|
||||
// thread to signal a notification that an event has occurred.
|
||||
//
|
||||
// Multiple threads can wait on the same Notification object.
|
||||
// but only one caller must call Notify() on the object.
|
||||
class Notification {
|
||||
// Barrier is an object that allows one or more threads to wait until
|
||||
// Notify has been called a specified number of times.
|
||||
class Barrier {
|
||||
public:
|
||||
Notification() : notified_(false) {}
|
||||
~Notification() {}
|
||||
Barrier(unsigned int count) : state_(count << 1), notified_(false) {
|
||||
eigen_assert(((count << 1) >> 1) == count);
|
||||
}
|
||||
~Barrier() {
|
||||
eigen_assert((state_>>1) == 0);
|
||||
}
|
||||
|
||||
void Notify() {
|
||||
unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2;
|
||||
if (v != 1) {
|
||||
eigen_assert(((v + 2) & ~1) != 0);
|
||||
return; // either count has not dropped to 0, or waiter is not waiting
|
||||
}
|
||||
std::unique_lock<std::mutex> l(mu_);
|
||||
eigen_assert(!notified_);
|
||||
notified_ = true;
|
||||
cv_.notify_all();
|
||||
}
|
||||
|
||||
void WaitForNotification() {
|
||||
void Wait() {
|
||||
unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel);
|
||||
if ((v >> 1) == 0) return;
|
||||
std::unique_lock<std::mutex> l(mu_);
|
||||
cv_.wait(l, [this]() { return notified_; } );
|
||||
while (!notified_) {
|
||||
cv_.wait(l);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
std::mutex mu_;
|
||||
std::condition_variable cv_;
|
||||
std::atomic<unsigned int> state_; // low bit is waiter flag
|
||||
bool notified_;
|
||||
};
|
||||
|
||||
|
||||
// Notification is an object that allows a user to to wait for another
|
||||
// thread to signal a notification that an event has occurred.
|
||||
//
|
||||
// Multiple threads can wait on the same Notification object,
|
||||
// but only one caller must call Notify() on the object.
|
||||
struct Notification : Barrier {
|
||||
Notification() : Barrier(1) {};
|
||||
};
|
||||
|
||||
|
||||
// Runs an arbitrary function and then calls Notify() on the passed in
|
||||
// Notification.
|
||||
template <typename Function, typename... Args> struct FunctionWrapper
|
||||
template <typename Function, typename... Args> struct FunctionWrapperWithNotification
|
||||
{
|
||||
static void run(Notification* n, Function f, Args... args) {
|
||||
f(args...);
|
||||
n->Notify();
|
||||
if (n) {
|
||||
n->Notify();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static EIGEN_STRONG_INLINE void wait_until_ready(Notification* n) {
|
||||
template <typename Function, typename... Args> struct FunctionWrapperWithBarrier
|
||||
{
|
||||
static void run(Barrier* b, Function f, Args... args) {
|
||||
f(args...);
|
||||
if (b) {
|
||||
b->Notify();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename SyncType>
|
||||
static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
|
||||
if (n) {
|
||||
n->WaitForNotification();
|
||||
n->Wait();
|
||||
}
|
||||
}
|
||||
|
||||
@ -203,10 +238,20 @@ struct ThreadPoolDevice {
|
||||
EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const {
|
||||
Notification* n = new Notification();
|
||||
std::function<void()> func =
|
||||
std::bind(&FunctionWrapper<Function, Args...>::run, n, f, args...);
|
||||
std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, f, args...);
|
||||
pool_->Schedule(func);
|
||||
return n;
|
||||
}
|
||||
|
||||
template <class Function, class... Args>
|
||||
EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b,
|
||||
Function&& f,
|
||||
Args&&... args) const {
|
||||
std::function<void()> func = std::bind(
|
||||
&FunctionWrapperWithBarrier<Function, Args...>::run, b, f, args...);
|
||||
pool_->Schedule(func);
|
||||
}
|
||||
|
||||
template <class Function, class... Args>
|
||||
EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const {
|
||||
std::function<void()> func = std::bind(f, args...);
|
||||
|
@ -111,6 +111,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* scalar) {
|
||||
EIGEN_UNUSED_VARIABLE(scalar);
|
||||
eigen_assert(scalar == NULL);
|
||||
return m_impl.evalSubExprsIfNeeded(m_buffer);
|
||||
}
|
||||
|
@ -125,22 +125,18 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
|
||||
|
||||
int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1;
|
||||
const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
|
||||
const Index numblocks = size / blocksize;
|
||||
const unsigned int numblocks = static_cast<unsigned int>(size / blocksize);
|
||||
|
||||
MaxSizeVector<Notification*> results(numblocks);
|
||||
for (int i = 0; i < numblocks; ++i) {
|
||||
results.push_back(device.enqueue(&EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize));
|
||||
Barrier barrier(numblocks);
|
||||
for (unsigned int i = 0; i < numblocks; ++i) {
|
||||
device.enqueue_with_barrier(&barrier, &EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize);
|
||||
}
|
||||
|
||||
if (numblocks * blocksize < size) {
|
||||
EvalRange<Evaluator, Index, Vectorizable>::run(evaluator, numblocks * blocksize, size);
|
||||
}
|
||||
|
||||
for (int i = 0; i < numblocks; ++i) {
|
||||
wait_until_ready(results[i]);
|
||||
delete results[i];
|
||||
}
|
||||
|
||||
barrier.Wait();
|
||||
}
|
||||
evaluator.cleanup();
|
||||
}
|
||||
|
@ -25,7 +25,20 @@ struct scalar_mod_op {
|
||||
};
|
||||
template <typename Scalar>
|
||||
struct functor_traits<scalar_mod_op<Scalar> >
|
||||
{ enum { Cost = 2 * NumTraits<Scalar>::MulCost, PacketAccess = false }; };
|
||||
{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
|
||||
|
||||
|
||||
/** \internal
|
||||
* \brief Template functor to compute the modulo between 2 arrays.
|
||||
*/
|
||||
template <typename Scalar>
|
||||
struct scalar_mod2_op {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op);
|
||||
EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; }
|
||||
};
|
||||
template <typename Scalar>
|
||||
struct functor_traits<scalar_mod2_op<Scalar> >
|
||||
{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
|
||||
|
||||
|
||||
/** \internal
|
||||
|
@ -41,7 +41,7 @@ template <> struct max_n_1<0> {
|
||||
template <typename Scalar, typename Device>
|
||||
struct PacketType {
|
||||
typedef typename internal::packet_traits<Scalar>::type type;
|
||||
static const int size = internal::unpacket_traits<type>::size;
|
||||
enum { size = internal::unpacket_traits<type>::size };
|
||||
};
|
||||
|
||||
// For CUDA packet types when using a GpuDevice
|
||||
|
@ -54,7 +54,6 @@ class TensorReshapingOp : public TensorBase<TensorReshapingOp<NewDimensions, Xpr
|
||||
{
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorReshapingOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
|
||||
typedef typename Eigen::internal::nested<TensorReshapingOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorReshapingOp>::StorageKind StorageKind;
|
||||
@ -143,7 +142,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
|
||||
return m_impl.template packet<LoadMode>(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); }
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); }
|
||||
|
||||
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
|
||||
|
||||
@ -234,7 +233,6 @@ class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, X
|
||||
{
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorSlicingOp>::Scalar Scalar;
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename Eigen::internal::nested<TensorSlicingOp>::type Nested;
|
||||
typedef typename Eigen::internal::traits<TensorSlicingOp>::StorageKind StorageKind;
|
||||
|
@ -253,15 +253,14 @@ struct FullReducer<Self, Op, ThreadPoolDevice, false> {
|
||||
return;
|
||||
} else {
|
||||
const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
|
||||
const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
|
||||
const unsigned int numblocks = blocksize > 0 ? static_cast<unsigned int>(num_coeffs / blocksize) : 0;
|
||||
eigen_assert(num_coeffs >= numblocks * blocksize);
|
||||
|
||||
MaxSizeVector<Notification*> results(numblocks);
|
||||
Barrier barrier(numblocks);
|
||||
MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
|
||||
for (Index i = 0; i < numblocks; ++i) {
|
||||
results.push_back(
|
||||
device.enqueue(&FullReducerShard<Self, Op, false>::run, self,
|
||||
i * blocksize, blocksize, reducer, &shards[i]));
|
||||
for (unsigned int i = 0; i < numblocks; ++i) {
|
||||
device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, false>::run, self,
|
||||
i * blocksize, blocksize, reducer, &shards[i]);
|
||||
}
|
||||
|
||||
typename Self::CoeffReturnType finalShard;
|
||||
@ -271,11 +270,8 @@ struct FullReducer<Self, Op, ThreadPoolDevice, false> {
|
||||
} else {
|
||||
finalShard = reducer.initialize();
|
||||
}
|
||||
for (Index i = 0; i < numblocks; ++i) {
|
||||
wait_until_ready(results[i]);
|
||||
delete results[i];
|
||||
}
|
||||
for (Index i = 0; i < numblocks; ++i) {
|
||||
barrier.Wait();
|
||||
for (unsigned int i = 0; i < numblocks; ++i) {
|
||||
reducer.reduce(shards[i], &finalShard);
|
||||
}
|
||||
*output = reducer.finalize(finalShard);
|
||||
@ -304,15 +300,15 @@ struct FullReducer<Self, Op, ThreadPoolDevice, true> {
|
||||
return;
|
||||
}
|
||||
const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
|
||||
const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
|
||||
const unsigned int numblocks = blocksize > 0 ? static_cast<unsigned int>(num_coeffs / blocksize) : 0;
|
||||
eigen_assert(num_coeffs >= numblocks * blocksize);
|
||||
|
||||
MaxSizeVector<Notification*> results(numblocks);
|
||||
Barrier barrier(numblocks);
|
||||
MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
|
||||
for (Index i = 0; i < numblocks; ++i) {
|
||||
results.push_back(device.enqueue(&FullReducerShard<Self, Op, true>::run,
|
||||
self, i * blocksize, blocksize, reducer,
|
||||
&shards[i]));
|
||||
for (unsigned int i = 0; i < numblocks; ++i) {
|
||||
device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, true>::run,
|
||||
self, i * blocksize, blocksize, reducer,
|
||||
&shards[i]);
|
||||
}
|
||||
typename Self::CoeffReturnType finalShard;
|
||||
if (numblocks * blocksize < num_coeffs) {
|
||||
@ -322,11 +318,8 @@ struct FullReducer<Self, Op, ThreadPoolDevice, true> {
|
||||
finalShard = reducer.initialize();
|
||||
}
|
||||
|
||||
for (Index i = 0; i < numblocks; ++i) {
|
||||
wait_until_ready(results[i]);
|
||||
delete results[i];
|
||||
}
|
||||
for (Index i = 0; i < numblocks; ++i) {
|
||||
barrier.Wait();
|
||||
for (unsigned int i = 0; i < numblocks; ++i) {
|
||||
reducer.reduce(shards[i], &finalShard);
|
||||
}
|
||||
*output = reducer.finalize(finalShard);
|
||||
|
@ -180,11 +180,11 @@ template<typename Scalar> void glLoadMatrix(const Transform<Scalar,3,AffineCompa
|
||||
|
||||
inline void glRotate(const Rotation2D<float>& rot)
|
||||
{
|
||||
glRotatef(rot.angle()*180.f/float(M_PI), 0.f, 0.f, 1.f);
|
||||
glRotatef(rot.angle()*180.f/float(EIGEN_PI), 0.f, 0.f, 1.f);
|
||||
}
|
||||
inline void glRotate(const Rotation2D<double>& rot)
|
||||
{
|
||||
glRotated(rot.angle()*180.0/M_PI, 0.0, 0.0, 1.0);
|
||||
glRotated(rot.angle()*180.0/EIGEN_PI, 0.0, 0.0, 1.0);
|
||||
}
|
||||
|
||||
template<typename Derived> void glRotate(const RotationBase<Derived,3>& rot)
|
||||
|
@ -11,10 +11,6 @@
|
||||
#ifndef EIGEN_MATRIX_LOGARITHM
|
||||
#define EIGEN_MATRIX_LOGARITHM
|
||||
|
||||
#ifndef M_PI
|
||||
#define M_PI 3.141592653589793238462643383279503L
|
||||
#endif
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
@ -65,8 +61,8 @@ void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result)
|
||||
else
|
||||
{
|
||||
// computation in previous branch is inaccurate if A(1,1) \approx A(0,0)
|
||||
int unwindingNumber = static_cast<int>(ceil((imag(logA11 - logA00) - M_PI) / (2*M_PI)));
|
||||
result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,2*M_PI*unwindingNumber)) / y;
|
||||
int unwindingNumber = static_cast<int>(ceil((imag(logA11 - logA00) - EIGEN_PI) / (2*EIGEN_PI)));
|
||||
result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,2*EIGEN_PI*unwindingNumber)) / y;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -298,8 +298,8 @@ MatrixPowerAtomic<MatrixType>::computeSuperDiag(const ComplexScalar& curr, const
|
||||
|
||||
ComplexScalar logCurr = log(curr);
|
||||
ComplexScalar logPrev = log(prev);
|
||||
int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - M_PI) / (2*M_PI));
|
||||
ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, M_PI*unwindingNumber);
|
||||
int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - EIGEN_PI) / (2*EIGEN_PI));
|
||||
ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, EIGEN_PI*unwindingNumber);
|
||||
return RealScalar(2) * exp(RealScalar(0.5) * p * (logCurr + logPrev)) * sinh(p * w) / (curr - prev);
|
||||
}
|
||||
|
||||
|
@ -149,6 +149,7 @@ if(EIGEN_TEST_CXX11)
|
||||
ei_add_test(cxx11_tensor_argmax)
|
||||
ei_add_test(cxx11_tensor_shuffling)
|
||||
ei_add_test(cxx11_tensor_striding)
|
||||
ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
|
||||
ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
|
||||
ei_add_test(cxx11_tensor_ref)
|
||||
ei_add_test(cxx11_tensor_random)
|
||||
@ -175,9 +176,14 @@ endif()
|
||||
# These tests needs nvcc
|
||||
find_package(CUDA 7.0)
|
||||
if(CUDA_FOUND AND EIGEN_TEST_NVCC)
|
||||
# Mke sure to compile without the -pedantic and -Wundef flags since they trigger thousands of compilation warnings in the CUDA runtime
|
||||
# Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor
|
||||
# and -fno-check-new flags since they trigger thousands of compilation warnings
|
||||
# in the CUDA runtime
|
||||
string(REPLACE "-pedantic" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
|
||||
string(REPLACE "-Wundef" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
|
||||
string(REPLACE "-Wnon-virtual-dtor" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
|
||||
string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
|
||||
|
||||
message(STATUS "Flags used to compile cuda code: " ${CMAKE_CXX_FLAGS})
|
||||
|
||||
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
|
||||
|
@ -7,8 +7,8 @@
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// TODO(mdevin): Free the cuda memory.
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_cuda
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
|
@ -853,6 +853,10 @@ void test_cxx11_tensor_cuda()
|
||||
CALL_SUBTEST_3(test_cuda_convolution_3d<ColMajor>());
|
||||
CALL_SUBTEST_3(test_cuda_convolution_3d<RowMajor>());
|
||||
|
||||
#if __cplusplus > 199711L
|
||||
// std::erf, std::erfc, and so on where only added in c++11. We use them
|
||||
// as a golden reference to validate the results produced by Eigen. Therefore
|
||||
// we can only run these tests if we use a c++11 compiler.
|
||||
CALL_SUBTEST_4(test_cuda_lgamma<float>(1.0f));
|
||||
CALL_SUBTEST_4(test_cuda_lgamma<float>(100.0f));
|
||||
CALL_SUBTEST_4(test_cuda_lgamma<float>(0.01f));
|
||||
@ -860,6 +864,7 @@ void test_cxx11_tensor_cuda()
|
||||
|
||||
CALL_SUBTEST_4(test_cuda_digamma<float>());
|
||||
|
||||
|
||||
CALL_SUBTEST_4(test_cuda_erf<float>(1.0f));
|
||||
CALL_SUBTEST_4(test_cuda_erf<float>(100.0f));
|
||||
CALL_SUBTEST_4(test_cuda_erf<float>(0.01f));
|
||||
@ -894,4 +899,5 @@ void test_cxx11_tensor_cuda()
|
||||
|
||||
CALL_SUBTEST_5(test_cuda_igamma<double>());
|
||||
CALL_SUBTEST_5(test_cuda_igammac<double>());
|
||||
#endif
|
||||
}
|
||||
|
81
unsupported/test/cxx11_tensor_notification.cpp
Normal file
81
unsupported/test/cxx11_tensor_notification.cpp
Normal file
@ -0,0 +1,81 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2015 Vijay Vasudevan <vrv@google.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#define EIGEN_USE_THREADS
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "main.h"
|
||||
#include <Eigen/CXX11/Tensor>
|
||||
|
||||
#if EIGEN_OS_WIN || EIGEN_OS_WIN64
|
||||
#include <windows.h>
|
||||
void sleep(int seconds) {
|
||||
Sleep(seconds*1000);
|
||||
}
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
|
||||
namespace {
|
||||
|
||||
void WaitAndAdd(Eigen::Notification* n, int* counter) {
|
||||
n->Wait();
|
||||
*counter = *counter + 1;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
static void test_notification_single()
|
||||
{
|
||||
ThreadPool thread_pool(1);
|
||||
|
||||
int counter = 0;
|
||||
Eigen::Notification n;
|
||||
std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter);
|
||||
thread_pool.Schedule(func);
|
||||
sleep(1);
|
||||
|
||||
// The thread should be waiting for the notification.
|
||||
VERIFY_IS_EQUAL(counter, 0);
|
||||
|
||||
// Unblock the thread
|
||||
n.Notify();
|
||||
|
||||
sleep(1);
|
||||
|
||||
// Verify the counter has been incremented
|
||||
VERIFY_IS_EQUAL(counter, 1);
|
||||
}
|
||||
|
||||
// Like test_notification_single() but enqueues multiple threads to
|
||||
// validate that all threads get notified by Notify().
|
||||
static void test_notification_multiple()
|
||||
{
|
||||
ThreadPool thread_pool(1);
|
||||
|
||||
int counter = 0;
|
||||
Eigen::Notification n;
|
||||
std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter);
|
||||
thread_pool.Schedule(func);
|
||||
thread_pool.Schedule(func);
|
||||
thread_pool.Schedule(func);
|
||||
thread_pool.Schedule(func);
|
||||
sleep(1);
|
||||
VERIFY_IS_EQUAL(counter, 0);
|
||||
n.Notify();
|
||||
sleep(1);
|
||||
VERIFY_IS_EQUAL(counter, 4);
|
||||
}
|
||||
|
||||
void test_cxx11_tensor_notification()
|
||||
{
|
||||
CALL_SUBTEST(test_notification_single());
|
||||
CALL_SUBTEST(test_notification_multiple());
|
||||
}
|
@ -134,7 +134,7 @@ void test_cuda_elementwise() {
|
||||
gpu_device.deallocate(d_res_float);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
void test_cuda_contractions() {
|
||||
Eigen::CudaStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
@ -181,7 +181,7 @@ void test_cuda_contractions() {
|
||||
gpu_device.deallocate(d_float2);
|
||||
gpu_device.deallocate(d_res_half);
|
||||
gpu_device.deallocate(d_res_float);
|
||||
}*/
|
||||
}
|
||||
|
||||
|
||||
void test_cuda_reductions() {
|
||||
@ -244,7 +244,7 @@ void test_cxx11_tensor_of_float16_cuda()
|
||||
CALL_SUBTEST_1(test_cuda_conversion());
|
||||
CALL_SUBTEST_1(test_cuda_unary());
|
||||
CALL_SUBTEST_1(test_cuda_elementwise());
|
||||
// CALL_SUBTEST_2(test_cuda_contractions());
|
||||
CALL_SUBTEST_2(test_cuda_contractions());
|
||||
CALL_SUBTEST_3(test_cuda_reductions());
|
||||
}
|
||||
else {
|
||||
|
@ -113,8 +113,8 @@ void testMatrixLogarithm(const MatrixType& A)
|
||||
|
||||
MatrixType scaledA;
|
||||
RealScalar maxImagPartOfSpectrum = A.eigenvalues().imag().cwiseAbs().maxCoeff();
|
||||
if (maxImagPartOfSpectrum >= 0.9 * M_PI)
|
||||
scaledA = A * 0.9 * M_PI / maxImagPartOfSpectrum;
|
||||
if (maxImagPartOfSpectrum >= 0.9 * EIGEN_PI)
|
||||
scaledA = A * 0.9 * EIGEN_PI / maxImagPartOfSpectrum;
|
||||
else
|
||||
scaledA = A;
|
||||
|
||||
|
@ -24,7 +24,7 @@ void test2dRotation(double tol)
|
||||
s = std::sin(angle);
|
||||
B << c, s, -s, c;
|
||||
|
||||
C = Apow(std::ldexp(angle,1) / M_PI);
|
||||
C = Apow(std::ldexp(angle,1) / EIGEN_PI);
|
||||
std::cout << "test2dRotation: i = " << i << " error powerm = " << relerr(C,B) << '\n';
|
||||
VERIFY(C.isApprox(B, tol));
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user