Merged eigen/eigen into default

This commit is contained in:
Konstantinos Margaritis 2016-03-28 01:48:45 +03:00
commit f48011119e
45 changed files with 327 additions and 110 deletions

0
Eigen/src/Core/AssignEvaluator.h Executable file → Normal file
View File

0
Eigen/src/Core/Assign_MKL.h Executable file → Normal file
View File

View File

@ -60,6 +60,23 @@ template<typename T> struct GenericNumTraits
MulCost = 1
};
// Division is messy but important, because it is expensive and throughput
// varies significantly. The following numbers are based on min division
// throughput on Haswell.
template<bool Vectorized>
struct Div {
enum {
#ifdef EIGEN_VECTORIZE_AVX
AVX = true,
#else
AVX = false,
#endif
Cost = IsInteger ? (sizeof(T) == 8 ? (IsSigned ? 24 : 21) : (IsSigned ? 8 : 9)):
Vectorized ? (sizeof(T) == 8 ? (AVX ? 16 : 8) : (AVX ? 14 : 7)) : 8
};
};
typedef T Real;
typedef typename internal::conditional<
IsInteger,

0
Eigen/src/Core/ProductEvaluators.h Executable file → Normal file
View File

View File

@ -576,7 +576,7 @@ struct igammac_impl {
pkm1 = pk;
qkm2 = qkm1;
qkm1 = qk;
if (abs(pk) > big) {
if (numext::abs(pk) > big) {
pkm2 *= biginv;
pkm1 *= biginv;
qkm2 *= biginv;

0
Eigen/src/Core/VectorwiseOp.h Executable file → Normal file
View File

View File

@ -63,38 +63,69 @@ static inline EIGEN_DEVICE_FUNC float half_to_float(__half h);
// Class definition.
struct half : public __half {
EIGEN_DEVICE_FUNC half() : __half(internal::raw_uint16_to_half(0)) {}
EIGEN_DEVICE_FUNC half() {}
// TODO(sesse): Should these conversions be marked as explicit?
EIGEN_DEVICE_FUNC half(float f) : __half(internal::float_to_half_rtne(f)) {}
EIGEN_DEVICE_FUNC half(int i) : __half(internal::float_to_half_rtne(static_cast<float>(i))) {}
EIGEN_DEVICE_FUNC half(double d) : __half(internal::float_to_half_rtne(static_cast<float>(d))) {}
EIGEN_DEVICE_FUNC half(bool b)
: __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
EIGEN_DEVICE_FUNC half(const __half& h) : __half(h) {}
EIGEN_DEVICE_FUNC half(const half& h) : __half(h) {}
EIGEN_DEVICE_FUNC half(const volatile half& h)
: __half(internal::raw_uint16_to_half(h.x)) {}
explicit EIGEN_DEVICE_FUNC half(bool b)
: __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
explicit EIGEN_DEVICE_FUNC half(int i)
: __half(internal::float_to_half_rtne(static_cast<float>(i))) {}
explicit EIGEN_DEVICE_FUNC half(long l)
: __half(internal::float_to_half_rtne(static_cast<float>(l))) {}
explicit EIGEN_DEVICE_FUNC half(long long ll)
: __half(internal::float_to_half_rtne(static_cast<float>(ll))) {}
explicit EIGEN_DEVICE_FUNC half(float f)
: __half(internal::float_to_half_rtne(f)) {}
explicit EIGEN_DEVICE_FUNC half(double d)
: __half(internal::float_to_half_rtne(static_cast<float>(d))) {}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const {
// +0.0 and -0.0 become false, everything else becomes true.
return static_cast<bool>(x & 0x7fff);
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const {
return static_cast<signed char>(internal::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const {
return static_cast<unsigned char>(internal::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const {
return static_cast<short>(internal::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const {
return static_cast<unsigned short>(internal::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const {
return static_cast<int>(internal::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const {
return static_cast<unsigned int>(internal::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const {
return static_cast<long>(internal::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const {
return static_cast<unsigned long>(internal::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const {
return static_cast<long long>(internal::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const {
return static_cast<unsigned long long>(internal::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const {
return internal::half_to_float(*this);
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const {
return internal::half_to_float(*this);
return static_cast<double>(internal::half_to_float(*this));
}
EIGEN_DEVICE_FUNC half& operator=(const half& other) {
x = other.x;
return *this;
}
EIGEN_DEVICE_FUNC half& operator=(const volatile half& other) {
x = other.x;
return *this;
}
EIGEN_DEVICE_FUNC volatile half& operator=(const half& other) volatile {
x = other.x;
return *this;
}
};
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
@ -203,6 +234,12 @@ static inline EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) {
#endif // Emulate support for half floats
// Division by an index. Do it in full float precision to avoid accuracy
// issues in converting the denominator to half.
static inline EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
return Eigen::half(static_cast<float>(a) / static_cast<float>(b));
}
// Conversion routines, including fallbacks for the host or older CUDA.
// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of
// these in hardware. If we need more performance on older/other CPUs, they are
@ -341,4 +378,14 @@ static inline EIGEN_DEVICE_FUNC Eigen::half log(const Eigen::half& a) {
} // end namespace std
// Add the missing shfl_xor intrinsic
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
__device__ inline Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
}
#endif
#endif // EIGEN_HALF_CUDA_H

View File

@ -114,8 +114,8 @@ template<> EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
return __float22half2_rn(make_float2(a.x, a.y));
#else
half r1 = a.x;
half r2 = a.y;
half r1 = static_cast<half>(a.x);
half r2 = static_cast<half>(a.y);
half2 r;
r.x = 0;
r.x |= r1.x;

View File

@ -238,7 +238,13 @@ template<typename Scalar> struct scalar_hypot_op {
};
template<typename Scalar>
struct functor_traits<scalar_hypot_op<Scalar> > {
enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess=0 };
enum
{
Cost = 3 * NumTraits<Scalar>::AddCost +
2 * NumTraits<Scalar>::MulCost +
2 * NumTraits<Scalar>::template Div<false>::Cost,
PacketAccess = false
};
};
/** \internal
@ -297,9 +303,10 @@ template<typename LhsScalar,typename RhsScalar> struct scalar_quotient_op {
};
template<typename LhsScalar,typename RhsScalar>
struct functor_traits<scalar_quotient_op<LhsScalar,RhsScalar> > {
typedef typename scalar_quotient_op<LhsScalar,RhsScalar>::result_type result_type;
enum {
Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost), // rough estimate!
PacketAccess = scalar_quotient_op<LhsScalar,RhsScalar>::Vectorizable
PacketAccess = scalar_quotient_op<LhsScalar,RhsScalar>::Vectorizable,
Cost = NumTraits<result_type>::template Div<PacketAccess>::Cost
};
};
@ -564,6 +571,10 @@ struct scalar_inverse_mult_op {
{ return internal::pdiv(pset1<Packet>(m_other),a); }
Scalar m_other;
};
template<typename Scalar>
struct functor_traits<scalar_inverse_mult_op<Scalar> >
{ enum { PacketAccess = packet_traits<Scalar>::HasDiv, Cost = NumTraits<Scalar>::template Div<PacketAccess>::Cost }; };
} // end namespace internal

0
Eigen/src/Core/products/GeneralMatrixVector_MKL.h Executable file → Normal file
View File

0
Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h Executable file → Normal file
View File

0
Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h Executable file → Normal file
View File

0
Eigen/src/Eigenvalues/ComplexSchur_MKL.h Executable file → Normal file
View File

0
Eigen/src/Eigenvalues/GeneralizedEigenSolver.h Executable file → Normal file
View File

0
Eigen/src/Eigenvalues/RealQZ.h Executable file → Normal file
View File

0
Eigen/src/Eigenvalues/RealSchur_MKL.h Executable file → Normal file
View File

0
Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h Executable file → Normal file
View File

0
Eigen/src/PardisoSupport/PardisoSupport.h Executable file → Normal file
View File

0
Eigen/src/QR/ColPivHouseholderQR_MKL.h Executable file → Normal file
View File

0
Eigen/src/SVD/JacobiSVD.h Executable file → Normal file
View File

0
Eigen/src/SparseLU/SparseLU.h Executable file → Normal file
View File

View File

@ -333,7 +333,7 @@ template <typename Device, typename T> class BenchmarkSuite {
#ifndef EIGEN_HAS_INDEX_LIST
Eigen::array<TensorIndex, 1> sum_along_dim;
sum_along_dim = 1;
sum_along_dim[0] = 1;
#else
// Take advantage of cxx11 to give the compiler information it can use to
// optimize the code.
@ -356,7 +356,7 @@ template <typename Device, typename T> class BenchmarkSuite {
input_size[1] = n_;
const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
b_, input_size);
const Eigen::array<TensorIndex, 0> output_size;
Eigen::array<TensorIndex, 0> output_size;
TensorMap<Tensor<float, 0, 0, TensorIndex>, Eigen::Aligned> C(
c_, output_size);

View File

@ -12,7 +12,7 @@
StopBenchmarkTiming(); \
Eigen::CudaStreamDevice stream; \
Eigen::GpuDevice device(&stream); \
BenchmarkSuite<Eigen::GpuDevice, half> suite(device, N); \
BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, N); \
cudaDeviceSynchronize(); \
suite.FUNC(iters); \
} \
@ -41,7 +41,7 @@ BM_FuncGPU(colReduction);
StopBenchmarkTiming(); \
Eigen::CudaStreamDevice stream; \
Eigen::GpuDevice device(&stream); \
BenchmarkSuite<Eigen::GpuDevice, half> suite(device, D1, D2, D3); \
BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, D1, D2, D3); \
cudaDeviceSynchronize(); \
suite.FUNC(iters); \
} \
@ -60,7 +60,7 @@ BM_FuncWithInputDimsGPU(contraction, N, N, 64);
StopBenchmarkTiming(); \
Eigen::CudaStreamDevice stream; \
Eigen::GpuDevice device(&stream); \
BenchmarkSuite<Eigen::GpuDevice, half> suite(device, N); \
BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, N); \
cudaDeviceSynchronize(); \
suite.FUNC(iters, DIM1, DIM2); \
} \
@ -73,4 +73,4 @@ BM_FuncWithKernelDimsGPU(convolution, 7, 4);
BM_FuncWithKernelDimsGPU(convolution, 4, 7);
BM_FuncWithKernelDimsGPU(convolution, 7, 64);
BM_FuncWithKernelDimsGPU(convolution, 64, 7);
*/
*/

View File

@ -58,6 +58,10 @@
#define isnan(X) please_protect_your_isnan_with_parentheses
#define isinf(X) please_protect_your_isinf_with_parentheses
#define isfinite(X) please_protect_your_isfinite_with_parentheses
#ifdef M_PI
#undef M_PI
#endif
#define M_PI please_use_EIGEN_PI_instead_of_M_PI
#define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes
// B0 is defined in POSIX header termios.h
@ -331,11 +335,13 @@ inline bool test_isApprox(const std::complex<double>& a, const std::complex<doub
inline bool test_isMuchSmallerThan(const std::complex<double>& a, const std::complex<double>& b)
{ return internal::isMuchSmallerThan(a, b, test_precision<std::complex<double> >()); }
#ifndef EIGEN_TEST_NO_LONGDOUBLE
inline bool test_isApprox(const std::complex<long double>& a, const std::complex<long double>& b)
{ return internal::isApprox(a, b, test_precision<std::complex<long double> >()); }
inline bool test_isMuchSmallerThan(const std::complex<long double>& a, const std::complex<long double>& b)
{ return internal::isMuchSmallerThan(a, b, test_precision<std::complex<long double> >()); }
#endif
#endif
#ifndef EIGEN_TEST_NO_LONGDOUBLE
inline bool test_isApprox(const long double& a, const long double& b)

View File

@ -51,6 +51,7 @@ typedef unsigned __int64 uint64_t;
#endif
#ifdef EIGEN_USE_THREADS
#include <atomic>
#include <condition_variable>
#include <deque>
#include <mutex>

View File

@ -13,7 +13,7 @@
// The array class is only available starting with cxx11. Emulate our own here
// if needed.
// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler!
// Moreover, CUDA doesn't support the STL containers, so we use our own instead.
#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(__CUDACC__) || defined(EIGEN_AVOID_STL_ARRAY)

View File

@ -607,7 +607,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>
extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols,
const Index plane_stride = 1, const Index row_stride = 1, const Index col_stride = 1,
const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = 0) const {
const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const {
return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, 1, 1, 1, padding_type, padding_value);
}
@ -619,7 +619,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
const Index plane_inflate_stride, const Index row_inflate_stride, const Index col_inflate_stride,
const Index padding_top_z, const Index padding_bottom_z,
const Index padding_top, const Index padding_bottom,
const Index padding_left, const Index padding_right, const Scalar padding_value = 0) const {
const Index padding_left, const Index padding_right, const Scalar padding_value = Scalar(0)) const {
return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, plane_inflate_stride, row_inflate_stride, col_inflate_stride, padding_top_z, padding_bottom_z, padding_top, padding_bottom, padding_left, padding_right, padding_value);
}

View File

@ -20,7 +20,7 @@ template<typename Scalar, typename Index, typename LhsMapper,
typename RhsMapper, typename OutputMapper, bool needs_edge_check>
__device__ EIGEN_STRONG_INLINE void
EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
const OutputMapper output, volatile Scalar* lhs_shmem, volatile Scalar* rhs_shmem,
const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem,
const Index m_size, const Index n_size, const Index k_size) {
const Index m_block_idx = blockIdx.x;
@ -319,8 +319,8 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
Scalar rrow(7);
// Now x corresponds to k, y to m, and z to n
const volatile Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
const volatile Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
@ -503,8 +503,8 @@ __launch_bounds__(512)
EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
const OutputMapper output,
const Index m_size, const Index n_size, const Index k_size) {
__shared__ volatile Scalar lhs_shmem[72 * 64];
__shared__ volatile Scalar rhs_shmem[72 * 64];
__shared__ Scalar lhs_shmem[72 * 64];
__shared__ Scalar rhs_shmem[72 * 64];
const Index m_block_idx = blockIdx.x;
const Index n_block_idx = blockIdx.y;

View File

@ -27,7 +27,7 @@ class ThreadPoolInterface {
class ThreadPool : public ThreadPoolInterface {
public:
// Construct a pool that contains "num_threads" threads.
explicit ThreadPool(int num_threads) {
explicit ThreadPool(int num_threads) : threads_(num_threads), waiters_(num_threads) {
for (int i = 0; i < num_threads; i++) {
threads_.push_back(new std::thread([this]() { WorkerLoop(); }));
}
@ -110,55 +110,90 @@ class ThreadPool : public ThreadPoolInterface {
};
std::mutex mu_;
std::vector<std::thread*> threads_; // All threads
std::vector<Waiter*> waiters_; // Stack of waiting threads.
MaxSizeVector<std::thread*> threads_; // All threads
MaxSizeVector<Waiter*> waiters_; // Stack of waiting threads.
std::deque<std::function<void()>> pending_; // Queue of pending work
std::condition_variable empty_; // Signaled on pending_.empty()
bool exiting_ = false;
};
// Notification is an object that allows a user to to wait for another
// thread to signal a notification that an event has occurred.
//
// Multiple threads can wait on the same Notification object.
// but only one caller must call Notify() on the object.
class Notification {
// Barrier is an object that allows one or more threads to wait until
// Notify has been called a specified number of times.
class Barrier {
public:
Notification() : notified_(false) {}
~Notification() {}
Barrier(unsigned int count) : state_(count << 1), notified_(false) {
eigen_assert(((count << 1) >> 1) == count);
}
~Barrier() {
eigen_assert((state_>>1) == 0);
}
void Notify() {
unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2;
if (v != 1) {
eigen_assert(((v + 2) & ~1) != 0);
return; // either count has not dropped to 0, or waiter is not waiting
}
std::unique_lock<std::mutex> l(mu_);
eigen_assert(!notified_);
notified_ = true;
cv_.notify_all();
}
void WaitForNotification() {
void Wait() {
unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel);
if ((v >> 1) == 0) return;
std::unique_lock<std::mutex> l(mu_);
cv_.wait(l, [this]() { return notified_; } );
while (!notified_) {
cv_.wait(l);
}
}
private:
std::mutex mu_;
std::condition_variable cv_;
std::atomic<unsigned int> state_; // low bit is waiter flag
bool notified_;
};
// Notification is an object that allows a user to to wait for another
// thread to signal a notification that an event has occurred.
//
// Multiple threads can wait on the same Notification object,
// but only one caller must call Notify() on the object.
struct Notification : Barrier {
Notification() : Barrier(1) {};
};
// Runs an arbitrary function and then calls Notify() on the passed in
// Notification.
template <typename Function, typename... Args> struct FunctionWrapper
template <typename Function, typename... Args> struct FunctionWrapperWithNotification
{
static void run(Notification* n, Function f, Args... args) {
f(args...);
n->Notify();
if (n) {
n->Notify();
}
}
};
static EIGEN_STRONG_INLINE void wait_until_ready(Notification* n) {
template <typename Function, typename... Args> struct FunctionWrapperWithBarrier
{
static void run(Barrier* b, Function f, Args... args) {
f(args...);
if (b) {
b->Notify();
}
}
};
template <typename SyncType>
static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
if (n) {
n->WaitForNotification();
n->Wait();
}
}
@ -203,10 +238,20 @@ struct ThreadPoolDevice {
EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const {
Notification* n = new Notification();
std::function<void()> func =
std::bind(&FunctionWrapper<Function, Args...>::run, n, f, args...);
std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, f, args...);
pool_->Schedule(func);
return n;
}
template <class Function, class... Args>
EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b,
Function&& f,
Args&&... args) const {
std::function<void()> func = std::bind(
&FunctionWrapperWithBarrier<Function, Args...>::run, b, f, args...);
pool_->Schedule(func);
}
template <class Function, class... Args>
EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const {
std::function<void()> func = std::bind(f, args...);

View File

@ -111,6 +111,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* scalar) {
EIGEN_UNUSED_VARIABLE(scalar);
eigen_assert(scalar == NULL);
return m_impl.evalSubExprsIfNeeded(m_buffer);
}

View File

@ -125,22 +125,18 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1;
const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
const Index numblocks = size / blocksize;
const unsigned int numblocks = static_cast<unsigned int>(size / blocksize);
MaxSizeVector<Notification*> results(numblocks);
for (int i = 0; i < numblocks; ++i) {
results.push_back(device.enqueue(&EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize));
Barrier barrier(numblocks);
for (unsigned int i = 0; i < numblocks; ++i) {
device.enqueue_with_barrier(&barrier, &EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize);
}
if (numblocks * blocksize < size) {
EvalRange<Evaluator, Index, Vectorizable>::run(evaluator, numblocks * blocksize, size);
}
for (int i = 0; i < numblocks; ++i) {
wait_until_ready(results[i]);
delete results[i];
}
barrier.Wait();
}
evaluator.cleanup();
}

View File

@ -25,7 +25,20 @@ struct scalar_mod_op {
};
template <typename Scalar>
struct functor_traits<scalar_mod_op<Scalar> >
{ enum { Cost = 2 * NumTraits<Scalar>::MulCost, PacketAccess = false }; };
{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
/** \internal
* \brief Template functor to compute the modulo between 2 arrays.
*/
template <typename Scalar>
struct scalar_mod2_op {
EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op);
EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; }
};
template <typename Scalar>
struct functor_traits<scalar_mod2_op<Scalar> >
{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
/** \internal

View File

@ -41,7 +41,7 @@ template <> struct max_n_1<0> {
template <typename Scalar, typename Device>
struct PacketType {
typedef typename internal::packet_traits<Scalar>::type type;
static const int size = internal::unpacket_traits<type>::size;
enum { size = internal::unpacket_traits<type>::size };
};
// For CUDA packet types when using a GpuDevice

View File

@ -54,7 +54,6 @@ class TensorReshapingOp : public TensorBase<TensorReshapingOp<NewDimensions, Xpr
{
public:
typedef typename Eigen::internal::traits<TensorReshapingOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
typedef typename Eigen::internal::nested<TensorReshapingOp>::type Nested;
typedef typename Eigen::internal::traits<TensorReshapingOp>::StorageKind StorageKind;
@ -143,7 +142,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
return m_impl.template packet<LoadMode>(index);
}
EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); }
EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); }
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
@ -234,7 +233,6 @@ class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, X
{
public:
typedef typename Eigen::internal::traits<TensorSlicingOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename Eigen::internal::nested<TensorSlicingOp>::type Nested;
typedef typename Eigen::internal::traits<TensorSlicingOp>::StorageKind StorageKind;

View File

@ -253,15 +253,14 @@ struct FullReducer<Self, Op, ThreadPoolDevice, false> {
return;
} else {
const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
const unsigned int numblocks = blocksize > 0 ? static_cast<unsigned int>(num_coeffs / blocksize) : 0;
eigen_assert(num_coeffs >= numblocks * blocksize);
MaxSizeVector<Notification*> results(numblocks);
Barrier barrier(numblocks);
MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
for (Index i = 0; i < numblocks; ++i) {
results.push_back(
device.enqueue(&FullReducerShard<Self, Op, false>::run, self,
i * blocksize, blocksize, reducer, &shards[i]));
for (unsigned int i = 0; i < numblocks; ++i) {
device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, false>::run, self,
i * blocksize, blocksize, reducer, &shards[i]);
}
typename Self::CoeffReturnType finalShard;
@ -271,11 +270,8 @@ struct FullReducer<Self, Op, ThreadPoolDevice, false> {
} else {
finalShard = reducer.initialize();
}
for (Index i = 0; i < numblocks; ++i) {
wait_until_ready(results[i]);
delete results[i];
}
for (Index i = 0; i < numblocks; ++i) {
barrier.Wait();
for (unsigned int i = 0; i < numblocks; ++i) {
reducer.reduce(shards[i], &finalShard);
}
*output = reducer.finalize(finalShard);
@ -304,15 +300,15 @@ struct FullReducer<Self, Op, ThreadPoolDevice, true> {
return;
}
const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
const unsigned int numblocks = blocksize > 0 ? static_cast<unsigned int>(num_coeffs / blocksize) : 0;
eigen_assert(num_coeffs >= numblocks * blocksize);
MaxSizeVector<Notification*> results(numblocks);
Barrier barrier(numblocks);
MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
for (Index i = 0; i < numblocks; ++i) {
results.push_back(device.enqueue(&FullReducerShard<Self, Op, true>::run,
self, i * blocksize, blocksize, reducer,
&shards[i]));
for (unsigned int i = 0; i < numblocks; ++i) {
device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, true>::run,
self, i * blocksize, blocksize, reducer,
&shards[i]);
}
typename Self::CoeffReturnType finalShard;
if (numblocks * blocksize < num_coeffs) {
@ -322,11 +318,8 @@ struct FullReducer<Self, Op, ThreadPoolDevice, true> {
finalShard = reducer.initialize();
}
for (Index i = 0; i < numblocks; ++i) {
wait_until_ready(results[i]);
delete results[i];
}
for (Index i = 0; i < numblocks; ++i) {
barrier.Wait();
for (unsigned int i = 0; i < numblocks; ++i) {
reducer.reduce(shards[i], &finalShard);
}
*output = reducer.finalize(finalShard);

View File

@ -180,11 +180,11 @@ template<typename Scalar> void glLoadMatrix(const Transform<Scalar,3,AffineCompa
inline void glRotate(const Rotation2D<float>& rot)
{
glRotatef(rot.angle()*180.f/float(M_PI), 0.f, 0.f, 1.f);
glRotatef(rot.angle()*180.f/float(EIGEN_PI), 0.f, 0.f, 1.f);
}
inline void glRotate(const Rotation2D<double>& rot)
{
glRotated(rot.angle()*180.0/M_PI, 0.0, 0.0, 1.0);
glRotated(rot.angle()*180.0/EIGEN_PI, 0.0, 0.0, 1.0);
}
template<typename Derived> void glRotate(const RotationBase<Derived,3>& rot)

View File

@ -11,10 +11,6 @@
#ifndef EIGEN_MATRIX_LOGARITHM
#define EIGEN_MATRIX_LOGARITHM
#ifndef M_PI
#define M_PI 3.141592653589793238462643383279503L
#endif
namespace Eigen {
namespace internal {
@ -65,8 +61,8 @@ void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result)
else
{
// computation in previous branch is inaccurate if A(1,1) \approx A(0,0)
int unwindingNumber = static_cast<int>(ceil((imag(logA11 - logA00) - M_PI) / (2*M_PI)));
result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,2*M_PI*unwindingNumber)) / y;
int unwindingNumber = static_cast<int>(ceil((imag(logA11 - logA00) - EIGEN_PI) / (2*EIGEN_PI)));
result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,2*EIGEN_PI*unwindingNumber)) / y;
}
}

View File

@ -298,8 +298,8 @@ MatrixPowerAtomic<MatrixType>::computeSuperDiag(const ComplexScalar& curr, const
ComplexScalar logCurr = log(curr);
ComplexScalar logPrev = log(prev);
int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - M_PI) / (2*M_PI));
ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, M_PI*unwindingNumber);
int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - EIGEN_PI) / (2*EIGEN_PI));
ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, EIGEN_PI*unwindingNumber);
return RealScalar(2) * exp(RealScalar(0.5) * p * (logCurr + logPrev)) * sinh(p * w) / (curr - prev);
}

View File

@ -149,6 +149,7 @@ if(EIGEN_TEST_CXX11)
ei_add_test(cxx11_tensor_argmax)
ei_add_test(cxx11_tensor_shuffling)
ei_add_test(cxx11_tensor_striding)
ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
ei_add_test(cxx11_tensor_ref)
ei_add_test(cxx11_tensor_random)
@ -175,9 +176,14 @@ endif()
# These tests needs nvcc
find_package(CUDA 7.0)
if(CUDA_FOUND AND EIGEN_TEST_NVCC)
# Mke sure to compile without the -pedantic and -Wundef flags since they trigger thousands of compilation warnings in the CUDA runtime
# Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor
# and -fno-check-new flags since they trigger thousands of compilation warnings
# in the CUDA runtime
string(REPLACE "-pedantic" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
string(REPLACE "-Wundef" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
string(REPLACE "-Wnon-virtual-dtor" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
message(STATUS "Flags used to compile cuda code: " ${CMAKE_CXX_FLAGS})
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")

View File

@ -7,8 +7,8 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// TODO(mdevin): Free the cuda memory.
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_FUNC cxx11_tensor_cuda
#define EIGEN_USE_GPU

View File

@ -853,6 +853,10 @@ void test_cxx11_tensor_cuda()
CALL_SUBTEST_3(test_cuda_convolution_3d<ColMajor>());
CALL_SUBTEST_3(test_cuda_convolution_3d<RowMajor>());
#if __cplusplus > 199711L
// std::erf, std::erfc, and so on where only added in c++11. We use them
// as a golden reference to validate the results produced by Eigen. Therefore
// we can only run these tests if we use a c++11 compiler.
CALL_SUBTEST_4(test_cuda_lgamma<float>(1.0f));
CALL_SUBTEST_4(test_cuda_lgamma<float>(100.0f));
CALL_SUBTEST_4(test_cuda_lgamma<float>(0.01f));
@ -860,6 +864,7 @@ void test_cxx11_tensor_cuda()
CALL_SUBTEST_4(test_cuda_digamma<float>());
CALL_SUBTEST_4(test_cuda_erf<float>(1.0f));
CALL_SUBTEST_4(test_cuda_erf<float>(100.0f));
CALL_SUBTEST_4(test_cuda_erf<float>(0.01f));
@ -894,4 +899,5 @@ void test_cxx11_tensor_cuda()
CALL_SUBTEST_5(test_cuda_igamma<double>());
CALL_SUBTEST_5(test_cuda_igammac<double>());
#endif
}

View File

@ -0,0 +1,81 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Vijay Vasudevan <vrv@google.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#define EIGEN_USE_THREADS
#include <stdlib.h>
#include "main.h"
#include <Eigen/CXX11/Tensor>
#if EIGEN_OS_WIN || EIGEN_OS_WIN64
#include <windows.h>
void sleep(int seconds) {
Sleep(seconds*1000);
}
#else
#include <unistd.h>
#endif
namespace {
void WaitAndAdd(Eigen::Notification* n, int* counter) {
n->Wait();
*counter = *counter + 1;
}
} // namespace
static void test_notification_single()
{
ThreadPool thread_pool(1);
int counter = 0;
Eigen::Notification n;
std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter);
thread_pool.Schedule(func);
sleep(1);
// The thread should be waiting for the notification.
VERIFY_IS_EQUAL(counter, 0);
// Unblock the thread
n.Notify();
sleep(1);
// Verify the counter has been incremented
VERIFY_IS_EQUAL(counter, 1);
}
// Like test_notification_single() but enqueues multiple threads to
// validate that all threads get notified by Notify().
static void test_notification_multiple()
{
ThreadPool thread_pool(1);
int counter = 0;
Eigen::Notification n;
std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter);
thread_pool.Schedule(func);
thread_pool.Schedule(func);
thread_pool.Schedule(func);
thread_pool.Schedule(func);
sleep(1);
VERIFY_IS_EQUAL(counter, 0);
n.Notify();
sleep(1);
VERIFY_IS_EQUAL(counter, 4);
}
void test_cxx11_tensor_notification()
{
CALL_SUBTEST(test_notification_single());
CALL_SUBTEST(test_notification_multiple());
}

View File

@ -134,7 +134,7 @@ void test_cuda_elementwise() {
gpu_device.deallocate(d_res_float);
}
/*
void test_cuda_contractions() {
Eigen::CudaStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
@ -181,7 +181,7 @@ void test_cuda_contractions() {
gpu_device.deallocate(d_float2);
gpu_device.deallocate(d_res_half);
gpu_device.deallocate(d_res_float);
}*/
}
void test_cuda_reductions() {
@ -244,7 +244,7 @@ void test_cxx11_tensor_of_float16_cuda()
CALL_SUBTEST_1(test_cuda_conversion());
CALL_SUBTEST_1(test_cuda_unary());
CALL_SUBTEST_1(test_cuda_elementwise());
// CALL_SUBTEST_2(test_cuda_contractions());
CALL_SUBTEST_2(test_cuda_contractions());
CALL_SUBTEST_3(test_cuda_reductions());
}
else {

View File

@ -113,8 +113,8 @@ void testMatrixLogarithm(const MatrixType& A)
MatrixType scaledA;
RealScalar maxImagPartOfSpectrum = A.eigenvalues().imag().cwiseAbs().maxCoeff();
if (maxImagPartOfSpectrum >= 0.9 * M_PI)
scaledA = A * 0.9 * M_PI / maxImagPartOfSpectrum;
if (maxImagPartOfSpectrum >= 0.9 * EIGEN_PI)
scaledA = A * 0.9 * EIGEN_PI / maxImagPartOfSpectrum;
else
scaledA = A;

View File

@ -24,7 +24,7 @@ void test2dRotation(double tol)
s = std::sin(angle);
B << c, s, -s, c;
C = Apow(std::ldexp(angle,1) / M_PI);
C = Apow(std::ldexp(angle,1) / EIGEN_PI);
std::cout << "test2dRotation: i = " << i << " error powerm = " << relerr(C,B) << '\n';
VERIFY(C.isApprox(B, tol));
}