mirror of
https://gitlab.com/libeigen/eigen.git
synced 2024-12-15 07:10:37 +08:00
Merged in rmlarsen/eigen (pull request PR-178)
Eigen Tensor cost model part 2: Thread scheduling for standard evaluators and reductions.
This commit is contained in:
commit
eb669f989f
@ -10,9 +10,9 @@
|
|||||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
|
#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
|
||||||
#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
|
#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
|
||||||
|
|
||||||
#if !defined(EIGEN_USE_GPU)
|
//#if !defined(EIGEN_USE_GPU)
|
||||||
#define EIGEN_USE_COST_MODEL
|
//#define EIGEN_USE_COST_MODEL
|
||||||
#endif
|
//#endif
|
||||||
|
|
||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
|
|
||||||
|
@ -189,6 +189,11 @@ struct TensorEvaluator<const Derived, Device>
|
|||||||
return loadConstant(m_data+index);
|
return loadConstant(m_data+index);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||||
|
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
|
||||||
|
internal::unpacket_traits<PacketReturnType>::size);
|
||||||
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; }
|
EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
@ -59,9 +59,16 @@ class TensorExecutor<Expression, DefaultDevice, true>
|
|||||||
{
|
{
|
||||||
const Index size = array_prod(evaluator.dimensions());
|
const Index size = array_prod(evaluator.dimensions());
|
||||||
const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
|
const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
|
||||||
|
// Manually unroll this loop since compilers don't do it.
|
||||||
|
const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
|
||||||
|
for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
|
||||||
|
evaluator.evalPacket(i);
|
||||||
|
evaluator.evalPacket(i+PacketSize);
|
||||||
|
evaluator.evalPacket(i+2*PacketSize);
|
||||||
|
evaluator.evalPacket(i+3*PacketSize);
|
||||||
|
}
|
||||||
const Index VectorizedSize = (size / PacketSize) * PacketSize;
|
const Index VectorizedSize = (size / PacketSize) * PacketSize;
|
||||||
|
for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
|
||||||
for (Index i = 0; i < VectorizedSize; i += PacketSize) {
|
|
||||||
evaluator.evalPacket(i);
|
evaluator.evalPacket(i);
|
||||||
}
|
}
|
||||||
for (Index i = VectorizedSize; i < size; ++i) {
|
for (Index i = VectorizedSize; i < size; ++i) {
|
||||||
@ -78,8 +85,9 @@ class TensorExecutor<Expression, DefaultDevice, true>
|
|||||||
#ifdef EIGEN_USE_THREADS
|
#ifdef EIGEN_USE_THREADS
|
||||||
template <typename Evaluator, typename Index, bool Vectorizable>
|
template <typename Evaluator, typename Index, bool Vectorizable>
|
||||||
struct EvalRange {
|
struct EvalRange {
|
||||||
static void run(Evaluator evaluator, const Index first, const Index last) {
|
static void run(Evaluator* evaluator_in, const Index first, const Index last) {
|
||||||
eigen_assert(last > first);
|
Evaluator evaluator = *evaluator_in;
|
||||||
|
eigen_assert(last >= first);
|
||||||
for (Index i = first; i < last; ++i) {
|
for (Index i = first; i < last; ++i) {
|
||||||
evaluator.evalScalar(i);
|
evaluator.evalScalar(i);
|
||||||
}
|
}
|
||||||
@ -88,28 +96,34 @@ struct EvalRange {
|
|||||||
|
|
||||||
template <typename Evaluator, typename Index>
|
template <typename Evaluator, typename Index>
|
||||||
struct EvalRange<Evaluator, Index, true> {
|
struct EvalRange<Evaluator, Index, true> {
|
||||||
static void run(Evaluator evaluator, const Index first, const Index last) {
|
static void run(Evaluator* evaluator_in, const Index first, const Index last) {
|
||||||
eigen_assert(last > first);
|
Evaluator evaluator = *evaluator_in;
|
||||||
|
eigen_assert(last >= first);
|
||||||
Index i = first;
|
Index i = first;
|
||||||
static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
|
const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
|
||||||
if (last - first >= PacketSize) {
|
if (last - first >= PacketSize) {
|
||||||
eigen_assert(first % PacketSize == 0);
|
eigen_assert(first % PacketSize == 0);
|
||||||
Index lastPacket = last - (last % PacketSize);
|
Index last_chunk_offset = last - 4 * PacketSize;
|
||||||
for (; i < lastPacket; i += PacketSize) {
|
// Manually unroll this loop since compilers don't do it.
|
||||||
|
for (; i <= last_chunk_offset; i += 4*PacketSize) {
|
||||||
|
evaluator.evalPacket(i);
|
||||||
|
evaluator.evalPacket(i+PacketSize);
|
||||||
|
evaluator.evalPacket(i+2*PacketSize);
|
||||||
|
evaluator.evalPacket(i+3*PacketSize);
|
||||||
|
}
|
||||||
|
last_chunk_offset = last - PacketSize;
|
||||||
|
for (; i <= last_chunk_offset; i += PacketSize) {
|
||||||
evaluator.evalPacket(i);
|
evaluator.evalPacket(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (; i < last; ++i) {
|
for (; i < last; ++i) {
|
||||||
evaluator.evalScalar(i);
|
evaluator.evalScalar(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template<typename Expression, bool Vectorizable>
|
template <typename Expression, bool Vectorizable>
|
||||||
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
|
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
|
||||||
{
|
|
||||||
public:
|
public:
|
||||||
typedef typename Expression::Index Index;
|
typedef typename Expression::Index Index;
|
||||||
static inline void run(const Expression& expr, const ThreadPoolDevice& device)
|
static inline void run(const Expression& expr, const ThreadPoolDevice& device)
|
||||||
@ -119,25 +133,35 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
|
|||||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||||
if (needs_assign)
|
if (needs_assign)
|
||||||
{
|
{
|
||||||
|
const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
|
||||||
const Index size = array_prod(evaluator.dimensions());
|
const Index size = array_prod(evaluator.dimensions());
|
||||||
|
int num_threads = device.numThreads();
|
||||||
static const int PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
|
#ifdef EIGEN_USE_COST_MODEL
|
||||||
|
if (num_threads > 1) {
|
||||||
int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1;
|
num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
|
||||||
|
size, evaluator.costPerCoeff(Vectorizable), num_threads);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
if (num_threads == 1) {
|
||||||
|
EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
|
||||||
|
} else {
|
||||||
|
Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
|
||||||
const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
|
const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
|
||||||
const unsigned int numblocks = static_cast<unsigned int>(size / blocksize);
|
const Index numblocks = size / blocksize;
|
||||||
|
|
||||||
Barrier barrier(numblocks);
|
Barrier barrier(numblocks);
|
||||||
for (unsigned int i = 0; i < numblocks; ++i) {
|
for (int i = 0; i < numblocks; ++i) {
|
||||||
device.enqueue_with_barrier(&barrier, &EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize);
|
device.enqueue_with_barrier(
|
||||||
|
&barrier, &EvalRange<Evaluator, Index, Vectorizable>::run,
|
||||||
|
&evaluator, i * blocksize, (i + 1) * blocksize);
|
||||||
}
|
}
|
||||||
|
if (numblocks * blocksize < size) {
|
||||||
if (static_cast<Index>(numblocks) * blocksize < size) {
|
EvalRange<Evaluator, Index, Vectorizable>::run(
|
||||||
EvalRange<Evaluator, Index, Vectorizable>::run(evaluator, numblocks * blocksize, size);
|
&evaluator, numblocks * blocksize, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
barrier.Wait();
|
barrier.Wait();
|
||||||
}
|
}
|
||||||
|
}
|
||||||
evaluator.cleanup();
|
evaluator.cleanup();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -226,7 +250,6 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
|
|||||||
#endif // __CUDACC__
|
#endif // __CUDACC__
|
||||||
#endif // EIGEN_USE_GPU
|
#endif // EIGEN_USE_GPU
|
||||||
|
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
@ -214,7 +214,7 @@ struct FullReducer {
|
|||||||
|
|
||||||
static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) {
|
static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) {
|
||||||
const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
|
const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
|
||||||
*output = InnerMostDimReducer<Self, Op>::reduce(self, 0, num_coeffs, reducer);
|
*output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -222,18 +222,19 @@ struct FullReducer {
|
|||||||
#ifdef EIGEN_USE_THREADS
|
#ifdef EIGEN_USE_THREADS
|
||||||
// Multithreaded full reducers
|
// Multithreaded full reducers
|
||||||
template <typename Self, typename Op,
|
template <typename Self, typename Op,
|
||||||
bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
|
bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
|
||||||
struct FullReducerShard {
|
struct FullReducerShard {
|
||||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
|
||||||
typename Self::Index numValuesToReduce, Op& reducer,
|
typename Self::Index numValuesToReduce, Op& reducer,
|
||||||
typename Self::CoeffReturnType* output) {
|
typename Self::CoeffReturnType* output) {
|
||||||
*output = InnerMostDimReducer<Self, Op, vectorizable>::reduce(
|
*output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
|
||||||
self, firstIndex, numValuesToReduce, reducer);
|
self, firstIndex, numValuesToReduce, reducer);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename Self, typename Op>
|
// Multithreaded full reducer
|
||||||
struct FullReducer<Self, Op, ThreadPoolDevice, false> {
|
template <typename Self, typename Op, bool Vectorizable>
|
||||||
|
struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
|
||||||
static const bool HasOptimizedImplementation = !Op::IsStateful;
|
static const bool HasOptimizedImplementation = !Op::IsStateful;
|
||||||
static const int PacketSize =
|
static const int PacketSize =
|
||||||
unpacket_traits<typename Self::PacketReturnType>::size;
|
unpacket_traits<typename Self::PacketReturnType>::size;
|
||||||
@ -247,79 +248,44 @@ struct FullReducer<Self, Op, ThreadPoolDevice, false> {
|
|||||||
*output = reducer.finalize(reducer.initialize());
|
*output = reducer.finalize(reducer.initialize());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const std::size_t num_threads = device.numThreads();
|
#ifdef EIGEN_USE_COST_MODEL
|
||||||
|
const TensorOpCost cost =
|
||||||
|
self.m_impl.costPerCoeff(Vectorizable) +
|
||||||
|
TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
|
||||||
|
PacketSize);
|
||||||
|
const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
|
||||||
|
num_coeffs, cost, device.numThreads());
|
||||||
|
#else
|
||||||
|
const int num_threads = device.numThreads();
|
||||||
|
#endif
|
||||||
if (num_threads == 1) {
|
if (num_threads == 1) {
|
||||||
*output = InnerMostDimReducer<Self, Op, false>::reduce(self, 0, num_coeffs, reducer);
|
*output =
|
||||||
|
InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
|
||||||
return;
|
return;
|
||||||
} else {
|
}
|
||||||
const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
|
const Index blocksize =
|
||||||
const unsigned int numblocks = blocksize > 0 ? static_cast<unsigned int>(num_coeffs / blocksize) : 0;
|
std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
|
||||||
eigen_assert(num_coeffs >= static_cast<Index>(numblocks) * blocksize);
|
const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
|
||||||
|
eigen_assert(num_coeffs >= numblocks * blocksize);
|
||||||
|
|
||||||
Barrier barrier(numblocks);
|
Barrier barrier(numblocks);
|
||||||
MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
|
MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
|
||||||
for (unsigned int i = 0; i < numblocks; ++i) {
|
for (Index i = 0; i < numblocks; ++i) {
|
||||||
device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, false>::run, self,
|
device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run,
|
||||||
i * blocksize, blocksize, reducer, &shards[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
typename Self::CoeffReturnType finalShard;
|
|
||||||
if (static_cast<Index>(numblocks) * blocksize < num_coeffs) {
|
|
||||||
finalShard = InnerMostDimReducer<Self, Op, false>::reduce(
|
|
||||||
self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer);
|
|
||||||
} else {
|
|
||||||
finalShard = reducer.initialize();
|
|
||||||
}
|
|
||||||
barrier.Wait();
|
|
||||||
for (unsigned int i = 0; i < numblocks; ++i) {
|
|
||||||
reducer.reduce(shards[i], &finalShard);
|
|
||||||
}
|
|
||||||
*output = reducer.finalize(finalShard);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename Self, typename Op>
|
|
||||||
struct FullReducer<Self, Op, ThreadPoolDevice, true> {
|
|
||||||
static const bool HasOptimizedImplementation = !Op::IsStateful;
|
|
||||||
static const int PacketSize =
|
|
||||||
unpacket_traits<typename Self::PacketReturnType>::size;
|
|
||||||
|
|
||||||
// launch one reducer per thread and accumulate the result.
|
|
||||||
static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device,
|
|
||||||
typename Self::CoeffReturnType* output) {
|
|
||||||
typedef typename Self::Index Index;
|
|
||||||
const Index num_coeffs = array_prod(self.m_impl.dimensions());
|
|
||||||
if (num_coeffs == 0) {
|
|
||||||
*output = reducer.finalize(reducer.initialize());
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
const std::size_t num_threads = device.numThreads();
|
|
||||||
if (num_threads == 1) {
|
|
||||||
*output = InnerMostDimReducer<Self, Op, true>::reduce(self, 0, num_coeffs, reducer);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
|
|
||||||
const unsigned int numblocks = blocksize > 0 ? static_cast<unsigned int>(num_coeffs / blocksize) : 0;
|
|
||||||
eigen_assert(num_coeffs >= static_cast<Index>(numblocks) * blocksize);
|
|
||||||
|
|
||||||
Barrier barrier(numblocks);
|
|
||||||
MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
|
|
||||||
for (unsigned int i = 0; i < numblocks; ++i) {
|
|
||||||
device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, true>::run,
|
|
||||||
self, i * blocksize, blocksize, reducer,
|
self, i * blocksize, blocksize, reducer,
|
||||||
&shards[i]);
|
&shards[i]);
|
||||||
}
|
}
|
||||||
typename Self::CoeffReturnType finalShard;
|
typename Self::CoeffReturnType finalShard;
|
||||||
if (static_cast<Index>(numblocks) * blocksize < num_coeffs) {
|
if (numblocks * blocksize < num_coeffs) {
|
||||||
finalShard = InnerMostDimReducer<Self, Op, true>::reduce(
|
finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
|
||||||
self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer);
|
self, numblocks * blocksize, num_coeffs - numblocks * blocksize,
|
||||||
|
reducer);
|
||||||
} else {
|
} else {
|
||||||
finalShard = reducer.initialize();
|
finalShard = reducer.initialize();
|
||||||
}
|
}
|
||||||
|
|
||||||
barrier.Wait();
|
barrier.Wait();
|
||||||
for (unsigned int i = 0; i < numblocks; ++i) {
|
|
||||||
|
for (Index i = 0; i < numblocks; ++i) {
|
||||||
reducer.reduce(shards[i], &finalShard);
|
reducer.reduce(shards[i], &finalShard);
|
||||||
}
|
}
|
||||||
*output = reducer.finalize(finalShard);
|
*output = reducer.finalize(finalShard);
|
||||||
@ -498,13 +464,21 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
|
|||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||||
|
|
||||||
|
static bool size_large_enough(Index total_size) {
|
||||||
|
#ifndef EIGEN_USE_COST_MODEL
|
||||||
|
return total_size > 1024 * 1024;
|
||||||
|
#else
|
||||||
|
return true || total_size;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) {
|
||||||
m_impl.evalSubExprsIfNeeded(NULL);
|
m_impl.evalSubExprsIfNeeded(NULL);
|
||||||
|
|
||||||
// Use the FullReducer if possible.
|
// Use the FullReducer if possible.
|
||||||
if (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
|
if (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
|
||||||
((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
|
((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
|
||||||
(!RunningOnGPU && (internal::array_prod(m_impl.dimensions()) > 1024 * 1024)))) {
|
(!RunningOnGPU && size_large_enough(internal::array_prod(m_impl.dimensions()))))) {
|
||||||
|
|
||||||
bool need_assign = false;
|
bool need_assign = false;
|
||||||
if (!data) {
|
if (!data) {
|
||||||
|
Loading…
Reference in New Issue
Block a user