Made it possible to limit the number of blocks that will be used to evaluate a tensor expression on a CUDA device. This makesit possible to set aside streaming multiprocessors for other computations.

This commit is contained in:
Benoit Steiner 2016-02-01 12:46:32 -08:00
parent 264f8141f8
commit 6b5dff875e
2 changed files with 11 additions and 5 deletions

View File

@ -109,10 +109,12 @@ class CudaStreamDevice : public StreamInterface {
struct GpuDevice {
// The StreamInterface is not owned: the caller is
// responsible for its initialization and eventual destruction.
explicit GpuDevice(const StreamInterface* stream) : stream_(stream) {
explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
eigen_assert(stream);
}
explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
eigen_assert(stream);
}
// TODO(bsteiner): This is an internal API, we should not expose it.
EIGEN_STRONG_INLINE const cudaStream_t& stream() const {
return stream_->stream();
@ -246,6 +248,10 @@ struct GpuDevice {
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const {
return max_blocks_;
}
// This function checks if the CUDA runtime recorded an error for the
// underlying stream device.
inline bool ok() const {
@ -259,7 +265,7 @@ struct GpuDevice {
private:
const StreamInterface* stream_;
int max_blocks_;
};
#ifndef __CUDA_ARCH__

View File

@ -220,7 +220,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(
if (needs_assign)
{
const int block_size = device.maxCudaThreadsPerBlock();
const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
const int max_blocks = numext::maxi<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size);
const Index size = array_prod(evaluator.dimensions());
// Create a least one block to ensure we won't crash if we're called with tensors of size 0.
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
@ -239,7 +239,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(c
if (needs_assign)
{
const int block_size = device.maxCudaThreadsPerBlock();
const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
const int max_blocks = numext::maxi<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size);
const Index size = array_prod(evaluator.dimensions());
// Create a least one block to ensure we won't crash if we're called with tensors of size 0.
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);