mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-03-07 18:27:40 +08:00
Made it possible to limit the number of blocks that will be used to evaluate a tensor expression on a CUDA device. This makesit possible to set aside streaming multiprocessors for other computations.
This commit is contained in:
parent
264f8141f8
commit
6b5dff875e
@ -109,10 +109,12 @@ class CudaStreamDevice : public StreamInterface {
|
||||
struct GpuDevice {
|
||||
// The StreamInterface is not owned: the caller is
|
||||
// responsible for its initialization and eventual destruction.
|
||||
explicit GpuDevice(const StreamInterface* stream) : stream_(stream) {
|
||||
explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
|
||||
eigen_assert(stream);
|
||||
}
|
||||
explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
|
||||
eigen_assert(stream);
|
||||
}
|
||||
|
||||
// TODO(bsteiner): This is an internal API, we should not expose it.
|
||||
EIGEN_STRONG_INLINE const cudaStream_t& stream() const {
|
||||
return stream_->stream();
|
||||
@ -246,6 +248,10 @@ struct GpuDevice {
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const {
|
||||
return max_blocks_;
|
||||
}
|
||||
|
||||
// This function checks if the CUDA runtime recorded an error for the
|
||||
// underlying stream device.
|
||||
inline bool ok() const {
|
||||
@ -259,7 +265,7 @@ struct GpuDevice {
|
||||
|
||||
private:
|
||||
const StreamInterface* stream_;
|
||||
|
||||
int max_blocks_;
|
||||
};
|
||||
|
||||
#ifndef __CUDA_ARCH__
|
||||
|
@ -220,7 +220,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(
|
||||
if (needs_assign)
|
||||
{
|
||||
const int block_size = device.maxCudaThreadsPerBlock();
|
||||
const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
|
||||
const int max_blocks = numext::maxi<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size);
|
||||
const Index size = array_prod(evaluator.dimensions());
|
||||
// Create a least one block to ensure we won't crash if we're called with tensors of size 0.
|
||||
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
|
||||
@ -239,7 +239,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(c
|
||||
if (needs_assign)
|
||||
{
|
||||
const int block_size = device.maxCudaThreadsPerBlock();
|
||||
const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
|
||||
const int max_blocks = numext::maxi<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size);
|
||||
const Index size = array_prod(evaluator.dimensions());
|
||||
// Create a least one block to ensure we won't crash if we're called with tensors of size 0.
|
||||
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
|
||||
|
Loading…
Reference in New Issue
Block a user