Made it possible to limit the number of blocks that will be used to evaluate a tensor expression on a CUDA device. This makesit possible to set aside streaming multiprocessors for other computations.

2025-03-07 18:27:40 +08:00 · 2016-02-01 12:46:32 -08:00 · 2016-02-01 12:46:32 -08:00 · 6b5dff875e
commit 6b5dff875e
parent 264f8141f8
2 changed files with 11 additions and 5 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@ -109,10 +109,12 @@ class CudaStreamDevice : public StreamInterface {
 struct GpuDevice {
  // The StreamInterface is not owned: the caller is
  // responsible for its initialization and eventual destruction.
-  explicit GpuDevice(const StreamInterface* stream) : stream_(stream) {
+  explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
+    eigen_assert(stream);
+  }
+  explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
    eigen_assert(stream);
  }
-
  // TODO(bsteiner): This is an internal API, we should not expose it.
  EIGEN_STRONG_INLINE const cudaStream_t& stream() const {
    return stream_->stream();
@ -246,6 +248,10 @@ struct GpuDevice {
 #endif
  }

+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const {
+    return max_blocks_;
+  }
+
  // This function checks if the CUDA runtime recorded an error for the
  // underlying stream device.
  inline bool ok() const {
@ -259,7 +265,7 @@ struct GpuDevice {

 private:
  const StreamInterface* stream_;
-
+  int max_blocks_;
 };

 #ifndef __CUDA_ARCH__
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@ -220,7 +220,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(
  if (needs_assign)
  {
    const int block_size = device.maxCudaThreadsPerBlock();
-    const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
+    const int max_blocks = numext::maxi<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size);
    const Index size = array_prod(evaluator.dimensions());
    // Create a least one block to ensure we won't crash if we're called with tensors of size 0.
    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
@ -239,7 +239,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(c
  if (needs_assign)
  {
    const int block_size = device.maxCudaThreadsPerBlock();
-    const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
+    const int max_blocks = numext::maxi<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size);
    const Index size = array_prod(evaluator.dimensions());
    // Create a least one block to ensure we won't crash if we're called with tensors of size 0.
    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);