Don't crash when attempting to reduce empty tensors.

2025-03-07 18:27:40 +08:00 · 2016-04-20 18:08:20 -07:00 · 2016-04-20 18:08:20 -07:00 · 2dde1b1028
commit 2dde1b1028
parent a792cd357d
3 changed files with 17 additions and 4 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@ -238,7 +238,7 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
                           device.maxCudaThreadsPerMultiProcessor() / block_size;
    const Index size = array_prod(evaluator.dimensions());
    // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
-    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
+    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);

    LAUNCH_CUDA_KERNEL(
        (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@ -24,9 +24,17 @@ const T2& choose(Cond<false>, const T1&, const T2& second) {
  return second;
 }

-template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+
+template <typename T, typename X, typename Y>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T divup(const X x, const Y y) {
+  return static_cast<T>((x + y - 1) / y);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T divup(const T x, const T y) {
-  return (x + y - 1) / y;
+  return static_cast<T>((x + y - 1) / y);
 }

 template <size_t n> struct max_n_1 {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@ -134,9 +134,14 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
    typedef typename Self::Index Index;

    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    // Don't crash when we're called with an input tensor of size 0.
+    if (num_coeffs == 0) {
+      return;
+    }
+
    const int block_size = 256;
    const int num_per_thread = 128;
-    const int num_blocks = numext::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
+    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);

    if (num_blocks > 1) {
      // We initialize the outputs outside the reduction kernel when we can't be sure that there