From 100d7caf920920657c5c559e6b1e0a322d0d1f98 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Mon, 18 Oct 2021 20:58:14 +1100
Subject: [PATCH] Avoid integer overflow in EigenMetaKernel indexing

- The current implementation computes `size + total_threads`, which can
  overflow and cause CUDA_ERROR_ILLEGAL_ADDRESS when size is close to
  the maximum representable value.
- The num_blocks calculation can also overflow due to the implementation
  of divup().
- This patch prevents these overflows and allows the kernel to work
  correctly for the full representable range of tensor sizes.
- Also adds relevant tests.
---
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h   | 42 ++++++++++++++---
 .../Eigen/CXX11/src/Tensor/TensorMeta.h       |  6 ++-
 unsupported/test/cxx11_tensor_gpu.cu          | 45 +++++++++++++++++++
 3 files changed, 86 insertions(+), 7 deletions(-)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 279be346e..9a45bc01f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -553,11 +553,39 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
 };
 
 #if defined(EIGEN_GPUCC)
+// Returns lhs + rhs, saturating to the highest/lowest representable value on
+// overflow/underflow respectively.
+template <typename Index>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index saturate_add(Index lhs, Index rhs) {
+  const Index highest = NumTraits<Index>::highest();
+  const Index lowest = NumTraits<Index>::lowest();
+  if (lhs > 0 && rhs > 0) {
+    return (lhs > highest - rhs) ? highest : lhs + rhs;
+  } else if (lhs < 0 && rhs < 0) {
+    return (lhs < lowest - rhs) ? lowest : lhs + rhs;
+  } else {
+    return lhs + rhs;
+  }
+}
+
+#if !defined(EIGEN_USE_HIP)
+// Specialization for int32 using PTX intrinsic.
+template <>
+__device__ EIGEN_ALWAYS_INLINE int32_t saturate_add<int32_t>(int32_t lhs,
+                                                             int32_t rhs) {
+  // add.sat is only supported for s32.
+  int32_t result;
+  asm("add.sat.s32 %0, %1, %2;" : "=r"(result) : "r"(lhs), "r"(rhs));
+  return result;
+}
+#endif
+
 template <typename Evaluator, typename StorageIndex, bool Vectorizable>
 struct EigenMetaKernelEval {
   static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
-    for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) {
+    for (StorageIndex i = firstIdx; i < lastIdx;
+         i = saturate_add(i, step_size)) {
       eval.evalScalar(i);
     }
   }
@@ -573,10 +601,11 @@ struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
 
     // Use the vector path
     for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size;
-         i += vectorized_step_size) {
+         i = saturate_add(i, vectorized_step_size)) {
       eval.evalPacket(i);
     }
-    for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) {
+    for (StorageIndex i = saturate_add(vectorized_size, firstIdx); i < lastIdx;
+         i = saturate_add(i, step_size)) {
       eval.evalScalar(i);
     }
   }
@@ -603,8 +632,11 @@ EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Til
   if (needs_assign) {
 
     const int block_size = device.maxGpuThreadsPerBlock();
-    const int max_blocks = device.getNumGpuMultiProcessors() *
-                           device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const int max_blocks =
+        numext::mini<int64_t>(device.getNumGpuMultiProcessors() *
+                              device.maxGpuThreadsPerMultiProcessor(),
+                          NumTraits<StorageIndex>::highest()) /
+        block_size;
     const StorageIndex size = array_prod(evaluator.dimensions());
     // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
     const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index b3f4a1ce9..cf891ebd3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -30,13 +30,15 @@ const T2& choose(Cond<false>, const T1&, const T2& second) {
 template <typename T, typename X, typename Y>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T divup(const X x, const Y y) {
-  return static_cast<T>((x + y - 1) / y);
+  // Note: This form is used because it cannot overflow.
+  return static_cast<T>(x == 0 ? 0 : (x - 1) / y + 1);
 }
 
 template <typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T divup(const T x, const T y) {
-  return static_cast<T>((x + y - 1) / y);
+  // Note: This form is used because it cannot overflow.
+  return static_cast<T>(x == 0 ? 0 : (x - 1) / y + 1);
 }
 
 template <size_t n> struct max_n_1 {
diff --git a/unsupported/test/cxx11_tensor_gpu.cu b/unsupported/test/cxx11_tensor_gpu.cu
index 0a37c0293..83b150d9c 100644
--- a/unsupported/test/cxx11_tensor_gpu.cu
+++ b/unsupported/test/cxx11_tensor_gpu.cu
@@ -66,6 +66,47 @@ void test_gpu_nullary() {
   gpuFree(d_in2);
 }
 
+// Tests that there are no indexing overflows when computing tensors with the
+// max representable size.
+template <typename IndexType,
+          IndexType N = (std::numeric_limits<IndexType>::max)()>
+void test_gpu_nullary_max_size()
+{
+  typedef int8_t DataType;
+  typedef Tensor<DataType, 1, 0, IndexType> TensorType;
+  typedef Eigen::array<IndexType, 1> ArrayType;
+
+  const IndexType n = N;
+  TensorType in1((ArrayType(n)));
+  in1.setZero();
+
+  std::size_t in1_bytes = in1.size() * sizeof(DataType);
+
+  DataType* d_in1;
+  gpuMalloc((void**)(&d_in1), in1_bytes);
+
+  gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<TensorType> gpu_in1(d_in1, ArrayType(n));
+
+  gpu_in1.device(gpu_device) = gpu_in1.constant(123);
+
+  TensorType new1((ArrayType(n)));
+
+  assert(gpuMemcpyAsync(new1.data(), d_in1, in1_bytes, gpuMemcpyDeviceToHost,
+                        gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (IndexType i = 0; i < n; ++i) {
+    VERIFY_IS_EQUAL(new1(ArrayType(i)), 123);
+  }
+
+  gpuFree(d_in1);
+}
+
 void test_gpu_elementwise_small() {
   Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2));
   Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2));
@@ -1524,6 +1565,10 @@ void test_gpu_gamma_sample_der_alpha()
 EIGEN_DECLARE_TEST(cxx11_tensor_gpu)
 {
   CALL_SUBTEST_1(test_gpu_nullary());
+  CALL_SUBTEST_1(test_gpu_nullary_max_size<int16_t>());
+  CALL_SUBTEST_1(test_gpu_nullary_max_size<int32_t>());
+  CALL_SUBTEST_1((test_gpu_nullary_max_size<
+                  int64_t, (std::numeric_limits<int32_t>::max)() + 100ll>()));
   CALL_SUBTEST_1(test_gpu_elementwise_small());
   CALL_SUBTEST_1(test_gpu_elementwise());
   CALL_SUBTEST_1(test_gpu_props());