From 100d7caf920920657c5c559e6b1e0a322d0d1f98 Mon Sep 17 00:00:00 2001 From: Ben Barsdell Date: Mon, 18 Oct 2021 20:58:14 +1100 Subject: [PATCH] Avoid integer overflow in EigenMetaKernel indexing - The current implementation computes `size + total_threads`, which can overflow and cause CUDA_ERROR_ILLEGAL_ADDRESS when size is close to the maximum representable value. - The num_blocks calculation can also overflow due to the implementation of divup(). - This patch prevents these overflows and allows the kernel to work correctly for the full representable range of tensor sizes. - Also adds relevant tests. --- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 42 ++++++++++++++--- .../Eigen/CXX11/src/Tensor/TensorMeta.h | 6 ++- unsupported/test/cxx11_tensor_gpu.cu | 45 +++++++++++++++++++ 3 files changed, 86 insertions(+), 7 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 279be346e..9a45bc01f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -553,11 +553,39 @@ class TensorExecutor { }; #if defined(EIGEN_GPUCC) +// Returns lhs + rhs, saturating to the highest/lowest representable value on +// overflow/underflow respectively. +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index saturate_add(Index lhs, Index rhs) { + const Index highest = NumTraits::highest(); + const Index lowest = NumTraits::lowest(); + if (lhs > 0 && rhs > 0) { + return (lhs > highest - rhs) ? highest : lhs + rhs; + } else if (lhs < 0 && rhs < 0) { + return (lhs < lowest - rhs) ? lowest : lhs + rhs; + } else { + return lhs + rhs; + } +} + +#if !defined(EIGEN_USE_HIP) +// Specialization for int32 using PTX intrinsic. +template <> +__device__ EIGEN_ALWAYS_INLINE int32_t saturate_add(int32_t lhs, + int32_t rhs) { + // add.sat is only supported for s32. + int32_t result; + asm("add.sat.s32 %0, %1, %2;" : "=r"(result) : "r"(lhs), "r"(rhs)); + return result; +} +#endif + template struct EigenMetaKernelEval { static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) { - for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) { + for (StorageIndex i = firstIdx; i < lastIdx; + i = saturate_add(i, step_size)) { eval.evalScalar(i); } } @@ -573,10 +601,11 @@ struct EigenMetaKernelEval { // Use the vector path for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size; - i += vectorized_step_size) { + i = saturate_add(i, vectorized_step_size)) { eval.evalPacket(i); } - for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) { + for (StorageIndex i = saturate_add(vectorized_size, firstIdx); i < lastIdx; + i = saturate_add(i, step_size)) { eval.evalScalar(i); } } @@ -603,8 +632,11 @@ EIGEN_STRONG_INLINE void TensorExecutor(device.getNumGpuMultiProcessors() * + device.maxGpuThreadsPerMultiProcessor(), + NumTraits::highest()) / + block_size; const StorageIndex size = array_prod(evaluator.dimensions()); // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. const int num_blocks = numext::maxi(numext::mini(max_blocks, divup(size, block_size)), 1); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index b3f4a1ce9..cf891ebd3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -30,13 +30,15 @@ const T2& choose(Cond, const T1&, const T2& second) { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T divup(const X x, const Y y) { - return static_cast((x + y - 1) / y); + // Note: This form is used because it cannot overflow. + return static_cast(x == 0 ? 0 : (x - 1) / y + 1); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T divup(const T x, const T y) { - return static_cast((x + y - 1) / y); + // Note: This form is used because it cannot overflow. + return static_cast(x == 0 ? 0 : (x - 1) / y + 1); } template struct max_n_1 { diff --git a/unsupported/test/cxx11_tensor_gpu.cu b/unsupported/test/cxx11_tensor_gpu.cu index 0a37c0293..83b150d9c 100644 --- a/unsupported/test/cxx11_tensor_gpu.cu +++ b/unsupported/test/cxx11_tensor_gpu.cu @@ -66,6 +66,47 @@ void test_gpu_nullary() { gpuFree(d_in2); } +// Tests that there are no indexing overflows when computing tensors with the +// max representable size. +template ::max)()> +void test_gpu_nullary_max_size() +{ + typedef int8_t DataType; + typedef Tensor TensorType; + typedef Eigen::array ArrayType; + + const IndexType n = N; + TensorType in1((ArrayType(n))); + in1.setZero(); + + std::size_t in1_bytes = in1.size() * sizeof(DataType); + + DataType* d_in1; + gpuMalloc((void**)(&d_in1), in1_bytes); + + gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice); + + Eigen::GpuStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap gpu_in1(d_in1, ArrayType(n)); + + gpu_in1.device(gpu_device) = gpu_in1.constant(123); + + TensorType new1((ArrayType(n))); + + assert(gpuMemcpyAsync(new1.data(), d_in1, in1_bytes, gpuMemcpyDeviceToHost, + gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); + + for (IndexType i = 0; i < n; ++i) { + VERIFY_IS_EQUAL(new1(ArrayType(i)), 123); + } + + gpuFree(d_in1); +} + void test_gpu_elementwise_small() { Tensor in1(Eigen::array(2)); Tensor in2(Eigen::array(2)); @@ -1524,6 +1565,10 @@ void test_gpu_gamma_sample_der_alpha() EIGEN_DECLARE_TEST(cxx11_tensor_gpu) { CALL_SUBTEST_1(test_gpu_nullary()); + CALL_SUBTEST_1(test_gpu_nullary_max_size()); + CALL_SUBTEST_1(test_gpu_nullary_max_size()); + CALL_SUBTEST_1((test_gpu_nullary_max_size< + int64_t, (std::numeric_limits::max)() + 100ll>())); CALL_SUBTEST_1(test_gpu_elementwise_small()); CALL_SUBTEST_1(test_gpu_elementwise()); CALL_SUBTEST_1(test_gpu_props());