diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 9a45bc01f..279be346e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -553,39 +553,11 @@ class TensorExecutor { }; #if defined(EIGEN_GPUCC) -// Returns lhs + rhs, saturating to the highest/lowest representable value on -// overflow/underflow respectively. -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index saturate_add(Index lhs, Index rhs) { - const Index highest = NumTraits::highest(); - const Index lowest = NumTraits::lowest(); - if (lhs > 0 && rhs > 0) { - return (lhs > highest - rhs) ? highest : lhs + rhs; - } else if (lhs < 0 && rhs < 0) { - return (lhs < lowest - rhs) ? lowest : lhs + rhs; - } else { - return lhs + rhs; - } -} - -#if !defined(EIGEN_USE_HIP) -// Specialization for int32 using PTX intrinsic. -template <> -__device__ EIGEN_ALWAYS_INLINE int32_t saturate_add(int32_t lhs, - int32_t rhs) { - // add.sat is only supported for s32. - int32_t result; - asm("add.sat.s32 %0, %1, %2;" : "=r"(result) : "r"(lhs), "r"(rhs)); - return result; -} -#endif - template struct EigenMetaKernelEval { static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) { - for (StorageIndex i = firstIdx; i < lastIdx; - i = saturate_add(i, step_size)) { + for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) { eval.evalScalar(i); } } @@ -601,11 +573,10 @@ struct EigenMetaKernelEval { // Use the vector path for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size; - i = saturate_add(i, vectorized_step_size)) { + i += vectorized_step_size) { eval.evalPacket(i); } - for (StorageIndex i = saturate_add(vectorized_size, firstIdx); i < lastIdx; - i = saturate_add(i, step_size)) { + for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) { eval.evalScalar(i); } } @@ -632,11 +603,8 @@ EIGEN_STRONG_INLINE void TensorExecutor(device.getNumGpuMultiProcessors() * - device.maxGpuThreadsPerMultiProcessor(), - NumTraits::highest()) / - block_size; + const int max_blocks = device.getNumGpuMultiProcessors() * + device.maxGpuThreadsPerMultiProcessor() / block_size; const StorageIndex size = array_prod(evaluator.dimensions()); // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. const int num_blocks = numext::maxi(numext::mini(max_blocks, divup(size, block_size)), 1); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index cf891ebd3..b3f4a1ce9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -30,15 +30,13 @@ const T2& choose(Cond, const T1&, const T2& second) { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T divup(const X x, const Y y) { - // Note: This form is used because it cannot overflow. - return static_cast(x == 0 ? 0 : (x - 1) / y + 1); + return static_cast((x + y - 1) / y); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T divup(const T x, const T y) { - // Note: This form is used because it cannot overflow. - return static_cast(x == 0 ? 0 : (x - 1) / y + 1); + return static_cast((x + y - 1) / y); } template struct max_n_1 { diff --git a/unsupported/test/cxx11_tensor_gpu.cu b/unsupported/test/cxx11_tensor_gpu.cu index 83b150d9c..0a37c0293 100644 --- a/unsupported/test/cxx11_tensor_gpu.cu +++ b/unsupported/test/cxx11_tensor_gpu.cu @@ -66,47 +66,6 @@ void test_gpu_nullary() { gpuFree(d_in2); } -// Tests that there are no indexing overflows when computing tensors with the -// max representable size. -template ::max)()> -void test_gpu_nullary_max_size() -{ - typedef int8_t DataType; - typedef Tensor TensorType; - typedef Eigen::array ArrayType; - - const IndexType n = N; - TensorType in1((ArrayType(n))); - in1.setZero(); - - std::size_t in1_bytes = in1.size() * sizeof(DataType); - - DataType* d_in1; - gpuMalloc((void**)(&d_in1), in1_bytes); - - gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice); - - Eigen::GpuStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap gpu_in1(d_in1, ArrayType(n)); - - gpu_in1.device(gpu_device) = gpu_in1.constant(123); - - TensorType new1((ArrayType(n))); - - assert(gpuMemcpyAsync(new1.data(), d_in1, in1_bytes, gpuMemcpyDeviceToHost, - gpu_device.stream()) == gpuSuccess); - assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); - - for (IndexType i = 0; i < n; ++i) { - VERIFY_IS_EQUAL(new1(ArrayType(i)), 123); - } - - gpuFree(d_in1); -} - void test_gpu_elementwise_small() { Tensor in1(Eigen::array(2)); Tensor in2(Eigen::array(2)); @@ -1565,10 +1524,6 @@ void test_gpu_gamma_sample_der_alpha() EIGEN_DECLARE_TEST(cxx11_tensor_gpu) { CALL_SUBTEST_1(test_gpu_nullary()); - CALL_SUBTEST_1(test_gpu_nullary_max_size()); - CALL_SUBTEST_1(test_gpu_nullary_max_size()); - CALL_SUBTEST_1((test_gpu_nullary_max_size< - int64_t, (std::numeric_limits::max)() + 100ll>())); CALL_SUBTEST_1(test_gpu_elementwise_small()); CALL_SUBTEST_1(test_gpu_elementwise()); CALL_SUBTEST_1(test_gpu_props());