mirror of
https://gitlab.com/libeigen/eigen.git
synced 2024-12-27 07:29:52 +08:00
Revert "Avoid integer overflow in EigenMetaKernel indexing"
This reverts commit 100d7caf92
This commit is contained in:
parent
68e0d023c0
commit
185ad0e610
@ -553,39 +553,11 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
#if defined(EIGEN_GPUCC)
|
#if defined(EIGEN_GPUCC)
|
||||||
// Returns lhs + rhs, saturating to the highest/lowest representable value on
|
|
||||||
// overflow/underflow respectively.
|
|
||||||
template <typename Index>
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index saturate_add(Index lhs, Index rhs) {
|
|
||||||
const Index highest = NumTraits<Index>::highest();
|
|
||||||
const Index lowest = NumTraits<Index>::lowest();
|
|
||||||
if (lhs > 0 && rhs > 0) {
|
|
||||||
return (lhs > highest - rhs) ? highest : lhs + rhs;
|
|
||||||
} else if (lhs < 0 && rhs < 0) {
|
|
||||||
return (lhs < lowest - rhs) ? lowest : lhs + rhs;
|
|
||||||
} else {
|
|
||||||
return lhs + rhs;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#if !defined(EIGEN_USE_HIP)
|
|
||||||
// Specialization for int32 using PTX intrinsic.
|
|
||||||
template <>
|
|
||||||
__device__ EIGEN_ALWAYS_INLINE int32_t saturate_add<int32_t>(int32_t lhs,
|
|
||||||
int32_t rhs) {
|
|
||||||
// add.sat is only supported for s32.
|
|
||||||
int32_t result;
|
|
||||||
asm("add.sat.s32 %0, %1, %2;" : "=r"(result) : "r"(lhs), "r"(rhs));
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template <typename Evaluator, typename StorageIndex, bool Vectorizable>
|
template <typename Evaluator, typename StorageIndex, bool Vectorizable>
|
||||||
struct EigenMetaKernelEval {
|
struct EigenMetaKernelEval {
|
||||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
|
void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
|
||||||
for (StorageIndex i = firstIdx; i < lastIdx;
|
for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) {
|
||||||
i = saturate_add(i, step_size)) {
|
|
||||||
eval.evalScalar(i);
|
eval.evalScalar(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -601,11 +573,10 @@ struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
|
|||||||
|
|
||||||
// Use the vector path
|
// Use the vector path
|
||||||
for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size;
|
for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size;
|
||||||
i = saturate_add(i, vectorized_step_size)) {
|
i += vectorized_step_size) {
|
||||||
eval.evalPacket(i);
|
eval.evalPacket(i);
|
||||||
}
|
}
|
||||||
for (StorageIndex i = saturate_add(vectorized_size, firstIdx); i < lastIdx;
|
for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) {
|
||||||
i = saturate_add(i, step_size)) {
|
|
||||||
eval.evalScalar(i);
|
eval.evalScalar(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -632,11 +603,8 @@ EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Til
|
|||||||
if (needs_assign) {
|
if (needs_assign) {
|
||||||
|
|
||||||
const int block_size = device.maxGpuThreadsPerBlock();
|
const int block_size = device.maxGpuThreadsPerBlock();
|
||||||
const int max_blocks =
|
const int max_blocks = device.getNumGpuMultiProcessors() *
|
||||||
numext::mini<int64_t>(device.getNumGpuMultiProcessors() *
|
device.maxGpuThreadsPerMultiProcessor() / block_size;
|
||||||
device.maxGpuThreadsPerMultiProcessor(),
|
|
||||||
NumTraits<StorageIndex>::highest()) /
|
|
||||||
block_size;
|
|
||||||
const StorageIndex size = array_prod(evaluator.dimensions());
|
const StorageIndex size = array_prod(evaluator.dimensions());
|
||||||
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
|
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
|
||||||
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
|
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
|
||||||
|
@ -30,15 +30,13 @@ const T2& choose(Cond<false>, const T1&, const T2& second) {
|
|||||||
template <typename T, typename X, typename Y>
|
template <typename T, typename X, typename Y>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
T divup(const X x, const Y y) {
|
T divup(const X x, const Y y) {
|
||||||
// Note: This form is used because it cannot overflow.
|
return static_cast<T>((x + y - 1) / y);
|
||||||
return static_cast<T>(x == 0 ? 0 : (x - 1) / y + 1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
T divup(const T x, const T y) {
|
T divup(const T x, const T y) {
|
||||||
// Note: This form is used because it cannot overflow.
|
return static_cast<T>((x + y - 1) / y);
|
||||||
return static_cast<T>(x == 0 ? 0 : (x - 1) / y + 1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <size_t n> struct max_n_1 {
|
template <size_t n> struct max_n_1 {
|
||||||
|
@ -66,47 +66,6 @@ void test_gpu_nullary() {
|
|||||||
gpuFree(d_in2);
|
gpuFree(d_in2);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Tests that there are no indexing overflows when computing tensors with the
|
|
||||||
// max representable size.
|
|
||||||
template <typename IndexType,
|
|
||||||
IndexType N = (std::numeric_limits<IndexType>::max)()>
|
|
||||||
void test_gpu_nullary_max_size()
|
|
||||||
{
|
|
||||||
typedef int8_t DataType;
|
|
||||||
typedef Tensor<DataType, 1, 0, IndexType> TensorType;
|
|
||||||
typedef Eigen::array<IndexType, 1> ArrayType;
|
|
||||||
|
|
||||||
const IndexType n = N;
|
|
||||||
TensorType in1((ArrayType(n)));
|
|
||||||
in1.setZero();
|
|
||||||
|
|
||||||
std::size_t in1_bytes = in1.size() * sizeof(DataType);
|
|
||||||
|
|
||||||
DataType* d_in1;
|
|
||||||
gpuMalloc((void**)(&d_in1), in1_bytes);
|
|
||||||
|
|
||||||
gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
|
|
||||||
|
|
||||||
Eigen::GpuStreamDevice stream;
|
|
||||||
Eigen::GpuDevice gpu_device(&stream);
|
|
||||||
|
|
||||||
Eigen::TensorMap<TensorType> gpu_in1(d_in1, ArrayType(n));
|
|
||||||
|
|
||||||
gpu_in1.device(gpu_device) = gpu_in1.constant(123);
|
|
||||||
|
|
||||||
TensorType new1((ArrayType(n)));
|
|
||||||
|
|
||||||
assert(gpuMemcpyAsync(new1.data(), d_in1, in1_bytes, gpuMemcpyDeviceToHost,
|
|
||||||
gpu_device.stream()) == gpuSuccess);
|
|
||||||
assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
|
|
||||||
|
|
||||||
for (IndexType i = 0; i < n; ++i) {
|
|
||||||
VERIFY_IS_EQUAL(new1(ArrayType(i)), 123);
|
|
||||||
}
|
|
||||||
|
|
||||||
gpuFree(d_in1);
|
|
||||||
}
|
|
||||||
|
|
||||||
void test_gpu_elementwise_small() {
|
void test_gpu_elementwise_small() {
|
||||||
Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2));
|
Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2));
|
||||||
Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2));
|
Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2));
|
||||||
@ -1565,10 +1524,6 @@ void test_gpu_gamma_sample_der_alpha()
|
|||||||
EIGEN_DECLARE_TEST(cxx11_tensor_gpu)
|
EIGEN_DECLARE_TEST(cxx11_tensor_gpu)
|
||||||
{
|
{
|
||||||
CALL_SUBTEST_1(test_gpu_nullary());
|
CALL_SUBTEST_1(test_gpu_nullary());
|
||||||
CALL_SUBTEST_1(test_gpu_nullary_max_size<int16_t>());
|
|
||||||
CALL_SUBTEST_1(test_gpu_nullary_max_size<int32_t>());
|
|
||||||
CALL_SUBTEST_1((test_gpu_nullary_max_size<
|
|
||||||
int64_t, (std::numeric_limits<int32_t>::max)() + 100ll>()));
|
|
||||||
CALL_SUBTEST_1(test_gpu_elementwise_small());
|
CALL_SUBTEST_1(test_gpu_elementwise_small());
|
||||||
CALL_SUBTEST_1(test_gpu_elementwise());
|
CALL_SUBTEST_1(test_gpu_elementwise());
|
||||||
CALL_SUBTEST_1(test_gpu_props());
|
CALL_SUBTEST_1(test_gpu_props());
|
||||||
|
Loading…
Reference in New Issue
Block a user