From c6b0de2c21c921e1c1cca4978d4e9d6e18559a15 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 22 Jul 2016 17:18:20 -0700 Subject: [PATCH] Improved partial reductions in more cases --- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 22 ++++++++++---- unsupported/test/cxx11_tensor_reduction.cpp | 29 +++++++++++++++++++ 2 files changed, 46 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index f950f00933..0e15769548 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -492,7 +492,7 @@ struct TensorEvaluator, Device> } // Attempt to use an optimized reduction. - else if (RunningOnGPU && data && (m_device.majorDeviceVersion() >= 3)) { + else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) { bool reducing_inner_dims = true; for (int i = 0; i < NumReducedDims; ++i) { if (static_cast(Layout) == static_cast(ColMajor)) { @@ -505,8 +505,12 @@ struct TensorEvaluator, Device> (reducing_inner_dims || ReducingInnerMostDims)) { const Index num_values_to_reduce = internal::array_prod(m_reducedDims); const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + if (!data && num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve) { + data = static_cast(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); + m_result = data; + } Op reducer(m_reducer); - return internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); + return internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve) || (m_result != NULL); } bool preserving_inner_dims = true; @@ -521,8 +525,12 @@ struct TensorEvaluator, Device> preserving_inner_dims) { const Index num_values_to_reduce = internal::array_prod(m_reducedDims); const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + if (!data && num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve) { + data = static_cast(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); + m_result = data; + } Op reducer(m_reducer); - return internal::OuterReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); + return internal::OuterReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve) || (m_result != NULL); } } return true; @@ -537,8 +545,8 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - if (RunningFullReduction && m_result) { - return *m_result; + if ((RunningFullReduction || RunningOnGPU) && m_result) { + return *(m_result + index); } Op reducer(m_reducer); if (ReducingInnerMostDims || RunningFullReduction) { @@ -560,6 +568,10 @@ struct TensorEvaluator, Device> EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions()))); + if (RunningOnGPU && m_result) { + return internal::pload(m_result + index); + } + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; if (ReducingInnerMostDims) { const Index num_values_to_reduce = diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp index ca483257b5..1490ec3da5 100644 --- a/unsupported/test/cxx11_tensor_reduction.cpp +++ b/unsupported/test/cxx11_tensor_reduction.cpp @@ -239,6 +239,33 @@ static void test_simple_reductions() { } } + +template +static void test_reductions_in_expr() { + Tensor tensor(2, 3, 5, 7); + tensor.setRandom(); + array reduction_axis2; + reduction_axis2[0] = 1; + reduction_axis2[1] = 3; + + Tensor result(2, 5); + result = result.constant(1.0f) - tensor.sum(reduction_axis2); + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_EQUAL(result.dimension(1), 5); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 5; ++j) { + float sum = 0.0f; + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 7; ++l) { + sum += tensor(i, k, j, l); + } + } + VERIFY_IS_APPROX(result(i, j), 1.0f - sum); + } + } +} + + template static void test_full_reductions() { Tensor tensor(2, 3); @@ -462,6 +489,8 @@ void test_cxx11_tensor_reduction() { CALL_SUBTEST(test_trivial_reductions()); CALL_SUBTEST(test_simple_reductions()); CALL_SUBTEST(test_simple_reductions()); + CALL_SUBTEST(test_reductions_in_expr()); + CALL_SUBTEST(test_reductions_in_expr()); CALL_SUBTEST(test_full_reductions()); CALL_SUBTEST(test_full_reductions()); CALL_SUBTEST(test_user_defined_reductions());