diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index cf1e821a9..24a7dd0ca 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -89,19 +89,22 @@ struct TensorEvaluator, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; enum { - IsAligned = false, - PacketAccess = (PacketType::size > 1), - BlockAccess = false, - PreferBlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = false, + PacketAccess = (PacketType::size > 1), + BlockAccess = true, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; typedef internal::TensorIntDivisor IndexDivisor; + typedef internal::TensorBlock + TensorBlock; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_generator(op.generator()) + : m_device(device), m_generator(op.generator()) #ifdef EIGEN_USE_SYCL , m_argImpl(op.expression(), device) #endif @@ -154,7 +157,71 @@ struct TensorEvaluator, Device> return rslt; } - // TODO(ezhulenev): Add tiled evaluation support. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + Eigen::Index block_total_size_max = numext::maxi( + 1, m_device.firstLevelCacheSize() / sizeof(Scalar)); + resources->push_back(internal::TensorOpResourceRequirements( + internal::kSkewedInnerDims, block_total_size_max)); + } + + struct BlockIteratorState { + Index stride; + Index span; + Index size; + Index count; + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + TensorBlock* output_block) const { + if (NumDims <= 0) return; + + static const bool is_col_major = + static_cast(Layout) == static_cast(ColMajor); + + // Compute spatial coordinates for the first block element. + array coords; + extract_coordinates(output_block->first_coeff_index(), coords); + array initial_coords = coords; + + CoeffReturnType* data = output_block->data(); + Index offset = 0; + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array it; + for (Index i = 0; i < NumDims; ++i) { + const Index dim = is_col_major ? i : NumDims - 1 - i; + it[i].size = output_block->block_sizes()[dim]; + it[i].stride = output_block->block_strides()[dim]; + it[i].span = it[i].stride * (it[i].size - 1); + it[i].count = 0; + } + eigen_assert(it[0].stride == 1); + + while (it[NumDims - 1].count < it[NumDims - 1].size) { + // Generate data for the inner-most dimension. + for (Index i = 0; i < it[0].size; ++i) { + *(data + offset + i) = m_generator(coords); + coords[is_col_major ? 0 : NumDims - 1]++; + } + coords[is_col_major ? 0 : NumDims - 1] = + initial_coords[is_col_major ? 0 : NumDims - 1]; + + // Update offset. + for (Index i = 1; i < NumDims; ++i) { + if (++it[i].count < it[i].size) { + offset += it[i].stride; + coords[is_col_major ? i : NumDims - 1 - i]++; + break; + } + if (i != NumDims - 1) it[i].count = 0; + coords[is_col_major ? i : NumDims - 1 - i] = + initial_coords[is_col_major ? i : NumDims - 1 - i]; + offset -= it[i].span; + } + } + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { @@ -191,6 +258,7 @@ struct TensorEvaluator, Device> } } + const Device& m_device; Dimensions m_dimensions; array m_strides; array m_fast_strides; diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp index 608306613..162dab7b8 100644 --- a/unsupported/test/cxx11_tensor_executor.cpp +++ b/unsupported/test/cxx11_tensor_executor.cpp @@ -484,6 +484,49 @@ static void test_execute_broadcasting_of_forced_eval(Device d) } } +template +struct DummyGenerator { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + T operator()(const array & dims) const { + T result = static_cast(0); + for (int i = 0; i < NumDims; ++i) { + result += static_cast((i + 1) * dims[i]); + } + return result; + } +}; + +template +static void test_execute_generator_op(Device d) +{ + static constexpr int Options = 0 | Layout; + + auto dims = RandomDims(20, 30); + Tensor src(dims); + src.setRandom(); + + const auto expr = src.generate(DummyGenerator()); + + // We assume that generator on a default device is tested and correct, so + // we can rely on it to verify correctness of tensor executor and tiling. + Tensor golden; + golden = expr; + + // Now do the broadcasting using configured tensor executor. + Tensor dst(golden.dimensions()); + + using Assign = TensorAssignOp; + using Executor = + internal::TensorExecutor; + + Executor::run(Assign(dst, expr), d); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); + } +} + #define CALL_SUBTEST_PART(PART) \ CALL_SUBTEST_##PART @@ -565,8 +608,13 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) { CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 4); CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 5); + CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 2); + CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 3); + CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 4); + CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 5); + // Force CMake to split this test. - // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12 + // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13 } #undef CALL_SUBTEST_COMBINATIONS diff --git a/unsupported/test/cxx11_tensor_generator.cpp b/unsupported/test/cxx11_tensor_generator.cpp index ee5e29b77..6dcf676bb 100644 --- a/unsupported/test/cxx11_tensor_generator.cpp +++ b/unsupported/test/cxx11_tensor_generator.cpp @@ -42,11 +42,11 @@ struct Generator2D { template static void test_2D() { - Tensor matrix(5, 7); + Tensor matrix(512, 512); Tensor result = matrix.generate(Generator2D()); - for (int i = 0; i < 5; ++i) { - for (int j = 0; j < 5; ++j) { + for (int i = 0; i < 512; ++i) { + for (int j = 0; j < 512; ++j) { VERIFY_IS_EQUAL(result(i, j), 3*i + 11*j); } }