mirror of
https://gitlab.com/libeigen/eigen.git
synced 2024-12-21 07:19:46 +08:00
751 lines
30 KiB
C++
751 lines
30 KiB
C++
// This file is part of Eigen, a lightweight C++ template library
|
|
// for linear algebra.
|
|
//
|
|
// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
|
|
//
|
|
// This Source Code Form is subject to the terms of the Mozilla
|
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
#define EIGEN_USE_THREADS
|
|
|
|
#include "main.h"
|
|
|
|
#include <Eigen/CXX11/Tensor>
|
|
|
|
using Eigen::Tensor;
|
|
using Eigen::RowMajor;
|
|
using Eigen::ColMajor;
|
|
|
|
// A set of tests to verify that different TensorExecutor strategies yields the
|
|
// same results for all the ops, supporting tiled evaluation.
|
|
|
|
template <int NumDims>
|
|
static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
|
|
array<Index, NumDims> dims;
|
|
for (int i = 0; i < NumDims; ++i) {
|
|
dims[i] = internal::random<int>(min_dim, max_dim);
|
|
}
|
|
return dims;
|
|
}
|
|
|
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
|
bool Tileable, int Layout>
|
|
static void test_execute_unary_expr(Device d)
|
|
{
|
|
static constexpr int Options = 0 | Layout;
|
|
|
|
// Pick a large enough tensor size to bypass small tensor block evaluation
|
|
// optimization.
|
|
auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
|
|
|
|
Tensor<T, NumDims, Options, Index> src(dims);
|
|
Tensor<T, NumDims, Options, Index> dst(dims);
|
|
|
|
src.setRandom();
|
|
const auto expr = src.square();
|
|
|
|
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
|
using Executor =
|
|
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
|
|
|
Executor::run(Assign(dst, expr), d);
|
|
|
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
|
|
T square = src.coeff(i) * src.coeff(i);
|
|
VERIFY_IS_EQUAL(square, dst.coeff(i));
|
|
}
|
|
}
|
|
|
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
|
bool Tileable, int Layout>
|
|
static void test_execute_binary_expr(Device d)
|
|
{
|
|
static constexpr int Options = 0 | Layout;
|
|
|
|
// Pick a large enough tensor size to bypass small tensor block evaluation
|
|
// optimization.
|
|
auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
|
|
|
|
Tensor<T, NumDims, Options, Index> lhs(dims);
|
|
Tensor<T, NumDims, Options, Index> rhs(dims);
|
|
Tensor<T, NumDims, Options, Index> dst(dims);
|
|
|
|
lhs.setRandom();
|
|
rhs.setRandom();
|
|
|
|
const auto expr = lhs + rhs;
|
|
|
|
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
|
using Executor =
|
|
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
|
|
|
Executor::run(Assign(dst, expr), d);
|
|
|
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
|
|
T sum = lhs.coeff(i) + rhs.coeff(i);
|
|
VERIFY_IS_EQUAL(sum, dst.coeff(i));
|
|
}
|
|
}
|
|
|
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
|
bool Tileable, int Layout>
|
|
static void test_execute_broadcasting(Device d)
|
|
{
|
|
static constexpr int Options = 0 | Layout;
|
|
|
|
auto dims = RandomDims<NumDims>(1, 10);
|
|
Tensor<T, NumDims, Options, Index> src(dims);
|
|
src.setRandom();
|
|
|
|
const auto broadcasts = RandomDims<NumDims>(1, 7);
|
|
const auto expr = src.broadcast(broadcasts);
|
|
|
|
// We assume that broadcasting on a default device is tested and correct, so
|
|
// we can rely on it to verify correctness of tensor executor and tiling.
|
|
Tensor<T, NumDims, Options, Index> golden;
|
|
golden = expr;
|
|
|
|
// Now do the broadcasting using configured tensor executor.
|
|
Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
|
|
|
|
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
|
using Executor =
|
|
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
|
|
|
Executor::run(Assign(dst, expr), d);
|
|
|
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
|
|
VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
|
|
}
|
|
}
|
|
|
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
|
bool Tileable, int Layout>
|
|
static void test_execute_chipping_rvalue(Device d)
|
|
{
|
|
auto dims = RandomDims<NumDims>(1, 10);
|
|
Tensor<T, NumDims, Layout, Index> src(dims);
|
|
src.setRandom();
|
|
|
|
#define TEST_CHIPPING(CHIP_DIM) \
|
|
if (NumDims > (CHIP_DIM)) { \
|
|
const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \
|
|
const auto expr = src.template chip<(CHIP_DIM)>(offset); \
|
|
\
|
|
Tensor<T, NumDims - 1, Layout, Index> golden; \
|
|
golden = expr; \
|
|
\
|
|
Tensor<T, NumDims - 1, Layout, Index> dst(golden.dimensions()); \
|
|
\
|
|
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; \
|
|
using Executor = internal::TensorExecutor<const Assign, Device, \
|
|
Vectorizable, Tileable>; \
|
|
\
|
|
Executor::run(Assign(dst, expr), d); \
|
|
\
|
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { \
|
|
VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); \
|
|
} \
|
|
}
|
|
|
|
TEST_CHIPPING(0)
|
|
TEST_CHIPPING(1)
|
|
TEST_CHIPPING(2)
|
|
TEST_CHIPPING(3)
|
|
TEST_CHIPPING(4)
|
|
TEST_CHIPPING(5)
|
|
|
|
#undef TEST_CHIPPING
|
|
}
|
|
|
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
|
bool Tileable, int Layout>
|
|
static void test_execute_chipping_lvalue(Device d)
|
|
{
|
|
auto dims = RandomDims<NumDims>(1, 10);
|
|
|
|
#define TEST_CHIPPING(CHIP_DIM) \
|
|
if (NumDims > (CHIP_DIM)) { \
|
|
/* Generate random data that we'll assign to the chipped tensor dim. */ \
|
|
array<Index, NumDims - 1> src_dims; \
|
|
for (int i = 0; i < NumDims - 1; ++i) { \
|
|
int dim = i < (CHIP_DIM) ? i : i + 1; \
|
|
src_dims[i] = dims[dim]; \
|
|
} \
|
|
\
|
|
Tensor<T, NumDims - 1, Layout, Index> src(src_dims); \
|
|
src.setRandom(); \
|
|
\
|
|
const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \
|
|
\
|
|
/* Generate random data to fill non-chipped dimensions*/ \
|
|
Tensor<T, NumDims, Layout, Index> random(dims); \
|
|
random.setRandom(); \
|
|
\
|
|
Tensor<T, NumDims, Layout, Index> golden(dims); \
|
|
golden = random; \
|
|
golden.template chip<(CHIP_DIM)>(offset) = src; \
|
|
\
|
|
Tensor<T, NumDims, Layout, Index> dst(dims); \
|
|
dst = random; \
|
|
auto expr = dst.template chip<(CHIP_DIM)>(offset); \
|
|
\
|
|
using Assign = TensorAssignOp<decltype(expr), const decltype(src)>; \
|
|
using Executor = internal::TensorExecutor<const Assign, Device, \
|
|
Vectorizable, Tileable>; \
|
|
\
|
|
Executor::run(Assign(expr, src), d); \
|
|
\
|
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { \
|
|
VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); \
|
|
} \
|
|
}
|
|
|
|
TEST_CHIPPING(0)
|
|
TEST_CHIPPING(1)
|
|
TEST_CHIPPING(2)
|
|
TEST_CHIPPING(3)
|
|
TEST_CHIPPING(4)
|
|
TEST_CHIPPING(5)
|
|
|
|
#undef TEST_CHIPPING
|
|
}
|
|
|
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
|
bool Tileable, int Layout>
|
|
static void test_execute_shuffle_rvalue(Device d)
|
|
{
|
|
static constexpr int Options = 0 | Layout;
|
|
|
|
auto dims = RandomDims<NumDims>(1, 10);
|
|
Tensor<T, NumDims, Options, Index> src(dims);
|
|
src.setRandom();
|
|
|
|
// Create a random dimension re-ordering/shuffle.
|
|
std::vector<Index> shuffle;
|
|
for (int i = 0; i < NumDims; ++i) shuffle.push_back(i);
|
|
std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
|
|
|
|
const auto expr = src.shuffle(shuffle);
|
|
|
|
// We assume that shuffling on a default device is tested and correct, so
|
|
// we can rely on it to verify correctness of tensor executor and tiling.
|
|
Tensor<T, NumDims, Options, Index> golden;
|
|
golden = expr;
|
|
|
|
// Now do the shuffling using configured tensor executor.
|
|
Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
|
|
|
|
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
|
using Executor =
|
|
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
|
|
|
Executor::run(Assign(dst, expr), d);
|
|
|
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
|
|
VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
|
|
}
|
|
}
|
|
|
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
|
bool Tileable, int Layout>
|
|
static void test_execute_shuffle_lvalue(Device d)
|
|
{
|
|
static constexpr int Options = 0 | Layout;
|
|
|
|
auto dims = RandomDims<NumDims>(5, 10);
|
|
Tensor<T, NumDims, Options, Index> src(dims);
|
|
src.setRandom();
|
|
|
|
// Create a random dimension re-ordering/shuffle.
|
|
std::vector<Index> shuffle;
|
|
for (int i = 0; i < NumDims; ++i) shuffle.push_back(i);
|
|
std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
|
|
|
|
array<Index, NumDims> shuffled_dims;
|
|
for (int i = 0; i < NumDims; ++i) shuffled_dims[shuffle[i]] = dims[i];
|
|
|
|
// We assume that shuffling on a default device is tested and correct, so
|
|
// we can rely on it to verify correctness of tensor executor and tiling.
|
|
Tensor<T, NumDims, Options, Index> golden(shuffled_dims);
|
|
golden.shuffle(shuffle) = src;
|
|
|
|
// Now do the shuffling using configured tensor executor.
|
|
Tensor<T, NumDims, Options, Index> dst(shuffled_dims);
|
|
|
|
auto expr = dst.shuffle(shuffle);
|
|
|
|
using Assign = TensorAssignOp<decltype(expr), const decltype(src)>;
|
|
using Executor =
|
|
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
|
|
|
Executor::run(Assign(expr, src), d);
|
|
|
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
|
|
VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
|
|
}
|
|
}
|
|
|
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
|
bool Tileable, int Layout>
|
|
static void test_execute_reduction(Device d)
|
|
{
|
|
static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
|
|
|
|
static constexpr int ReducedDims = NumDims - 2;
|
|
static constexpr int Options = 0 | Layout;
|
|
|
|
auto dims = RandomDims<NumDims>(5, 10);
|
|
Tensor<T, NumDims, Options, Index> src(dims);
|
|
src.setRandom();
|
|
|
|
// Pick two random and unique reduction dimensions.
|
|
int reduction0 = internal::random<int>(0, NumDims - 1);
|
|
int reduction1 = internal::random<int>(0, NumDims - 1);
|
|
while (reduction0 == reduction1) {
|
|
reduction1 = internal::random<int>(0, NumDims - 1);
|
|
}
|
|
|
|
DSizes<Index, 2> reduction_axis;
|
|
reduction_axis[0] = reduction0;
|
|
reduction_axis[1] = reduction1;
|
|
|
|
Tensor<T, ReducedDims, Options, Index> golden = src.sum(reduction_axis);
|
|
|
|
// Now do the reduction using configured tensor executor.
|
|
Tensor<T, ReducedDims, Options, Index> dst(golden.dimensions());
|
|
|
|
auto expr = src.sum(reduction_axis);
|
|
|
|
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
|
using Executor =
|
|
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
|
|
|
Executor::run(Assign(dst, expr), d);
|
|
|
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
|
|
VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
|
|
}
|
|
}
|
|
|
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
|
bool Tileable, int Layout>
|
|
static void test_execute_reshape(Device d)
|
|
{
|
|
static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
|
|
|
|
static constexpr int ReshapedDims = NumDims - 1;
|
|
static constexpr int Options = 0 | Layout;
|
|
|
|
auto dims = RandomDims<NumDims>(5, 10);
|
|
Tensor<T, NumDims, Options, Index> src(dims);
|
|
src.setRandom();
|
|
|
|
// Multiple 0th dimension and then shuffle.
|
|
std::vector<Index> shuffle;
|
|
for (int i = 0; i < ReshapedDims; ++i) shuffle.push_back(i);
|
|
std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
|
|
|
|
DSizes<Index, ReshapedDims> reshaped_dims;
|
|
reshaped_dims[shuffle[0]] = dims[0] * dims[1];
|
|
for (int i = 1; i < ReshapedDims; ++i) reshaped_dims[shuffle[i]] = dims[i + 1];
|
|
|
|
Tensor<T, ReshapedDims, Options, Index> golden = src.reshape(reshaped_dims);
|
|
|
|
// Now reshape using configured tensor executor.
|
|
Tensor<T, ReshapedDims, Options, Index> dst(golden.dimensions());
|
|
|
|
auto expr = src.reshape(reshaped_dims);
|
|
|
|
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
|
using Executor =
|
|
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
|
|
|
Executor::run(Assign(dst, expr), d);
|
|
|
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
|
|
VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
|
|
}
|
|
}
|
|
|
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
|
bool Tileable, int Layout>
|
|
static void test_execute_slice_rvalue(Device d)
|
|
{
|
|
static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
|
|
static constexpr int Options = 0 | Layout;
|
|
|
|
auto dims = RandomDims<NumDims>(5, 10);
|
|
Tensor<T, NumDims, Options, Index> src(dims);
|
|
src.setRandom();
|
|
|
|
// Pick a random slice of src tensor.
|
|
auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>());
|
|
auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>());
|
|
|
|
// Make sure that slice start + size do not overflow tensor dims.
|
|
for (int i = 0; i < NumDims; ++i) {
|
|
slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
|
|
slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
|
|
}
|
|
|
|
Tensor<T, NumDims, Options, Index> golden =
|
|
src.slice(slice_start, slice_size);
|
|
|
|
// Now reshape using configured tensor executor.
|
|
Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
|
|
|
|
auto expr = src.slice(slice_start, slice_size);
|
|
|
|
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
|
using Executor =
|
|
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
|
|
|
Executor::run(Assign(dst, expr), d);
|
|
|
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
|
|
VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
|
|
}
|
|
}
|
|
|
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
|
bool Tileable, int Layout>
|
|
static void test_execute_slice_lvalue(Device d)
|
|
{
|
|
static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
|
|
static constexpr int Options = 0 | Layout;
|
|
|
|
auto dims = RandomDims<NumDims>(5, 10);
|
|
Tensor<T, NumDims, Options, Index> src(dims);
|
|
src.setRandom();
|
|
|
|
// Pick a random slice of src tensor.
|
|
auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10));
|
|
auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10));
|
|
|
|
// Make sure that slice start + size do not overflow tensor dims.
|
|
for (int i = 0; i < NumDims; ++i) {
|
|
slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
|
|
slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
|
|
}
|
|
|
|
Tensor<T, NumDims, Options, Index> slice(slice_size);
|
|
slice.setRandom();
|
|
|
|
// Assign a slice using default executor.
|
|
Tensor<T, NumDims, Options, Index> golden = src;
|
|
golden.slice(slice_start, slice_size) = slice;
|
|
|
|
// And using configured execution strategy.
|
|
Tensor<T, NumDims, Options, Index> dst = src;
|
|
auto expr = dst.slice(slice_start, slice_size);
|
|
|
|
using Assign = TensorAssignOp<decltype(expr), const decltype(slice)>;
|
|
using Executor =
|
|
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
|
|
|
Executor::run(Assign(expr, slice), d);
|
|
|
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
|
|
VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
|
|
}
|
|
}
|
|
|
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
|
bool Tileable, int Layout>
|
|
static void test_execute_broadcasting_of_forced_eval(Device d)
|
|
{
|
|
static constexpr int Options = 0 | Layout;
|
|
|
|
auto dims = RandomDims<NumDims>(1, 10);
|
|
Tensor<T, NumDims, Options, Index> src(dims);
|
|
src.setRandom();
|
|
|
|
const auto broadcasts = RandomDims<NumDims>(1, 7);
|
|
const auto expr = src.square().eval().broadcast(broadcasts);
|
|
|
|
// We assume that broadcasting on a default device is tested and correct, so
|
|
// we can rely on it to verify correctness of tensor executor and tiling.
|
|
Tensor<T, NumDims, Options, Index> golden;
|
|
golden = expr;
|
|
|
|
// Now do the broadcasting using configured tensor executor.
|
|
Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
|
|
|
|
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
|
using Executor =
|
|
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
|
|
|
Executor::run(Assign(dst, expr), d);
|
|
|
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
|
|
VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
|
|
}
|
|
}
|
|
|
|
template<typename T, int NumDims>
|
|
struct DummyGenerator {
|
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
|
T operator()(const array <Index, NumDims>& dims) const {
|
|
T result = static_cast<T>(0);
|
|
for (int i = 0; i < NumDims; ++i) {
|
|
result += static_cast<T>((i + 1) * dims[i]);
|
|
}
|
|
return result;
|
|
}
|
|
};
|
|
|
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
|
bool Tileable, int Layout>
|
|
static void test_execute_generator_op(Device d)
|
|
{
|
|
static constexpr int Options = 0 | Layout;
|
|
|
|
auto dims = RandomDims<NumDims>(20, 30);
|
|
Tensor<T, NumDims, Options, Index> src(dims);
|
|
src.setRandom();
|
|
|
|
const auto expr = src.generate(DummyGenerator<T, NumDims>());
|
|
|
|
// We assume that generator on a default device is tested and correct, so
|
|
// we can rely on it to verify correctness of tensor executor and tiling.
|
|
Tensor<T, NumDims, Options, Index> golden;
|
|
golden = expr;
|
|
|
|
// Now do the broadcasting using configured tensor executor.
|
|
Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
|
|
|
|
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
|
using Executor =
|
|
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
|
|
|
Executor::run(Assign(dst, expr), d);
|
|
|
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
|
|
VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
|
|
}
|
|
}
|
|
|
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
|
bool Tileable, int Layout>
|
|
static void test_execute_reverse_rvalue(Device d)
|
|
{
|
|
static constexpr int Options = 0 | Layout;
|
|
|
|
auto dims = RandomDims<NumDims>(1, numext::pow(1000000.0, 1.0 / NumDims));
|
|
Tensor <T, NumDims, Options, Index> src(dims);
|
|
src.setRandom();
|
|
|
|
// Reverse half of the dimensions.
|
|
Eigen::array<bool, NumDims> reverse;
|
|
for (int i = 0; i < NumDims; ++i) reverse[i] = (dims[i] % 2 == 0);
|
|
|
|
const auto expr = src.reverse(reverse);
|
|
|
|
// We assume that reversing on a default device is tested and correct, so
|
|
// we can rely on it to verify correctness of tensor executor and tiling.
|
|
Tensor <T, NumDims, Options, Index> golden;
|
|
golden = expr;
|
|
|
|
// Now do the reversing using configured tensor executor.
|
|
Tensor <T, NumDims, Options, Index> dst(golden.dimensions());
|
|
|
|
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
|
using Executor =
|
|
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
|
|
|
Executor::run(Assign(dst, expr), d);
|
|
|
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
|
|
VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
|
|
}
|
|
}
|
|
|
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
|
bool Tileable, int Layout>
|
|
static void test_async_execute_unary_expr(Device d)
|
|
{
|
|
static constexpr int Options = 0 | Layout;
|
|
|
|
// Pick a large enough tensor size to bypass small tensor block evaluation
|
|
// optimization.
|
|
auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
|
|
|
|
Tensor<T, NumDims, Options, Index> src(dims);
|
|
Tensor<T, NumDims, Options, Index> dst(dims);
|
|
|
|
src.setRandom();
|
|
const auto expr = src.square();
|
|
|
|
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
|
using Executor = internal::TensorAsyncExecutor<const Assign, Device,
|
|
Vectorizable, Tileable>;
|
|
Eigen::Barrier done(1);
|
|
Executor::runAsync(Assign(dst, expr), d, [&done]() { done.Notify(); });
|
|
done.Wait();
|
|
|
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
|
|
T square = src.coeff(i) * src.coeff(i);
|
|
VERIFY_IS_EQUAL(square, dst.coeff(i));
|
|
}
|
|
}
|
|
|
|
template <typename T, int NumDims, typename Device, bool Vectorizable,
|
|
bool Tileable, int Layout>
|
|
static void test_async_execute_binary_expr(Device d)
|
|
{
|
|
static constexpr int Options = 0 | Layout;
|
|
|
|
// Pick a large enough tensor size to bypass small tensor block evaluation
|
|
// optimization.
|
|
auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
|
|
|
|
Tensor<T, NumDims, Options, Index> lhs(dims);
|
|
Tensor<T, NumDims, Options, Index> rhs(dims);
|
|
Tensor<T, NumDims, Options, Index> dst(dims);
|
|
|
|
lhs.setRandom();
|
|
rhs.setRandom();
|
|
|
|
const auto expr = lhs + rhs;
|
|
|
|
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
|
using Executor = internal::TensorAsyncExecutor<const Assign, Device,
|
|
Vectorizable, Tileable>;
|
|
|
|
Eigen::Barrier done(1);
|
|
Executor::runAsync(Assign(dst, expr), d, [&done]() { done.Notify(); });
|
|
done.Wait();
|
|
|
|
for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
|
|
T sum = lhs.coeff(i) + rhs.coeff(i);
|
|
VERIFY_IS_EQUAL(sum, dst.coeff(i));
|
|
}
|
|
}
|
|
|
|
#ifdef EIGEN_DONT_VECTORIZE
|
|
#define VECTORIZABLE(VAL) !EIGEN_DONT_VECTORIZE && VAL
|
|
#else
|
|
#define VECTORIZABLE(VAL) VAL
|
|
#endif
|
|
|
|
#define CALL_SUBTEST_PART(PART) \
|
|
CALL_SUBTEST_##PART
|
|
|
|
#define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \
|
|
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, false, ColMajor>(default_device))); \
|
|
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, true, ColMajor>(default_device))); \
|
|
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), false, ColMajor>(default_device))); \
|
|
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), true, ColMajor>(default_device))); \
|
|
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, false, RowMajor>(default_device))); \
|
|
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, true, RowMajor>(default_device))); \
|
|
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), false, RowMajor>(default_device))); \
|
|
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), true, RowMajor>(default_device))); \
|
|
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, false, ColMajor>(tp_device))); \
|
|
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, true, ColMajor>(tp_device))); \
|
|
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), false, ColMajor>(tp_device))); \
|
|
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), true, ColMajor>(tp_device))); \
|
|
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, false, RowMajor>(tp_device))); \
|
|
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, true, RowMajor>(tp_device))); \
|
|
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), false, RowMajor>(tp_device))); \
|
|
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), true, RowMajor>(tp_device)))
|
|
|
|
// NOTE: Currently only ThreadPoolDevice supports async expression evaluation.
|
|
#define CALL_ASYNC_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \
|
|
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, false, ColMajor>(tp_device))); \
|
|
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, true, ColMajor>(tp_device))); \
|
|
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), false, ColMajor>(tp_device))); \
|
|
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), true, ColMajor>(tp_device))); \
|
|
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, false, RowMajor>(tp_device))); \
|
|
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, true, RowMajor>(tp_device))); \
|
|
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), false, RowMajor>(tp_device))); \
|
|
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), true, RowMajor>(tp_device)))
|
|
|
|
EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
|
|
Eigen::DefaultDevice default_device;
|
|
// Default device is unused in ASYNC tests.
|
|
EIGEN_UNUSED_VARIABLE(default_device);
|
|
|
|
const auto num_threads = internal::random<int>(20, 24);
|
|
Eigen::ThreadPool tp(num_threads);
|
|
Eigen::ThreadPoolDevice tp_device(&tp, num_threads);
|
|
|
|
CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 3);
|
|
CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 4);
|
|
CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 5);
|
|
|
|
CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 3);
|
|
CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 4);
|
|
CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 5);
|
|
|
|
CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 3);
|
|
CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 4);
|
|
CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 5);
|
|
|
|
CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 3);
|
|
CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 4);
|
|
CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 5);
|
|
|
|
CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 3);
|
|
CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 4);
|
|
CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 5);
|
|
|
|
CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 3);
|
|
CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 4);
|
|
CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 5);
|
|
|
|
CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 3);
|
|
CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 4);
|
|
CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 5);
|
|
|
|
CALL_SUBTEST_COMBINATIONS(8, test_execute_reduction, float, 2);
|
|
CALL_SUBTEST_COMBINATIONS(8, test_execute_reduction, float, 3);
|
|
CALL_SUBTEST_COMBINATIONS(8, test_execute_reduction, float, 4);
|
|
CALL_SUBTEST_COMBINATIONS(8, test_execute_reduction, float, 5);
|
|
|
|
CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 2);
|
|
CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 3);
|
|
CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 4);
|
|
CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 5);
|
|
|
|
CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 2);
|
|
CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 3);
|
|
CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 4);
|
|
CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 5);
|
|
|
|
CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 2);
|
|
CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 3);
|
|
CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 4);
|
|
CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 5);
|
|
|
|
CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 2);
|
|
CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 3);
|
|
CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 4);
|
|
CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 5);
|
|
|
|
CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 2);
|
|
CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 3);
|
|
CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 4);
|
|
CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 5);
|
|
|
|
CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 1);
|
|
CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 2);
|
|
CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 3);
|
|
CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 4);
|
|
CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 5);
|
|
|
|
CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 3);
|
|
CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 4);
|
|
CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 5);
|
|
|
|
CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 3);
|
|
CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 4);
|
|
CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 5);
|
|
|
|
// Force CMake to split this test.
|
|
// EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16
|
|
}
|
|
|
|
#undef CALL_SUBTEST_COMBINATIONS
|