mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-03-07 18:27:40 +08:00
Merged in ezhulenev/eigen/tiling_3 (pull request PR-438)
Tiled tensor executor
This commit is contained in:
commit
678a0dcb12
@ -112,13 +112,13 @@ typedef unsigned __int64 uint64_t;
|
||||
#include "src/Tensor/TensorGlobalFunctions.h"
|
||||
|
||||
#include "src/Tensor/TensorBase.h"
|
||||
#include "src/Tensor/TensorBlock.h"
|
||||
|
||||
#include "src/Tensor/TensorEvaluator.h"
|
||||
#include "src/Tensor/TensorExpr.h"
|
||||
#include "src/Tensor/TensorReduction.h"
|
||||
#include "src/Tensor/TensorReductionGpu.h"
|
||||
#include "src/Tensor/TensorArgMax.h"
|
||||
#include "src/Tensor/TensorBlock.h"
|
||||
#include "src/Tensor/TensorConcatenation.h"
|
||||
#include "src/Tensor/TensorContractionMapper.h"
|
||||
#include "src/Tensor/TensorContractionBlocking.h"
|
||||
|
@ -68,6 +68,8 @@ class TensorAssignOp : public TensorBase<TensorAssignOp<LhsXprType, RhsXprType>
|
||||
typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index;
|
||||
|
||||
static const int NumDims = Eigen::internal::traits<TensorAssignOp>::NumDimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs)
|
||||
: m_lhs_xpr(lhs), m_rhs_xpr(rhs) {}
|
||||
|
||||
@ -95,20 +97,33 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
|
||||
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
static const int NumDims = XprType::NumDims;
|
||||
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
|
||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
||||
RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess
|
||||
IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned &
|
||||
TensorEvaluator<RightArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
|
||||
TensorEvaluator<RightArgType, Device>::PacketAccess,
|
||||
BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess &
|
||||
TensorEvaluator<RightArgType, Device>::BlockAccess,
|
||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
||||
RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess
|
||||
};
|
||||
|
||||
typedef typename internal::TensorBlock<
|
||||
typename internal::remove_const<Scalar>::type, Index, NumDims, Layout>
|
||||
TensorBlock;
|
||||
|
||||
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
|
||||
m_leftImpl(op.lhsExpression(), device),
|
||||
m_rightImpl(op.rhsExpression(), device)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
EIGEN_STATIC_ASSERT(
|
||||
(static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
|
||||
static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
|
||||
YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
|
||||
@ -164,6 +179,25 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
|
||||
TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
|
||||
std::vector<internal::TensorOpResourceRequirements>* resources) const {
|
||||
m_leftImpl.getResourceRequirements(resources);
|
||||
m_rightImpl.getResourceRequirements(resources);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlock* block) {
|
||||
if (TensorEvaluator<LeftArgType, Device>::RawAccess &&
|
||||
m_leftImpl.data() != nullptr) {
|
||||
TensorBlock left_block(block->first_coeff_index(), block->block_sizes(),
|
||||
block->tensor_strides(), block->tensor_strides(),
|
||||
m_leftImpl.data() + block->first_coeff_index());
|
||||
m_rightImpl.block(&left_block);
|
||||
} else {
|
||||
m_rightImpl.block(block);
|
||||
m_leftImpl.writeBlock(*block);
|
||||
}
|
||||
}
|
||||
|
||||
/// required by sycl in order to extract the accessor
|
||||
const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
|
||||
/// required by sycl in order to extract the accessor
|
||||
|
@ -14,6 +14,32 @@
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
namespace {
|
||||
|
||||
// Helper template to choose between ColMajor and RowMajor values.
|
||||
template <int Layout>
|
||||
struct cond;
|
||||
|
||||
template <>
|
||||
struct cond<ColMajor> {
|
||||
template <typename T>
|
||||
EIGEN_STRONG_INLINE const T& operator()(const T& col,
|
||||
const T& /*row*/) const {
|
||||
return col;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct cond<RowMajor> {
|
||||
template <typename T>
|
||||
EIGEN_STRONG_INLINE const T& operator()(const T& /*col*/,
|
||||
const T& row) const {
|
||||
return row;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
/**
|
||||
* \class TensorBlockShapeType
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
@ -39,6 +65,40 @@ enum class TensorBlockShapeType {
|
||||
kSkewedInnerDims,
|
||||
};
|
||||
|
||||
struct TensorOpResourceRequirements {
|
||||
TensorBlockShapeType block_shape;
|
||||
Index block_total_size;
|
||||
// TODO(andydavis) Add 'target_num_threads' to support communication of
|
||||
// thread-resource requirements. This will allow ops deep in the
|
||||
// expression tree (like reductions) to communicate resources
|
||||
// requirements based on local state (like the total number of reductions
|
||||
// to be computed).
|
||||
TensorOpResourceRequirements(internal::TensorBlockShapeType shape,
|
||||
const Index size)
|
||||
: block_shape(shape), block_total_size(size) {}
|
||||
};
|
||||
|
||||
// Tries to merge multiple resource requirements.
|
||||
EIGEN_STRONG_INLINE void MergeResourceRequirements(
|
||||
const std::vector<TensorOpResourceRequirements>& resources,
|
||||
TensorBlockShapeType* block_shape, Index* block_total_size) {
|
||||
if (resources.empty()) {
|
||||
return;
|
||||
}
|
||||
// TODO(andydavis) Implement different policies (i.e. revert to a default
|
||||
// policy if block shapes/sizes conflict).
|
||||
*block_shape = resources[0].block_shape;
|
||||
*block_total_size = resources[0].block_total_size;
|
||||
for (int i = 1; i < resources.size(); ++i) {
|
||||
if (resources[i].block_shape == TensorBlockShapeType::kSkewedInnerDims &&
|
||||
*block_shape != TensorBlockShapeType::kSkewedInnerDims) {
|
||||
*block_shape = TensorBlockShapeType::kSkewedInnerDims;
|
||||
}
|
||||
*block_total_size =
|
||||
numext::maxi(*block_total_size, resources[i].block_total_size);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* \class TensorBlock
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
@ -48,12 +108,12 @@ enum class TensorBlockShapeType {
|
||||
* This class represents a tensor block specified by the index of the
|
||||
* first block coefficient, and the size of the block in each dimension.
|
||||
*/
|
||||
template <typename Scalar, typename Index, std::size_t NumDims, int Layout>
|
||||
template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
|
||||
class TensorBlock {
|
||||
public:
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef DSizes<StorageIndex, NumDims> Dimensions;
|
||||
|
||||
TensorBlock(const Index first_coeff_index, const Dimensions& block_sizes,
|
||||
TensorBlock(const StorageIndex first_coeff_index, const Dimensions& block_sizes,
|
||||
const Dimensions& block_strides, const Dimensions& tensor_strides,
|
||||
Scalar* data)
|
||||
: m_first_coeff_index(first_coeff_index),
|
||||
@ -62,7 +122,7 @@ class TensorBlock {
|
||||
m_tensor_strides(tensor_strides),
|
||||
m_data(data) {}
|
||||
|
||||
Index first_coeff_index() const { return m_first_coeff_index; }
|
||||
StorageIndex first_coeff_index() const { return m_first_coeff_index; }
|
||||
|
||||
const Dimensions& block_sizes() const { return m_block_sizes; }
|
||||
|
||||
@ -75,13 +135,487 @@ class TensorBlock {
|
||||
const Scalar* data() const { return m_data; }
|
||||
|
||||
private:
|
||||
Index m_first_coeff_index;
|
||||
StorageIndex m_first_coeff_index;
|
||||
Dimensions m_block_sizes;
|
||||
Dimensions m_block_strides;
|
||||
Dimensions m_tensor_strides;
|
||||
Scalar* m_data; // Not owned.
|
||||
};
|
||||
|
||||
template <typename Scalar, typename StorageIndex>
|
||||
struct TensorBlockCopyOp {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||
const StorageIndex num_coeff_to_copy, const StorageIndex dst_index,
|
||||
const StorageIndex dst_stride, Scalar* EIGEN_RESTRICT dst_data,
|
||||
const StorageIndex src_index, const StorageIndex src_stride,
|
||||
const Scalar* EIGEN_RESTRICT src_data) {
|
||||
const Scalar* src_base = &src_data[src_index];
|
||||
Scalar* dst_base = &dst_data[dst_index];
|
||||
|
||||
using Src = const Eigen::Array<Scalar, Dynamic, 1>;
|
||||
using Dst = Eigen::Array<Scalar, Dynamic, 1>;
|
||||
|
||||
using SrcMap = Eigen::Map<Src, 0, InnerStride<>>;
|
||||
using DstMap = Eigen::Map<Dst, 0, InnerStride<>>;
|
||||
|
||||
const SrcMap src(src_base, num_coeff_to_copy, InnerStride<>(src_stride));
|
||||
DstMap dst(dst_base, num_coeff_to_copy, InnerStride<>(dst_stride));
|
||||
|
||||
dst = src;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* \class TensorBlockIO
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor block IO class.
|
||||
*
|
||||
* This class is responsible for copying data between a tensor and a tensor
|
||||
* block.
|
||||
*/
|
||||
template <typename Scalar, typename StorageIndex, int NumDims, int Layout,
|
||||
bool BlockRead>
|
||||
class TensorBlockIO {
|
||||
public:
|
||||
typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
|
||||
TensorBlock;
|
||||
typedef typename internal::TensorBlockCopyOp<Scalar, StorageIndex>
|
||||
TensorBlockCopyOp;
|
||||
|
||||
protected:
|
||||
struct BlockIteratorState {
|
||||
StorageIndex input_stride;
|
||||
StorageIndex output_stride;
|
||||
StorageIndex input_span;
|
||||
StorageIndex output_span;
|
||||
StorageIndex size;
|
||||
StorageIndex count;
|
||||
};
|
||||
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy(
|
||||
const TensorBlock& block, StorageIndex first_coeff_index,
|
||||
const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
|
||||
const array<StorageIndex, NumDims>& tensor_strides, const Scalar* src_data,
|
||||
Scalar* dst_data) {
|
||||
// Find the innermost tensor dimension whose size is not 1. This is the
|
||||
// effective inner dim. If all dimensions are of size 1, then fallback to
|
||||
// using the actual innermost dim to avoid out-of-bound access.
|
||||
StorageIndex num_size_one_inner_dims = 0;
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
const int dim = cond<Layout>()(i, NumDims - i - 1);
|
||||
if (block.block_sizes()[tensor_to_block_dim_map[dim]] != 1) {
|
||||
num_size_one_inner_dims = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Calculate strides and dimensions.
|
||||
const StorageIndex tensor_stride1_dim = cond<Layout>()(
|
||||
num_size_one_inner_dims, NumDims - num_size_one_inner_dims - 1);
|
||||
const StorageIndex block_dim_for_tensor_stride1_dim =
|
||||
NumDims == 0 ? 1 : tensor_to_block_dim_map[tensor_stride1_dim];
|
||||
size_t block_inner_dim_size =
|
||||
NumDims == 0 ? 1
|
||||
: block.block_sizes()[block_dim_for_tensor_stride1_dim];
|
||||
for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
|
||||
const int dim = cond<Layout>()(i, NumDims - i - 1);
|
||||
const StorageIndex block_stride =
|
||||
block.block_strides()[tensor_to_block_dim_map[dim]];
|
||||
if (block_inner_dim_size == block_stride &&
|
||||
block_stride == tensor_strides[dim]) {
|
||||
block_inner_dim_size *=
|
||||
block.block_sizes()[tensor_to_block_dim_map[dim]];
|
||||
++num_size_one_inner_dims;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
StorageIndex inputIndex;
|
||||
StorageIndex outputIndex;
|
||||
StorageIndex input_stride;
|
||||
StorageIndex output_stride;
|
||||
|
||||
// Setup strides to read/write along the tensor's stride1 dimension.
|
||||
if (BlockRead) {
|
||||
inputIndex = first_coeff_index;
|
||||
outputIndex = 0;
|
||||
input_stride = NumDims == 0 ? 1 : tensor_strides[tensor_stride1_dim];
|
||||
output_stride =
|
||||
NumDims == 0
|
||||
? 1
|
||||
: block.block_strides()[block_dim_for_tensor_stride1_dim];
|
||||
} else {
|
||||
inputIndex = 0;
|
||||
outputIndex = first_coeff_index;
|
||||
input_stride =
|
||||
NumDims == 0
|
||||
? 1
|
||||
: block.block_strides()[block_dim_for_tensor_stride1_dim];
|
||||
output_stride = NumDims == 0 ? 1 : tensor_strides[tensor_stride1_dim];
|
||||
}
|
||||
|
||||
const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
|
||||
array<BlockIteratorState, at_least_1_dim> block_iter_state;
|
||||
|
||||
// Initialize block iterator state. Squeeze away any dimension of size 1.
|
||||
int num_squeezed_dims = 0;
|
||||
for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
|
||||
const int dim = cond<Layout>()(i + 1, NumDims - i - 2);
|
||||
const StorageIndex size = block.block_sizes()[tensor_to_block_dim_map[dim]];
|
||||
if (size == 1) {
|
||||
continue;
|
||||
}
|
||||
block_iter_state[num_squeezed_dims].size = size;
|
||||
if (BlockRead) {
|
||||
block_iter_state[num_squeezed_dims].input_stride = tensor_strides[dim];
|
||||
block_iter_state[num_squeezed_dims].output_stride =
|
||||
block.block_strides()[tensor_to_block_dim_map[dim]];
|
||||
} else {
|
||||
block_iter_state[num_squeezed_dims].input_stride =
|
||||
block.block_strides()[tensor_to_block_dim_map[dim]];
|
||||
block_iter_state[num_squeezed_dims].output_stride = tensor_strides[dim];
|
||||
}
|
||||
block_iter_state[num_squeezed_dims].input_span =
|
||||
block_iter_state[num_squeezed_dims].input_stride *
|
||||
(block_iter_state[num_squeezed_dims].size - 1);
|
||||
block_iter_state[num_squeezed_dims].output_span =
|
||||
block_iter_state[num_squeezed_dims].output_stride *
|
||||
(block_iter_state[num_squeezed_dims].size - 1);
|
||||
block_iter_state[num_squeezed_dims].count = 0;
|
||||
++num_squeezed_dims;
|
||||
}
|
||||
|
||||
// Iterate copying data from src to dst.
|
||||
const StorageIndex block_total_size =
|
||||
NumDims == 0 ? 1 : block.block_sizes().TotalSize();
|
||||
for (StorageIndex i = 0; i < block_total_size; i += block_inner_dim_size) {
|
||||
TensorBlockCopyOp::Run(block_inner_dim_size, outputIndex, output_stride,
|
||||
dst_data, inputIndex, input_stride, src_data);
|
||||
// Update index.
|
||||
for (int j = 0; j < num_squeezed_dims; ++j) {
|
||||
if (++block_iter_state[j].count < block_iter_state[j].size) {
|
||||
inputIndex += block_iter_state[j].input_stride;
|
||||
outputIndex += block_iter_state[j].output_stride;
|
||||
break;
|
||||
}
|
||||
block_iter_state[j].count = 0;
|
||||
inputIndex -= block_iter_state[j].input_span;
|
||||
outputIndex -= block_iter_state[j].output_span;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* \class TensorBlockReader
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor block reader class.
|
||||
*
|
||||
* This class is responsible for reading a tensor block.
|
||||
*
|
||||
*/
|
||||
template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
|
||||
class TensorBlockReader : public TensorBlockIO<Scalar, StorageIndex, NumDims,
|
||||
Layout, /*BlockRead=*/true> {
|
||||
public:
|
||||
typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
|
||||
TensorBlock;
|
||||
typedef TensorBlockIO<Scalar, StorageIndex, NumDims, Layout, /*BlockRead=*/true>
|
||||
Base;
|
||||
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||
TensorBlock* block, const Scalar* src_data) {
|
||||
array<StorageIndex, NumDims> tensor_to_block_dim_map;
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
tensor_to_block_dim_map[i] = i;
|
||||
}
|
||||
Base::Copy(*block, block->first_coeff_index(), tensor_to_block_dim_map,
|
||||
block->tensor_strides(), src_data, block->data());
|
||||
}
|
||||
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||
TensorBlock* block, StorageIndex first_coeff_index,
|
||||
const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
|
||||
const array<StorageIndex, NumDims>& tensor_strides, const Scalar* src_data) {
|
||||
Base::Copy(*block, first_coeff_index, tensor_to_block_dim_map,
|
||||
tensor_strides, src_data, block->data());
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* \class TensorBlockWriter
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor block writer class.
|
||||
*
|
||||
* This class is responsible for writing a tensor block.
|
||||
*
|
||||
*/
|
||||
template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
|
||||
class TensorBlockWriter : public TensorBlockIO<Scalar, StorageIndex, NumDims,
|
||||
Layout, /*BlockRead=*/false> {
|
||||
public:
|
||||
typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
|
||||
TensorBlock;
|
||||
typedef TensorBlockIO<Scalar, StorageIndex, NumDims, Layout, /*BlockRead=*/false>
|
||||
Base;
|
||||
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||
const TensorBlock& block, Scalar* dst_data) {
|
||||
array<StorageIndex, NumDims> tensor_to_block_dim_map;
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
tensor_to_block_dim_map[i] = i;
|
||||
}
|
||||
Base::Copy(block, block.first_coeff_index(), tensor_to_block_dim_map,
|
||||
block.tensor_strides(), block.data(), dst_data);
|
||||
}
|
||||
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||
const TensorBlock& block, StorageIndex first_coeff_index,
|
||||
const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
|
||||
const array<StorageIndex, NumDims>& tensor_strides, Scalar* dst_data) {
|
||||
Base::Copy(block, first_coeff_index, tensor_to_block_dim_map,
|
||||
tensor_strides, block.data(), dst_data);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* \class TensorBlockCwiseBinaryOp
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Carries out a cwise binary op on a number of coefficients.
|
||||
*
|
||||
* This class reads strided inputs from left and right operands, and writes the
|
||||
* result of the cwise binary op to the strided output array.
|
||||
*
|
||||
*/
|
||||
struct TensorBlockCwiseBinaryOp {
|
||||
template <typename StorageIndex, typename BinaryFunctor, typename OutputScalar,
|
||||
typename LeftScalar, typename RightScalar>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||
const BinaryFunctor& functor, const StorageIndex num_coeff,
|
||||
const StorageIndex output_index, const StorageIndex output_stride,
|
||||
OutputScalar* output_data, const StorageIndex left_index,
|
||||
const StorageIndex left_stride, const LeftScalar* left_data,
|
||||
const StorageIndex right_index, const StorageIndex right_stride,
|
||||
const RightScalar* right_data) {
|
||||
using Lhs = const Eigen::Array<LeftScalar, Dynamic, 1>;
|
||||
using Rhs = const Eigen::Array<RightScalar, Dynamic, 1>;
|
||||
using Out = Eigen::Array<OutputScalar, Dynamic, 1>;
|
||||
|
||||
using LhsMap = Eigen::Map<Lhs, 0, InnerStride<>>;
|
||||
using RhsMap = Eigen::Map<Rhs, 0, InnerStride<>>;
|
||||
using OutMap = Eigen::Map<Out, 0, InnerStride<>>;
|
||||
|
||||
const LeftScalar* lhs_base = &left_data[left_index];
|
||||
const RightScalar* rhs_base = &right_data[right_index];
|
||||
OutputScalar* out_base = &output_data[output_index];
|
||||
|
||||
const LhsMap lhs(lhs_base, num_coeff, InnerStride<>(left_stride));
|
||||
const RhsMap rhs(rhs_base, num_coeff, InnerStride<>(right_stride));
|
||||
OutMap out(out_base, num_coeff, InnerStride<>(output_stride));
|
||||
|
||||
out =
|
||||
Eigen::CwiseBinaryOp<BinaryFunctor, LhsMap, RhsMap>(lhs, rhs, functor);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* \class TensorBlockCwiseBinaryIO
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor block IO class for carrying out cwise binary ops.
|
||||
*
|
||||
* This class carries out the binary op on given blocks.
|
||||
*
|
||||
*/
|
||||
template <typename BinaryFunctor, typename StorageIndex, typename OutputScalar,
|
||||
int NumDims, int Layout>
|
||||
struct TensorBlockCwiseBinaryIO {
|
||||
typedef typename internal::TensorBlock<OutputScalar, StorageIndex, NumDims,
|
||||
Layout>::Dimensions Dimensions;
|
||||
|
||||
struct BlockIteratorState {
|
||||
StorageIndex output_stride, output_span;
|
||||
StorageIndex left_stride, left_span;
|
||||
StorageIndex right_stride, right_span;
|
||||
StorageIndex size, count;
|
||||
};
|
||||
|
||||
template <typename LeftScalar, typename RightScalar>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||
const BinaryFunctor& functor, const Dimensions& block_sizes,
|
||||
const Dimensions& block_strides, OutputScalar* output_data,
|
||||
const array<StorageIndex, NumDims>& left_strides,
|
||||
const LeftScalar* left_data,
|
||||
const array<StorageIndex, NumDims>& right_strides,
|
||||
const RightScalar* right_data) {
|
||||
// Find the innermost dimension whose size is not 1. This is the effective
|
||||
// inner dim. If all dimensions are of size 1, fallback to using the actual
|
||||
// innermost dim to avoid out-of-bound access.
|
||||
int num_size_one_inner_dims = 0;
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
const int dim = cond<Layout>()(i, NumDims - i - 1);
|
||||
if (block_sizes[dim] != 1) {
|
||||
num_size_one_inner_dims = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Calculate strides and dimensions.
|
||||
const int inner_dim =
|
||||
NumDims == 0 ? 1
|
||||
: cond<Layout>()(num_size_one_inner_dims,
|
||||
NumDims - num_size_one_inner_dims - 1);
|
||||
StorageIndex inner_dim_size = NumDims == 0 ? 1 : block_sizes[inner_dim];
|
||||
for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
|
||||
const int dim = cond<Layout>()(i, NumDims - i - 1);
|
||||
// Merge multiple inner dims into one for larger inner dim size (i.e.
|
||||
// fewer calls to TensorBlockCwiseBinaryOp::Run()).
|
||||
if (inner_dim_size == block_strides[dim] &&
|
||||
block_strides[dim] == left_strides[dim] &&
|
||||
block_strides[dim] == right_strides[dim]) {
|
||||
inner_dim_size *= block_sizes[dim];
|
||||
++num_size_one_inner_dims;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
StorageIndex output_index = 0, left_index = 0, right_index = 0;
|
||||
const StorageIndex output_stride =
|
||||
NumDims == 0 ? 1 : block_strides[inner_dim];
|
||||
const StorageIndex left_stride = NumDims == 0 ? 1 : left_strides[inner_dim];
|
||||
const StorageIndex right_stride =
|
||||
NumDims == 0 ? 1 : right_strides[inner_dim];
|
||||
|
||||
const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
|
||||
array<BlockIteratorState, at_least_1_dim> block_iter_state;
|
||||
|
||||
// Initialize block iterator state. Squeeze away any dimension of size 1.
|
||||
int num_squeezed_dims = 0;
|
||||
for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
|
||||
const int dim = cond<Layout>()(i + 1, NumDims - i - 2);
|
||||
const StorageIndex size = block_sizes[dim];
|
||||
if (size == 1) {
|
||||
continue;
|
||||
}
|
||||
auto& state = block_iter_state[num_squeezed_dims];
|
||||
state.output_stride = block_strides[dim];
|
||||
state.left_stride = left_strides[dim];
|
||||
state.right_stride = right_strides[dim];
|
||||
state.size = size;
|
||||
state.output_span = state.output_stride * (size - 1);
|
||||
state.left_span = state.left_stride * (size - 1);
|
||||
state.right_span = state.right_stride * (size - 1);
|
||||
state.count = 0;
|
||||
++num_squeezed_dims;
|
||||
}
|
||||
|
||||
// Compute cwise binary op.
|
||||
const StorageIndex block_total_size =
|
||||
NumDims == 0 ? 1 : block_sizes.TotalSize();
|
||||
for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) {
|
||||
TensorBlockCwiseBinaryOp::Run(functor, inner_dim_size, output_index,
|
||||
output_stride, output_data, left_index,
|
||||
left_stride, left_data, right_index,
|
||||
right_stride, right_data);
|
||||
// Update index.
|
||||
for (int j = 0; j < num_squeezed_dims; ++j) {
|
||||
auto& state = block_iter_state[j];
|
||||
if (++state.count < state.size) {
|
||||
output_index += state.output_stride;
|
||||
left_index += state.left_stride;
|
||||
right_index += state.right_stride;
|
||||
break;
|
||||
}
|
||||
state.count = 0;
|
||||
output_index -= state.output_span;
|
||||
left_index -= state.left_span;
|
||||
right_index -= state.right_span;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* \class TensorBlockView
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Read-only view into a block of data.
|
||||
*
|
||||
* This class provides read-only access to a block of data in impl. It may need
|
||||
* to allocate space for holding the intermediate result.
|
||||
*
|
||||
*/
|
||||
template <class ArgType, class Device>
|
||||
struct TensorBlockView {
|
||||
typedef TensorEvaluator<ArgType, Device> Impl;
|
||||
typedef typename Impl::Index StorageIndex;
|
||||
typedef typename remove_const<typename Impl::Scalar>::type Scalar;
|
||||
static const int NumDims = array_size<typename Impl::Dimensions>::value;
|
||||
typedef DSizes<StorageIndex, NumDims> Dimensions;
|
||||
|
||||
// Constructs a TensorBlockView for `impl`. `block` is only used for for
|
||||
// specifying the start offset, shape, and strides of the block.
|
||||
template <typename OtherTensorBlock>
|
||||
TensorBlockView(const Device& device,
|
||||
const TensorEvaluator<ArgType, Device>& impl,
|
||||
const OtherTensorBlock& block)
|
||||
: m_device(device),
|
||||
m_block_sizes(block.block_sizes()),
|
||||
m_data(NULL),
|
||||
m_allocated_data(NULL) {
|
||||
if (Impl::RawAccess && impl.data() != NULL) {
|
||||
m_data = impl.data() + block.first_coeff_index();
|
||||
m_block_strides = block.tensor_strides();
|
||||
} else {
|
||||
// Actually make a copy.
|
||||
|
||||
// TODO(wuke): This sometimes put a lot pressure on the heap allocator.
|
||||
// Consider allowing ops to request additional temporary block memory in
|
||||
// TensorOpResourceRequirements.
|
||||
m_allocated_data = static_cast<Scalar*>(
|
||||
m_device.allocate(m_block_sizes.TotalSize() * sizeof(Scalar)));
|
||||
m_data = m_allocated_data;
|
||||
if (NumDims > 0) {
|
||||
if (static_cast<int>(Impl::Layout) == static_cast<int>(ColMajor)) {
|
||||
m_block_strides[0] = 1;
|
||||
for (int i = 1; i < NumDims; ++i) {
|
||||
m_block_strides[i] = m_block_strides[i - 1] * m_block_sizes[i - 1];
|
||||
}
|
||||
} else {
|
||||
m_block_strides[NumDims - 1] = 1;
|
||||
for (int i = NumDims - 2; i >= 0; --i) {
|
||||
m_block_strides[i] = m_block_strides[i + 1] * m_block_sizes[i + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
TensorBlock<Scalar, StorageIndex, NumDims, Impl::Layout> input_block(
|
||||
block.first_coeff_index(), m_block_sizes, m_block_strides,
|
||||
block.tensor_strides(), m_allocated_data);
|
||||
impl.block(&input_block);
|
||||
}
|
||||
}
|
||||
|
||||
~TensorBlockView() {
|
||||
if (m_allocated_data != NULL) {
|
||||
m_device.deallocate(m_allocated_data);
|
||||
}
|
||||
}
|
||||
|
||||
const Dimensions& block_sizes() const { return m_block_sizes; }
|
||||
const Dimensions& block_strides() const { return m_block_strides; }
|
||||
const Scalar* data() const { return m_data; }
|
||||
|
||||
private:
|
||||
const Device& m_device;
|
||||
Dimensions m_block_sizes, m_block_strides;
|
||||
const Scalar* m_data; // Not owned.
|
||||
Scalar* m_allocated_data; // Owned.
|
||||
};
|
||||
|
||||
/**
|
||||
* \class TensorBlockMapper
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
@ -90,21 +624,21 @@ class TensorBlock {
|
||||
*
|
||||
* This class is responsible for iterating over the blocks of a tensor.
|
||||
*/
|
||||
template <typename Scalar, typename Index, std::size_t NumDims, int Layout>
|
||||
template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
|
||||
class TensorBlockMapper {
|
||||
public:
|
||||
typedef typename internal::TensorBlock<Scalar, Index, NumDims, Layout>
|
||||
typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
|
||||
TensorBlock;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef DSizes<StorageIndex, NumDims> Dimensions;
|
||||
|
||||
TensorBlockMapper(const Dimensions& dims,
|
||||
const TensorBlockShapeType block_shape,
|
||||
size_t min_target_size)
|
||||
Index min_target_size)
|
||||
: m_dimensions(dims),
|
||||
m_block_dim_sizes(BlockDimensions(dims, block_shape, min_target_size)) {
|
||||
// Calculate block counts by dimension and total block count.
|
||||
DSizes<Index, NumDims> block_count;
|
||||
for (size_t i = 0; i < block_count.rank(); ++i) {
|
||||
DSizes<StorageIndex, NumDims> block_count;
|
||||
for (Index i = 0; i < block_count.rank(); ++i) {
|
||||
block_count[i] = divup(m_dimensions[i], m_block_dim_sizes[i]);
|
||||
}
|
||||
m_total_block_count = array_prod(block_count);
|
||||
@ -130,15 +664,15 @@ class TensorBlockMapper {
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
|
||||
GetBlockForIndex(Index block_index, Scalar* data) const {
|
||||
Index first_coeff_index = 0;
|
||||
DSizes<Index, NumDims> coords;
|
||||
DSizes<Index, NumDims> sizes;
|
||||
DSizes<Index, NumDims> strides;
|
||||
GetBlockForIndex(StorageIndex block_index, Scalar* data) const {
|
||||
StorageIndex first_coeff_index = 0;
|
||||
DSizes<StorageIndex, NumDims> coords;
|
||||
DSizes<StorageIndex, NumDims> sizes;
|
||||
DSizes<StorageIndex, NumDims> strides;
|
||||
if (NumDims > 0) {
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
const Index idx = block_index / m_block_strides[i];
|
||||
const StorageIndex idx = block_index / m_block_strides[i];
|
||||
coords[i] = idx * m_block_dim_sizes[i];
|
||||
sizes[i] =
|
||||
numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]);
|
||||
@ -156,7 +690,7 @@ class TensorBlockMapper {
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < NumDims - 1; ++i) {
|
||||
const Index idx = block_index / m_block_strides[i];
|
||||
const StorageIndex idx = block_index / m_block_strides[i];
|
||||
coords[i] = idx * m_block_dim_sizes[i];
|
||||
sizes[i] =
|
||||
numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]);
|
||||
@ -181,23 +715,20 @@ class TensorBlockMapper {
|
||||
data);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index total_block_count() const {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex total_block_count() const {
|
||||
return m_total_block_count;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index block_dims_total_size() const {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex
|
||||
block_dims_total_size() const {
|
||||
return m_block_dim_sizes.TotalSize();
|
||||
}
|
||||
|
||||
private:
|
||||
static int InnerDimIndex(Index i) {
|
||||
return Layout == static_cast<int>(ColMajor) ? i : NumDims - i - 1;
|
||||
}
|
||||
|
||||
static Dimensions BlockDimensions(const Dimensions& tensor_dims,
|
||||
const TensorBlockShapeType block_shape,
|
||||
size_t min_target_size) {
|
||||
min_target_size = numext::maxi<size_t>(1, min_target_size);
|
||||
Index min_target_size) {
|
||||
min_target_size = numext::maxi<Index>(1, min_target_size);
|
||||
|
||||
// If tensor fully fits into the target size, we'll treat it a single block.
|
||||
Dimensions block_dim_sizes = tensor_dims;
|
||||
@ -226,14 +757,14 @@ class TensorBlockMapper {
|
||||
dim_size_target, static_cast<size_t>(tensor_dims[i]));
|
||||
}
|
||||
// Add any un-allocated coefficients to inner dimension(s).
|
||||
Index total_size = block_dim_sizes.TotalSize();
|
||||
StorageIndex total_size = block_dim_sizes.TotalSize();
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
const int dim = InnerDimIndex(i);
|
||||
const int dim = cond<Layout>()(i, NumDims - i - 1);
|
||||
if (block_dim_sizes[dim] < tensor_dims[dim]) {
|
||||
const Index total_size_other_dims =
|
||||
const StorageIndex total_size_other_dims =
|
||||
total_size / block_dim_sizes[dim];
|
||||
const Index alloc_avail =
|
||||
divup<Index>(min_target_size, total_size_other_dims);
|
||||
const StorageIndex alloc_avail =
|
||||
divup<StorageIndex>(min_target_size, total_size_other_dims);
|
||||
if (alloc_avail == block_dim_sizes[dim]) {
|
||||
// Insufficient excess coefficients to allocate.
|
||||
break;
|
||||
@ -243,14 +774,14 @@ class TensorBlockMapper {
|
||||
}
|
||||
}
|
||||
} else if (block_shape == TensorBlockShapeType::kSkewedInnerDims) {
|
||||
Index coeff_to_allocate = min_target_size;
|
||||
StorageIndex coeff_to_allocate = min_target_size;
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
const int dim = InnerDimIndex(i);
|
||||
const int dim = cond<Layout>()(i, NumDims - i - 1);
|
||||
block_dim_sizes[dim] =
|
||||
numext::mini(coeff_to_allocate, tensor_dims[dim]);
|
||||
coeff_to_allocate =
|
||||
divup(coeff_to_allocate,
|
||||
numext::maxi(static_cast<Index>(1), block_dim_sizes[dim]));
|
||||
coeff_to_allocate = divup(
|
||||
coeff_to_allocate,
|
||||
numext::maxi(static_cast<StorageIndex>(1), block_dim_sizes[dim]));
|
||||
}
|
||||
eigen_assert(coeff_to_allocate == 1);
|
||||
} else {
|
||||
@ -269,7 +800,7 @@ class TensorBlockMapper {
|
||||
Dimensions m_block_dim_sizes;
|
||||
Dimensions m_block_strides;
|
||||
Dimensions m_tensor_strides;
|
||||
Index m_total_block_count;
|
||||
StorageIndex m_total_block_count;
|
||||
};
|
||||
|
||||
/**
|
||||
@ -284,12 +815,12 @@ class TensorBlockMapper {
|
||||
* processed together.
|
||||
*
|
||||
*/
|
||||
template <typename Scalar, typename Index, std::size_t NumDims, int Layout>
|
||||
template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
|
||||
class TensorSliceBlockMapper {
|
||||
public:
|
||||
typedef typename internal::TensorBlock<Scalar, Index, NumDims, Layout>
|
||||
typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
|
||||
TensorBlock;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef DSizes<StorageIndex, NumDims> Dimensions;
|
||||
|
||||
TensorSliceBlockMapper(const Dimensions& tensor_dims,
|
||||
const Dimensions& tensor_slice_offsets,
|
||||
@ -303,7 +834,7 @@ class TensorSliceBlockMapper {
|
||||
m_block_stride_order(block_stride_order),
|
||||
m_total_block_count(1) {
|
||||
// Calculate block counts by dimension and total block count.
|
||||
DSizes<Index, NumDims> block_count;
|
||||
DSizes<StorageIndex, NumDims> block_count;
|
||||
for (size_t i = 0; i < block_count.rank(); ++i) {
|
||||
block_count[i] = divup(m_tensor_slice_extents[i], m_block_dim_sizes[i]);
|
||||
}
|
||||
@ -330,11 +861,11 @@ class TensorSliceBlockMapper {
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
|
||||
GetBlockForIndex(Index block_index, Scalar* data) const {
|
||||
Index first_coeff_index = 0;
|
||||
DSizes<Index, NumDims> coords;
|
||||
DSizes<Index, NumDims> sizes;
|
||||
DSizes<Index, NumDims> strides;
|
||||
GetBlockForIndex(StorageIndex block_index, Scalar* data) const {
|
||||
StorageIndex first_coeff_index = 0;
|
||||
DSizes<StorageIndex, NumDims> coords;
|
||||
DSizes<StorageIndex, NumDims> sizes;
|
||||
DSizes<StorageIndex, NumDims> strides;
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
const Index idx = block_index / m_block_strides[i];
|
||||
@ -352,16 +883,16 @@ class TensorSliceBlockMapper {
|
||||
m_block_dim_sizes[0]);
|
||||
first_coeff_index += coords[0] * m_tensor_strides[0];
|
||||
|
||||
Index prev_dim = m_block_stride_order[0];
|
||||
StorageIndex prev_dim = m_block_stride_order[0];
|
||||
strides[prev_dim] = 1;
|
||||
for (int i = 1; i < NumDims; ++i) {
|
||||
const Index curr_dim = m_block_stride_order[i];
|
||||
const StorageIndex curr_dim = m_block_stride_order[i];
|
||||
strides[curr_dim] = strides[prev_dim] * sizes[prev_dim];
|
||||
prev_dim = curr_dim;
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < static_cast<int>(NumDims) - 1; ++i) {
|
||||
const Index idx = block_index / m_block_strides[i];
|
||||
for (int i = 0; i < NumDims - 1; ++i) {
|
||||
const StorageIndex idx = block_index / m_block_strides[i];
|
||||
coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i];
|
||||
sizes[i] = numext::mini(
|
||||
m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i],
|
||||
@ -377,10 +908,10 @@ class TensorSliceBlockMapper {
|
||||
m_block_dim_sizes[NumDims - 1]);
|
||||
first_coeff_index += coords[NumDims - 1] * m_tensor_strides[NumDims - 1];
|
||||
|
||||
Index prev_dim = m_block_stride_order[NumDims - 1];
|
||||
StorageIndex prev_dim = m_block_stride_order[NumDims - 1];
|
||||
strides[prev_dim] = 1;
|
||||
for (int i = NumDims - 2; i >= 0; --i) {
|
||||
const Index curr_dim = m_block_stride_order[i];
|
||||
const StorageIndex curr_dim = m_block_stride_order[i];
|
||||
strides[curr_dim] = strides[prev_dim] * sizes[prev_dim];
|
||||
prev_dim = curr_dim;
|
||||
}
|
||||
@ -390,7 +921,7 @@ class TensorSliceBlockMapper {
|
||||
data);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index total_block_count() const {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex total_block_count() const {
|
||||
return m_total_block_count;
|
||||
}
|
||||
|
||||
@ -402,7 +933,7 @@ class TensorSliceBlockMapper {
|
||||
Dimensions m_block_dim_sizes;
|
||||
Dimensions m_block_stride_order;
|
||||
Dimensions m_block_strides;
|
||||
Index m_total_block_count;
|
||||
StorageIndex m_total_block_count;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
@ -110,6 +110,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = true,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
RawAccess = false
|
||||
};
|
||||
|
@ -146,6 +146,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
||||
// slice offsets.
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
@ -343,6 +344,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
|
@ -122,6 +122,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
||||
RawAccess = false
|
||||
};
|
||||
@ -306,6 +307,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
||||
RawAccess = false
|
||||
};
|
||||
|
@ -240,6 +240,7 @@ struct TensorContractionEvaluatorBase
|
||||
enum {
|
||||
IsAligned = true,
|
||||
PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = true
|
||||
|
@ -195,6 +195,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = true,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
RawAccess = false
|
||||
};
|
||||
|
@ -307,6 +307,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<InputArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
@ -577,11 +578,11 @@ __global__ void EigenConvolutionKernel1D(
|
||||
const float* __restrict kernel, const int numPlanes, const int numX,
|
||||
const int maxX, const int kernelSize, float* buffer) {
|
||||
#if defined(EIGEN_HIPCC)
|
||||
HIP_DYNAMIC_SHARED(float, s)
|
||||
HIP_DYNAMIC_SHARED(float, s)
|
||||
#else
|
||||
extern __shared__ float s[];
|
||||
#endif
|
||||
|
||||
|
||||
const int first_x = blockIdx.x * maxX;
|
||||
const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
|
||||
const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
|
||||
@ -630,7 +631,7 @@ __global__ void EigenConvolutionKernel2D(
|
||||
const int maxX, const int numY, const int maxY, const int kernelSizeX,
|
||||
const int kernelSizeY, float* buffer) {
|
||||
#if defined(EIGEN_HIPCC)
|
||||
HIP_DYNAMIC_SHARED(float, s)
|
||||
HIP_DYNAMIC_SHARED(float, s)
|
||||
#else
|
||||
extern __shared__ float s[];
|
||||
#endif
|
||||
@ -702,7 +703,7 @@ __global__ void EigenConvolutionKernel3D(
|
||||
const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
|
||||
const size_t kernelSizeZ, float* buffer) {
|
||||
#if defined(EIGEN_HIPCC)
|
||||
HIP_DYNAMIC_SHARED(float, s)
|
||||
HIP_DYNAMIC_SHARED(float, s)
|
||||
#else
|
||||
extern __shared__ float s[];
|
||||
#endif
|
||||
@ -778,6 +779,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
|
||||
PacketAccess = false,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
|
@ -242,6 +242,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned,
|
||||
PacketAccess = false,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
|
@ -290,6 +290,22 @@ struct DSizes : array<DenseIndex, NumDims> {
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef EIGEN_EMULATE_CXX11_META_H
|
||||
template <typename std::ptrdiff_t... Indices>
|
||||
EIGEN_DEVICE_FUNC DSizes(const Sizes<Indices...>& a) {
|
||||
for (int i = 0 ; i < NumDims; ++i) {
|
||||
(*this)[i] = a[i];
|
||||
}
|
||||
}
|
||||
#else
|
||||
template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
|
||||
EIGEN_DEVICE_FUNC DSizes(const Sizes<V1, V2, V3, V4, V5>& a) {
|
||||
for (int i = 0 ; i < NumDims; ++i) {
|
||||
(*this)[i] = a[i];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) {
|
||||
|
@ -107,6 +107,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = true
|
||||
|
@ -41,11 +41,22 @@ struct TensorEvaluator
|
||||
enum {
|
||||
IsAligned = Derived::IsAligned,
|
||||
PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
|
||||
BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
|
||||
Layout = Derived::Layout,
|
||||
CoordAccess = NumCoords > 0,
|
||||
RawAccess = true
|
||||
};
|
||||
|
||||
typedef typename internal::TensorBlock<
|
||||
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
|
||||
TensorBlock;
|
||||
typedef typename internal::TensorBlockReader<
|
||||
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
|
||||
TensorBlockReader;
|
||||
typedef typename internal::TensorBlockWriter<
|
||||
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
|
||||
TensorBlockWriter;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
|
||||
: m_data(const_cast<typename internal::traits<Derived>::template MakePointer<Scalar>::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m)
|
||||
{ }
|
||||
@ -113,6 +124,20 @@ struct TensorEvaluator
|
||||
internal::unpacket_traits<PacketReturnType>::size);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
|
||||
std::vector<internal::TensorOpResourceRequirements>* resources) const {}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
|
||||
assert(m_data != NULL);
|
||||
TensorBlockReader::Run(block, m_data);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
|
||||
const TensorBlock& block) {
|
||||
assert(m_data != NULL);
|
||||
TensorBlockWriter::Run(block, m_data);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<Scalar>::Type data() const { return m_data; }
|
||||
|
||||
/// required by sycl in order to construct sycl buffer from raw pointer
|
||||
@ -167,11 +192,19 @@ struct TensorEvaluator<const Derived, Device>
|
||||
enum {
|
||||
IsAligned = Derived::IsAligned,
|
||||
PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
|
||||
BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
|
||||
Layout = Derived::Layout,
|
||||
CoordAccess = NumCoords > 0,
|
||||
RawAccess = true
|
||||
};
|
||||
|
||||
typedef typename internal::TensorBlock<
|
||||
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
|
||||
TensorBlock;
|
||||
typedef typename internal::TensorBlockReader<
|
||||
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
|
||||
TensorBlockReader;
|
||||
|
||||
// Used for accessor extraction in SYCL Managed TensorMap:
|
||||
const Derived& derived() const { return m_impl; }
|
||||
|
||||
@ -219,6 +252,14 @@ struct TensorEvaluator<const Derived, Device>
|
||||
internal::unpacket_traits<PacketReturnType>::size);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
|
||||
std::vector<internal::TensorOpResourceRequirements>* resources) const {}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
|
||||
assert(m_data != NULL);
|
||||
TensorBlockReader::Run(block, m_data);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<const Scalar>::Type data() const { return m_data; }
|
||||
|
||||
/// added for sycl in order to construct the buffer from the sycl device
|
||||
@ -244,6 +285,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = true,
|
||||
PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
@ -308,7 +350,9 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
|
||||
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess & internal::functor_traits<UnaryOp>::PacketAccess,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess &
|
||||
internal::functor_traits<UnaryOp>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
@ -375,16 +419,21 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
|
||||
typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
|
||||
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess &
|
||||
IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned &
|
||||
TensorEvaluator<RightArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
|
||||
TensorEvaluator<RightArgType, Device>::PacketAccess &
|
||||
internal::functor_traits<BinaryOp>::PacketAccess,
|
||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess &
|
||||
TensorEvaluator<RightArgType, Device>::BlockAccess,
|
||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_functor(op.functor()),
|
||||
: m_device(device),
|
||||
m_functor(op.functor()),
|
||||
m_leftImpl(op.lhsExpression(), device),
|
||||
m_rightImpl(op.rhsExpression(), device)
|
||||
{
|
||||
@ -399,6 +448,14 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
|
||||
|
||||
static const int NumDims = internal::array_size<
|
||||
typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
|
||||
|
||||
typedef internal::TensorBlock<
|
||||
typename internal::remove_const<Scalar>::type, Index, NumDims,
|
||||
TensorEvaluator<LeftArgType, Device>::Layout>
|
||||
TensorBlock;
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
|
||||
{
|
||||
// TODO: use right impl instead if right impl dimensions are known at compile time.
|
||||
@ -433,6 +490,30 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
|
||||
TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
|
||||
std::vector<internal::TensorOpResourceRequirements>* resources) const {
|
||||
m_leftImpl.getResourceRequirements(resources);
|
||||
m_rightImpl.getResourceRequirements(resources);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
|
||||
TensorBlock* output_block) const {
|
||||
if (NumDims <= 0) {
|
||||
output_block->data()[0] = coeff(0);
|
||||
return;
|
||||
}
|
||||
internal::TensorBlockView<LeftArgType, Device> left_block(
|
||||
m_device, m_leftImpl, *output_block);
|
||||
internal::TensorBlockView<RightArgType, Device> right_block(
|
||||
m_device, m_rightImpl, *output_block);
|
||||
internal::TensorBlockCwiseBinaryIO<
|
||||
BinaryOp, Index, typename internal::remove_const<Scalar>::type, NumDims,
|
||||
Layout>::Run(m_functor, output_block->block_sizes(),
|
||||
output_block->block_strides(), output_block->data(),
|
||||
left_block.block_strides(), left_block.data(),
|
||||
right_block.block_strides(), right_block.data());
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
|
||||
/// required by sycl in order to extract the accessor
|
||||
const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
|
||||
@ -442,6 +523,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
|
||||
BinaryOp functor() const { return m_functor; }
|
||||
|
||||
private:
|
||||
const Device& m_device;
|
||||
const BinaryOp m_functor;
|
||||
TensorEvaluator<LeftArgType, Device> m_leftImpl;
|
||||
TensorEvaluator<RightArgType, Device> m_rightImpl;
|
||||
@ -458,6 +540,7 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
|
||||
IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess &
|
||||
internal::functor_traits<TernaryOp>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<Arg1Type, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
@ -562,6 +645,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
|
||||
IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess &
|
||||
internal::packet_traits<Scalar>::HasBlend,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<IfArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
|
@ -12,31 +12,40 @@
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorExecutor
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief The tensor executor class.
|
||||
*
|
||||
* This class is responsible for launch the evaluation of the expression on
|
||||
* the specified computing device.
|
||||
*/
|
||||
/**
|
||||
* \class TensorExecutor
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief The tensor executor class.
|
||||
*
|
||||
* This class is responsible for launch the evaluation of the expression on
|
||||
* the specified computing device.
|
||||
*
|
||||
* @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and
|
||||
* instructions)
|
||||
* @tparam Tileable can use block based tensor evaluation
|
||||
* (see TensorBlock.h)
|
||||
*/
|
||||
namespace internal {
|
||||
|
||||
// Default strategy: the expression is evaluated with a single cpu thread.
|
||||
template<typename Expression, typename Device, bool Vectorizable>
|
||||
class TensorExecutor
|
||||
{
|
||||
/**
|
||||
* Default strategy: the expression is evaluated sequentially with a single cpu
|
||||
* thread, without vectorization and block evaluation.
|
||||
*/
|
||||
template <typename Expression, typename Device, bool Vectorizable,
|
||||
bool Tileable>
|
||||
class TensorExecutor {
|
||||
public:
|
||||
typedef typename Expression::Index Index;
|
||||
using StorageIndex = typename Expression::Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
static inline void run(const Expression& expr, const Device& device = Device())
|
||||
{
|
||||
static inline void run(const Expression& expr,
|
||||
const Device& device = Device()) {
|
||||
TensorEvaluator<Expression, Device> evaluator(expr, device);
|
||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||
if (needs_assign)
|
||||
{
|
||||
const Index size = array_prod(evaluator.dimensions());
|
||||
for (Index i = 0; i < size; ++i) {
|
||||
if (needs_assign) {
|
||||
const StorageIndex size = array_prod(evaluator.dimensions());
|
||||
for (StorageIndex i = 0; i < size; ++i) {
|
||||
evaluator.evalScalar(i);
|
||||
}
|
||||
}
|
||||
@ -44,35 +53,40 @@ class TensorExecutor
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template<typename Expression>
|
||||
class TensorExecutor<Expression, DefaultDevice, true>
|
||||
{
|
||||
/**
|
||||
* Process all the data with a single cpu thread, using vectorized instructions.
|
||||
*/
|
||||
template <typename Expression>
|
||||
class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true,
|
||||
/*Tileable*/ false> {
|
||||
public:
|
||||
typedef typename Expression::Index Index;
|
||||
using StorageIndex = typename Expression::Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
|
||||
{
|
||||
static inline void run(const Expression& expr,
|
||||
const DefaultDevice& device = DefaultDevice()) {
|
||||
TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
|
||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||
if (needs_assign)
|
||||
{
|
||||
const Index size = array_prod(evaluator.dimensions());
|
||||
const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
|
||||
// Give the compiler a strong hint to unroll the loop. But don't insist
|
||||
// on unrolling, because if the function is expensive the compiler should not
|
||||
if (needs_assign) {
|
||||
const StorageIndex size = array_prod(evaluator.dimensions());
|
||||
const int PacketSize = unpacket_traits<typename TensorEvaluator<
|
||||
Expression, DefaultDevice>::PacketReturnType>::size;
|
||||
|
||||
// Give compiler a strong possibility to unroll the loop. But don't insist
|
||||
// on unrolling, because if the function is expensive compiler should not
|
||||
// unroll the loop at the expense of inlining.
|
||||
const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
|
||||
for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
|
||||
for (Index j = 0; j < 4; j++) {
|
||||
const StorageIndex UnrolledSize =
|
||||
(size / (4 * PacketSize)) * 4 * PacketSize;
|
||||
for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
|
||||
for (StorageIndex j = 0; j < 4; j++) {
|
||||
evaluator.evalPacket(i + j * PacketSize);
|
||||
}
|
||||
}
|
||||
const Index VectorizedSize = (size / PacketSize) * PacketSize;
|
||||
for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
|
||||
const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
|
||||
for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
|
||||
evaluator.evalPacket(i);
|
||||
}
|
||||
for (Index i = VectorizedSize; i < size; ++i) {
|
||||
for (StorageIndex i = VectorizedSize; i < size; ++i) {
|
||||
evaluator.evalScalar(i);
|
||||
}
|
||||
}
|
||||
@ -80,41 +94,107 @@ class TensorExecutor<Expression, DefaultDevice, true>
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Process all the data with a single cpu thread, using blocks of data. By
|
||||
* sizing a block to fit L1 cache we get better cache performance.
|
||||
*/
|
||||
template <typename Expression, bool Vectorizable>
|
||||
class TensorExecutor<Expression, DefaultDevice, Vectorizable,
|
||||
/*Tileable*/ true> {
|
||||
public:
|
||||
using Scalar = typename traits<Expression>::Scalar;
|
||||
using ScalarNoConst = typename remove_const<Scalar>::type;
|
||||
|
||||
using Evaluator = TensorEvaluator<Expression, DefaultDevice>;
|
||||
using StorageIndex = typename traits<Expression>::Index;
|
||||
|
||||
// Multicore strategy: the index space is partitioned and each partition is executed on a single core
|
||||
static const int NumDims = traits<Expression>::NumDimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
static inline void run(const Expression& expr,
|
||||
const DefaultDevice& device = DefaultDevice()) {
|
||||
using TensorBlock =
|
||||
TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
|
||||
using TensorBlockMapper = TensorBlockMapper<ScalarNoConst, StorageIndex,
|
||||
NumDims, Evaluator::Layout>;
|
||||
|
||||
Evaluator evaluator(expr, device);
|
||||
Index total_size = array_prod(evaluator.dimensions());
|
||||
Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
|
||||
|
||||
if (total_size < cache_size) {
|
||||
// TODO(andydavis) Reduce block management overhead for small tensors.
|
||||
// TODO(wuke) Do not do this when evaluating TensorBroadcastingOp.
|
||||
internal::TensorExecutor<Expression, DefaultDevice, Vectorizable,
|
||||
/*Tileable*/ false>::run(expr, device);
|
||||
return;
|
||||
}
|
||||
|
||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||
if (needs_assign) {
|
||||
// Size tensor blocks to fit in cache (or requested target block size).
|
||||
Index block_total_size = numext::mini(cache_size, total_size);
|
||||
TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims;
|
||||
// Query expression tree for desired block size/shape.
|
||||
std::vector<TensorOpResourceRequirements> resources;
|
||||
evaluator.getResourceRequirements(&resources);
|
||||
MergeResourceRequirements(resources, &block_shape, &block_total_size);
|
||||
|
||||
TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape,
|
||||
block_total_size);
|
||||
block_total_size = block_mapper.block_dims_total_size();
|
||||
|
||||
Scalar* data = static_cast<Scalar*>(
|
||||
device.allocate(block_total_size * sizeof(Scalar)));
|
||||
|
||||
const StorageIndex total_block_count = block_mapper.total_block_count();
|
||||
for (StorageIndex i = 0; i < total_block_count; ++i) {
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(i, data);
|
||||
evaluator.evalBlock(&block);
|
||||
}
|
||||
device.deallocate(data);
|
||||
}
|
||||
evaluator.cleanup();
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Multicore strategy: the index space is partitioned and each partition is
|
||||
* executed on a single core.
|
||||
*/
|
||||
#ifdef EIGEN_USE_THREADS
|
||||
template <typename Evaluator, typename Index, bool Vectorizable>
|
||||
template <typename Evaluator, typename StorageIndex, bool Vectorizable>
|
||||
struct EvalRange {
|
||||
static void run(Evaluator* evaluator_in, const Index first, const Index last) {
|
||||
static void run(Evaluator* evaluator_in, const StorageIndex first,
|
||||
const StorageIndex last) {
|
||||
Evaluator evaluator = *evaluator_in;
|
||||
eigen_assert(last >= first);
|
||||
for (Index i = first; i < last; ++i) {
|
||||
for (StorageIndex i = first; i < last; ++i) {
|
||||
evaluator.evalScalar(i);
|
||||
}
|
||||
}
|
||||
|
||||
static Index alignBlockSize(Index size) {
|
||||
return size;
|
||||
}
|
||||
static StorageIndex alignBlockSize(StorageIndex size) { return size; }
|
||||
};
|
||||
|
||||
template <typename Evaluator, typename Index>
|
||||
struct EvalRange<Evaluator, Index, true> {
|
||||
static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
|
||||
template <typename Evaluator, typename StorageIndex>
|
||||
struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
|
||||
static const int PacketSize =
|
||||
unpacket_traits<typename Evaluator::PacketReturnType>::size;
|
||||
|
||||
static void run(Evaluator* evaluator_in, const Index first, const Index last) {
|
||||
static void run(Evaluator* evaluator_in, const StorageIndex first,
|
||||
const StorageIndex last) {
|
||||
Evaluator evaluator = *evaluator_in;
|
||||
eigen_assert(last >= first);
|
||||
Index i = first;
|
||||
StorageIndex i = first;
|
||||
if (last - first >= PacketSize) {
|
||||
eigen_assert(first % PacketSize == 0);
|
||||
Index last_chunk_offset = last - 4 * PacketSize;
|
||||
// Give the compiler a strong hint to unroll the loop. But don't insist
|
||||
// on unrolling, because if the function is expensive the compiler should not
|
||||
StorageIndex last_chunk_offset = last - 4 * PacketSize;
|
||||
// Give compiler a strong possibility to unroll the loop. But don't insist
|
||||
// on unrolling, because if the function is expensive compiler should not
|
||||
// unroll the loop at the expense of inlining.
|
||||
for (; i <= last_chunk_offset; i += 4*PacketSize) {
|
||||
for (Index j = 0; j < 4; j++) {
|
||||
for (; i <= last_chunk_offset; i += 4 * PacketSize) {
|
||||
for (StorageIndex j = 0; j < 4; j++) {
|
||||
evaluator.evalPacket(i + j * PacketSize);
|
||||
}
|
||||
}
|
||||
@ -128,7 +208,7 @@ struct EvalRange<Evaluator, Index, true> {
|
||||
}
|
||||
}
|
||||
|
||||
static Index alignBlockSize(Index size) {
|
||||
static StorageIndex alignBlockSize(StorageIndex size) {
|
||||
// Align block size to packet size and account for unrolling in run above.
|
||||
if (size >= 16 * PacketSize) {
|
||||
return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
|
||||
@ -138,106 +218,165 @@ struct EvalRange<Evaluator, Index, true> {
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Expression, bool Vectorizable>
|
||||
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
|
||||
template <typename Expression, bool Vectorizable, bool Tileable>
|
||||
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
|
||||
public:
|
||||
typedef typename Expression::Index Index;
|
||||
static inline void run(const Expression& expr, const ThreadPoolDevice& device)
|
||||
{
|
||||
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
|
||||
Evaluator evaluator(expr, device);
|
||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||
if (needs_assign)
|
||||
{
|
||||
const Index size = array_prod(evaluator.dimensions());
|
||||
size_t num_threads = device.numThreads();
|
||||
if (num_threads > 1) {
|
||||
num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
|
||||
size, evaluator.costPerCoeff(Vectorizable), num_threads);
|
||||
}
|
||||
if (num_threads == 1) {
|
||||
EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
|
||||
} else {
|
||||
const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
|
||||
Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
|
||||
const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
|
||||
const Index numblocks = size / blocksize;
|
||||
using StorageIndex = typename Expression::Index;
|
||||
|
||||
Barrier barrier(numblocks);
|
||||
for (int i = 0; i < numblocks; ++i) {
|
||||
device.enqueue_with_barrier(
|
||||
&barrier, &EvalRange<Evaluator, Index, Vectorizable>::run,
|
||||
&evaluator, i * blocksize, (i + 1) * blocksize);
|
||||
}
|
||||
if (numblocks * blocksize < size) {
|
||||
EvalRange<Evaluator, Index, Vectorizable>::run(
|
||||
&evaluator, numblocks * blocksize, size);
|
||||
}
|
||||
barrier.Wait();
|
||||
}
|
||||
static inline void run(const Expression& expr,
|
||||
const ThreadPoolDevice& device) {
|
||||
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
|
||||
typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
|
||||
|
||||
Evaluator evaluator(expr, device);
|
||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
|
||||
if (needs_assign) {
|
||||
const StorageIndex PacketSize =
|
||||
Vectorizable
|
||||
? unpacket_traits<typename Evaluator::PacketReturnType>::size
|
||||
: 1;
|
||||
const StorageIndex size = array_prod(evaluator.dimensions());
|
||||
device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
|
||||
EvalRange::alignBlockSize,
|
||||
[&evaluator](StorageIndex first, StorageIndex last) {
|
||||
EvalRange::run(&evaluator, first, last);
|
||||
});
|
||||
}
|
||||
evaluator.cleanup();
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Expression, bool Vectorizable>
|
||||
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ true> {
|
||||
public:
|
||||
using Scalar = typename traits<Expression>::Scalar;
|
||||
using ScalarNoConst = typename remove_const<Scalar>::type;
|
||||
|
||||
using Evaluator = TensorEvaluator<Expression, ThreadPoolDevice>;
|
||||
using StorageIndex = typename traits<Expression>::Index;
|
||||
|
||||
static const int NumDims = traits<Expression>::NumDimensions;
|
||||
|
||||
static inline void run(const Expression& expr,
|
||||
const ThreadPoolDevice& device) {
|
||||
using TensorBlock =
|
||||
TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
|
||||
using TensorBlockMapper =
|
||||
TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
|
||||
|
||||
Evaluator evaluator(expr, device);
|
||||
StorageIndex total_size = array_prod(evaluator.dimensions());
|
||||
StorageIndex cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
|
||||
if (total_size < cache_size) {
|
||||
// TODO(andydavis) Reduce block management overhead for small tensors.
|
||||
internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
|
||||
false>::run(expr, device);
|
||||
evaluator.cleanup();
|
||||
return;
|
||||
}
|
||||
|
||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
|
||||
if (needs_assign) {
|
||||
TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims;
|
||||
Index block_total_size = 0;
|
||||
// Query expression tree for desired block size/shape.
|
||||
std::vector<internal::TensorOpResourceRequirements> resources;
|
||||
evaluator.getResourceRequirements(&resources);
|
||||
MergeResourceRequirements(resources, &block_shape, &block_total_size);
|
||||
int num_threads = device.numThreads();
|
||||
|
||||
// Estimate minimum block size based on cost.
|
||||
TensorOpCost cost = evaluator.costPerCoeff(Vectorizable);
|
||||
double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(1, cost);
|
||||
size_t block_size = static_cast<size_t>(1.0 / taskSize);
|
||||
TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape,
|
||||
block_size);
|
||||
block_size = block_mapper.block_dims_total_size();
|
||||
const size_t aligned_blocksize =
|
||||
EIGEN_MAX_ALIGN_BYTES *
|
||||
divup<size_t>(block_size * sizeof(Scalar), EIGEN_MAX_ALIGN_BYTES);
|
||||
void* buf = device.allocate((num_threads + 1) * aligned_blocksize);
|
||||
device.parallelFor(
|
||||
block_mapper.total_block_count(), cost * block_size,
|
||||
[=, &device, &evaluator, &block_mapper](StorageIndex first,
|
||||
StorageIndex last) {
|
||||
// currentThreadId() returns -1 if called from a thread not in the
|
||||
// thread pool, such as the main thread dispatching Eigen
|
||||
// expressions.
|
||||
const int thread_idx = device.currentThreadId();
|
||||
eigen_assert(thread_idx >= -1 && thread_idx < num_threads);
|
||||
Scalar* thread_buf = reinterpret_cast<Scalar*>(
|
||||
static_cast<char*>(buf) + aligned_blocksize * (thread_idx + 1));
|
||||
for (StorageIndex i = first; i < last; ++i) {
|
||||
auto block = block_mapper.GetBlockForIndex(i, thread_buf);
|
||||
evaluator.evalBlock(&block);
|
||||
}
|
||||
});
|
||||
device.deallocate(buf);
|
||||
}
|
||||
evaluator.cleanup();
|
||||
}
|
||||
};
|
||||
|
||||
#endif // EIGEN_USE_THREADS
|
||||
|
||||
|
||||
// GPU: the evaluation of the expression is offloaded to a GPU.
|
||||
#if defined(EIGEN_USE_GPU)
|
||||
|
||||
template <typename Expression, bool Vectorizable>
|
||||
class TensorExecutor<Expression, GpuDevice, Vectorizable> {
|
||||
template <typename Expression, bool Vectorizable, bool Tileable>
|
||||
class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> {
|
||||
public:
|
||||
typedef typename Expression::Index Index;
|
||||
typedef typename Expression::Index StorageIndex;
|
||||
static void run(const Expression& expr, const GpuDevice& device);
|
||||
};
|
||||
|
||||
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template <typename Evaluator, typename Index, bool Vectorizable>
|
||||
template <typename Evaluator, typename StorageIndex, bool Vectorizable>
|
||||
struct EigenMetaKernelEval {
|
||||
static __device__ EIGEN_ALWAYS_INLINE
|
||||
void run(Evaluator& eval, Index first, Index last, Index step_size) {
|
||||
for (Index i = first; i < last; i += step_size) {
|
||||
void run(Evaluator& eval, StorageIndex first, StorageIndex last, StorageIndex step_size) {
|
||||
for (StorageIndex i = first; i < last; i += step_size) {
|
||||
eval.evalScalar(i);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Evaluator, typename Index>
|
||||
struct EigenMetaKernelEval<Evaluator, Index, true> {
|
||||
template <typename Evaluator, typename StorageIndex>
|
||||
struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
|
||||
static __device__ EIGEN_ALWAYS_INLINE
|
||||
void run(Evaluator& eval, Index first, Index last, Index step_size) {
|
||||
const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
|
||||
const Index vectorized_size = (last / PacketSize) * PacketSize;
|
||||
const Index vectorized_step_size = step_size * PacketSize;
|
||||
void run(Evaluator& eval, StorageIndex first, StorageIndex last, StorageIndex step_size) {
|
||||
const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
|
||||
const StorageIndex vectorized_size = (last / PacketSize) * PacketSize;
|
||||
const StorageIndex vectorized_step_size = step_size * PacketSize;
|
||||
|
||||
// Use the vector path
|
||||
for (Index i = first * PacketSize; i < vectorized_size;
|
||||
for (StorageIndex i = first * PacketSize; i < vectorized_size;
|
||||
i += vectorized_step_size) {
|
||||
eval.evalPacket(i);
|
||||
}
|
||||
for (Index i = vectorized_size + first; i < last; i += step_size) {
|
||||
for (StorageIndex i = vectorized_size + first; i < last; i += step_size) {
|
||||
eval.evalScalar(i);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Evaluator, typename Index>
|
||||
template <typename Evaluator, typename StorageIndex>
|
||||
__global__ void
|
||||
__launch_bounds__(1024)
|
||||
EigenMetaKernel(Evaluator eval, Index size) {
|
||||
EigenMetaKernel(Evaluator eval, StorageIndex size) {
|
||||
|
||||
const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const Index step_size = blockDim.x * gridDim.x;
|
||||
const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const StorageIndex step_size = blockDim.x * gridDim.x;
|
||||
|
||||
const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
|
||||
EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
|
||||
EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
|
||||
}
|
||||
|
||||
/*static*/
|
||||
template <typename Expression, bool Vectorizable>
|
||||
inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
|
||||
template <typename Expression, bool Vectorizable, bool Tileable>
|
||||
inline void TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable>::run(
|
||||
const Expression& expr, const GpuDevice& device) {
|
||||
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
|
||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||
@ -246,12 +385,12 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
|
||||
const int block_size = device.maxGpuThreadsPerBlock();
|
||||
const int max_blocks = device.getNumGpuMultiProcessors() *
|
||||
device.maxGpuThreadsPerMultiProcessor() / block_size;
|
||||
const Index size = array_prod(evaluator.dimensions());
|
||||
const StorageIndex size = array_prod(evaluator.dimensions());
|
||||
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
|
||||
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
|
||||
|
||||
LAUNCH_GPU_KERNEL(
|
||||
(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
|
||||
(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
|
||||
num_blocks, block_size, 0, device, evaluator, size);
|
||||
}
|
||||
evaluator.cleanup();
|
||||
|
@ -40,6 +40,8 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
|
||||
|
||||
enum {
|
||||
IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0),
|
||||
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
|
||||
BlockAccess = false,
|
||||
Layout = Options_ & RowMajor ? RowMajor : ColMajor,
|
||||
CoordAccess = true,
|
||||
RawAccess = true
|
||||
|
@ -98,6 +98,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = true,
|
||||
PacketAccess = (PacketSize > 1),
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
RawAccess = true
|
||||
};
|
||||
|
@ -129,8 +129,14 @@ struct IsVectorizable<GpuDevice, Expression> {
|
||||
TensorEvaluator<Expression, GpuDevice>::IsAligned;
|
||||
};
|
||||
|
||||
template <typename Device, typename Expression>
|
||||
struct IsTileable {
|
||||
static const bool value = TensorEvaluator<Expression, Device>::BlockAccess;
|
||||
};
|
||||
|
||||
template <typename Expression, typename Device,
|
||||
bool Vectorizable = IsVectorizable<Device, Expression>::value>
|
||||
bool Vectorizable = IsVectorizable<Device, Expression>::value,
|
||||
bool Tileable = IsTileable<Device, Expression>::value>
|
||||
class TensorExecutor;
|
||||
|
||||
} // end namespace internal
|
||||
|
@ -186,6 +186,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false,
|
||||
RawAccess = false
|
||||
|
@ -119,6 +119,7 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
|
||||
@ -181,6 +182,7 @@ template<typename ArgType, typename Device>
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
|
||||
CoordAccess = false // to be implemented
|
||||
};
|
||||
|
@ -105,6 +105,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
|
||||
@ -170,6 +171,7 @@ template<typename NewDimensions, typename ArgType, typename Device>
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
|
||||
@ -325,6 +327,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
|
||||
// slice offsets and sizes.
|
||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false,
|
||||
RawAccess = false
|
||||
@ -557,6 +560,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false,
|
||||
RawAccess = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess
|
||||
@ -716,7 +720,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
|
||||
static const int NumDims = internal::array_size<Strides>::value;
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
typedef Strides Dimensions;
|
||||
@ -858,7 +861,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
|
||||
}
|
||||
return inputIndex;
|
||||
}
|
||||
|
||||
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
|
||||
#ifndef __SYCL_DEVICE_ONLY__
|
||||
return numext::maxi(min, numext::mini(max,value));
|
||||
@ -907,7 +910,6 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
|
||||
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
typedef Strides Dimensions;
|
||||
|
@ -96,6 +96,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
enum {
|
||||
IsAligned = true,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = true,
|
||||
RawAccess = false
|
||||
|
@ -94,6 +94,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false,
|
||||
RawAccess = false
|
||||
|
@ -412,6 +412,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
|
@ -136,6 +136,7 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = false,
|
||||
BlockAccess = false,
|
||||
Layout = PlainObjectType::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
@ -364,6 +365,7 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = false,
|
||||
BlockAccess = false,
|
||||
Layout = TensorRef<Derived>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
@ -411,6 +413,7 @@ struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<cons
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = false,
|
||||
BlockAccess = false,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
|
@ -113,6 +113,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
@ -253,6 +254,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
|
@ -112,6 +112,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
@ -240,6 +241,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
|
||||
BlockAccess = false,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
|
@ -112,6 +112,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
@ -273,6 +274,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
|
@ -95,6 +95,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
BlockAccess = false,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false,
|
||||
RawAccess = false
|
||||
@ -110,7 +111,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
|
||||
for (int i = 0; i < NumInputDims; ++i) {
|
||||
m_reduced[i] = false;
|
||||
}
|
||||
|
||||
|
||||
const Dims& op_dims = op.dims();
|
||||
for (int i = 0; i < NumReducedDims; ++i) {
|
||||
eigen_assert(op_dims[i] >= 0);
|
||||
@ -128,7 +129,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
|
||||
|
||||
eigen_assert(num_distinct_reduce_dims == NumReducedDims);
|
||||
|
||||
// Compute the dimensions of the result.
|
||||
// Compute the dimensions of the result.
|
||||
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
|
||||
|
||||
int output_index = 0;
|
||||
@ -229,7 +230,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
|
||||
result += m_impl.coeff(cur_index);
|
||||
cur_index += index_stride;
|
||||
}
|
||||
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -213,6 +213,7 @@ if(EIGEN_TEST_CXX11)
|
||||
ei_add_test(cxx11_tensor_striding)
|
||||
ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
|
||||
ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
|
||||
ei_add_test(cxx11_tensor_executor "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
|
||||
ei_add_test(cxx11_tensor_ref)
|
||||
ei_add_test(cxx11_tensor_random)
|
||||
ei_add_test(cxx11_tensor_generator)
|
||||
|
@ -19,11 +19,75 @@ using Eigen::Index;
|
||||
using Eigen::RowMajor;
|
||||
using Eigen::ColMajor;
|
||||
|
||||
using internal::TensorBlockShapeType;
|
||||
|
||||
template<typename T>
|
||||
static const T& choose(int layout, const T& col, const T& row) {
|
||||
return layout == ColMajor ? col : row;
|
||||
}
|
||||
|
||||
static const TensorBlockShapeType RandomShape() {
|
||||
return internal::random<bool>()
|
||||
? internal::TensorBlockShapeType::kUniformAllDims
|
||||
: internal::TensorBlockShapeType::kSkewedInnerDims;
|
||||
}
|
||||
|
||||
template <int NumDims>
|
||||
static std::size_t RandomTargetSize(const DSizes<Index, NumDims>& dims) {
|
||||
return internal::random<int>(1, dims.TotalSize());
|
||||
}
|
||||
|
||||
template <int NumDims>
|
||||
static DSizes<Index, NumDims> RandomDims() {
|
||||
array<Index, NumDims> dims;
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
dims[i] = internal::random<int>(1, 20);
|
||||
}
|
||||
return DSizes<Index, NumDims>(dims);
|
||||
};
|
||||
|
||||
/** Dummy data type to test TensorBlock copy ops. */
|
||||
struct Data {
|
||||
Data() : Data(0) {}
|
||||
explicit Data(int v) { value = v; }
|
||||
int value;
|
||||
};
|
||||
|
||||
bool operator==(const Data& lhs, const Data& rhs) {
|
||||
return lhs.value == rhs.value;
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, const Data& d) {
|
||||
os << "Data: value=" << d.value;
|
||||
return os;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static T* GenerateRandomData(const Index& size) {
|
||||
T* data = new T[size];
|
||||
for (int i = 0; i < size; ++i) {
|
||||
data[i] = internal::random<T>();
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
template <>
|
||||
Data* GenerateRandomData(const Index& size) {
|
||||
Data* data = new Data[size];
|
||||
for (int i = 0; i < size; ++i) {
|
||||
data[i] = Data(internal::random<int>(1, 100));
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
template <int NumDims>
|
||||
static void Debug(DSizes<Index, NumDims> dims) {
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
std::cout << dims[i] << "; ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
template <int Layout>
|
||||
static void test_block_mapper_sanity()
|
||||
{
|
||||
@ -74,10 +138,8 @@ static void test_block_mapper_sanity()
|
||||
// index in the visited set. Verify that every coeff accessed only once.
|
||||
template <typename T, int Layout, int NumDims>
|
||||
static void UpdateCoeffSet(
|
||||
const internal::TensorBlock<T, Index, 4, Layout>& block,
|
||||
Index first_coeff_index,
|
||||
int dim_index,
|
||||
std::set<Index>* visited_coeffs) {
|
||||
const internal::TensorBlock<T, Index, NumDims, Layout>& block,
|
||||
Index first_coeff_index, int dim_index, std::set<Index>* visited_coeffs) {
|
||||
const DSizes<Index, NumDims> block_sizes = block.block_sizes();
|
||||
const DSizes<Index, NumDims> tensor_strides = block.tensor_strides();
|
||||
|
||||
@ -94,89 +156,840 @@ static void UpdateCoeffSet(
|
||||
}
|
||||
}
|
||||
|
||||
template <int Layout>
|
||||
static void test_block_mapper_maps_every_element()
|
||||
{
|
||||
using T = int;
|
||||
using TensorBlock = internal::TensorBlock<T, Index, 4, Layout>;
|
||||
using TensorBlockMapper = internal::TensorBlockMapper<T, Index, 4, Layout>;
|
||||
template <typename T, int NumDims, int Layout>
|
||||
static void test_block_mapper_maps_every_element() {
|
||||
using TensorBlock = internal::TensorBlock<T, Index, NumDims, Layout>;
|
||||
using TensorBlockMapper =
|
||||
internal::TensorBlockMapper<T, Index, NumDims, Layout>;
|
||||
|
||||
DSizes<Index, 4> dims(5, 7, 11, 17);
|
||||
|
||||
auto total_coeffs = static_cast<int>(dims.TotalSize());
|
||||
DSizes<Index, NumDims> dims = RandomDims<NumDims>();
|
||||
|
||||
// Keep track of elements indices available via block access.
|
||||
std::set<Index> coeff_set;
|
||||
|
||||
// Try different combinations of block types and sizes.
|
||||
auto block_shape_type =
|
||||
internal::random<bool>()
|
||||
? internal::TensorBlockShapeType::kUniformAllDims
|
||||
: internal::TensorBlockShapeType::kSkewedInnerDims;
|
||||
auto block_target_size = internal::random<int>(1, total_coeffs);
|
||||
TensorBlockMapper block_mapper(dims, block_shape_type, block_target_size);
|
||||
TensorBlockMapper block_mapper(dims, RandomShape(), RandomTargetSize(dims));
|
||||
|
||||
for (int i = 0; i < block_mapper.total_block_count(); ++i) {
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(i, nullptr);
|
||||
UpdateCoeffSet<T, Layout, 4>(block, block.first_coeff_index(),
|
||||
choose(Layout, 3, 0), &coeff_set);
|
||||
UpdateCoeffSet<T, Layout, NumDims>(block, block.first_coeff_index(),
|
||||
choose(Layout, NumDims - 1, 0),
|
||||
&coeff_set);
|
||||
}
|
||||
|
||||
// Verify that every coefficient in the original Tensor is accessible through
|
||||
// TensorBlock only once.
|
||||
Index total_coeffs = dims.TotalSize();
|
||||
VERIFY_IS_EQUAL(coeff_set.size(), total_coeffs);
|
||||
VERIFY_IS_EQUAL(*coeff_set.begin(), static_cast<Index>(0));
|
||||
VERIFY_IS_EQUAL(*coeff_set.rbegin(), static_cast<Index>(total_coeffs - 1));
|
||||
VERIFY_IS_EQUAL(*coeff_set.begin(), 0);
|
||||
VERIFY_IS_EQUAL(*coeff_set.rbegin(), total_coeffs - 1);
|
||||
}
|
||||
|
||||
template <int Layout>
|
||||
static void test_slice_block_mapper_maps_every_element()
|
||||
{
|
||||
using T = int;
|
||||
using TensorBlock = internal::TensorBlock<T, Index, 4, Layout>;
|
||||
template <typename T, int NumDims, int Layout>
|
||||
static void test_slice_block_mapper_maps_every_element() {
|
||||
using TensorBlock = internal::TensorBlock<T, Index, NumDims, Layout>;
|
||||
using TensorSliceBlockMapper =
|
||||
internal::TensorSliceBlockMapper<T, Index, 4, Layout>;
|
||||
internal::TensorSliceBlockMapper<T, Index, NumDims, Layout>;
|
||||
|
||||
DSizes<Index, 4> tensor_dims(5,7,11,17);
|
||||
DSizes<Index, 4> tensor_slice_offsets(1,3,5,7);
|
||||
DSizes<Index, 4> tensor_slice_extents(3,2,4,5);
|
||||
DSizes<Index, NumDims> tensor_dims = RandomDims<NumDims>();
|
||||
DSizes<Index, NumDims> tensor_slice_offsets = RandomDims<NumDims>();
|
||||
DSizes<Index, NumDims> tensor_slice_extents = RandomDims<NumDims>();
|
||||
|
||||
// Make sure that tensor offsets + extents do not overflow.
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
tensor_slice_offsets[i] =
|
||||
numext::mini(tensor_dims[i] - 1, tensor_slice_offsets[i]);
|
||||
tensor_slice_extents[i] = numext::mini(
|
||||
tensor_slice_extents[i], tensor_dims[i] - tensor_slice_offsets[i]);
|
||||
}
|
||||
|
||||
// Keep track of elements indices available via block access.
|
||||
std::set<Index> coeff_set;
|
||||
|
||||
auto total_coeffs = static_cast<int>(tensor_slice_extents.TotalSize());
|
||||
|
||||
// Try different combinations of block types and sizes.
|
||||
auto block_shape_type =
|
||||
internal::random<bool>()
|
||||
? internal::TensorBlockShapeType::kUniformAllDims
|
||||
: internal::TensorBlockShapeType::kSkewedInnerDims;
|
||||
auto block_target_size = internal::random<int>(1, total_coeffs);
|
||||
|
||||
// Pick a random dimension sizes for the tensor blocks.
|
||||
DSizes<Index, 4> block_sizes;
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
DSizes<Index, NumDims> block_sizes;
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
block_sizes[i] = internal::random<int>(1, tensor_slice_extents[i]);
|
||||
}
|
||||
|
||||
TensorSliceBlockMapper block_mapper(tensor_dims, tensor_slice_offsets,
|
||||
tensor_slice_extents, block_sizes,
|
||||
DimensionList<Index, 4>());
|
||||
DimensionList<Index, NumDims>());
|
||||
|
||||
for (int i = 0; i < block_mapper.total_block_count(); ++i) {
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(i, NULL);
|
||||
UpdateCoeffSet<T, Layout, 4>(block, block.first_coeff_index(),
|
||||
choose(Layout, 3, 0), &coeff_set);
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(i, nullptr);
|
||||
UpdateCoeffSet<T, Layout, NumDims>(block, block.first_coeff_index(),
|
||||
choose(Layout, NumDims - 1, 0),
|
||||
&coeff_set);
|
||||
}
|
||||
|
||||
VERIFY_IS_EQUAL(coeff_set.size(), total_coeffs);
|
||||
}
|
||||
|
||||
EIGEN_DECLARE_TEST(cxx11_tensor_assign) {
|
||||
CALL_SUBTEST(test_block_mapper_sanity<ColMajor>());
|
||||
CALL_SUBTEST(test_block_mapper_sanity<RowMajor>());
|
||||
CALL_SUBTEST(test_block_mapper_maps_every_element<ColMajor>());
|
||||
CALL_SUBTEST(test_block_mapper_maps_every_element<RowMajor>());
|
||||
CALL_SUBTEST(test_slice_block_mapper_maps_every_element<ColMajor>());
|
||||
CALL_SUBTEST(test_slice_block_mapper_maps_every_element<RowMajor>());
|
||||
template <typename T, int NumDims, int Layout>
|
||||
static void test_block_io_copy_data_from_source_to_target() {
|
||||
typedef internal::TensorBlock<T, Index, NumDims, Layout> TensorBlock;
|
||||
typedef internal::TensorBlockMapper<T, Index, NumDims, Layout>
|
||||
TensorBlockMapper;
|
||||
|
||||
typedef internal::TensorBlockReader<T, Index, NumDims, Layout>
|
||||
TensorBlockReader;
|
||||
typedef internal::TensorBlockWriter<T, Index, NumDims, Layout>
|
||||
TensorBlockWriter;
|
||||
|
||||
DSizes<Index, NumDims> input_tensor_dims = RandomDims<NumDims>();
|
||||
const auto input_tensor_size = input_tensor_dims.TotalSize();
|
||||
|
||||
T* input_data = GenerateRandomData<T>(input_tensor_size);
|
||||
T* output_data = new T[input_tensor_size];
|
||||
|
||||
TensorBlockMapper block_mapper(input_tensor_dims, RandomShape(),
|
||||
RandomTargetSize(input_tensor_dims));
|
||||
T* block_data = new T[block_mapper.block_dims_total_size()];
|
||||
|
||||
for (int i = 0; i < block_mapper.total_block_count(); ++i) {
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(i, block_data);
|
||||
TensorBlockReader::Run(&block, input_data);
|
||||
TensorBlockWriter::Run(block, output_data);
|
||||
}
|
||||
|
||||
for (int i = 0; i < input_tensor_size; ++i) {
|
||||
VERIFY_IS_EQUAL(input_data[i], output_data[i]);
|
||||
}
|
||||
|
||||
delete[] input_data;
|
||||
delete[] output_data;
|
||||
delete[] block_data;
|
||||
}
|
||||
|
||||
template <int Layout, int NumDims>
|
||||
static int GetInputIndex(Index output_index,
|
||||
const array<Index, NumDims>& output_to_input_dim_map,
|
||||
const array<Index, NumDims>& input_strides,
|
||||
const array<Index, NumDims>& output_strides) {
|
||||
int input_index = 0;
|
||||
if (Layout == ColMajor) {
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
const int idx = output_index / output_strides[i];
|
||||
input_index += idx * input_strides[output_to_input_dim_map[i]];
|
||||
output_index -= idx * output_strides[i];
|
||||
}
|
||||
return input_index +
|
||||
output_index * input_strides[output_to_input_dim_map[0]];
|
||||
} else {
|
||||
for (int i = 0; i < NumDims - 1; ++i) {
|
||||
const int idx = output_index / output_strides[i];
|
||||
input_index += idx * input_strides[output_to_input_dim_map[i]];
|
||||
output_index -= idx * output_strides[i];
|
||||
}
|
||||
return input_index +
|
||||
output_index * input_strides[output_to_input_dim_map[NumDims - 1]];
|
||||
}
|
||||
}
|
||||
|
||||
template <int Layout, int NumDims>
|
||||
static array<Index, NumDims> ComputeStrides(
|
||||
const array<Index, NumDims>& sizes) {
|
||||
array<Index, NumDims> strides;
|
||||
if (Layout == ColMajor) {
|
||||
strides[0] = 1;
|
||||
for (int i = 1; i < NumDims; ++i) {
|
||||
strides[i] = strides[i - 1] * sizes[i - 1];
|
||||
}
|
||||
} else {
|
||||
strides[NumDims - 1] = 1;
|
||||
for (int i = NumDims - 2; i >= 0; --i) {
|
||||
strides[i] = strides[i + 1] * sizes[i + 1];
|
||||
}
|
||||
}
|
||||
return strides;
|
||||
}
|
||||
|
||||
template <typename T, int NumDims, int Layout>
|
||||
static void test_block_io_copy_using_reordered_dimensions() {
|
||||
typedef internal::TensorBlock<T, Index, NumDims, Layout> TensorBlock;
|
||||
typedef internal::TensorBlockMapper<T, Index, NumDims, Layout>
|
||||
TensorBlockMapper;
|
||||
|
||||
typedef internal::TensorBlockReader<T, Index, NumDims, Layout>
|
||||
TensorBlockReader;
|
||||
typedef internal::TensorBlockWriter<T, Index, NumDims, Layout>
|
||||
TensorBlockWriter;
|
||||
|
||||
DSizes<Index, NumDims> input_tensor_dims = RandomDims<NumDims>();
|
||||
const auto input_tensor_size = input_tensor_dims.TotalSize();
|
||||
|
||||
// Create a random input tensor.
|
||||
T* input_data = GenerateRandomData<T>(input_tensor_size);
|
||||
|
||||
// Create a random dimension re-ordering/shuffle.
|
||||
std::vector<Index> shuffle;
|
||||
for (int i = 0; i < NumDims; ++i) shuffle.push_back(i);
|
||||
std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
|
||||
|
||||
DSizes<Index, NumDims> output_tensor_dims;
|
||||
array<Index, NumDims> input_to_output_dim_map;
|
||||
array<Index, NumDims> output_to_input_dim_map;
|
||||
for (Index i = 0; i < NumDims; ++i) {
|
||||
output_tensor_dims[shuffle[i]] = input_tensor_dims[i];
|
||||
input_to_output_dim_map[i] = shuffle[i];
|
||||
output_to_input_dim_map[shuffle[i]] = i;
|
||||
}
|
||||
|
||||
// Random block shape and size.
|
||||
TensorBlockMapper block_mapper(output_tensor_dims, RandomShape(),
|
||||
RandomTargetSize(input_tensor_dims));
|
||||
|
||||
auto* block_data = new T[block_mapper.block_dims_total_size()];
|
||||
auto* output_data = new T[input_tensor_size];
|
||||
|
||||
array<Index, NumDims> input_tensor_strides =
|
||||
ComputeStrides<Layout, NumDims>(input_tensor_dims);
|
||||
array<Index, NumDims> output_tensor_strides =
|
||||
ComputeStrides<Layout, NumDims>(output_tensor_dims);
|
||||
|
||||
for (Index i = 0; i < block_mapper.total_block_count(); ++i) {
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(i, block_data);
|
||||
const Index first_coeff_index = GetInputIndex<Layout, NumDims>(
|
||||
block.first_coeff_index(), output_to_input_dim_map,
|
||||
input_tensor_strides, output_tensor_strides);
|
||||
TensorBlockReader::Run(&block, first_coeff_index, input_to_output_dim_map,
|
||||
input_tensor_strides, input_data);
|
||||
TensorBlockWriter::Run(block, first_coeff_index, input_to_output_dim_map,
|
||||
input_tensor_strides, output_data);
|
||||
}
|
||||
|
||||
for (int i = 0; i < input_tensor_size; ++i) {
|
||||
VERIFY_IS_EQUAL(input_data[i], output_data[i]);
|
||||
}
|
||||
|
||||
delete[] input_data;
|
||||
delete[] block_data;
|
||||
delete[] output_data;
|
||||
}
|
||||
|
||||
template <int Layout>
|
||||
static void test_block_io_zero_stride()
|
||||
{
|
||||
typedef internal::TensorBlock<float, Index, 5, Layout> TensorBlock;
|
||||
typedef internal::TensorBlockReader<float, Index, 5, Layout>
|
||||
TensorBlockReader;
|
||||
typedef internal::TensorBlockWriter<float, Index, 5, Layout>
|
||||
TensorBlockWriter;
|
||||
|
||||
DSizes<Index, 5> rnd_dims = RandomDims<5>();
|
||||
|
||||
DSizes<Index, 5> input_tensor_dims = rnd_dims;
|
||||
input_tensor_dims[0] = 1;
|
||||
input_tensor_dims[2] = 1;
|
||||
input_tensor_dims[4] = 1;
|
||||
const auto input_tensor_size = input_tensor_dims.TotalSize();
|
||||
auto* input_data = GenerateRandomData<float>(input_tensor_size);
|
||||
|
||||
DSizes<Index, 5> output_tensor_dims = rnd_dims;
|
||||
|
||||
DSizes<Index, 5> input_tensor_strides(
|
||||
ComputeStrides<Layout, 5>(input_tensor_dims));
|
||||
DSizes<Index, 5> output_tensor_strides(
|
||||
ComputeStrides<Layout, 5>(output_tensor_dims));
|
||||
|
||||
DSizes<Index, 5> input_tensor_strides_with_zeros(input_tensor_strides);
|
||||
input_tensor_strides_with_zeros[0] = 0;
|
||||
input_tensor_strides_with_zeros[2] = 0;
|
||||
input_tensor_strides_with_zeros[4] = 0;
|
||||
|
||||
// Verify that data was correctly read/written from/into the block.
|
||||
const auto verify_is_equal = [&](const float* output_data) {
|
||||
for (int i = 0; i < output_tensor_dims[0]; ++i) {
|
||||
for (int j = 0; j < output_tensor_dims[1]; ++j) {
|
||||
for (int k = 0; k < output_tensor_dims[2]; ++k) {
|
||||
for (int l = 0; l < output_tensor_dims[3]; ++l) {
|
||||
for (int m = 0; m < output_tensor_dims[4]; ++m) {
|
||||
const Index output_offset =
|
||||
i * output_tensor_strides[0] + j * output_tensor_strides[1] +
|
||||
k * output_tensor_strides[2] + l * output_tensor_strides[3] +
|
||||
m * output_tensor_strides[4];
|
||||
const Index input_offset =
|
||||
i % input_tensor_dims[0] * input_tensor_strides[0] +
|
||||
j % input_tensor_dims[1] * input_tensor_strides[1] +
|
||||
k % input_tensor_dims[2] * input_tensor_strides[2] +
|
||||
l % input_tensor_dims[3] * input_tensor_strides[3] +
|
||||
m % input_tensor_dims[4] * input_tensor_strides[4];
|
||||
VERIFY_IS_EQUAL(output_data[output_offset],
|
||||
input_data[input_offset]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
{
|
||||
auto* output_data = new float[output_tensor_dims.TotalSize()];
|
||||
TensorBlock read_block(0, output_tensor_dims, output_tensor_strides,
|
||||
input_tensor_strides_with_zeros, output_data);
|
||||
TensorBlockReader::Run(&read_block, input_data);
|
||||
verify_is_equal(output_data);
|
||||
delete[] output_data;
|
||||
}
|
||||
|
||||
{
|
||||
auto* output_data = new float[output_tensor_dims.TotalSize()];
|
||||
TensorBlock write_block(0, output_tensor_dims,
|
||||
input_tensor_strides_with_zeros,
|
||||
output_tensor_strides, input_data);
|
||||
TensorBlockWriter::Run(write_block, output_data);
|
||||
verify_is_equal(output_data);
|
||||
delete[] output_data;
|
||||
}
|
||||
|
||||
delete[] input_data;
|
||||
}
|
||||
|
||||
template <int Layout>
|
||||
static void test_block_io_squeeze_ones() {
|
||||
typedef internal::TensorBlock<float, Index, 5, Layout> TensorBlock;
|
||||
typedef internal::TensorBlockReader<float, Index, 5, Layout>
|
||||
TensorBlockReader;
|
||||
typedef internal::TensorBlockWriter<float, Index, 5, Layout>
|
||||
TensorBlockWriter;
|
||||
|
||||
// Total size > 1.
|
||||
{
|
||||
DSizes<Index, 5> block_sizes(1, 2, 1, 2, 1);
|
||||
const auto total_size = block_sizes.TotalSize();
|
||||
|
||||
// Create a random input tensor.
|
||||
auto* input_data = GenerateRandomData<float>(total_size);
|
||||
DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));
|
||||
|
||||
{
|
||||
auto* output_data = new float[block_sizes.TotalSize()];
|
||||
TensorBlock read_block(0, block_sizes, strides, strides, output_data);
|
||||
TensorBlockReader::Run(&read_block, input_data);
|
||||
for (int i = 0; i < total_size; ++i) {
|
||||
VERIFY_IS_EQUAL(output_data[i], input_data[i]);
|
||||
}
|
||||
delete[] output_data;
|
||||
}
|
||||
|
||||
{
|
||||
auto* output_data = new float[block_sizes.TotalSize()];
|
||||
TensorBlock write_block(0, block_sizes, strides, strides, input_data);
|
||||
TensorBlockWriter::Run(write_block, output_data);
|
||||
for (int i = 0; i < total_size; ++i) {
|
||||
VERIFY_IS_EQUAL(output_data[i], input_data[i]);
|
||||
}
|
||||
delete[] output_data;
|
||||
}
|
||||
}
|
||||
|
||||
// Total size == 1.
|
||||
{
|
||||
DSizes<Index, 5> block_sizes(1, 1, 1, 1, 1);
|
||||
const auto total_size = block_sizes.TotalSize();
|
||||
|
||||
// Create a random input tensor.
|
||||
auto* input_data = GenerateRandomData<float>(total_size);
|
||||
DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));
|
||||
|
||||
{
|
||||
auto* output_data = new float[block_sizes.TotalSize()];
|
||||
TensorBlock read_block(0, block_sizes, strides, strides, output_data);
|
||||
TensorBlockReader::Run(&read_block, input_data);
|
||||
for (int i = 0; i < total_size; ++i) {
|
||||
VERIFY_IS_EQUAL(output_data[i], input_data[i]);
|
||||
}
|
||||
delete[] output_data;
|
||||
}
|
||||
|
||||
{
|
||||
auto* output_data = new float[block_sizes.TotalSize()];
|
||||
TensorBlock write_block(0, block_sizes, strides, strides, input_data);
|
||||
TensorBlockWriter::Run(write_block, output_data);
|
||||
for (int i = 0; i < total_size; ++i) {
|
||||
VERIFY_IS_EQUAL(output_data[i], input_data[i]);
|
||||
}
|
||||
delete[] output_data;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, int NumDims, int Layout>
|
||||
static void test_block_cwise_binary_io_basic() {
|
||||
typedef internal::scalar_sum_op<T> BinaryFunctor;
|
||||
typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, T, NumDims,
|
||||
Layout>
|
||||
TensorBlockCwiseBinaryIO;
|
||||
|
||||
DSizes<Index, NumDims> block_sizes = RandomDims<NumDims>();
|
||||
DSizes<Index, NumDims> strides(ComputeStrides<Layout, NumDims>(block_sizes));
|
||||
|
||||
const auto total_size = block_sizes.TotalSize();
|
||||
|
||||
// Create a random input tensors.
|
||||
T* left_data = GenerateRandomData<T>(total_size);
|
||||
T* right_data = GenerateRandomData<T>(total_size);
|
||||
|
||||
T* output_data = new T[total_size];
|
||||
BinaryFunctor functor;
|
||||
TensorBlockCwiseBinaryIO::Run(functor, block_sizes, strides, output_data,
|
||||
strides, left_data, strides, right_data);
|
||||
for (int i = 0; i < total_size; ++i) {
|
||||
VERIFY_IS_EQUAL(output_data[i], functor(left_data[i], right_data[i]));
|
||||
}
|
||||
|
||||
delete[] left_data;
|
||||
delete[] right_data;
|
||||
delete[] output_data;
|
||||
}
|
||||
|
||||
template <int Layout>
|
||||
static void test_block_cwise_binary_io_squeeze_ones() {
|
||||
typedef internal::scalar_sum_op<float> BinaryFunctor;
|
||||
typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, float, 5,
|
||||
Layout>
|
||||
TensorBlockCwiseBinaryIO;
|
||||
|
||||
DSizes<Index, 5> block_sizes(1, 2, 1, 3, 1);
|
||||
DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));
|
||||
|
||||
const auto total_size = block_sizes.TotalSize();
|
||||
|
||||
// Create a random input tensors.
|
||||
auto* left_data = GenerateRandomData<float>(total_size);
|
||||
auto* right_data = GenerateRandomData<float>(total_size);
|
||||
|
||||
auto* output_data = new float[total_size];
|
||||
BinaryFunctor functor;
|
||||
TensorBlockCwiseBinaryIO::Run(functor, block_sizes, strides, output_data,
|
||||
strides, left_data, strides, right_data);
|
||||
for (int i = 0; i < total_size; ++i) {
|
||||
VERIFY_IS_EQUAL(output_data[i], functor(left_data[i], right_data[i]));
|
||||
}
|
||||
|
||||
delete[] left_data;
|
||||
delete[] right_data;
|
||||
delete[] output_data;
|
||||
}
|
||||
|
||||
template <int Layout>
|
||||
static void test_block_cwise_binary_io_zero_strides() {
|
||||
typedef internal::scalar_sum_op<float> BinaryFunctor;
|
||||
typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, float, 5,
|
||||
Layout>
|
||||
TensorBlockCwiseBinaryIO;
|
||||
|
||||
DSizes<Index, 5> rnd_dims = RandomDims<5>();
|
||||
|
||||
DSizes<Index, 5> left_sizes = rnd_dims;
|
||||
left_sizes[0] = 1;
|
||||
left_sizes[2] = 1;
|
||||
left_sizes[4] = 1;
|
||||
|
||||
DSizes<Index, 5> left_strides(ComputeStrides<Layout, 5>(left_sizes));
|
||||
left_strides[0] = 0;
|
||||
left_strides[2] = 0;
|
||||
left_strides[4] = 0;
|
||||
|
||||
DSizes<Index, 5> right_sizes = rnd_dims;
|
||||
right_sizes[1] = 0;
|
||||
right_sizes[3] = 0;
|
||||
|
||||
DSizes<Index, 5> right_strides(ComputeStrides<Layout, 5>(right_sizes));
|
||||
right_strides[1] = 0;
|
||||
right_strides[3] = 0;
|
||||
|
||||
// Generate random data.
|
||||
auto* left_data = GenerateRandomData<float>(left_sizes.TotalSize());
|
||||
auto* right_data = GenerateRandomData<float>(right_sizes.TotalSize());
|
||||
|
||||
DSizes<Index, 5> output_sizes = rnd_dims;
|
||||
DSizes<Index, 5> output_strides(ComputeStrides<Layout, 5>(output_sizes));
|
||||
|
||||
const auto output_total_size = output_sizes.TotalSize();
|
||||
auto* output_data = new float[output_total_size];
|
||||
|
||||
BinaryFunctor functor;
|
||||
TensorBlockCwiseBinaryIO::Run(functor, output_sizes, output_strides,
|
||||
output_data, left_strides, left_data,
|
||||
right_strides, right_data);
|
||||
for (int i = 0; i < rnd_dims[0]; ++i) {
|
||||
for (int j = 0; j < rnd_dims[1]; ++j) {
|
||||
for (int k = 0; k < rnd_dims[2]; ++k) {
|
||||
for (int l = 0; l < rnd_dims[3]; ++l) {
|
||||
for (int m = 0; m < rnd_dims[4]; ++m) {
|
||||
Index output_index = i * output_strides[0] + j * output_strides[1] +
|
||||
k * output_strides[2] + l * output_strides[3] +
|
||||
m * output_strides[4];
|
||||
Index left_index = i * left_strides[0] + j * left_strides[1] +
|
||||
k * left_strides[2] + l * left_strides[3] +
|
||||
m * left_strides[4];
|
||||
Index right_index = i * right_strides[0] + j * right_strides[1] +
|
||||
k * right_strides[2] + l * right_strides[3] +
|
||||
m * right_strides[4];
|
||||
VERIFY_IS_EQUAL(
|
||||
output_data[output_index],
|
||||
functor(left_data[left_index], right_data[right_index]));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
delete[] left_data;
|
||||
delete[] right_data;
|
||||
delete[] output_data;
|
||||
}
|
||||
|
||||
template <int Layout>
|
||||
static void test_uniform_block_shape()
|
||||
{
|
||||
using T = int;
|
||||
typedef internal::TensorBlock<T, Index, 5, Layout> TensorBlock;
|
||||
typedef internal::TensorBlockMapper<T, Index, 5, Layout> TensorBlockMapper;
|
||||
|
||||
{
|
||||
// Test shape 'UniformAllDims' with uniform 'max_coeff count'.
|
||||
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
|
||||
const size_t max_coeff_count = 5 * 5 * 5 * 5 * 5;
|
||||
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
|
||||
max_coeff_count);
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
|
||||
for (int i = 0; i < 5; ++i) {
|
||||
VERIFY_IS_EQUAL(5, block.block_sizes()[i]);
|
||||
}
|
||||
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
|
||||
}
|
||||
|
||||
// Test shape 'UniformAllDims' with larger 'max_coeff count' which spills
|
||||
// partially into first inner-most dimension.
|
||||
if (Layout == ColMajor) {
|
||||
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
|
||||
const size_t max_coeff_count = 7 * 5 * 5 * 5 * 5;
|
||||
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
|
||||
max_coeff_count);
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
|
||||
VERIFY_IS_EQUAL(7, block.block_sizes()[0]);
|
||||
for (int i = 1; i < 5; ++i) {
|
||||
VERIFY_IS_EQUAL(5, block.block_sizes()[i]);
|
||||
}
|
||||
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
|
||||
} else {
|
||||
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
|
||||
const size_t max_coeff_count = 5 * 5 * 5 * 5 * 6;
|
||||
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
|
||||
max_coeff_count);
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
|
||||
VERIFY_IS_EQUAL(6, block.block_sizes()[4]);
|
||||
for (int i = 3; i >= 0; --i) {
|
||||
VERIFY_IS_EQUAL(5, block.block_sizes()[i]);
|
||||
}
|
||||
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
|
||||
}
|
||||
|
||||
// Test shape 'UniformAllDims' with larger 'max_coeff count' which spills
|
||||
// fully into first inner-most dimension.
|
||||
if (Layout == ColMajor) {
|
||||
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
|
||||
const size_t max_coeff_count = 11 * 5 * 5 * 5 * 5;
|
||||
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
|
||||
max_coeff_count);
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
|
||||
VERIFY_IS_EQUAL(11, block.block_sizes()[0]);
|
||||
for (int i = 1; i < 5; ++i) {
|
||||
VERIFY_IS_EQUAL(5, block.block_sizes()[i]);
|
||||
}
|
||||
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
|
||||
} else {
|
||||
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
|
||||
const size_t max_coeff_count = 5 * 5 * 5 * 5 * 7;
|
||||
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
|
||||
max_coeff_count);
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
|
||||
VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
|
||||
for (int i = 3; i >= 0; --i) {
|
||||
VERIFY_IS_EQUAL(5, block.block_sizes()[i]);
|
||||
}
|
||||
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
|
||||
}
|
||||
|
||||
// Test shape 'UniformAllDims' with larger 'max_coeff count' which spills
|
||||
// fully into first few inner-most dimensions.
|
||||
if (Layout == ColMajor) {
|
||||
DSizes<Index, 5> dims(7, 5, 6, 17, 7);
|
||||
const size_t max_coeff_count = 7 * 5 * 6 * 7 * 5;
|
||||
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
|
||||
max_coeff_count);
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
|
||||
VERIFY_IS_EQUAL(7, block.block_sizes()[0]);
|
||||
VERIFY_IS_EQUAL(5, block.block_sizes()[1]);
|
||||
VERIFY_IS_EQUAL(6, block.block_sizes()[2]);
|
||||
VERIFY_IS_EQUAL(7, block.block_sizes()[3]);
|
||||
VERIFY_IS_EQUAL(5, block.block_sizes()[4]);
|
||||
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
|
||||
} else {
|
||||
DSizes<Index, 5> dims(7, 5, 6, 9, 7);
|
||||
const size_t max_coeff_count = 5 * 5 * 5 * 6 * 7;
|
||||
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
|
||||
max_coeff_count);
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
|
||||
VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
|
||||
VERIFY_IS_EQUAL(6, block.block_sizes()[3]);
|
||||
VERIFY_IS_EQUAL(5, block.block_sizes()[2]);
|
||||
VERIFY_IS_EQUAL(5, block.block_sizes()[1]);
|
||||
VERIFY_IS_EQUAL(5, block.block_sizes()[0]);
|
||||
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
|
||||
}
|
||||
|
||||
// Test shape 'UniformAllDims' with full allocation to all dims.
|
||||
if (Layout == ColMajor) {
|
||||
DSizes<Index, 5> dims(7, 5, 6, 17, 7);
|
||||
const size_t max_coeff_count = 7 * 5 * 6 * 17 * 7;
|
||||
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
|
||||
max_coeff_count);
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
|
||||
VERIFY_IS_EQUAL(7, block.block_sizes()[0]);
|
||||
VERIFY_IS_EQUAL(5, block.block_sizes()[1]);
|
||||
VERIFY_IS_EQUAL(6, block.block_sizes()[2]);
|
||||
VERIFY_IS_EQUAL(17, block.block_sizes()[3]);
|
||||
VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
|
||||
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
|
||||
} else {
|
||||
DSizes<Index, 5> dims(7, 5, 6, 9, 7);
|
||||
const size_t max_coeff_count = 7 * 5 * 6 * 9 * 7;
|
||||
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
|
||||
max_coeff_count);
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
|
||||
VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
|
||||
VERIFY_IS_EQUAL(9, block.block_sizes()[3]);
|
||||
VERIFY_IS_EQUAL(6, block.block_sizes()[2]);
|
||||
VERIFY_IS_EQUAL(5, block.block_sizes()[1]);
|
||||
VERIFY_IS_EQUAL(7, block.block_sizes()[0]);
|
||||
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
|
||||
}
|
||||
}
|
||||
|
||||
template <int Layout>
|
||||
static void test_skewed_inner_dim_block_shape()
|
||||
{
|
||||
using T = int;
|
||||
typedef internal::TensorBlock<T, Index, 5, Layout> TensorBlock;
|
||||
typedef internal::TensorBlockMapper<T, Index, 5, Layout> TensorBlockMapper;
|
||||
|
||||
// Test shape 'SkewedInnerDims' with partial allocation to inner-most dim.
|
||||
if (Layout == ColMajor) {
|
||||
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
|
||||
const size_t max_coeff_count = 10 * 1 * 1 * 1 * 1;
|
||||
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
|
||||
max_coeff_count);
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
|
||||
VERIFY_IS_EQUAL(10, block.block_sizes()[0]);
|
||||
for (int i = 1; i < 5; ++i) {
|
||||
VERIFY_IS_EQUAL(1, block.block_sizes()[i]);
|
||||
}
|
||||
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
|
||||
} else {
|
||||
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
|
||||
const size_t max_coeff_count = 1 * 1 * 1 * 1 * 6;
|
||||
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
|
||||
max_coeff_count);
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
|
||||
VERIFY_IS_EQUAL(6, block.block_sizes()[4]);
|
||||
for (int i = 3; i >= 0; --i) {
|
||||
VERIFY_IS_EQUAL(1, block.block_sizes()[i]);
|
||||
}
|
||||
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
|
||||
}
|
||||
|
||||
// Test shape 'SkewedInnerDims' with full allocation to inner-most dim.
|
||||
if (Layout == ColMajor) {
|
||||
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
|
||||
const size_t max_coeff_count = 11 * 1 * 1 * 1 * 1;
|
||||
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
|
||||
max_coeff_count);
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
|
||||
VERIFY_IS_EQUAL(11, block.block_sizes()[0]);
|
||||
for (int i = 1; i < 5; ++i) {
|
||||
VERIFY_IS_EQUAL(1, block.block_sizes()[i]);
|
||||
}
|
||||
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
|
||||
} else {
|
||||
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
|
||||
const size_t max_coeff_count = 1 * 1 * 1 * 1 * 7;
|
||||
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
|
||||
max_coeff_count);
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
|
||||
VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
|
||||
for (int i = 3; i >= 0; --i) {
|
||||
VERIFY_IS_EQUAL(1, block.block_sizes()[i]);
|
||||
}
|
||||
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
|
||||
}
|
||||
|
||||
// Test shape 'SkewedInnerDims' with full allocation to inner-most dim,
|
||||
// and partial allocation to second inner-dim.
|
||||
if (Layout == ColMajor) {
|
||||
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
|
||||
const size_t max_coeff_count = 11 * 3 * 1 * 1 * 1;
|
||||
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
|
||||
max_coeff_count);
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
|
||||
VERIFY_IS_EQUAL(11, block.block_sizes()[0]);
|
||||
VERIFY_IS_EQUAL(3, block.block_sizes()[1]);
|
||||
for (int i = 2; i < 5; ++i) {
|
||||
VERIFY_IS_EQUAL(1, block.block_sizes()[i]);
|
||||
}
|
||||
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
|
||||
} else {
|
||||
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
|
||||
const size_t max_coeff_count = 1 * 1 * 1 * 15 * 7;
|
||||
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
|
||||
max_coeff_count);
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
|
||||
VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
|
||||
VERIFY_IS_EQUAL(15, block.block_sizes()[3]);
|
||||
for (int i = 2; i >= 0; --i) {
|
||||
VERIFY_IS_EQUAL(1, block.block_sizes()[i]);
|
||||
}
|
||||
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
|
||||
}
|
||||
|
||||
// Test shape 'SkewedInnerDims' with full allocation to inner-most dim,
|
||||
// and partial allocation to third inner-dim.
|
||||
if (Layout == ColMajor) {
|
||||
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
|
||||
const size_t max_coeff_count = 11 * 5 * 5 * 1 * 1;
|
||||
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
|
||||
max_coeff_count);
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
|
||||
VERIFY_IS_EQUAL(11, block.block_sizes()[0]);
|
||||
VERIFY_IS_EQUAL(5, block.block_sizes()[1]);
|
||||
VERIFY_IS_EQUAL(5, block.block_sizes()[2]);
|
||||
for (int i = 3; i < 5; ++i) {
|
||||
VERIFY_IS_EQUAL(1, block.block_sizes()[i]);
|
||||
}
|
||||
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
|
||||
} else {
|
||||
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
|
||||
const size_t max_coeff_count = 1 * 1 * 5 * 17 * 7;
|
||||
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
|
||||
max_coeff_count);
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
|
||||
VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
|
||||
VERIFY_IS_EQUAL(17, block.block_sizes()[3]);
|
||||
VERIFY_IS_EQUAL(5, block.block_sizes()[2]);
|
||||
for (int i = 1; i >= 0; --i) {
|
||||
VERIFY_IS_EQUAL(1, block.block_sizes()[i]);
|
||||
}
|
||||
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
|
||||
}
|
||||
|
||||
// Test shape 'SkewedInnerDims' with full allocation to all dims.
|
||||
if (Layout == ColMajor) {
|
||||
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
|
||||
const size_t max_coeff_count = 11 * 5 * 6 * 17 * 7;
|
||||
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
|
||||
max_coeff_count);
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
|
||||
VERIFY_IS_EQUAL(11, block.block_sizes()[0]);
|
||||
VERIFY_IS_EQUAL(5, block.block_sizes()[1]);
|
||||
VERIFY_IS_EQUAL(6, block.block_sizes()[2]);
|
||||
VERIFY_IS_EQUAL(17, block.block_sizes()[3]);
|
||||
VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
|
||||
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
|
||||
} else {
|
||||
DSizes<Index, 5> dims(11, 5, 6, 17, 7);
|
||||
const size_t max_coeff_count = 11 * 5 * 6 * 17 * 7;
|
||||
TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
|
||||
max_coeff_count);
|
||||
TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
|
||||
VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
|
||||
VERIFY_IS_EQUAL(17, block.block_sizes()[3]);
|
||||
VERIFY_IS_EQUAL(6, block.block_sizes()[2]);
|
||||
VERIFY_IS_EQUAL(5, block.block_sizes()[1]);
|
||||
VERIFY_IS_EQUAL(11, block.block_sizes()[0]);
|
||||
VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
|
||||
}
|
||||
}
|
||||
|
||||
template <int Layout>
|
||||
static void test_empty_dims(const internal::TensorBlockShapeType block_shape)
|
||||
{
|
||||
using T = int;
|
||||
|
||||
// Test blocking of tensors with zero dimensions:
|
||||
// - we must not crash on asserts and divisions by zero
|
||||
// - we must not return block with zero dimensions
|
||||
// (recipe for overflows/underflows, divisions by zero and NaNs later)
|
||||
// - total block count must be zero
|
||||
{
|
||||
typedef internal::TensorBlockMapper<T, Index, 1, Layout> TensorBlockMapper;
|
||||
DSizes<Index, 1> dims(0);
|
||||
for (int max_coeff_count = 0; max_coeff_count < 2; ++max_coeff_count) {
|
||||
TensorBlockMapper block_mapper(dims, block_shape, max_coeff_count);
|
||||
VERIFY_IS_EQUAL(block_mapper.total_block_count(), 0);
|
||||
VERIFY(block_mapper.block_dims_total_size() >= 1);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
typedef internal::TensorBlockMapper<T, Index, 2, Layout> TensorBlockMapper;
|
||||
for (int dim1 = 0; dim1 < 3; ++dim1) {
|
||||
for (int dim2 = 0; dim2 < 3; ++dim2) {
|
||||
DSizes<Index, 2> dims(dim1, dim2);
|
||||
for (int max_coeff_count = 0; max_coeff_count < 2; ++max_coeff_count) {
|
||||
TensorBlockMapper block_mapper(dims, block_shape, max_coeff_count);
|
||||
if (dim1 * dim2 == 0) {
|
||||
VERIFY_IS_EQUAL(block_mapper.total_block_count(), 0);
|
||||
}
|
||||
VERIFY(block_mapper.block_dims_total_size() >= 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define TEST_LAYOUTS(NAME) \
|
||||
CALL_SUBTEST(NAME<ColMajor>()); \
|
||||
CALL_SUBTEST(NAME<RowMajor>())
|
||||
|
||||
#define TEST_LAYOUTS_AND_DIMS(TYPE, NAME) \
|
||||
CALL_SUBTEST((NAME<TYPE, 1, ColMajor>())); \
|
||||
CALL_SUBTEST((NAME<TYPE, 1, RowMajor>())); \
|
||||
CALL_SUBTEST((NAME<TYPE, 2, ColMajor>())); \
|
||||
CALL_SUBTEST((NAME<TYPE, 2, RowMajor>())); \
|
||||
CALL_SUBTEST((NAME<TYPE, 3, ColMajor>())); \
|
||||
CALL_SUBTEST((NAME<TYPE, 3, RowMajor>())); \
|
||||
CALL_SUBTEST((NAME<TYPE, 4, ColMajor>())); \
|
||||
CALL_SUBTEST((NAME<TYPE, 4, RowMajor>())); \
|
||||
CALL_SUBTEST((NAME<TYPE, 5, ColMajor>())); \
|
||||
CALL_SUBTEST((NAME<TYPE, 5, RowMajor>()))
|
||||
|
||||
#define TEST_LAYOUTS_WITH_ARG(NAME, ARG) \
|
||||
CALL_SUBTEST(NAME<ColMajor>(ARG)); \
|
||||
CALL_SUBTEST(NAME<RowMajor>(ARG))
|
||||
|
||||
EIGEN_DECLARE_TEST(cxx11_tensor_block_access) {
|
||||
TEST_LAYOUTS(test_block_mapper_sanity);
|
||||
TEST_LAYOUTS_AND_DIMS(float, test_block_mapper_maps_every_element);
|
||||
TEST_LAYOUTS_AND_DIMS(float, test_slice_block_mapper_maps_every_element);
|
||||
TEST_LAYOUTS_AND_DIMS(float, test_block_io_copy_data_from_source_to_target);
|
||||
TEST_LAYOUTS_AND_DIMS(Data, test_block_io_copy_data_from_source_to_target);
|
||||
TEST_LAYOUTS_AND_DIMS(float, test_block_io_copy_using_reordered_dimensions);
|
||||
TEST_LAYOUTS_AND_DIMS(Data, test_block_io_copy_using_reordered_dimensions);
|
||||
TEST_LAYOUTS(test_block_io_zero_stride);
|
||||
TEST_LAYOUTS(test_block_io_squeeze_ones);
|
||||
TEST_LAYOUTS_AND_DIMS(float, test_block_cwise_binary_io_basic);
|
||||
TEST_LAYOUTS(test_block_cwise_binary_io_squeeze_ones);
|
||||
TEST_LAYOUTS(test_block_cwise_binary_io_zero_strides);
|
||||
TEST_LAYOUTS(test_uniform_block_shape);
|
||||
TEST_LAYOUTS(test_skewed_inner_dim_block_shape);
|
||||
TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kUniformAllDims);
|
||||
TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kSkewedInnerDims);
|
||||
}
|
||||
|
||||
#undef TEST_LAYOUTS
|
||||
#undef TEST_LAYOUTS_WITH_ARG
|
@ -93,7 +93,7 @@ void test_cuda_complex_cwise_ops() {
|
||||
}
|
||||
|
||||
|
||||
void test_cxx11_tensor_complex_cwise_ops()
|
||||
EIGEN_DECLARE_TEST(test_cxx11_tensor_complex_cwise_ops)
|
||||
{
|
||||
CALL_SUBTEST(test_cuda_complex_cwise_ops<float>());
|
||||
CALL_SUBTEST(test_cuda_complex_cwise_ops<double>());
|
||||
|
@ -177,7 +177,7 @@ static void test_cuda_product_reductions() {
|
||||
}
|
||||
|
||||
|
||||
void test_cxx11_tensor_complex()
|
||||
EIGEN_DECLARE_TEST(test_cxx11_tensor_complex)
|
||||
{
|
||||
CALL_SUBTEST(test_cuda_nullary());
|
||||
CALL_SUBTEST(test_cuda_sum_reductions());
|
||||
|
87
unsupported/test/cxx11_tensor_executor.cpp
Normal file
87
unsupported/test/cxx11_tensor_executor.cpp
Normal file
@ -0,0 +1,87 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#define EIGEN_USE_THREADS
|
||||
|
||||
#include "main.h"
|
||||
|
||||
#include <Eigen/CXX11/Tensor>
|
||||
|
||||
using Eigen::Tensor;
|
||||
using Eigen::RowMajor;
|
||||
using Eigen::ColMajor;
|
||||
|
||||
// A set of tests to verify that different TensorExecutor strategies yields the
|
||||
// same results for all the ops, supporting tiled execution.
|
||||
|
||||
template <typename Device, bool Vectorizable, bool Tileable, int Layout>
|
||||
static void test_execute_binary_expr(Device d) {
|
||||
// Pick a large enough tensor size to bypass small tensor block evaluation
|
||||
// optimization.
|
||||
int d0 = internal::random<int>(100, 200);
|
||||
int d1 = internal::random<int>(100, 200);
|
||||
int d2 = internal::random<int>(100, 200);
|
||||
|
||||
static constexpr int Options = 0;
|
||||
using IndexType = int;
|
||||
|
||||
Tensor<float, 3, Options, IndexType> lhs(d0, d1, d2);
|
||||
Tensor<float, 3, Options, IndexType> rhs(d0, d1, d2);
|
||||
Tensor<float, 3, Options, IndexType> dst(d0, d1, d2);
|
||||
|
||||
lhs.setRandom();
|
||||
rhs.setRandom();
|
||||
|
||||
const auto expr = lhs + rhs;
|
||||
|
||||
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
|
||||
using Executor =
|
||||
internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
|
||||
|
||||
Executor::run(Assign(dst, expr), d);
|
||||
|
||||
for (int i = 0; i < d0; ++i) {
|
||||
for (int j = 0; j < d1; ++j) {
|
||||
for (int k = 0; k < d2; ++k) {
|
||||
float sum = lhs(i, j, k) + rhs(i, j, k);
|
||||
VERIFY_IS_EQUAL(sum, dst(i, j, k));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define CALL_SUBTEST_COMBINATIONS(NAME) \
|
||||
CALL_SUBTEST((NAME<DefaultDevice, false, false, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST((NAME<DefaultDevice, false, true, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST((NAME<DefaultDevice, true, false, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST((NAME<DefaultDevice, true, true, ColMajor>(default_device))); \
|
||||
CALL_SUBTEST((NAME<DefaultDevice, false, false, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST((NAME<DefaultDevice, false, true, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST((NAME<DefaultDevice, true, false, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST((NAME<DefaultDevice, true, true, RowMajor>(default_device))); \
|
||||
CALL_SUBTEST((NAME<ThreadPoolDevice, false, false, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST((NAME<ThreadPoolDevice, false, true, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST((NAME<ThreadPoolDevice, true, false, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST((NAME<ThreadPoolDevice, true, true, ColMajor>(tp_device))); \
|
||||
CALL_SUBTEST((NAME<ThreadPoolDevice, false, false, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST((NAME<ThreadPoolDevice, false, true, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST((NAME<ThreadPoolDevice, true, false, RowMajor>(tp_device))); \
|
||||
CALL_SUBTEST((NAME<ThreadPoolDevice, true, true, RowMajor>(tp_device)))
|
||||
|
||||
EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
|
||||
Eigen::DefaultDevice default_device;
|
||||
|
||||
const auto num_threads = internal::random<int>(1, 24);
|
||||
Eigen::ThreadPool tp(num_threads);
|
||||
Eigen::ThreadPoolDevice tp_device(&tp, num_threads);
|
||||
|
||||
CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr);
|
||||
}
|
||||
|
||||
#undef CALL_SUBTEST_COMBINATIONS
|
Loading…
Reference in New Issue
Block a user