From 60ae24ee1a6c16114de456d77fcfba6f5a1160ca Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Wed, 2 Oct 2019 12:44:06 -0700 Subject: [PATCH 1/2] Add block evaluation to TensorReshaping/TensorCasting/TensorPadding/TensorSelect --- .../Eigen/CXX11/src/Tensor/TensorBlockV2.h | 275 +++++++++++++++--- .../CXX11/src/Tensor/TensorBroadcasting.h | 8 +- .../Eigen/CXX11/src/Tensor/TensorConversion.h | 55 +++- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 138 +++++---- .../Eigen/CXX11/src/Tensor/TensorIndexList.h | 11 + .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 95 +++++- .../Eigen/CXX11/src/Tensor/TensorPadding.h | 254 +++++++++++++++- unsupported/test/cxx11_tensor_block_eval.cpp | 178 +++++++++++- unsupported/test/cxx11_tensor_executor.cpp | 15 +- 9 files changed, 862 insertions(+), 167 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h index 25047b8e50..4d2145bf3e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h @@ -11,6 +11,11 @@ namespace Eigen { namespace internal { +// -------------------------------------------------------------------------- // +// Forward declarations for templates defined below. +template +class TensorBlockIOV2; + // -------------------------------------------------------------------------- // // Helper function to compute strides for densely stored buffer of given // dimensions. @@ -18,7 +23,7 @@ namespace internal { // TODO(ezhulenev): We compute strides 1000 times in different evaluators, use // this function instead everywhere. template -EIGEN_STRONG_INLINE DSizes strides( +EIGEN_ALWAYS_INLINE DSizes strides( const DSizes& dimensions) { DSizes strides; if (NumDims == 0) return strides; @@ -40,6 +45,14 @@ EIGEN_STRONG_INLINE DSizes strides( return strides; } +#if EIGEN_HAS_CXX11 +template +EIGEN_STRONG_INLINE DSizes strides( + const Sizes& sizes) { + return strides(DSizes(sizes)); +} +#endif + // -------------------------------------------------------------------------- // // TensorBlockDescriptor specifies a block offset within a tensor and the block // sizes along each of the tensor dimensions. @@ -155,6 +168,14 @@ class TensorBlockDescriptor { DestinationBuffer(dst_base, m_dimensions, dst_strides, total_dst_bytes); } + template + void AddDestinationBuffer( + Scalar* dst_base, const DSizes& dst_strides, + size_t total_dst_bytes) { + // DSizes constructor will do index type promotion if it's safe. + AddDestinationBuffer(dst_base, Dimensions(dst_strides), total_dst_bytes); + } + TensorBlockDescriptor& DropDestinationBuffer() { m_destination.m_data = NULL; return *this; @@ -333,10 +354,11 @@ class TensorMaterializedBlock { typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind; #endif public: + typedef DSizes Dimensions; typedef TensorMap > XprType; TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data, - const DSizes& dimensions) + const Dimensions& dimensions) : m_kind(kind), m_data(data), m_dimensions(dimensions), @@ -352,18 +374,84 @@ class TensorMaterializedBlock { // properly for TensorMap. const XprType& expr() const { return m_expr; } const Scalar* data() const { return m_data; } - void cleanup() {} + typedef internal::TensorBlockDescriptor TensorBlockDesc; + + // Creates a materialized block for the given descriptor from a memory buffer. + template + EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize( + const Scalar* data, const DataDimensions& data_dims, + TensorBlockDesc& desc, TensorBlockScratch& scratch) { + eigen_assert(array_size::value == desc.dimensions().size()); + + // If a tensor block dimensions covers a contiguous block of the underlying + // memory, we can skip block buffer memory allocation, and construct a block + // from existing `data` memory buffer. + // + // Example: (RowMajor layout) + // data_dims: [11, 12, 13, 14] + // desc.dimensions(): [1, 1, 3, 14] + // + // In this case we can construct a TensorBlock starting at + // `data + desc.offset()`, with a `desc.dimensions()` block sizes. + static const bool is_col_major = Layout == ColMajor; + + // Find out how many inner dimensions have a matching size. + int num_matching_inner_dims = 0; + for (int i = 0; i < NumDims; ++i) { + int dim = is_col_major ? i : NumDims - i - 1; + if (data_dims[dim] != desc.dimensions()[dim]) break; + ++num_matching_inner_dims; + } + + // All the outer dimensions must be of size `1`, except a single dimension + // before the matching inner dimension (`3` in the example above). + bool can_use_direct_access = true; + for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) { + int dim = is_col_major ? i : NumDims - i - 1; + if (desc.dimension(dim) != 1) { + can_use_direct_access = false; + break; + } + } + + if (can_use_direct_access) { + const Scalar* block_start = data + desc.offset(); + return TensorMaterializedBlock(TensorBlockKind::kView, block_start, + desc.dimensions()); + + } else { + void* mem = scratch.allocate(desc.size() * sizeof(Scalar)); + Scalar* block_buffer = static_cast(mem); + + typedef internal::TensorBlockIOV2 + TensorBlockIO; + typedef typename TensorBlockIO::Dst TensorBlockIODst; + typedef typename TensorBlockIO::Src TensorBlockIOSrc; + + TensorBlockIOSrc src(internal::strides(Dimensions(data_dims)), + data, desc.offset()); + TensorBlockIODst dst(desc.dimensions(), + internal::strides(desc.dimensions()), + block_buffer); + + TensorBlockIO::Copy(dst, src); + + return TensorMaterializedBlock(TensorBlockKind::kMaterializedInScratch, + block_buffer, desc.dimensions()); + } + } + private: TensorBlockKind m_kind; const Scalar* m_data; - DSizes m_dimensions; + Dimensions m_dimensions; XprType m_expr; }; // -------------------------------------------------------------------------- // -// TensorCwiseUnaryBlock is a lazy tensor expression that applies UnaryOp +// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp // functor to the blocks produced by the underlying Tensor expression. template @@ -398,7 +486,7 @@ class TensorCwiseUnaryBlock { }; // -------------------------------------------------------------------------- // -// TensorCwiseUnaryBlock is a lazy tensor expression that applies BinaryOp +// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp // functor to the blocks produced by the underlying Tensor expression. template @@ -446,6 +534,96 @@ class TensorCwiseBinaryBlock { BinaryOp m_functor; }; +// -------------------------------------------------------------------------- // +// TensorUnaryExprBlock is a lazy tensor expression block that can construct +// an arbitrary tensor expression from a block of the underlying type (this is a +// generalization of the TensorCwiseUnaryBlock for arbitrary expressions). + +template +class TensorUnaryExprBlock { +#if !EIGEN_HAS_CXX11 + typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind; +#endif + + typedef typename ArgTensorBlock::XprType ArgXprType; + static const bool NoArgBlockAccess = internal::is_void::value; + + public: + typedef typename conditional< + NoArgBlockAccess, void, + typename BlockFactory::template XprType::type>::type XprType; + + typedef typename XprScalar::type Scalar; + + TensorUnaryExprBlock(const ArgTensorBlock& arg_block, + const BlockFactory& factory) + : m_arg_block(arg_block), m_factory(factory) {} + + TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } + XprType expr() const { return m_factory.expr(m_arg_block.expr()); } + const Scalar* data() const { return NULL; } + void cleanup() { m_arg_block.cleanup(); } + + private: + ArgTensorBlock m_arg_block; + BlockFactory m_factory; +}; + +// -------------------------------------------------------------------------- // +// TensorTernaryExprBlock is a lazy tensor expression block that can construct +// an arbitrary tensor expression from three blocks of the underlying type. + +template +class TensorTernaryExprBlock { +#if !EIGEN_HAS_CXX11 + typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind; +#endif + + typedef typename Arg1TensorBlock::XprType Arg1XprType; + typedef typename Arg2TensorBlock::XprType Arg2XprType; + typedef typename Arg3TensorBlock::XprType Arg3XprType; + + static const bool NoArgBlockAccess = internal::is_void::value || + internal::is_void::value || + internal::is_void::value; + + public: + typedef typename conditional< + NoArgBlockAccess, void, + typename BlockFactory::template XprType::type>::type XprType; + + typedef typename XprScalar::type Scalar; + + TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block, + const Arg2TensorBlock& arg2_block, + const Arg3TensorBlock& arg3_block, + const BlockFactory& factory) + : m_arg1_block(arg1_block), + m_arg2_block(arg2_block), + m_arg3_block(arg3_block), + m_factory(factory) {} + + TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } + XprType expr() const { + return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(), + m_arg3_block.expr()); + } + const Scalar* data() const { return NULL; } + void cleanup() { + m_arg1_block.cleanup(); + m_arg2_block.cleanup(); + m_arg3_block.cleanup(); + } + + private: + Arg1TensorBlock m_arg1_block; + Arg2TensorBlock m_arg2_block; + Arg3TensorBlock m_arg3_block; + BlockFactory m_factory; +}; + // -------------------------------------------------------------------------- // // StridedLinearBufferCopy provides a method to copy data between two linear // buffers with different strides, with optimized paths for scatter/gather. @@ -547,7 +725,13 @@ class StridedLinearBufferCopy { } else if (kind == FillLinear) { // Fill `dst` with value at `*src`. eigen_assert(src_stride == 0 && dst_stride == 1); + const IndexType unrolled_size = count - 4 * PacketSize; Packet p = pload1(src); + for (; i <= unrolled_size; i += 4 * PacketSize) { + for (int j = 0; j < 4; ++j) { + pstoreu(dst + i + j * PacketSize, p); + } + } for (; i <= vectorized_size; i += PacketSize) { pstoreu(dst + i, p); } @@ -809,15 +993,15 @@ class TensorBlockIOV2 { // -------------------------------------------------------------------------- // // TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to -// a Tensor block defined by `desc`, backed by a memory buffer at `dst` address. +// a Tensor block defined by `desc`, backed by a memory buffer at `target`. // // Currently there is no way to write from a Tensor expression to a block of // memory, if dimensions are reordered. If you need to do that, you should // materialize a Tensor block expression into a memory buffer, and then use // TensorBlockIO to copy data between two memory buffers with a custom -// `dst->src` dimension map (see definition above). +// `target->src` dimension map (see definition above). // -// Also currently the innermost dimension of `dst` must have a stride '1' +// Also currently the innermost dimension of `target` must have a stride '1' // (contiguous in memory). This restriction could be lifted with a `pscatter`, // but in practice it's never needed, and there is a similar TensorBlockIO // workaround for that. @@ -842,18 +1026,18 @@ class TensorBlockAssignment { template struct InnerDimAssign { - EIGEN_ALWAYS_INLINE static void Run(Scalar* dst, IndexType count, + EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, const Evaluator& eval, IndexType eval_offset) { for (IndexType i = 0; i < count; ++i) { - dst[i] = eval.coeff(eval_offset + i); + target[i] = eval.coeff(eval_offset + i); } } }; template struct InnerDimAssign { - EIGEN_ALWAYS_INLINE static void Run(Scalar* dst, IndexType count, + EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, const Evaluator& eval, IndexType eval_offset) { typedef typename packet_traits::type Packet; @@ -866,26 +1050,29 @@ class TensorBlockAssignment { for (int j = 0; j < 4; ++j) { const IndexType idx = eval_offset + i + j * PacketSize; Packet p = eval.template packet(idx); - pstoreu(dst + i + j * PacketSize, p); + pstoreu(target + i + j * PacketSize, p); } } for (; i <= vectorized_size; i += PacketSize) { Packet p = eval.template packet(eval_offset + i); - pstoreu(dst + i, p); + pstoreu(target + i, p); } for (; i < count; ++i) { - dst[i] = eval.coeff(eval_offset + i); + target[i] = eval.coeff(eval_offset + i); } } }; public: - struct Dst { - Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst, - IndexType dst_offset = 0) - : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {} + struct Target { + Target(const Dimensions& target_dims, const Dimensions& target_strides, + Scalar* target_data, IndexType target_offset = 0) + : dims(target_dims), + strides(target_strides), + data(target_data), + offset(target_offset) {} Dimensions dims; Dimensions strides; @@ -893,34 +1080,50 @@ class TensorBlockAssignment { IndexType offset; }; + static Target target(const Dimensions& target_dims, + const Dimensions& target_strides, Scalar* target_data, + IndexType target_offset = 0) { + return Target(target_dims, target_strides, target_data, target_offset); + } + + template + static Target target( + const DSizes& target_dims, + const DSizes& target_strides, + Scalar* target_data, IndexType target_offset = 0) { + // DSizes constructor will do index type promotion if it's safe. + return Target(Dimensions(target_dims), Dimensions(target_strides), + target_data, target_offset); + } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const Dst& dst, const TensorBlockExpr& expr) { + const Target& target, const TensorBlockExpr& expr) { // Prepare evaluator for block expression. DefaultDevice default_device; TensorBlockEvaluator eval(expr, default_device); // Tensor block expression dimension should match destination dimensions. - eigen_assert(dimensions_match(dst.dims, eval.dimensions())); + eigen_assert(dimensions_match(target.dims, eval.dimensions())); static const int Layout = TensorBlockEvaluator::Layout; static const bool is_col_major = Layout == ColMajor; // Initialize output inner dimension size based on a layout. - const IndexType output_size = NumDims == 0 ? 1 : dst.dims.TotalSize(); + const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize(); const int inner_dim_idx = is_col_major ? 0 : NumDims - 1; - IndexType output_inner_dim_size = dst.dims[inner_dim_idx]; + IndexType output_inner_dim_size = target.dims[inner_dim_idx]; - // Dst inner dimension stride must be '1'. - eigen_assert(dst.strides[inner_dim_idx] == 1); + // Target inner dimension stride must be '1'. + eigen_assert(target.strides[inner_dim_idx] == 1); - // Squeeze multiple inner dims into one if they are contiguous in `dst`. + // Squeeze multiple inner dims into one if they are contiguous in `target`. IndexType num_squeezed_dims = 0; for (Index i = 1; i < NumDims; ++i) { const Index dim = is_col_major ? i : NumDims - i - 1; - const IndexType dst_stride = dst.strides[dim]; + const IndexType target_stride = target.strides[dim]; - if (output_inner_dim_size == dst_stride) { - output_inner_dim_size *= dst.dims[dim]; + if (output_inner_dim_size == target_stride) { + output_inner_dim_size *= target.dims[dim]; num_squeezed_dims++; } else { break; @@ -936,22 +1139,22 @@ class TensorBlockAssignment { const Index dim = is_col_major ? i + 1 : NumDims - i - 2; it[idx].count = 0; - it[idx].size = dst.dims[dim]; - it[idx].output_stride = dst.strides[dim]; + it[idx].size = target.dims[dim]; + it[idx].output_stride = target.strides[dim]; it[idx].output_span = it[i].output_stride * (it[i].size - 1); idx++; } // We read block expression from the beginning, and start writing data to - // `dst` at given offset. + // `target` at given offset. IndexType input_offset = 0; - IndexType output_offset = dst.offset; + IndexType output_offset = target.offset; - // Iterate copying data from `eval` to `dst`. + // Iterate copying data from `eval` to `target`. for (IndexType i = 0; i < output_size; i += output_inner_dim_size) { - // Assign to `dst` at current offset. + // Assign to `target` at current offset. InnerDimAssign::Run(dst.data + output_offset, + TensorBlockEvaluator>::Run(target.data + output_offset, output_inner_dim_size, eval, input_offset); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 9e4fae99af..dc9551d323 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -1247,10 +1247,10 @@ struct TensorEvaluator, Device> ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index> TensorBlockAssignment; - typename TensorBlockAssignment::Dst assignment_dst( - input_block_sizes, input_block_strides, *materialized_input); - - TensorBlockAssignment::Run(assignment_dst, input_block.expr()); + TensorBlockAssignment::Run( + TensorBlockAssignment::target(input_block_sizes, input_block_strides, + *materialized_input), + input_block.expr()); input_buffer = *materialized_input; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index a8160e17ec..cc3e676775 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -294,23 +294,45 @@ struct TensorEvaluator, Device> typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = false, - PacketAccess = + IsAligned = false, + PacketAccess = #ifndef EIGEN_USE_SYCL - true, + true, #else - TensorEvaluator::PacketAccess & - internal::type_casting_traits::VectorizedCast, + TensorEvaluator::PacketAccess & + internal::type_casting_traits::VectorizedCast, #endif - BlockAccess = false, - BlockAccessV2 = false, - PreferBlockAccess = false, - Layout = TensorEvaluator::Layout, - RawAccess = false + BlockAccess = false, + BlockAccessV2 = TensorEvaluator::BlockAccessV2, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false }; + static const int NumDims = internal::array_size::value; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlockV2 + ArgTensorBlock; + + struct TensorConversionOpBlockFactory { + template + struct XprType { + typedef TensorConversionOp type; + }; + + template + typename XprType::type expr(const ArgXprType& expr) const { + return typename XprType::type(expr); + } + }; + + typedef internal::TensorUnaryExprBlock + TensorBlockV2; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -376,6 +398,17 @@ struct TensorEvaluator, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + m_impl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + return TensorBlockV2(m_impl.blockV2(desc, scratch), + TensorConversionOpBlockFactory()); + } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } /// required by sycl in order to extract the sycl accessor diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index c87075a720..b1d6687449 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -176,11 +176,12 @@ struct TensorEvaluator typedef internal::TensorBlockAssignment TensorBlockAssign; - typename TensorBlockAssign::Dst dst(desc.dimensions(), - internal::strides(m_dims), - m_data, desc.offset()); - TensorBlockAssign::Run(dst, block.expr()); + TensorBlockAssign::Run( + TensorBlockAssign::target(desc.dimensions(), + internal::strides(m_dims), m_data, + desc.offset()), + block.expr()); } EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } @@ -349,62 +350,7 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { assert(m_data != NULL); - - // TODO(ezhulenev): Move it to TensorBlockV2 and reuse in TensorForcedEval. - - // If a tensor block descriptor covers a contiguous block of the underlying - // memory, we can skip block buffer memory allocation, and construct a block - // from existing `m_data` memory buffer. - // - // Example: (RowMajor layout) - // m_dims: [11, 12, 13, 14] - // desc.dimensions(): [1, 1, 3, 14] - // - // In this case we can construct a TensorBlock starting at - // `m_data + desc.offset()`, with a `desc.dimensions()` block sizes. - - static const bool - is_col_major = static_cast(Layout) == static_cast(ColMajor); - - // Find out how many inner dimensions have a matching size. - int num_matching_inner_dims = 0; - for (int i = 0; i < NumCoords; ++i) { - int dim = is_col_major ? i : NumCoords - i - 1; - if (m_dims[dim] != desc.dimensions()[dim]) break; - ++num_matching_inner_dims; - } - - // All the outer dimensions must be of size `1`, except a single dimension - // before the matching inner dimension (`3` in the example above). - bool can_use_direct_access = true; - for (int i = num_matching_inner_dims + 1; i < NumCoords; ++i) { - int dim = is_col_major ? i : NumCoords - i - 1; - if (desc.dimension(dim) != 1) { - can_use_direct_access = false; - break; - } - } - - if (can_use_direct_access) { - EvaluatorPointerType block_start = m_data + desc.offset(); - return TensorBlockV2(internal::TensorBlockKind::kView, block_start, - desc.dimensions()); - - } else { - void* mem = scratch.allocate(desc.size() * sizeof(Scalar)); - ScalarNoConst* block_buffer = static_cast(mem); - - TensorBlockIOSrc src(internal::strides(m_dims), m_data, - desc.offset()); - TensorBlockIODst dst(desc.dimensions(), - internal::strides(desc.dimensions()), - block_buffer); - - TensorBlockIO::Copy(dst, src); - - return TensorBlockV2(internal::TensorBlockKind::kMaterializedInScratch, - block_buffer, desc.dimensions()); - } + return TensorBlockV2::materialize(m_data, m_dims, desc, scratch); } EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } @@ -923,15 +869,21 @@ struct TensorEvaluator typedef typename XprType::Scalar Scalar; enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & - PacketType::HasBlend, - BlockAccess = false, - BlockAccessV2 = false, - PreferBlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = TensorEvaluator::IsAligned & + TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & + TensorEvaluator::PacketAccess & + PacketType::HasBlend, + BlockAccess = false, + BlockAccessV2 = TensorEvaluator::BlockAccessV2 && + TensorEvaluator::BlockAccessV2 && + TensorEvaluator::BlockAccessV2, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) @@ -953,8 +905,36 @@ struct TensorEvaluator typedef StorageMemory Storage; typedef typename Storage::Type EvaluatorPointerType; + static const int NumDims = internal::array_size::value; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlockV2 + IfArgTensorBlock; + typedef typename TensorEvaluator::TensorBlockV2 + ThenArgTensorBlock; + typedef typename TensorEvaluator::TensorBlockV2 + ElseArgTensorBlock; + + struct TensorSelectOpBlockFactory { + template + struct XprType { + typedef TensorSelectOp type; + }; + + template + typename XprType::type expr( + const IfArgXprType& if_expr, const ThenArgXprType& then_expr, const ElseArgXprType& else_expr) const { + return typename XprType::type(if_expr, then_expr, else_expr); + } + }; + + typedef internal::TensorTernaryExprBlock + TensorBlockV2; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC const Dimensions& dimensions() const @@ -1000,6 +980,24 @@ struct TensorEvaluator .cwiseMax(m_elseImpl.costPerCoeff(vectorized)); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + m_condImpl.getResourceRequirements(resources); + m_thenImpl.getResourceRequirements(resources); + m_elseImpl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + // It's unsafe to pass destination buffer to underlying expressions, because + // output might be aliased with one of the inputs. + desc.DropDestinationBuffer(); + + return TensorBlockV2( + m_condImpl.blockV2(desc, scratch), m_thenImpl.blockV2(desc, scratch), + m_elseImpl.blockV2(desc, scratch), TensorSelectOpBlockFactory()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; } #ifdef EIGEN_USE_SYCL diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index be8f3a734b..2a3398d670 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -324,6 +324,17 @@ struct IndexList : internal::IndexTuple { } }; +template +std::ostream& operator<<(std::ostream& os, + const IndexList& dims) { + os << "["; + for (size_t i = 0; i < 1 + sizeof...(OtherTypes); ++i) { + if (i > 0) os << ", "; + os << dims[i]; + } + os << "]"; + return os; +} template constexpr IndexList make_index_list(FirstType val1, OtherTypes... other_vals) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index c8333e488d..5d4b0f061d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -113,6 +113,25 @@ struct TensorEvaluator, Device> static const int NumOutputDims = internal::array_size::value; static const int NumInputDims = internal::array_size::Dimensions>::value; + enum ReshapingKind { + // We do not use layout information to determine reshaping kind. + // Depending on the layout `N` can be inner or outer dimension. + OneByN = 0, // expr.reshape(1, N) + NByOne = 1, // expr.reshape(N, 1) + Runtime = 2 // Reshape dimensions are dynamic (specified at runtime). + }; + + // clang-format off + static const ReshapingKind kind = +#if defined(EIGEN_HAS_INDEX_LIST) + (NumOutputDims == 2 && internal::index_statically_eq(/*index=*/0, /*value=*/1)) ? OneByN + : (NumOutputDims == 2 && internal::index_statically_eq(/*index=*/1, /*value=*/1)) ? NByOne + : Runtime; +#else + Runtime; +#endif + // clang-format on + enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, @@ -121,8 +140,12 @@ struct TensorEvaluator, Device> BlockAccess = TensorEvaluator::BlockAccess && TensorEvaluator::RawAccess && NumInputDims > 0 && NumOutputDims > 0, - BlockAccessV2 = false, - PreferBlockAccess = true, + // For trivial reshapes with raw access to underlying data we will provide + // zero overhead block access. + // TODO(ezhulenev): Consider adding block access without raw access? + BlockAccessV2 = TensorEvaluator::RawAccess && + NumInputDims > 0 && NumOutputDims > 0, + PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = TensorEvaluator::RawAccess @@ -139,7 +162,13 @@ struct TensorEvaluator, Device> OutputTensorBlockReader; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef + typename internal::TensorMaterializedBlock + TensorBlockV2; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -199,8 +228,9 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( - std::vector* resources) const { - m_impl.getResourceRequirements(resources); + std::vector*) const { + // TODO(ezhulenev): If we'll ever support block evaluation without raw + // access we'll need to get requirements from `m_impl`. } // required in block(OutputTensorBlock* output_block) const @@ -334,6 +364,26 @@ struct TensorEvaluator, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + eigen_assert(m_impl.data() != NULL); + eigen_assert((kind == Runtime) || + (kind == OneByN && desc.dimensions()[0] == 1) || + (kind == NByOne && desc.dimensions()[1] == 1)); + + if (kind == OneByN || kind == NByOne) { + // We can guarantee at compile time that block is just a contiguous slice + // of the underlying expression memory buffer. + return TensorBlockV2(internal::TensorBlockKind::kView, + m_impl.data() + desc.offset(), desc.dimensions()); + } else { + // This will do additional runtime checks, and in the end it might be also + // a view, or it might be a block materialized in the temporary buffer. + return TensorBlockV2::materialize(m_impl.data(), m_dimensions, desc, + scratch); + } + } + EIGEN_DEVICE_FUNC typename Storage::Type data() const { return constCast(m_impl.data()); } @@ -365,14 +415,14 @@ template typedef NewDimensions Dimensions; enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - BlockAccessV2 = false, + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + BlockAccessV2 = TensorEvaluator::RawAccess, PreferBlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = TensorEvaluator::RawAccess + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -385,18 +435,37 @@ template typedef typename PacketType::type PacketReturnType; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockDescriptor + TensorBlockDesc; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { return this->m_impl.coeffRef(index); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { this->m_impl.template writePacket(index, x); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2( + const TensorBlockDesc& desc, const TensorBlock& block) { + assert(this->m_impl.data() != NULL); + + typedef typename TensorBlock::XprType TensorBlockExpr; + typedef internal::TensorBlockAssignment< + Scalar, TensorEvaluator::NumOutputDims, TensorBlockExpr, Index> + TensorBlockAssign; + + TensorBlockAssign::Run( + TensorBlockAssign::target(desc.dimensions(), + internal::strides(this->dimensions()), + this->m_impl.data(), desc.offset()), + block.expr()); + } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 7b9ad73745..be2449ebd5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -96,22 +96,29 @@ struct TensorEvaluator, Device typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = true, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - BlockAccessV2 = false, - PreferBlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = true, - RawAccess = false + IsAligned = true, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + BlockAccessV2 = TensorEvaluator::RawAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = true, + RawAccess = false }; + typedef typename internal::remove_const::type ScalarNoConst; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock + TensorBlockV2; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()) + : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()), m_device(device) { // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector @@ -212,6 +219,214 @@ struct TensorEvaluator, Device return cost; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + Eigen::Index block_total_size_max = numext::maxi( + 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); + resources->push_back(internal::TensorOpResourceRequirements( + internal::kSkewedInnerDims, block_total_size_max)); + + m_impl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + eigen_assert(m_impl.data() != NULL); + + // Check if we can reuse `desc` destination, or allocate new scratch buffer. + ScalarNoConst* materialized_output = + desc.template destination(); + + bool materialized_in_output; + if (materialized_output != NULL) { + desc.DropDestinationBuffer(); + materialized_in_output = true; + + } else { + const size_t materialized_output_size = desc.size() * sizeof(Scalar); + void* output_scratch_mem = scratch.allocate(materialized_output_size); + materialized_output = static_cast(output_scratch_mem); + materialized_in_output = false; + } + + static const bool IsColMajor = Layout == static_cast(ColMajor); + + Index offset = desc.offset(); + + // Compute offsets in the output tensor corresponding to the desc.offset(). + DSizes output_offsets; + for (int i = NumDims - 1; i > 0; --i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + const int stride_dim = IsColMajor ? dim : dim + 1; + output_offsets[dim] = offset / m_outputStrides[stride_dim]; + offset -= output_offsets[dim] * m_outputStrides[stride_dim]; + } + output_offsets[IsColMajor ? 0 : NumDims - 1] = offset; + + // Offsets in the input corresponding to output offsets. + DSizes input_offsets = output_offsets; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + input_offsets[dim] = input_offsets[dim] - m_padding[dim].first; + } + + // Compute offset in the input buffer (at this point it might be illegal and + // point outside of the input buffer, because we don't check for negative + // offsets, it will be autocorrected in the block iteration loop below). + Index input_offset = 0; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + input_offset += input_offsets[dim] * m_inputStrides[dim]; + } + + // Destination buffer and scratch buffer both indexed from 0 and have the + // same dimensions as the requested block (for destination buffer this + // property is guaranteed by `desc.destination()`). + Index output_offset = 0; + const DSizes output_strides = + internal::strides(desc.dimensions()); + + // NOTE(ezhulenev): We initialize bock iteration state for `NumDims - 1` + // dimensions, skipping innermost dimension. In theory it should be possible + // to squeeze matching innermost dimensions, however in practice that did + // not show any improvements in benchmarks. Also in practice first outer + // dimension usually has padding, and will prevent squeezing. + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array it; + for (Index i = 0; i < NumDims - 1; ++i) { + const Index dim = IsColMajor ? i + 1 : NumDims - i - 2; + it[i].count = 0; + it[i].size = desc.dimension(dim); + + it[i].input_stride = m_inputStrides[dim]; + it[i].input_span = it[i].input_stride * (it[i].size - 1); + + it[i].output_stride = output_strides[dim]; + it[i].output_span = it[i].output_stride * (it[i].size - 1); + } + + const int inner_dim_idx = IsColMajor ? 0 : NumDims - 1; + + // Total output size. + const Index output_size = desc.size(); + + // We will fill inner dimension of this size in the output. It might be + // larger than the inner dimension in the input, so we might have to pad + // before/after we copy values from the input inner dimension. + const Index output_inner_dim_size = desc.dimension(inner_dim_idx); + + // How many values to fill with padding BEFORE reading from the input inner + // dimension. + const Index output_inner_pad_before_size = + input_offsets[inner_dim_idx] < 0 + ? numext::mini(numext::abs(input_offsets[inner_dim_idx]), + output_inner_dim_size) + : 0; + + // How many values we can actually copy from the input inner dimension. + const Index output_inner_copy_size = numext::mini( + // Want to copy from input. + (output_inner_dim_size - output_inner_pad_before_size), + // Can copy from input. + (static_cast(m_impl.dimensions()[inner_dim_idx]) - + numext::maxi(input_offsets[inner_dim_idx], Index(0)))); + + // How many values to fill with padding AFTER reading from the input inner + // dimension. + const Index output_inner_pad_after_size = + (output_inner_dim_size - output_inner_copy_size - + output_inner_pad_before_size); + + // Sanity check, sum of all sizes must be equal to the output size. + eigen_assert(output_inner_dim_size == + (output_inner_pad_before_size + output_inner_copy_size + + output_inner_pad_after_size)); + + // Keep track of current coordinates and padding in the output. + DSizes output_coord = output_offsets; + DSizes output_padded; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); + } + + typedef internal::StridedLinearBufferCopy LinCopy; + + // Iterate copying data from `m_impl.data()` to the output buffer. + for (Index size = 0; size < output_size; size += output_inner_dim_size) { + // Detect if we are in the padded region (exclude innermost dimension). + bool is_padded = false; + for (int j = 1; j < NumDims; ++j) { + const int dim = IsColMajor ? j : NumDims - j - 1; + is_padded = output_padded[dim]; + if (is_padded) break; + } + + if (is_padded) { + // Fill with padding value. + LinCopy::template Run( + typename LinCopy::Dst(output_offset, 1, materialized_output), + typename LinCopy::Src(0, 0, &m_paddingValue), + output_inner_dim_size); + + } else { + { // Fill with padding before copying from input inner dimension. + const Index out = output_offset; + + LinCopy::template Run( + typename LinCopy::Dst(out, 1, materialized_output), + typename LinCopy::Src(0, 0, &m_paddingValue), + output_inner_pad_before_size); + } + + { // Copy data from input inner dimension. + const Index out = output_offset + output_inner_pad_before_size; + const Index in = input_offset + output_inner_pad_before_size; + + LinCopy::template Run( + typename LinCopy::Dst(out, 1, materialized_output), + typename LinCopy::Src(in, 1, m_impl.data()), + output_inner_copy_size); + } + + { // Fill with padding after copying from input inner dimension. + const Index out = output_offset + output_inner_pad_before_size + + output_inner_copy_size; + + LinCopy::template Run( + typename LinCopy::Dst(out, 1, materialized_output), + typename LinCopy::Src(0, 0, &m_paddingValue), + output_inner_pad_after_size); + } + } + + for (int j = 0; j < NumDims - 1; ++j) { + const int dim = IsColMajor ? j + 1 : NumDims - j - 2; + + if (++it[j].count < it[j].size) { + input_offset += it[j].input_stride; + output_offset += it[j].output_stride; + output_coord[dim] += 1; + output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); + break; + } + it[j].count = 0; + input_offset -= it[j].input_span; + output_offset -= it[j].output_span; + output_coord[dim] -= it[j].size - 1; + output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); + } + } + + return TensorBlockV2(materialized_in_output + ? internal::TensorBlockKind::kMaterializedInOutput + : internal::TensorBlockKind::kMaterializedInScratch, + materialized_output, + desc.dimensions()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; } #ifdef EIGEN_USE_SYCL @@ -222,6 +437,23 @@ struct TensorEvaluator, Device #endif private: + struct BlockIteratorState { + BlockIteratorState() + : count(0), + size(0), + input_stride(0), + input_span(0), + output_stride(0), + output_span(0) {} + + Index count; + Index size; + Index input_stride; + Index input_span; + Index output_stride; + Index output_span; + }; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim( Index index, int dim_index) const { #if defined(EIGEN_HAS_INDEX_LIST) @@ -410,6 +642,8 @@ struct TensorEvaluator, Device PaddingDimensions m_padding; Scalar m_paddingValue; + + const Device EIGEN_DEVICE_REF m_device; }; diff --git a/unsupported/test/cxx11_tensor_block_eval.cpp b/unsupported/test/cxx11_tensor_block_eval.cpp index e85b81141e..ff98e170dd 100644 --- a/unsupported/test/cxx11_tensor_block_eval.cpp +++ b/unsupported/test/cxx11_tensor_block_eval.cpp @@ -104,6 +104,17 @@ static TensorBlockParams FixedSizeBlock(DSizes dims) { return {offsets, dims, TensorBlockDescriptor(0, dims)}; } +inline Eigen::IndexList> NByOne(int n) { + Eigen::IndexList> ret; + ret.set(0, n); + return ret; +} +inline Eigen::IndexList, int> OneByM(int m) { + Eigen::IndexList, int> ret; + ret.set(1, m); + return ret; +} + // -------------------------------------------------------------------------- // // Verify that block expression evaluation produces the same result as a // TensorSliceOp (reading a tensor block is same to taking a tensor slice). @@ -174,7 +185,7 @@ static void test_eval_tensor_block() { // Identity tensor expression transformation. VerifyBlockEvaluator( - input, [&dims]() { return RandomBlock(dims, 10, 20); }); + input, [&dims]() { return RandomBlock(dims, 1, 10); }); } template @@ -184,7 +195,7 @@ static void test_eval_tensor_unary_expr_block() { input.setRandom(); VerifyBlockEvaluator( - input.square(), [&dims]() { return RandomBlock(dims, 10, 20); }); + input.square(), [&dims]() { return RandomBlock(dims, 1, 10); }); } template @@ -195,7 +206,7 @@ static void test_eval_tensor_binary_expr_block() { rhs.setRandom(); VerifyBlockEvaluator( - lhs + rhs, [&dims]() { return RandomBlock(dims, 10, 20); }); + lhs + rhs, [&dims]() { return RandomBlock(dims, 1, 10); }); } template @@ -207,7 +218,7 @@ static void test_eval_tensor_binary_with_unary_expr_block() { VerifyBlockEvaluator( (lhs.square() + rhs.square()).sqrt(), - [&dims]() { return RandomBlock(dims, 10, 20); }); + [&dims]() { return RandomBlock(dims, 1, 10); }); } template @@ -236,6 +247,114 @@ static void test_eval_tensor_broadcast() { [&bcasted_dims]() { return SkewedInnerBlock(bcasted_dims); }); } +template +static void test_eval_tensor_reshape() { + DSizes dims = RandomDims(1, 10); + + DSizes shuffled = dims; + std::shuffle(&shuffled[0], &shuffled[NumDims - 1], std::mt19937(g_seed)); + + Tensor input(dims); + input.setRandom(); + + VerifyBlockEvaluator( + input.reshape(shuffled), + [&shuffled]() { return RandomBlock(shuffled, 1, 10); }); + + VerifyBlockEvaluator( + input.reshape(shuffled), + [&shuffled]() { return SkewedInnerBlock(shuffled); }); +} + +template +static void test_eval_tensor_reshape_with_bcast() { + Index dim = internal::random(1, 100); + + Tensor lhs(1, dim); + Tensor rhs(dim, 1); + lhs.setRandom(); + rhs.setRandom(); + + auto reshapeLhs = NByOne(dim); + auto reshapeRhs = OneByM(dim); + + auto bcastLhs = OneByM(dim); + auto bcastRhs = NByOne(dim); + + DSizes dims(dim, dim); + + VerifyBlockEvaluator( + lhs.reshape(reshapeLhs).broadcast(bcastLhs) + + rhs.reshape(reshapeRhs).broadcast(bcastRhs), + [dims]() { return SkewedInnerBlock(dims); }); +} + +template +static void test_eval_tensor_cast() { + DSizes dims = RandomDims(10, 20); + Tensor input(dims); + input.setRandom(); + + VerifyBlockEvaluator( + input.template cast().template cast(), + [&dims]() { return RandomBlock(dims, 1, 10); }); +} + +template +static void test_eval_tensor_select() { + DSizes dims = RandomDims(10, 20); + Tensor lhs(dims); + Tensor rhs(dims); + Tensor cond(dims); + lhs.setRandom(); + rhs.setRandom(); + cond.setRandom(); + + VerifyBlockEvaluator(cond.select(lhs, rhs), [&dims]() { + return RandomBlock(dims, 1, 20); + }); +} + +template +static void test_eval_tensor_padding() { + const int inner_dim = Layout == static_cast(ColMajor) ? 0 : NumDims - 1; + + DSizes dims = RandomDims(10, 20); + Tensor input(dims); + input.setRandom(); + + DSizes pad_before = RandomDims(0, 4); + DSizes pad_after = RandomDims(0, 4); + array, NumDims> paddings; + for (int i = 0; i < NumDims; ++i) { + paddings[i] = std::make_pair(pad_before[i], pad_after[i]); + } + + // Test squeezing reads from inner dim. + if (internal::random()) { + pad_before[inner_dim] = 0; + pad_after[inner_dim] = 0; + paddings[inner_dim] = std::make_pair(0, 0); + } + + DSizes padded_dims; + for (int i = 0; i < NumDims; ++i) { + padded_dims[i] = dims[i] + pad_before[i] + pad_after[i]; + } + + VerifyBlockEvaluator( + input.pad(paddings), + [&padded_dims]() { return FixedSizeBlock(padded_dims); }); + + VerifyBlockEvaluator( + input.pad(paddings), + [&padded_dims]() { return RandomBlock(padded_dims, 1, 10); }); + + VerifyBlockEvaluator( + input.pad(paddings), + [&padded_dims]() { return SkewedInnerBlock(padded_dims); }); +} + // -------------------------------------------------------------------------- // // Verify that assigning block to a Tensor expression produces the same result // as an assignment to TensorSliceOp (writing a block is is identical to @@ -300,7 +419,7 @@ static void VerifyBlockAssignment(Tensor& tensor, // -------------------------------------------------------------------------- // template -static void test_assign_tensor_block() { +static void test_assign_to_tensor() { DSizes dims = RandomDims(10, 20); Tensor tensor(dims); @@ -312,11 +431,32 @@ static void test_assign_tensor_block() { tensor, map, [&dims]() { return FixedSizeBlock(dims); }); } +template +static void test_assign_to_tensor_reshape() { + DSizes dims = RandomDims(10, 20); + Tensor tensor(dims); + + TensorMap> map(tensor.data(), dims); + + DSizes shuffled = dims; + std::shuffle(&shuffled[0], &shuffled[NumDims - 1], std::mt19937(g_seed)); + + VerifyBlockAssignment( + tensor, map.reshape(shuffled), + [&shuffled]() { return RandomBlock(shuffled, 1, 10); }); + + VerifyBlockAssignment( + tensor, map.reshape(shuffled), + [&shuffled]() { return SkewedInnerBlock(shuffled); }); + + VerifyBlockAssignment( + tensor, map.reshape(shuffled), + [&shuffled]() { return FixedSizeBlock(shuffled); }); +} + // -------------------------------------------------------------------------- // -//#define CALL_SUBTESTS(NAME) CALL_SUBTEST((NAME())) - -#define CALL_SUBTESTS(NAME) \ +#define CALL_SUBTESTS_DIMS_LAYOUTS(NAME) \ CALL_SUBTEST((NAME())); \ CALL_SUBTEST((NAME())); \ CALL_SUBTEST((NAME())); \ @@ -326,14 +466,24 @@ static void test_assign_tensor_block() { CALL_SUBTEST((NAME())); \ CALL_SUBTEST((NAME())) +#define CALL_SUBTESTS_LAYOUTS(NAME) \ + CALL_SUBTEST((NAME())); \ + CALL_SUBTEST((NAME())) + EIGEN_DECLARE_TEST(cxx11_tensor_block_eval) { // clang-format off - CALL_SUBTESTS(test_eval_tensor_block); - CALL_SUBTESTS(test_eval_tensor_unary_expr_block); - CALL_SUBTESTS(test_eval_tensor_binary_expr_block); - CALL_SUBTESTS(test_eval_tensor_binary_with_unary_expr_block); - CALL_SUBTESTS(test_eval_tensor_broadcast); + CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_block); + CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_unary_expr_block); + CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_binary_expr_block); + CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_binary_with_unary_expr_block); + CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_broadcast); + CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_reshape); + CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_cast); + CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_padding); - CALL_SUBTESTS(test_assign_tensor_block); + CALL_SUBTESTS_LAYOUTS(test_eval_tensor_reshape_with_bcast); + + CALL_SUBTESTS_DIMS_LAYOUTS(test_assign_to_tensor); + CALL_SUBTESTS_DIMS_LAYOUTS(test_assign_to_tensor_reshape); // clang-format on } diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp index c233fe30f0..9094b65076 100644 --- a/unsupported/test/cxx11_tensor_executor.cpp +++ b/unsupported/test/cxx11_tensor_executor.cpp @@ -582,11 +582,10 @@ static void test_async_execute_unary_expr(Device d) Eigen::Barrier done(1); auto on_done = [&done]() { done.Notify(); }; - static const bool TilingOn = Tiling == TiledEvaluation::Off ? false : true; using Assign = TensorAssignOp; using DoneCallback = decltype(on_done); using Executor = internal::TensorAsyncExecutor; + Vectorizable, Tiling>; Executor::runAsync(Assign(dst, expr), d, on_done); done.Wait(); @@ -619,11 +618,10 @@ static void test_async_execute_binary_expr(Device d) Eigen::Barrier done(1); auto on_done = [&done]() { done.Notify(); }; - static const bool TilingOn = Tiling == TiledEvaluation::Off ? false : true; using Assign = TensorAssignOp; using DoneCallback = decltype(on_done); using Executor = internal::TensorAsyncExecutor; + Vectorizable, Tiling>; Executor::runAsync(Assign(dst, expr), d, on_done); done.Wait(); @@ -737,10 +735,10 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) { CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 4); CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 5); - CALL_SUBTEST_COMBINATIONS_V1(9, test_execute_reshape, float, 2); - CALL_SUBTEST_COMBINATIONS_V1(9, test_execute_reshape, float, 3); - CALL_SUBTEST_COMBINATIONS_V1(9, test_execute_reshape, float, 4); - CALL_SUBTEST_COMBINATIONS_V1(9, test_execute_reshape, float, 5); + CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 2); + CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 3); + CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 4); + CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 5); CALL_SUBTEST_COMBINATIONS_V1(10, test_execute_slice_rvalue, float, 2); CALL_SUBTEST_COMBINATIONS_V1(10, test_execute_slice_rvalue, float, 3); @@ -779,4 +777,3 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) { // Force CMake to split this test. // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16 } - From 98bdd7252e14441dc2d392c5146496c35d5e6062 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Fri, 4 Oct 2019 10:15:33 -0700 Subject: [PATCH 2/2] Fix compilation warnings and errors with clang in TensorBlockV2 code and tests --- unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 4 ++-- unsupported/test/cxx11_tensor_block_eval.cpp | 8 ++++---- unsupported/test/cxx11_tensor_executor.cpp | 6 ++++-- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h index 4d2145bf3e..3880e7ed30 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h @@ -418,7 +418,7 @@ class TensorMaterializedBlock { if (can_use_direct_access) { const Scalar* block_start = data + desc.offset(); - return TensorMaterializedBlock(TensorBlockKind::kView, block_start, + return TensorMaterializedBlock(internal::TensorBlockKind::kView, block_start, desc.dimensions()); } else { @@ -438,7 +438,7 @@ class TensorMaterializedBlock { TensorBlockIO::Copy(dst, src); - return TensorMaterializedBlock(TensorBlockKind::kMaterializedInScratch, + return TensorMaterializedBlock(internal::TensorBlockKind::kMaterializedInScratch, block_buffer, desc.dimensions()); } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index be2449ebd5..489b915ac5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -295,8 +295,8 @@ struct TensorEvaluator, Device // Initialize output block iterator state. Dimension in this array are // always in inner_most -> outer_most order (col major layout). array it; - for (Index i = 0; i < NumDims - 1; ++i) { - const Index dim = IsColMajor ? i + 1 : NumDims - i - 2; + for (int i = 0; i < NumDims - 1; ++i) { + const int dim = IsColMajor ? i + 1 : NumDims - i - 2; it[i].count = 0; it[i].size = desc.dimension(dim); diff --git a/unsupported/test/cxx11_tensor_block_eval.cpp b/unsupported/test/cxx11_tensor_block_eval.cpp index ff98e170dd..75252362c4 100644 --- a/unsupported/test/cxx11_tensor_block_eval.cpp +++ b/unsupported/test/cxx11_tensor_block_eval.cpp @@ -104,13 +104,13 @@ static TensorBlockParams FixedSizeBlock(DSizes dims) { return {offsets, dims, TensorBlockDescriptor(0, dims)}; } -inline Eigen::IndexList> NByOne(int n) { - Eigen::IndexList> ret; +inline Eigen::IndexList> NByOne(Index n) { + Eigen::IndexList> ret; ret.set(0, n); return ret; } -inline Eigen::IndexList, int> OneByM(int m) { - Eigen::IndexList, int> ret; +inline Eigen::IndexList, Index> OneByM(Index m) { + Eigen::IndexList, Index> ret; ret.set(1, m); return ret; } diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp index 9094b65076..efae819619 100644 --- a/unsupported/test/cxx11_tensor_executor.cpp +++ b/unsupported/test/cxx11_tensor_executor.cpp @@ -582,10 +582,11 @@ static void test_async_execute_unary_expr(Device d) Eigen::Barrier done(1); auto on_done = [&done]() { done.Notify(); }; + static const bool TilingOn = Tiling == TiledEvaluation::Off ? false : true; using Assign = TensorAssignOp; using DoneCallback = decltype(on_done); using Executor = internal::TensorAsyncExecutor; + Vectorizable, TilingOn>; Executor::runAsync(Assign(dst, expr), d, on_done); done.Wait(); @@ -618,10 +619,11 @@ static void test_async_execute_binary_expr(Device d) Eigen::Barrier done(1); auto on_done = [&done]() { done.Notify(); }; + static const bool TilingOn = Tiling == TiledEvaluation::Off ? false : true; using Assign = TensorAssignOp; using DoneCallback = decltype(on_done); using Executor = internal::TensorAsyncExecutor; + Vectorizable, TilingOn>; Executor::runAsync(Assign(dst, expr), d, on_done); done.Wait();