mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-03-07 18:27:40 +08:00
Merged in ezhulenev/eigen-01 (pull request PR-726)
Block evaluation for TensorChipping + fixed bugs in TensorPadding and TensorSlicing
This commit is contained in:
commit
b03eb63d7c
@ -418,12 +418,22 @@ class TensorMaterializedBlock {
|
|||||||
|
|
||||||
if (can_use_direct_access) {
|
if (can_use_direct_access) {
|
||||||
const Scalar* block_start = data + desc.offset();
|
const Scalar* block_start = data + desc.offset();
|
||||||
return TensorMaterializedBlock(internal::TensorBlockKind::kView, block_start,
|
return TensorMaterializedBlock(internal::TensorBlockKind::kView,
|
||||||
desc.dimensions());
|
block_start, desc.dimensions());
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
|
// Try to reuse destination as an output block buffer.
|
||||||
Scalar* block_buffer = static_cast<Scalar*>(mem);
|
Scalar* block_buffer = desc.template destination<Scalar, Layout>();
|
||||||
|
bool materialized_in_output;
|
||||||
|
|
||||||
|
if (block_buffer != NULL) {
|
||||||
|
materialized_in_output = true;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
materialized_in_output = false;
|
||||||
|
void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
|
||||||
|
block_buffer = static_cast<Scalar*>(mem);
|
||||||
|
}
|
||||||
|
|
||||||
typedef internal::TensorBlockIOV2<Scalar, IndexType, NumDims, Layout>
|
typedef internal::TensorBlockIOV2<Scalar, IndexType, NumDims, Layout>
|
||||||
TensorBlockIO;
|
TensorBlockIO;
|
||||||
@ -438,8 +448,11 @@ class TensorMaterializedBlock {
|
|||||||
|
|
||||||
TensorBlockIO::Copy(dst, src);
|
TensorBlockIO::Copy(dst, src);
|
||||||
|
|
||||||
return TensorMaterializedBlock(internal::TensorBlockKind::kMaterializedInScratch,
|
return TensorMaterializedBlock(
|
||||||
block_buffer, desc.dimensions());
|
materialized_in_output
|
||||||
|
? internal::TensorBlockKind::kMaterializedInOutput
|
||||||
|
: internal::TensorBlockKind::kMaterializedInScratch,
|
||||||
|
block_buffer, desc.dimensions());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1141,7 +1154,7 @@ class TensorBlockAssignment {
|
|||||||
it[idx].count = 0;
|
it[idx].count = 0;
|
||||||
it[idx].size = target.dims[dim];
|
it[idx].size = target.dims[dim];
|
||||||
it[idx].output_stride = target.strides[dim];
|
it[idx].output_stride = target.strides[dim];
|
||||||
it[idx].output_span = it[i].output_stride * (it[i].size - 1);
|
it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
|
||||||
idx++;
|
idx++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -149,7 +149,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
|||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||||
BlockAccessV2 = false,
|
BlockAccessV2 = TensorEvaluator<ArgType, Device>::BlockAccessV2,
|
||||||
// Chipping of outer-most dimension is a trivial operation, because we can
|
// Chipping of outer-most dimension is a trivial operation, because we can
|
||||||
// read and write directly from the underlying tensor using single offset.
|
// read and write directly from the underlying tensor using single offset.
|
||||||
IsOuterChipping = (static_cast<int>(Layout) == ColMajor && DimId == NumInputDims - 1) ||
|
IsOuterChipping = (static_cast<int>(Layout) == ColMajor && DimId == NumInputDims - 1) ||
|
||||||
@ -171,7 +171,17 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
|||||||
OutputTensorBlock;
|
OutputTensorBlock;
|
||||||
|
|
||||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
|
||||||
|
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
|
||||||
|
|
||||||
|
typedef internal::TensorBlockDescriptor<NumInputDims, Index>
|
||||||
|
ArgTensorBlockDesc;
|
||||||
|
typedef typename TensorEvaluator<const ArgType, Device>::TensorBlockV2
|
||||||
|
ArgTensorBlock;
|
||||||
|
|
||||||
|
typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
|
||||||
|
Layout, Index>
|
||||||
|
TensorBlockV2;
|
||||||
//===--------------------------------------------------------------------===//
|
//===--------------------------------------------------------------------===//
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||||
@ -357,6 +367,72 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
|||||||
m_impl.block(&input_block);
|
m_impl.block(&input_block);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||||
|
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
|
||||||
|
const Index chip_dim = m_dim.actualDim();
|
||||||
|
|
||||||
|
DSizes<Index, NumInputDims> input_block_dims;
|
||||||
|
for (int i = 0; i < NumInputDims; ++i) {
|
||||||
|
input_block_dims[i] = i < chip_dim ? desc.dimension(i)
|
||||||
|
: i > chip_dim ? desc.dimension(i - 1)
|
||||||
|
: 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
ArgTensorBlockDesc arg_desc(srcCoeff(desc.offset()), input_block_dims);
|
||||||
|
|
||||||
|
// Try to reuse destination buffer for materializing argument block.
|
||||||
|
ScalarNoConst* destination_buffer =
|
||||||
|
desc.template destination<ScalarNoConst, Layout>();
|
||||||
|
if (destination_buffer != NULL) {
|
||||||
|
arg_desc.AddDestinationBuffer(
|
||||||
|
destination_buffer, internal::strides<Layout>(arg_desc.dimensions()),
|
||||||
|
(arg_desc.size() * sizeof(Scalar)));
|
||||||
|
}
|
||||||
|
|
||||||
|
ArgTensorBlock arg_block = m_impl.blockV2(arg_desc, scratch);
|
||||||
|
|
||||||
|
if (arg_block.data() != NULL) {
|
||||||
|
// Forward argument block buffer if possible.
|
||||||
|
return TensorBlockV2(arg_block.kind(), arg_block.data(),
|
||||||
|
desc.dimensions());
|
||||||
|
|
||||||
|
} else {
|
||||||
|
// Assign argument block expression to a buffer.
|
||||||
|
|
||||||
|
// Try to reuse destination as an output buffer.
|
||||||
|
ScalarNoConst* output_buffer =
|
||||||
|
desc.template destination<ScalarNoConst, Layout>();
|
||||||
|
bool materialized_in_output;
|
||||||
|
|
||||||
|
if (output_buffer != NULL) {
|
||||||
|
materialized_in_output = true;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
materialized_in_output = false;
|
||||||
|
const size_t materialized_output_size = desc.size() * sizeof(Scalar);
|
||||||
|
void* output_scratch_mem = scratch.allocate(materialized_output_size);
|
||||||
|
output_buffer = static_cast<ScalarNoConst*>(output_scratch_mem);
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef internal::TensorBlockAssignment<
|
||||||
|
ScalarNoConst, NumInputDims, typename ArgTensorBlock::XprType, Index>
|
||||||
|
TensorBlockAssignment;
|
||||||
|
|
||||||
|
TensorBlockAssignment::Run(
|
||||||
|
TensorBlockAssignment::target(
|
||||||
|
arg_desc.dimensions(),
|
||||||
|
internal::strides<Layout>(arg_desc.dimensions()),
|
||||||
|
output_buffer),
|
||||||
|
arg_block.expr());
|
||||||
|
|
||||||
|
return TensorBlockV2(
|
||||||
|
materialized_in_output
|
||||||
|
? internal::TensorBlockKind::kMaterializedInOutput
|
||||||
|
: internal::TensorBlockKind::kMaterializedInScratch,
|
||||||
|
output_buffer, desc.dimensions());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
|
||||||
typename Storage::Type result = constCast(m_impl.data());
|
typename Storage::Type result = constCast(m_impl.data());
|
||||||
if (isOuterChipping() && result) {
|
if (isOuterChipping() && result) {
|
||||||
@ -434,11 +510,12 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
|
|||||||
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
|
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
BlockAccessV2 = TensorEvaluator<ArgType, Device>::RawAccess,
|
||||||
RawAccess = false
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
|
RawAccess = false
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
|
typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
|
||||||
@ -448,6 +525,10 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
|
|||||||
typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
|
typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
|
||||||
OutputTensorBlock;
|
OutputTensorBlock;
|
||||||
|
|
||||||
|
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||||
|
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
|
||||||
|
//===--------------------------------------------------------------------===//
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||||
: Base(op, device)
|
: Base(op, device)
|
||||||
{ }
|
{ }
|
||||||
@ -539,6 +620,36 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
|
|||||||
input_block_strides, this->m_inputStrides,
|
input_block_strides, this->m_inputStrides,
|
||||||
const_cast<ScalarNoConst*>(output_block.data())));
|
const_cast<ScalarNoConst*>(output_block.data())));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename TensorBlockV2>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2(
|
||||||
|
const TensorBlockDesc& desc, const TensorBlockV2& block) {
|
||||||
|
assert(this->m_impl.data() != NULL);
|
||||||
|
|
||||||
|
const Index chip_dim = this->m_dim.actualDim();
|
||||||
|
|
||||||
|
DSizes<Index, NumInputDims> input_block_dims;
|
||||||
|
for (int i = 0; i < NumInputDims; ++i) {
|
||||||
|
input_block_dims[i] = i < chip_dim ? desc.dimension(i)
|
||||||
|
: i > chip_dim ? desc.dimension(i - 1)
|
||||||
|
: 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef TensorReshapingOp<const DSizes<Index, NumInputDims>,
|
||||||
|
const typename TensorBlockV2::XprType>
|
||||||
|
TensorBlockExpr;
|
||||||
|
|
||||||
|
typedef internal::TensorBlockAssignment<Scalar, NumInputDims,
|
||||||
|
TensorBlockExpr, Index>
|
||||||
|
TensorBlockAssign;
|
||||||
|
|
||||||
|
TensorBlockAssign::Run(
|
||||||
|
TensorBlockAssign::target(
|
||||||
|
input_block_dims,
|
||||||
|
internal::strides<Layout>(this->m_impl.dimensions()),
|
||||||
|
this->m_impl.data(), this->srcCoeff(desc.offset())),
|
||||||
|
block.expr().reshape(input_block_dims));
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -53,18 +53,22 @@ struct TensorEvaluator
|
|||||||
RawAccess = true
|
RawAccess = true
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef typename internal::TensorBlock<
|
typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
|
||||||
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
|
|
||||||
|
typedef typename internal::TensorBlock<ScalarNoConst, Index, NumCoords, Layout>
|
||||||
TensorBlock;
|
TensorBlock;
|
||||||
typedef typename internal::TensorBlockReader<
|
typedef typename internal::TensorBlockReader<ScalarNoConst, Index, NumCoords, Layout>
|
||||||
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
|
|
||||||
TensorBlockReader;
|
TensorBlockReader;
|
||||||
typedef typename internal::TensorBlockWriter<
|
typedef typename internal::TensorBlockWriter<ScalarNoConst, Index, NumCoords, Layout>
|
||||||
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
|
|
||||||
TensorBlockWriter;
|
TensorBlockWriter;
|
||||||
|
|
||||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||||
typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
|
typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
|
||||||
|
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
|
||||||
|
|
||||||
|
typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords,
|
||||||
|
Layout, Index>
|
||||||
|
TensorBlockV2;
|
||||||
//===--------------------------------------------------------------------===//
|
//===--------------------------------------------------------------------===//
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
|
||||||
@ -161,6 +165,12 @@ struct TensorEvaluator
|
|||||||
TensorBlockReader::Run(block, m_data);
|
TensorBlockReader::Run(block, m_data);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||||
|
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
|
||||||
|
assert(m_data != NULL);
|
||||||
|
return TensorBlockV2::materialize(m_data, m_dims, desc, scratch);
|
||||||
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
|
||||||
const TensorBlock& block) {
|
const TensorBlock& block) {
|
||||||
assert(m_data != NULL);
|
assert(m_data != NULL);
|
||||||
@ -269,11 +279,6 @@ struct TensorEvaluator<const Derived, Device>
|
|||||||
typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
|
typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
|
||||||
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
|
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
|
||||||
|
|
||||||
typedef internal::TensorBlockIOV2<ScalarNoConst, Index, NumCoords, Layout>
|
|
||||||
TensorBlockIO;
|
|
||||||
typedef typename TensorBlockIO::Dst TensorBlockIODst;
|
|
||||||
typedef typename TensorBlockIO::Src TensorBlockIOSrc;
|
|
||||||
|
|
||||||
typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords,
|
typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords,
|
||||||
Layout, Index>
|
Layout, Index>
|
||||||
TensorBlockV2;
|
TensorBlockV2;
|
||||||
|
@ -521,6 +521,19 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
|
|||||||
static EIGEN_STRONG_INLINE void run(const Expression& expr,
|
static EIGEN_STRONG_INLINE void run(const Expression& expr,
|
||||||
const ThreadPoolDevice& device) {
|
const ThreadPoolDevice& device) {
|
||||||
Evaluator evaluator(expr, device);
|
Evaluator evaluator(expr, device);
|
||||||
|
Index total_size = array_prod(evaluator.dimensions());
|
||||||
|
Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
|
||||||
|
|
||||||
|
// TODO(ezuhulenev): For small expressions cost of block mapping and
|
||||||
|
// resource requirements gathering dominates the cost of expression
|
||||||
|
// evaluatiuon.
|
||||||
|
if (total_size < cache_size &&
|
||||||
|
!ExpressionHasTensorBroadcastingOp<Expression>::value) {
|
||||||
|
internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
|
||||||
|
/*Tiling=*/TiledEvaluation::Off>::run(expr, device);
|
||||||
|
evaluator.cleanup();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
|
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
|
||||||
if (needs_assign) {
|
if (needs_assign) {
|
||||||
|
@ -97,21 +97,26 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
|
|||||||
IsAligned = true,
|
IsAligned = true,
|
||||||
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
|
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
|
||||||
BlockAccess = internal::is_arithmetic<CoeffReturnType>::value,
|
BlockAccess = internal::is_arithmetic<CoeffReturnType>::value,
|
||||||
BlockAccessV2 = false,
|
BlockAccessV2 = internal::is_arithmetic<CoeffReturnType>::value,
|
||||||
PreferBlockAccess = false,
|
PreferBlockAccess = false,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
RawAccess = true
|
RawAccess = true
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef typename internal::TensorBlock<
|
static const int NumDims = internal::traits<ArgType>::NumDimensions;
|
||||||
CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout>
|
|
||||||
|
typedef typename internal::TensorBlock<CoeffReturnType, Index, NumDims, Layout>
|
||||||
TensorBlock;
|
TensorBlock;
|
||||||
typedef typename internal::TensorBlockReader<
|
typedef typename internal::TensorBlockReader<CoeffReturnType, Index, NumDims, Layout>
|
||||||
CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout>
|
|
||||||
TensorBlockReader;
|
TensorBlockReader;
|
||||||
|
|
||||||
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
|
||||||
typedef internal::TensorBlockNotImplemented TensorBlockV2;
|
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
|
||||||
|
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
|
||||||
|
|
||||||
|
typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims,
|
||||||
|
Layout, Index>
|
||||||
|
TensorBlockV2;
|
||||||
//===--------------------------------------------------------------------===//
|
//===--------------------------------------------------------------------===//
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
|
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
|
||||||
@ -170,6 +175,12 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
|
|||||||
TensorBlockReader::Run(block, m_buffer);
|
TensorBlockReader::Run(block, m_buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||||
|
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
|
||||||
|
assert(m_buffer != NULL);
|
||||||
|
return TensorBlockV2::materialize(m_buffer, m_impl.dimensions(), desc, scratch);
|
||||||
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
|
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
|
||||||
}
|
}
|
||||||
|
@ -644,6 +644,9 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// No strides for scalars.
|
||||||
|
if (NumDims == 0) return;
|
||||||
|
|
||||||
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
|
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
|
||||||
const Sizes& output_dims = op.sizes();
|
const Sizes& output_dims = op.sizes();
|
||||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||||
|
@ -334,8 +334,12 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
|||||||
// Want to copy from input.
|
// Want to copy from input.
|
||||||
(output_inner_dim_size - output_inner_pad_before_size),
|
(output_inner_dim_size - output_inner_pad_before_size),
|
||||||
// Can copy from input.
|
// Can copy from input.
|
||||||
(static_cast<Index>(m_impl.dimensions()[inner_dim_idx]) -
|
numext::maxi(
|
||||||
numext::maxi(input_offsets[inner_dim_idx], Index(0))));
|
static_cast<Index>(m_impl.dimensions()[inner_dim_idx]) -
|
||||||
|
(input_offsets[inner_dim_idx] + output_inner_pad_before_size),
|
||||||
|
Index(0)));
|
||||||
|
|
||||||
|
eigen_assert(output_inner_copy_size >= 0);
|
||||||
|
|
||||||
// How many values to fill with padding AFTER reading from the input inner
|
// How many values to fill with padding AFTER reading from the input inner
|
||||||
// dimension.
|
// dimension.
|
||||||
|
@ -82,14 +82,14 @@ static TensorBlockParams<NumDims> SkewedInnerBlock(
|
|||||||
index -= idx * strides[i];
|
index -= idx * strides[i];
|
||||||
offsets[i] = idx;
|
offsets[i] = idx;
|
||||||
}
|
}
|
||||||
offsets[0] = index;
|
if (NumDims > 0) offsets[0] = index;
|
||||||
} else {
|
} else {
|
||||||
for (int i = 0; i < NumDims - 1; ++i) {
|
for (int i = 0; i < NumDims - 1; ++i) {
|
||||||
const Index idx = index / strides[i];
|
const Index idx = index / strides[i];
|
||||||
index -= idx * strides[i];
|
index -= idx * strides[i];
|
||||||
offsets[i] = idx;
|
offsets[i] = idx;
|
||||||
}
|
}
|
||||||
offsets[NumDims - 1] = index;
|
if (NumDims > 0) offsets[NumDims - 1] = index;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto desc = TensorBlockDescriptor<NumDims>(block.first_coeff_index(), sizes);
|
auto desc = TensorBlockDescriptor<NumDims>(block.first_coeff_index(), sizes);
|
||||||
@ -333,6 +333,42 @@ static void test_eval_tensor_padding() {
|
|||||||
[&padded_dims]() { return SkewedInnerBlock<Layout>(padded_dims); });
|
[&padded_dims]() { return SkewedInnerBlock<Layout>(padded_dims); });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename T, int NumDims, int Layout>
|
||||||
|
static void test_eval_tensor_chipping() {
|
||||||
|
DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
|
||||||
|
Tensor<T, NumDims, Layout> input(dims);
|
||||||
|
input.setRandom();
|
||||||
|
|
||||||
|
Index chip_dim = internal::random<int>(0, NumDims - 1);
|
||||||
|
Index chip_offset = internal::random<Index>(0, dims[chip_dim] - 2);
|
||||||
|
|
||||||
|
DSizes<Index, NumDims - 1> chipped_dims;
|
||||||
|
for (Index i = 0; i < chip_dim; ++i) {
|
||||||
|
chipped_dims[i] = dims[i];
|
||||||
|
}
|
||||||
|
for (Index i = chip_dim + 1; i < NumDims; ++i) {
|
||||||
|
chipped_dims[i - 1] = dims[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Block buffer forwarding.
|
||||||
|
VerifyBlockEvaluator<T, NumDims - 1, Layout>(
|
||||||
|
input.chip(chip_offset, chip_dim),
|
||||||
|
[&chipped_dims]() { return FixedSizeBlock(chipped_dims); });
|
||||||
|
|
||||||
|
VerifyBlockEvaluator<T, NumDims - 1, Layout>(
|
||||||
|
input.chip(chip_offset, chip_dim),
|
||||||
|
[&chipped_dims]() { return RandomBlock<Layout>(chipped_dims, 1, 10); });
|
||||||
|
|
||||||
|
// Block expression assignment.
|
||||||
|
VerifyBlockEvaluator<T, NumDims - 1, Layout>(
|
||||||
|
input.square().chip(chip_offset, chip_dim),
|
||||||
|
[&chipped_dims]() { return FixedSizeBlock(chipped_dims); });
|
||||||
|
|
||||||
|
VerifyBlockEvaluator<T, NumDims - 1, Layout>(
|
||||||
|
input.square().chip(chip_offset, chip_dim),
|
||||||
|
[&chipped_dims]() { return RandomBlock<Layout>(chipped_dims, 1, 10); });
|
||||||
|
}
|
||||||
|
|
||||||
template <typename T, int Layout>
|
template <typename T, int Layout>
|
||||||
static void test_eval_tensor_reshape_with_bcast() {
|
static void test_eval_tensor_reshape_with_bcast() {
|
||||||
Index dim = internal::random<Index>(1, 100);
|
Index dim = internal::random<Index>(1, 100);
|
||||||
@ -384,8 +420,8 @@ static void test_eval_tensor_forced_eval() {
|
|||||||
// as an assignment to TensorSliceOp (writing a block is is identical to
|
// as an assignment to TensorSliceOp (writing a block is is identical to
|
||||||
// assigning one tensor to a slice of another tensor).
|
// assigning one tensor to a slice of another tensor).
|
||||||
|
|
||||||
template <typename T, int NumDims, int Layout, typename Expression,
|
template <typename T, int NumDims, int Layout, int NumExprDims = NumDims,
|
||||||
typename GenBlockParams>
|
typename Expression, typename GenBlockParams>
|
||||||
static void VerifyBlockAssignment(Tensor<T, NumDims, Layout>& tensor,
|
static void VerifyBlockAssignment(Tensor<T, NumDims, Layout>& tensor,
|
||||||
Expression expr, GenBlockParams gen_block) {
|
Expression expr, GenBlockParams gen_block) {
|
||||||
using Device = DefaultDevice;
|
using Device = DefaultDevice;
|
||||||
@ -395,17 +431,17 @@ static void VerifyBlockAssignment(Tensor<T, NumDims, Layout>& tensor,
|
|||||||
auto eval = TensorEvaluator<decltype(expr), Device>(expr, d);
|
auto eval = TensorEvaluator<decltype(expr), Device>(expr, d);
|
||||||
|
|
||||||
// Generate a random block, or choose a block that fits in full expression.
|
// Generate a random block, or choose a block that fits in full expression.
|
||||||
TensorBlockParams<NumDims> block_params = gen_block();
|
TensorBlockParams<NumExprDims> block_params = gen_block();
|
||||||
|
|
||||||
// Generate random data of the selected block size.
|
// Generate random data of the selected block size.
|
||||||
Tensor<T, NumDims, Layout> block(block_params.desc.dimensions());
|
Tensor<T, NumExprDims, Layout> block(block_params.desc.dimensions());
|
||||||
block.setRandom();
|
block.setRandom();
|
||||||
|
|
||||||
// ************************************************************************ //
|
// ************************************************************************ //
|
||||||
// (1) Assignment from a block.
|
// (1) Assignment from a block.
|
||||||
|
|
||||||
// Construct a materialize block from a random generated block tensor.
|
// Construct a materialize block from a random generated block tensor.
|
||||||
internal::TensorMaterializedBlock<T, NumDims, Layout> blk(
|
internal::TensorMaterializedBlock<T, NumExprDims, Layout> blk(
|
||||||
internal::TensorBlockKind::kView, block.data(), block.dimensions());
|
internal::TensorBlockKind::kView, block.data(), block.dimensions());
|
||||||
|
|
||||||
// Reset all underlying tensor values to zero.
|
// Reset all underlying tensor values to zero.
|
||||||
@ -478,6 +514,37 @@ static void test_assign_to_tensor_reshape() {
|
|||||||
[&shuffled]() { return FixedSizeBlock(shuffled); });
|
[&shuffled]() { return FixedSizeBlock(shuffled); });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename T, int NumDims, int Layout>
|
||||||
|
static void test_assign_to_tensor_chipping() {
|
||||||
|
DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
|
||||||
|
Tensor<T, NumDims, Layout> tensor(dims);
|
||||||
|
|
||||||
|
Index chip_dim = internal::random<int>(0, NumDims - 1);
|
||||||
|
Index chip_offset = internal::random<Index>(0, dims[chip_dim] - 2);
|
||||||
|
|
||||||
|
DSizes < Index, NumDims - 1 > chipped_dims;
|
||||||
|
for (Index i = 0; i < chip_dim; ++i) {
|
||||||
|
chipped_dims[i] = dims[i];
|
||||||
|
}
|
||||||
|
for (Index i = chip_dim + 1; i < NumDims; ++i) {
|
||||||
|
chipped_dims[i - 1] = dims[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
|
||||||
|
|
||||||
|
VerifyBlockAssignment<T, NumDims, Layout, NumDims - 1>(
|
||||||
|
tensor, map.chip(chip_offset, chip_dim),
|
||||||
|
[&chipped_dims]() { return RandomBlock<Layout>(chipped_dims, 1, 10); });
|
||||||
|
|
||||||
|
VerifyBlockAssignment<T, NumDims, Layout, NumDims - 1>(
|
||||||
|
tensor, map.chip(chip_offset, chip_dim),
|
||||||
|
[&chipped_dims]() { return SkewedInnerBlock<Layout>(chipped_dims); });
|
||||||
|
|
||||||
|
VerifyBlockAssignment<T, NumDims, Layout, NumDims - 1>(
|
||||||
|
tensor, map.chip(chip_offset, chip_dim),
|
||||||
|
[&chipped_dims]() { return FixedSizeBlock(chipped_dims); });
|
||||||
|
}
|
||||||
|
|
||||||
// -------------------------------------------------------------------------- //
|
// -------------------------------------------------------------------------- //
|
||||||
|
|
||||||
#define CALL_SUBTESTS_DIMS_LAYOUTS(NAME) \
|
#define CALL_SUBTESTS_DIMS_LAYOUTS(NAME) \
|
||||||
@ -503,12 +570,15 @@ EIGEN_DECLARE_TEST(cxx11_tensor_block_eval) {
|
|||||||
CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_broadcast);
|
CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_broadcast);
|
||||||
CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_reshape);
|
CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_reshape);
|
||||||
CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_cast);
|
CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_cast);
|
||||||
|
CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_select);
|
||||||
CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_padding);
|
CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_padding);
|
||||||
|
CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_chipping);
|
||||||
|
|
||||||
CALL_SUBTESTS_LAYOUTS(test_eval_tensor_reshape_with_bcast);
|
CALL_SUBTESTS_LAYOUTS(test_eval_tensor_reshape_with_bcast);
|
||||||
CALL_SUBTESTS_LAYOUTS(test_eval_tensor_forced_eval);
|
CALL_SUBTESTS_LAYOUTS(test_eval_tensor_forced_eval);
|
||||||
|
|
||||||
CALL_SUBTESTS_DIMS_LAYOUTS(test_assign_to_tensor);
|
CALL_SUBTESTS_DIMS_LAYOUTS(test_assign_to_tensor);
|
||||||
CALL_SUBTESTS_DIMS_LAYOUTS(test_assign_to_tensor_reshape);
|
CALL_SUBTESTS_DIMS_LAYOUTS(test_assign_to_tensor_reshape);
|
||||||
|
CALL_SUBTESTS_DIMS_LAYOUTS(test_assign_to_tensor_chipping);
|
||||||
// clang-format on
|
// clang-format on
|
||||||
}
|
}
|
||||||
|
@ -180,9 +180,8 @@ static void test_execute_chipping_lvalue(Device d)
|
|||||||
\
|
\
|
||||||
const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \
|
const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \
|
||||||
\
|
\
|
||||||
/* Generate random data to fill non-chipped dimensions*/ \
|
|
||||||
Tensor<T, NumDims, Layout, Index> random(dims); \
|
Tensor<T, NumDims, Layout, Index> random(dims); \
|
||||||
random.setRandom(); \
|
random.setZero(); \
|
||||||
\
|
\
|
||||||
Tensor<T, NumDims, Layout, Index> golden(dims); \
|
Tensor<T, NumDims, Layout, Index> golden(dims); \
|
||||||
golden = random; \
|
golden = random; \
|
||||||
@ -716,13 +715,13 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
|
|||||||
CALL_SUBTEST_COMBINATIONS_V2(3, test_execute_broadcasting, float, 4);
|
CALL_SUBTEST_COMBINATIONS_V2(3, test_execute_broadcasting, float, 4);
|
||||||
CALL_SUBTEST_COMBINATIONS_V2(3, test_execute_broadcasting, float, 5);
|
CALL_SUBTEST_COMBINATIONS_V2(3, test_execute_broadcasting, float, 5);
|
||||||
|
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(4, test_execute_chipping_rvalue, float, 3);
|
CALL_SUBTEST_COMBINATIONS_V2(4, test_execute_chipping_rvalue, float, 3);
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(4, test_execute_chipping_rvalue, float, 4);
|
CALL_SUBTEST_COMBINATIONS_V2(4, test_execute_chipping_rvalue, float, 4);
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(4, test_execute_chipping_rvalue, float, 5);
|
CALL_SUBTEST_COMBINATIONS_V2(4, test_execute_chipping_rvalue, float, 5);
|
||||||
|
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(5, test_execute_chipping_lvalue, float, 3);
|
CALL_SUBTEST_COMBINATIONS_V2(5, test_execute_chipping_lvalue, float, 3);
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(5, test_execute_chipping_lvalue, float, 4);
|
CALL_SUBTEST_COMBINATIONS_V2(5, test_execute_chipping_lvalue, float, 4);
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(5, test_execute_chipping_lvalue, float, 5);
|
CALL_SUBTEST_COMBINATIONS_V2(5, test_execute_chipping_lvalue, float, 5);
|
||||||
|
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(6, test_execute_shuffle_rvalue, float, 3);
|
CALL_SUBTEST_COMBINATIONS_V1(6, test_execute_shuffle_rvalue, float, 3);
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(6, test_execute_shuffle_rvalue, float, 4);
|
CALL_SUBTEST_COMBINATIONS_V1(6, test_execute_shuffle_rvalue, float, 4);
|
||||||
@ -752,10 +751,10 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
|
|||||||
CALL_SUBTEST_COMBINATIONS_V1(11, test_execute_slice_lvalue, float, 4);
|
CALL_SUBTEST_COMBINATIONS_V1(11, test_execute_slice_lvalue, float, 4);
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(11, test_execute_slice_lvalue, float, 5);
|
CALL_SUBTEST_COMBINATIONS_V1(11, test_execute_slice_lvalue, float, 5);
|
||||||
|
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(12, test_execute_broadcasting_of_forced_eval, float, 2);
|
CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 2);
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(12, test_execute_broadcasting_of_forced_eval, float, 3);
|
CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 3);
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(12, test_execute_broadcasting_of_forced_eval, float, 4);
|
CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 4);
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(12, test_execute_broadcasting_of_forced_eval, float, 5);
|
CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 5);
|
||||||
|
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(13, test_execute_generator_op, float, 2);
|
CALL_SUBTEST_COMBINATIONS_V1(13, test_execute_generator_op, float, 2);
|
||||||
CALL_SUBTEST_COMBINATIONS_V1(13, test_execute_generator_op, float, 3);
|
CALL_SUBTEST_COMBINATIONS_V1(13, test_execute_generator_op, float, 3);
|
||||||
|
Loading…
Reference in New Issue
Block a user