mirror of
https://gitlab.com/libeigen/eigen.git
synced 2024-12-21 07:19:46 +08:00
Cleanup Tensor block destination and materialized block storage allocation
This commit is contained in:
parent
02431cbe71
commit
0d2a14ce11
@ -235,11 +235,9 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
|
||||
m_leftImpl.data() != NULL) {
|
||||
// If destination has raw data access, we pass it as a potential
|
||||
// destination for a block descriptor evaluation.
|
||||
desc.AddDestinationBuffer(
|
||||
desc.template AddDestinationBuffer<Layout>(
|
||||
/*dst_base=*/m_leftImpl.data() + desc.offset(),
|
||||
/*dst_strides=*/internal::strides<Layout>(m_leftImpl.dimensions()),
|
||||
/*total_dst_bytes=*/
|
||||
(internal::array_prod(m_leftImpl.dimensions()) * sizeof(Scalar)));
|
||||
/*dst_strides=*/internal::strides<Layout>(m_leftImpl.dimensions()));
|
||||
}
|
||||
|
||||
RightTensorBlock block = m_rightImpl.blockV2(desc, scratch, /*root_of_expr_ast=*/true);
|
||||
|
@ -70,91 +70,89 @@ class TensorBlockDescriptor {
|
||||
|
||||
// If we evaluate a Tensor assignment, and expression on the left, already has
|
||||
// a memory buffer, then we might do performance optimization, and evaluate
|
||||
// the root expression directly into the memory, or maybe use it as temporary
|
||||
// storage for some of the subexpressions, to avoid dynamic memory allocation.
|
||||
// the root expression directly into the final output memory. Some time it's
|
||||
// possible to reuse it for materializing subexpressions inside an expression
|
||||
// tree, to to avoid dynamic memory allocation.
|
||||
//
|
||||
// This is a type erased storage, because passing Scalar type through all the
|
||||
// expression evaluation layers it way too many templates. Also it should be
|
||||
// possible to use this destination as a temp buffer for materializing
|
||||
// expressions with type, not matching the final output.
|
||||
// The pointer type of the underlying storage is erased, because passing
|
||||
// Scalar type through all the expression evaluation layers is way too many
|
||||
// templates. In practice destination buffer type should always match the
|
||||
// evaluated expression scalar type.
|
||||
class DestinationBuffer {
|
||||
public:
|
||||
enum DestinationBufferKind {
|
||||
// Destination buffer is not defined (`m_data` == nullptr).
|
||||
kEmpty,
|
||||
|
||||
// Tensor block defined by an owning tensor block descriptor can fit
|
||||
// contiguously into the destination buffer. In this case it's safe to
|
||||
// materialize tensor block in the destination buffer, wrap it in a
|
||||
// TensorMap, and use to build Eigen expression on top of it.
|
||||
kContiguous,
|
||||
|
||||
// Destination buffer strides do not match strides of the contiguously
|
||||
// stored block, and it's impossible to define a TensorMap over this
|
||||
// buffer. However if we are evaluating a root of an expression tree, we
|
||||
// still can materialize an output into this destination, because we can
|
||||
// guarantee that no one will ever access it through block API.
|
||||
//
|
||||
// In theory it is possible to build valid TensorStriding<TensorMap>
|
||||
// expression on top of this destination buffer, however it has
|
||||
// inefficient coeff/packet access, and defeats the purpose of fast block
|
||||
// evaluation API.
|
||||
kStrided
|
||||
};
|
||||
|
||||
template <typename Scalar>
|
||||
Scalar* data() const {
|
||||
eigen_assert(m_data_type_size == sizeof(Scalar));
|
||||
return static_cast<Scalar*>(m_data);
|
||||
}
|
||||
|
||||
template <typename Scalar>
|
||||
Dimensions dimensions() const {
|
||||
Dimensions dimensions;
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
eigen_assert(m_dimensions[i] % sizeof(Scalar) == 0);
|
||||
dimensions[i] = m_dimensions[i] / sizeof(Scalar);
|
||||
}
|
||||
return dimensions;
|
||||
}
|
||||
|
||||
template <typename Scalar>
|
||||
Dimensions strides() const {
|
||||
Dimensions strides;
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
eigen_assert(m_strides[i] % sizeof(Scalar) == 0);
|
||||
strides[i] = m_strides[i] / sizeof(Scalar);
|
||||
}
|
||||
return strides;
|
||||
}
|
||||
|
||||
// Returns true if the tensor block corresponding to `desc` fits into the
|
||||
// contiguous block of memory defined by `*this`.
|
||||
template <typename Scalar, int Layout>
|
||||
bool fitsContiguously(const TensorBlockDescriptor& desc) const {
|
||||
if (m_data == NULL) return false;
|
||||
|
||||
const Dimensions& desc_dims = desc.dimensions();
|
||||
const Dimensions& dst_dims = dimensions<Scalar>();
|
||||
|
||||
if (!dimensions_match(desc_dims, dst_dims)) return false;
|
||||
|
||||
const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
|
||||
const Dimensions& dst_strides = strides<Scalar>();
|
||||
|
||||
// Compare strides ignoring dimensions of size `1`.
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
if (desc_dims[i] == 1) continue;
|
||||
if (desc_strides[i] != dst_strides[i]) return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
const Dimensions& strides() const { return m_strides; }
|
||||
const DestinationBufferKind& kind() const { return m_kind; }
|
||||
|
||||
private:
|
||||
friend class TensorBlockDescriptor;
|
||||
|
||||
DestinationBuffer() : m_data(NULL), m_total_dst_bytes(0) {}
|
||||
DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
|
||||
|
||||
template <typename Scalar>
|
||||
DestinationBuffer(Scalar* data, const Dimensions& dimensions,
|
||||
const Dimensions& strides, size_t total_dst_bytes)
|
||||
DestinationBuffer(Scalar* data, const Dimensions& strides,
|
||||
DestinationBufferKind kind)
|
||||
: m_data(static_cast<void*>(data)),
|
||||
m_dimensions(dimensions),
|
||||
m_data_type_size(sizeof(Scalar)),
|
||||
m_strides(strides),
|
||||
m_total_dst_bytes(total_dst_bytes) {
|
||||
// TODO(ezhulenev): Benchmark template meta-unroll for this loop.
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
m_dimensions[i] *= sizeof(Scalar);
|
||||
m_strides[i] *= sizeof(Scalar);
|
||||
}
|
||||
m_kind(kind) {}
|
||||
|
||||
template <int Layout, typename Scalar>
|
||||
static DestinationBuffer make(const TensorBlockDescriptor& desc,
|
||||
Scalar* data, const Dimensions& strides) {
|
||||
return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
|
||||
}
|
||||
|
||||
template <int Layout>
|
||||
static DestinationBufferKind kind(const TensorBlockDescriptor& desc,
|
||||
const Dimensions& strides) {
|
||||
const Dimensions& desc_dims = desc.dimensions();
|
||||
const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
if (desc_dims[i] == 1) continue;
|
||||
if (desc_strides[i] != strides[i]) return kStrided;
|
||||
}
|
||||
return kContiguous;
|
||||
}
|
||||
|
||||
// Storage pointer is type erased, to reduce template bloat, but we still
|
||||
// keep the size of the underlying element type for error checking.
|
||||
void* m_data;
|
||||
Dimensions m_dimensions;
|
||||
size_t m_data_type_size;
|
||||
|
||||
// Destination buffer dimensions always match the dimensions of a tensor
|
||||
// block descriptor it belongs to, however strides might be different.
|
||||
Dimensions m_strides;
|
||||
|
||||
// Total size of the memory buffer at the destination (typically the total
|
||||
// size of the left hand side of an assignment expression). This can be the
|
||||
// same as `array_prod(m_dimensions)` if the assignment target has just a
|
||||
// single block, but typically it's a larger number.
|
||||
size_t m_total_dst_bytes;
|
||||
DestinationBufferKind m_kind;
|
||||
};
|
||||
|
||||
TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions,
|
||||
@ -173,40 +171,31 @@ class TensorBlockDescriptor {
|
||||
IndexType dimension(int index) const { return m_dimensions[index]; }
|
||||
IndexType size() const { return array_prod<IndexType>(m_dimensions); }
|
||||
|
||||
template <typename Scalar>
|
||||
void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides,
|
||||
size_t total_dst_bytes) {
|
||||
const DestinationBuffer& destination() const { return m_destination; }
|
||||
|
||||
template <int Layout, typename Scalar>
|
||||
void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {
|
||||
eigen_assert(dst_base != NULL);
|
||||
m_destination =
|
||||
DestinationBuffer(dst_base, m_dimensions, dst_strides, total_dst_bytes);
|
||||
DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);
|
||||
}
|
||||
|
||||
template <typename Scalar, typename DstStridesIndexType>
|
||||
template <int Layout, typename Scalar, typename DstStridesIndexType>
|
||||
void AddDestinationBuffer(
|
||||
Scalar* dst_base, const DSizes<DstStridesIndexType, NumDims>& dst_strides,
|
||||
size_t total_dst_bytes) {
|
||||
Scalar* dst_base,
|
||||
const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
|
||||
// DSizes constructor will do index type promotion if it's safe.
|
||||
AddDestinationBuffer(dst_base, Dimensions(dst_strides), total_dst_bytes);
|
||||
AddDestinationBuffer<Layout>(*this, dst_base, Dimensions(dst_strides));
|
||||
}
|
||||
|
||||
TensorBlockDescriptor& DropDestinationBuffer() {
|
||||
m_destination.m_data = NULL;
|
||||
m_destination.m_kind = DestinationBuffer::kEmpty;
|
||||
return *this;
|
||||
}
|
||||
|
||||
bool HasDestinationBuffer() const { return m_destination.m_data != NULL; }
|
||||
|
||||
const DestinationBuffer& GetDestinationBuffer() const {
|
||||
return m_destination;
|
||||
}
|
||||
|
||||
// Returns a non-nullptr pointer to a destination buffer memory if this
|
||||
// block has a contiguous destination buffer.
|
||||
template <typename Scalar, int Layout>
|
||||
Scalar* destination() const {
|
||||
if (m_destination.template fitsContiguously<Scalar, Layout>(*this)) {
|
||||
return m_destination.template data<Scalar>();
|
||||
}
|
||||
return NULL;
|
||||
bool HasDestinationBuffer() const {
|
||||
return m_destination.kind() != DestinationBuffer::kEmpty;
|
||||
}
|
||||
|
||||
// Returns a copy of `*this` with updated offset.
|
||||
@ -404,6 +393,80 @@ class TensorMaterializedBlock {
|
||||
|
||||
typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
|
||||
|
||||
// TensorMaterializedBlock can be backed by different types of storage:
|
||||
//
|
||||
// (1) Contiguous block of memory allocated with scratch allocator.
|
||||
// (2) Contiguous block of memory reused from tensor block descriptor
|
||||
// destination buffer.
|
||||
// (3) Strided block of memory reused from tensor block descriptor
|
||||
// destination buffer.
|
||||
//
|
||||
class Storage {
|
||||
public:
|
||||
Scalar* data() const { return m_data; }
|
||||
const Dimensions& dimensions() const { return m_dimensions; }
|
||||
const Dimensions& strides() const { return m_strides; }
|
||||
|
||||
TensorMaterializedBlock AsTensorMaterializedBlock() const {
|
||||
return TensorMaterializedBlock(
|
||||
m_materialized_in_output
|
||||
? internal::TensorBlockKind::kMaterializedInOutput
|
||||
: internal::TensorBlockKind::kMaterializedInScratch,
|
||||
m_data, m_dimensions, !m_strided_storage);
|
||||
}
|
||||
|
||||
private:
|
||||
friend class TensorMaterializedBlock;
|
||||
|
||||
Storage(Scalar* data, const Dimensions& dimensions,
|
||||
const Dimensions& strides, bool materialized_in_output,
|
||||
bool strided_storage)
|
||||
: m_data(data),
|
||||
m_dimensions(dimensions),
|
||||
m_strides(strides),
|
||||
m_materialized_in_output(materialized_in_output),
|
||||
m_strided_storage(strided_storage) {}
|
||||
|
||||
Scalar* m_data;
|
||||
Dimensions m_dimensions;
|
||||
Dimensions m_strides;
|
||||
bool m_materialized_in_output;
|
||||
bool m_strided_storage;
|
||||
};
|
||||
|
||||
// Creates a storage for materialized block either from the block descriptor
|
||||
// destination buffer, or allocates a new buffer with scratch allocator.
|
||||
template <typename TensorBlockScratch>
|
||||
EIGEN_STRONG_INLINE static Storage prepareStorage(
|
||||
TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||
bool allow_strided_storage = false) {
|
||||
// Try to reuse destination as an output block buffer.
|
||||
typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
|
||||
|
||||
if (desc.destination().kind() == DestinationBuffer::kContiguous) {
|
||||
Scalar* buffer = desc.destination().template data<Scalar>();
|
||||
desc.DropDestinationBuffer();
|
||||
return Storage(buffer, desc.dimensions(),
|
||||
internal::strides<Layout>(desc.dimensions()),
|
||||
/*materialized_in_output=*/true,
|
||||
/*strided_storage=*/false);
|
||||
|
||||
} else if (desc.destination().kind() == DestinationBuffer::kStrided &&
|
||||
allow_strided_storage) {
|
||||
Scalar* buffer = desc.destination().template data<Scalar>();
|
||||
desc.DropDestinationBuffer();
|
||||
return Storage(buffer, desc.dimensions(), desc.destination().strides(),
|
||||
/*materialized_in_output=*/true, /*strided_storage=*/true);
|
||||
|
||||
} else {
|
||||
void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
|
||||
return Storage(static_cast<Scalar*>(mem), desc.dimensions(),
|
||||
internal::strides<Layout>(desc.dimensions()),
|
||||
/*materialized_in_output=*/false,
|
||||
/*strided_storage=*/false);
|
||||
}
|
||||
}
|
||||
|
||||
// Creates a materialized block for the given descriptor from a memory buffer.
|
||||
template <typename DataDimensions, typename TensorBlockScratch>
|
||||
EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(
|
||||
@ -448,19 +511,8 @@ class TensorMaterializedBlock {
|
||||
block_start, desc.dimensions());
|
||||
|
||||
} else {
|
||||
// Try to reuse destination as an output block buffer.
|
||||
Scalar* block_buffer = desc.template destination<Scalar, Layout>();
|
||||
bool materialized_in_output;
|
||||
|
||||
if (block_buffer != NULL) {
|
||||
desc.DropDestinationBuffer();
|
||||
materialized_in_output = true;
|
||||
|
||||
} else {
|
||||
materialized_in_output = false;
|
||||
void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
|
||||
block_buffer = static_cast<Scalar*>(mem);
|
||||
}
|
||||
// Reuse destination buffer or allocate new buffer with scratch allocator.
|
||||
const Storage storage = prepareStorage(desc, scratch);
|
||||
|
||||
typedef internal::TensorBlockIOV2<Scalar, IndexType, NumDims, Layout>
|
||||
TensorBlockIO;
|
||||
@ -469,17 +521,11 @@ class TensorMaterializedBlock {
|
||||
|
||||
TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
|
||||
data, desc.offset());
|
||||
TensorBlockIODst dst(desc.dimensions(),
|
||||
internal::strides<Layout>(desc.dimensions()),
|
||||
block_buffer);
|
||||
TensorBlockIODst dst(storage.dimensions(), storage.strides(),
|
||||
storage.data());
|
||||
|
||||
TensorBlockIO::Copy(dst, src);
|
||||
|
||||
return TensorMaterializedBlock(
|
||||
materialized_in_output
|
||||
? internal::TensorBlockKind::kMaterializedInOutput
|
||||
: internal::TensorBlockKind::kMaterializedInScratch,
|
||||
block_buffer, desc.dimensions());
|
||||
return storage.AsTensorMaterializedBlock();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -890,24 +890,14 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
return emptyBlock();
|
||||
}
|
||||
|
||||
// Check if we can reuse `desc` destination, or allocate new scratch buffer.
|
||||
ScalarNoConst* materialized_output =
|
||||
desc.template destination<ScalarNoConst, Layout>();
|
||||
bool materialized_in_output;
|
||||
// Prepare storage for the materialized broadcasting result.
|
||||
const typename TensorBlockV2::Storage block_storage =
|
||||
TensorBlockV2::prepareStorage(desc, scratch);
|
||||
ScalarNoConst* materialized_output = block_storage.data();
|
||||
|
||||
if (materialized_output != NULL) {
|
||||
desc.DropDestinationBuffer();
|
||||
materialized_in_output = true;
|
||||
|
||||
} else {
|
||||
materialized_in_output = false;
|
||||
const size_t materialized_output_size = desc.size() * sizeof(Scalar);
|
||||
void* output_scratch_mem = scratch.allocate(materialized_output_size);
|
||||
materialized_output = static_cast<ScalarNoConst*>(output_scratch_mem);
|
||||
}
|
||||
|
||||
ScalarNoConst* materialized_input = NULL;
|
||||
// We potentially will need to materialize input blocks.
|
||||
size_t materialized_input_size = 0;
|
||||
ScalarNoConst* materialized_input = NULL;
|
||||
|
||||
// Initialize block broadcating iterator state for outer dimensions (outer
|
||||
// with regard to bcast dimension). Dimension in this array are always in
|
||||
@ -951,11 +941,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
}
|
||||
}
|
||||
|
||||
return TensorBlockV2(
|
||||
materialized_in_output
|
||||
? internal::TensorBlockKind::kMaterializedInOutput
|
||||
: internal::TensorBlockKind::kMaterializedInScratch,
|
||||
materialized_output, desc.dimensions());
|
||||
return block_storage.AsTensorMaterializedBlock();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
|
||||
@ -1019,7 +1005,8 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
Index output_span;
|
||||
};
|
||||
|
||||
BlockBroadcastingParams blockBroadcastingParams(TensorBlockDesc& desc) const {
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockBroadcastingParams
|
||||
blockBroadcastingParams(TensorBlockDesc& desc) const {
|
||||
BlockBroadcastingParams params;
|
||||
|
||||
params.input_dims = Dimensions(m_impl.dimensions());
|
||||
|
@ -369,28 +369,35 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
|
||||
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
|
||||
bool /*root_of_expr_ast*/ = false) const {
|
||||
bool root_of_expr_ast = false) const {
|
||||
const Index chip_dim = m_dim.actualDim();
|
||||
|
||||
DSizes<Index, NumInputDims> input_block_dims;
|
||||
for (int i = 0; i < NumInputDims; ++i) {
|
||||
input_block_dims[i] = i < chip_dim ? desc.dimension(i)
|
||||
: i > chip_dim ? desc.dimension(i - 1)
|
||||
: 1;
|
||||
input_block_dims[i]
|
||||
= i < chip_dim ? desc.dimension(i)
|
||||
: i > chip_dim ? desc.dimension(i - 1)
|
||||
: 1;
|
||||
}
|
||||
|
||||
ArgTensorBlockDesc arg_desc(srcCoeff(desc.offset()), input_block_dims);
|
||||
|
||||
// Try to reuse destination buffer for materializing argument block.
|
||||
ScalarNoConst* destination_buffer =
|
||||
desc.template destination<ScalarNoConst, Layout>();
|
||||
if (destination_buffer != NULL) {
|
||||
arg_desc.AddDestinationBuffer(
|
||||
destination_buffer, internal::strides<Layout>(arg_desc.dimensions()),
|
||||
(arg_desc.size() * sizeof(Scalar)));
|
||||
if (desc.HasDestinationBuffer()) {
|
||||
DSizes<Index, NumInputDims> arg_destination_strides;
|
||||
for (int i = 0; i < NumInputDims; ++i) {
|
||||
arg_destination_strides[i]
|
||||
= i < chip_dim ? desc.destination().strides()[i]
|
||||
: i > chip_dim ? desc.destination().strides()[i - 1]
|
||||
: 0; // for dimensions of size `1` stride should never be used.
|
||||
}
|
||||
|
||||
arg_desc.template AddDestinationBuffer<Layout>(
|
||||
desc.destination().template data<ScalarNoConst>(),
|
||||
arg_destination_strides);
|
||||
}
|
||||
|
||||
ArgTensorBlock arg_block = m_impl.blockV2(arg_desc, scratch);
|
||||
ArgTensorBlock arg_block = m_impl.blockV2(arg_desc, scratch, root_of_expr_ast);
|
||||
if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer();
|
||||
|
||||
if (arg_block.data() != NULL) {
|
||||
@ -401,21 +408,9 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
||||
} else {
|
||||
// Assign argument block expression to a buffer.
|
||||
|
||||
// Try to reuse destination as an output buffer.
|
||||
ScalarNoConst* output_buffer =
|
||||
desc.template destination<ScalarNoConst, Layout>();
|
||||
bool materialized_in_output;
|
||||
|
||||
if (output_buffer != NULL) {
|
||||
desc.DropDestinationBuffer();
|
||||
materialized_in_output = true;
|
||||
|
||||
} else {
|
||||
materialized_in_output = false;
|
||||
const size_t materialized_output_size = desc.size() * sizeof(Scalar);
|
||||
void* output_scratch_mem = scratch.allocate(materialized_output_size);
|
||||
output_buffer = static_cast<ScalarNoConst*>(output_scratch_mem);
|
||||
}
|
||||
// Prepare storage for the materialized chipping result.
|
||||
const typename TensorBlockV2::Storage block_storage =
|
||||
TensorBlockV2::prepareStorage(desc, scratch);
|
||||
|
||||
typedef internal::TensorBlockAssignment<
|
||||
ScalarNoConst, NumInputDims, typename ArgTensorBlock::XprType, Index>
|
||||
@ -425,14 +420,10 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
||||
TensorBlockAssignment::target(
|
||||
arg_desc.dimensions(),
|
||||
internal::strides<Layout>(arg_desc.dimensions()),
|
||||
output_buffer),
|
||||
block_storage.data()),
|
||||
arg_block.expr());
|
||||
|
||||
return TensorBlockV2(
|
||||
materialized_in_output
|
||||
? internal::TensorBlockKind::kMaterializedInOutput
|
||||
: internal::TensorBlockKind::kMaterializedInScratch,
|
||||
output_buffer, desc.dimensions());
|
||||
return block_storage.AsTensorMaterializedBlock();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -173,12 +173,9 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlockV2(
|
||||
TensorBlockDesc& desc, TensorBlockScratch& scratch) {
|
||||
// Add `m_buffer` as destination buffer to the block descriptor.
|
||||
desc.AddDestinationBuffer(
|
||||
desc.template AddDestinationBuffer<Layout>(
|
||||
/*dst_base=*/m_buffer + desc.offset(),
|
||||
/*dst_strides=*/internal::strides<Layout>(m_impl.dimensions()),
|
||||
/*total_dst_bytes=*/
|
||||
(internal::array_prod(m_impl.dimensions())
|
||||
* sizeof(Scalar)));
|
||||
/*dst_strides=*/internal::strides<Layout>(m_impl.dimensions()));
|
||||
|
||||
ArgTensorBlock block = m_impl.blockV2(desc, scratch);
|
||||
|
||||
|
@ -248,21 +248,6 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
|
||||
extract_coordinates(desc.offset(), coords);
|
||||
array<Index, NumDims> initial_coords = coords;
|
||||
|
||||
// Try to reuse destination as an output block buffer.
|
||||
CoeffReturnType* block_buffer =
|
||||
desc.template destination<CoeffReturnType, Layout>();
|
||||
bool materialized_in_output;
|
||||
|
||||
if (block_buffer != NULL) {
|
||||
desc.DropDestinationBuffer();
|
||||
materialized_in_output = true;
|
||||
|
||||
} else {
|
||||
materialized_in_output = false;
|
||||
void* mem = scratch.allocate(desc.size() * sizeof(CoeffReturnType));
|
||||
block_buffer = static_cast<CoeffReturnType*>(mem);
|
||||
}
|
||||
|
||||
// Offset in the output block buffer.
|
||||
Index offset = 0;
|
||||
|
||||
@ -278,6 +263,12 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
|
||||
}
|
||||
eigen_assert(it[0].stride == 1);
|
||||
|
||||
// Prepare storage for the materialized generator result.
|
||||
const typename TensorBlockV2::Storage block_storage =
|
||||
TensorBlockV2::prepareStorage(desc, scratch);
|
||||
|
||||
CoeffReturnType* block_buffer = block_storage.data();
|
||||
|
||||
while (it[NumDims - 1].count < it[NumDims - 1].size) {
|
||||
// Generate data for the inner-most dimension.
|
||||
for (Index i = 0; i < it[0].size; ++i) {
|
||||
@ -304,11 +295,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
|
||||
}
|
||||
}
|
||||
|
||||
return TensorBlockV2(
|
||||
materialized_in_output
|
||||
? internal::TensorBlockKind::kMaterializedInOutput
|
||||
: internal::TensorBlockKind::kMaterializedInScratch,
|
||||
block_buffer, desc.dimensions());
|
||||
return block_storage.AsTensorMaterializedBlock();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
|
@ -238,22 +238,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
desc.dimensions());
|
||||
}
|
||||
|
||||
// Check if we can reuse `desc` destination, or allocate new scratch buffer.
|
||||
ScalarNoConst* materialized_output =
|
||||
desc.template destination<ScalarNoConst, Layout>();
|
||||
bool materialized_in_output;
|
||||
|
||||
if (materialized_output != NULL) {
|
||||
desc.DropDestinationBuffer();
|
||||
materialized_in_output = true;
|
||||
|
||||
} else {
|
||||
const size_t materialized_output_size = desc.size() * sizeof(Scalar);
|
||||
void* output_scratch_mem = scratch.allocate(materialized_output_size);
|
||||
materialized_output = static_cast<ScalarNoConst*>(output_scratch_mem);
|
||||
materialized_in_output = false;
|
||||
}
|
||||
|
||||
static const bool IsColMajor = Layout == static_cast<int>(ColMajor);
|
||||
|
||||
Index offset = desc.offset();
|
||||
@ -363,6 +347,10 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
|
||||
typedef internal::StridedLinearBufferCopy<ScalarNoConst, Index> LinCopy;
|
||||
|
||||
// Prepare storage for the materialized padding result.
|
||||
const typename TensorBlockV2::Storage block_storage =
|
||||
TensorBlockV2::prepareStorage(desc, scratch);
|
||||
|
||||
// Iterate copying data from `m_impl.data()` to the output buffer.
|
||||
for (Index size = 0; size < output_size; size += output_inner_dim_size) {
|
||||
// Detect if we are in the padded region (exclude innermost dimension).
|
||||
@ -376,7 +364,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
if (is_padded) {
|
||||
// Fill with padding value.
|
||||
LinCopy::template Run<LinCopy::Kind::FillLinear>(
|
||||
typename LinCopy::Dst(output_offset, 1, materialized_output),
|
||||
typename LinCopy::Dst(output_offset, 1, block_storage.data()),
|
||||
typename LinCopy::Src(0, 0, &m_paddingValue),
|
||||
output_inner_dim_size);
|
||||
|
||||
@ -385,7 +373,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
const Index out = output_offset;
|
||||
|
||||
LinCopy::template Run<LinCopy::Kind::FillLinear>(
|
||||
typename LinCopy::Dst(out, 1, materialized_output),
|
||||
typename LinCopy::Dst(out, 1, block_storage.data()),
|
||||
typename LinCopy::Src(0, 0, &m_paddingValue),
|
||||
output_inner_pad_before_size);
|
||||
}
|
||||
@ -397,7 +385,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
eigen_assert(output_inner_copy_size == 0 || m_impl.data() != NULL);
|
||||
|
||||
LinCopy::template Run<LinCopy::Kind::Linear>(
|
||||
typename LinCopy::Dst(out, 1, materialized_output),
|
||||
typename LinCopy::Dst(out, 1, block_storage.data()),
|
||||
typename LinCopy::Src(in, 1, m_impl.data()),
|
||||
output_inner_copy_size);
|
||||
}
|
||||
@ -407,7 +395,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
output_inner_copy_size;
|
||||
|
||||
LinCopy::template Run<LinCopy::Kind::FillLinear>(
|
||||
typename LinCopy::Dst(out, 1, materialized_output),
|
||||
typename LinCopy::Dst(out, 1, block_storage.data()),
|
||||
typename LinCopy::Src(0, 0, &m_paddingValue),
|
||||
output_inner_pad_after_size);
|
||||
}
|
||||
@ -431,11 +419,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
}
|
||||
}
|
||||
|
||||
return TensorBlockV2(materialized_in_output
|
||||
? internal::TensorBlockKind::kMaterializedInOutput
|
||||
: internal::TensorBlockKind::kMaterializedInScratch,
|
||||
materialized_output,
|
||||
desc.dimensions());
|
||||
return block_storage.AsTensorMaterializedBlock();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
|
||||
|
@ -370,21 +370,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
||||
static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1;
|
||||
const bool inner_dim_reversed = m_reverse[inner_dim_idx];
|
||||
|
||||
// Try to reuse destination as an output block buffer.
|
||||
CoeffReturnType* block_buffer =
|
||||
desc.template destination<CoeffReturnType, Layout>();
|
||||
bool materialized_in_output;
|
||||
|
||||
if (block_buffer != NULL) {
|
||||
desc.DropDestinationBuffer();
|
||||
materialized_in_output = true;
|
||||
|
||||
} else {
|
||||
materialized_in_output = false;
|
||||
void* mem = scratch.allocate(desc.size() * sizeof(CoeffReturnType));
|
||||
block_buffer = static_cast<CoeffReturnType*>(mem);
|
||||
}
|
||||
|
||||
// Offset in the output block.
|
||||
Index block_offset = 0;
|
||||
|
||||
@ -438,6 +423,11 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
||||
|
||||
const Index inner_dim_size = it[effective_inner_dim].size;
|
||||
|
||||
// Prepare storage for the materialized reverse result.
|
||||
const typename TensorBlockV2::Storage block_storage =
|
||||
TensorBlockV2::prepareStorage(desc, scratch);
|
||||
CoeffReturnType* block_buffer = block_storage.data();
|
||||
|
||||
while (it[NumDims - 1].count < it[NumDims - 1].size) {
|
||||
// Copy inner-most dimension data from reversed location in input.
|
||||
Index dst = block_offset;
|
||||
@ -475,11 +465,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
||||
}
|
||||
}
|
||||
|
||||
return TensorBlockV2(
|
||||
materialized_in_output
|
||||
? internal::TensorBlockKind::kMaterializedInOutput
|
||||
: internal::TensorBlockKind::kMaterializedInScratch,
|
||||
block_buffer, desc.dimensions());
|
||||
return block_storage.AsTensorMaterializedBlock();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
|
@ -351,66 +351,20 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
typedef typename TensorBlockIO::Dst TensorBlockIODst;
|
||||
typedef typename TensorBlockIO::Src TensorBlockIOSrc;
|
||||
|
||||
ScalarNoConst* block_buffer = NULL;
|
||||
typename TensorBlockIO::Dimensions block_strides;
|
||||
|
||||
bool materialized_in_output = false;
|
||||
bool has_valid_materialized_expr = true;
|
||||
|
||||
if (desc.HasDestinationBuffer()) {
|
||||
// Check if we can reuse destination buffer for block materialization.
|
||||
const typename TensorBlockDesc::DestinationBuffer& destination_buffer =
|
||||
desc.GetDestinationBuffer();
|
||||
|
||||
const bool dims_match = dimensions_match(
|
||||
desc.dimensions(), destination_buffer.template dimensions<Scalar>());
|
||||
|
||||
const bool strides_match =
|
||||
dimensions_match(internal::strides<Layout>(desc.dimensions()),
|
||||
destination_buffer.template strides<Scalar>());
|
||||
|
||||
if (dims_match && strides_match) {
|
||||
// Destination buffer fits the block contiguously.
|
||||
materialized_in_output = true;
|
||||
has_valid_materialized_expr = true;
|
||||
block_buffer = destination_buffer.template data<ScalarNoConst>();
|
||||
block_strides = internal::strides<Layout>(desc.dimensions());
|
||||
eigen_assert(block_buffer != NULL);
|
||||
|
||||
} else if (dims_match && root_of_expr_ast) {
|
||||
// Destination buffer has strides not matching the block strides, but
|
||||
// for the root of the expression tree it's safe to materialize anyway.
|
||||
materialized_in_output = true;
|
||||
has_valid_materialized_expr = false;
|
||||
block_buffer = destination_buffer.template data<ScalarNoConst>();
|
||||
block_strides = destination_buffer.template strides<ScalarNoConst>();
|
||||
eigen_assert(block_buffer != NULL);
|
||||
}
|
||||
|
||||
if (materialized_in_output) desc.DropDestinationBuffer();
|
||||
}
|
||||
|
||||
// If we were not able to reuse destination buffer, allocate temporary
|
||||
// buffer for block evaluation using scratch allocator.
|
||||
if (!materialized_in_output) {
|
||||
void* mem = scratch.allocate(desc.size() * sizeof(ScalarNoConst));
|
||||
block_buffer = static_cast<ScalarNoConst*>(mem);
|
||||
block_strides = internal::strides<Layout>(desc.dimensions());
|
||||
}
|
||||
const typename TensorBlockV2::Storage block_storage =
|
||||
TensorBlockV2::prepareStorage(
|
||||
desc, scratch, /*allow_strided_storage=*/root_of_expr_ast);
|
||||
|
||||
typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides);
|
||||
TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset()));
|
||||
|
||||
TensorBlockIODst dst(desc.dimensions(), block_strides, block_buffer);
|
||||
TensorBlockIODst dst(block_storage.dimensions(), block_storage.strides(),
|
||||
block_storage.data());
|
||||
|
||||
typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle);
|
||||
TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
|
||||
|
||||
return TensorBlockV2(
|
||||
materialized_in_output
|
||||
? internal::TensorBlockKind::kMaterializedInOutput
|
||||
: internal::TensorBlockKind::kMaterializedInScratch,
|
||||
block_buffer, desc.dimensions(), has_valid_materialized_expr);
|
||||
return block_storage.AsTensorMaterializedBlock();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
|
@ -154,9 +154,8 @@ static void VerifyBlockEvaluator(Expression expr, GenBlockParams gen_block) {
|
||||
Tensor<T, NumDims, Layout> dst(dst_dims);
|
||||
dst.setZero();
|
||||
if (internal::random<bool>()) {
|
||||
block_params.desc.template AddDestinationBuffer(
|
||||
dst.data(), internal::strides<Layout>(dst.dimensions()),
|
||||
dst.dimensions().TotalSize() * sizeof(T));
|
||||
block_params.desc.template AddDestinationBuffer<Layout>(
|
||||
dst.data(), internal::strides<Layout>(dst.dimensions()));
|
||||
}
|
||||
|
||||
const bool root_of_expr = internal::random<bool>();
|
||||
|
Loading…
Reference in New Issue
Block a user