Cleanup Tensor block destination and materialized block storage allocation

2024-12-21 07:19:46 +08:00 · 2019-10-16 17:14:37 -07:00 · 2019-10-16 17:14:37 -07:00 · 0d2a14ce11
commit 0d2a14ce11
parent 02431cbe71
10 changed files with 220 additions and 291 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@ -235,11 +235,9 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
        m_leftImpl.data() != NULL) {
      // If destination has raw data access, we pass it as a potential
      // destination for a block descriptor evaluation.
-      desc.AddDestinationBuffer(
+      desc.template AddDestinationBuffer<Layout>(
          /*dst_base=*/m_leftImpl.data() + desc.offset(),
-          /*dst_strides=*/internal::strides<Layout>(m_leftImpl.dimensions()),
-          /*total_dst_bytes=*/
-          (internal::array_prod(m_leftImpl.dimensions()) * sizeof(Scalar)));
+          /*dst_strides=*/internal::strides<Layout>(m_leftImpl.dimensions()));
    }

    RightTensorBlock block = m_rightImpl.blockV2(desc, scratch, /*root_of_expr_ast=*/true);
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
@ -70,91 +70,89 @@ class TensorBlockDescriptor {

  // If we evaluate a Tensor assignment, and expression on the left, already has
  // a memory buffer, then we might do performance optimization, and evaluate
-  // the root expression directly into the memory, or maybe use it as temporary
-  // storage for some of the subexpressions, to avoid dynamic memory allocation.
+  // the root expression directly into the final output memory. Some time it's
+  // possible to reuse it for materializing subexpressions inside an expression
+  // tree, to to avoid dynamic memory allocation.
  //
-  // This is a type erased storage, because passing Scalar type through all the
-  // expression evaluation layers it way too many templates. Also it should be
-  // possible to use this destination as a temp buffer for materializing
-  // expressions with type, not matching the final output.
+  // The pointer type of the underlying storage is erased, because passing
+  // Scalar type through all the expression evaluation layers is way too many
+  // templates. In practice destination buffer type should always match the
+  // evaluated expression scalar type.
  class DestinationBuffer {
   public:
+    enum DestinationBufferKind {
+      // Destination buffer is not defined (`m_data` == nullptr).
+      kEmpty,
+
+      // Tensor block defined by an owning tensor block descriptor can fit
+      // contiguously into the destination buffer. In this case it's safe to
+      // materialize tensor block in the destination buffer, wrap it in a
+      // TensorMap, and use to build Eigen expression on top of it.
+      kContiguous,
+
+      // Destination buffer strides do not match strides of the contiguously
+      // stored block, and it's impossible to define a TensorMap over this
+      // buffer. However if we are evaluating a root of an expression tree, we
+      // still can materialize an output into this destination, because we can
+      // guarantee that no one will ever access it through block API.
+      //
+      // In theory it is possible to build valid TensorStriding<TensorMap>
+      // expression on top of this destination buffer, however it has
+      // inefficient coeff/packet access, and defeats the purpose of fast block
+      // evaluation API.
+      kStrided
+    };
+
    template <typename Scalar>
    Scalar* data() const {
+      eigen_assert(m_data_type_size == sizeof(Scalar));
      return static_cast<Scalar*>(m_data);
    }

-    template <typename Scalar>
-    Dimensions dimensions() const {
-      Dimensions dimensions;
-      for (int i = 0; i < NumDims; ++i) {
-        eigen_assert(m_dimensions[i] % sizeof(Scalar) == 0);
-        dimensions[i] = m_dimensions[i] / sizeof(Scalar);
-      }
-      return dimensions;
-    }
-
-    template <typename Scalar>
-    Dimensions strides() const {
-      Dimensions strides;
-      for (int i = 0; i < NumDims; ++i) {
-        eigen_assert(m_strides[i] % sizeof(Scalar) == 0);
-        strides[i] = m_strides[i] / sizeof(Scalar);
-      }
-      return strides;
-    }
-
-    // Returns true if the tensor block corresponding to `desc` fits into the
-    // contiguous block of memory defined by `*this`.
-    template <typename Scalar, int Layout>
-    bool fitsContiguously(const TensorBlockDescriptor& desc) const {
-      if (m_data == NULL) return false;
-
-      const Dimensions& desc_dims = desc.dimensions();
-      const Dimensions& dst_dims = dimensions<Scalar>();
-
-      if (!dimensions_match(desc_dims, dst_dims)) return false;
-
-      const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
-      const Dimensions& dst_strides = strides<Scalar>();
-
-      // Compare strides ignoring dimensions of size `1`.
-      for (int i = 0; i < NumDims; ++i) {
-        if (desc_dims[i] == 1) continue;
-        if (desc_strides[i] != dst_strides[i]) return false;
-      }
-
-      return true;
-    }
+    const Dimensions& strides() const { return m_strides; }
+    const DestinationBufferKind& kind() const { return m_kind; }

   private:
    friend class TensorBlockDescriptor;

-    DestinationBuffer() : m_data(NULL), m_total_dst_bytes(0) {}
+    DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}

    template <typename Scalar>
-    DestinationBuffer(Scalar* data, const Dimensions& dimensions,
-                      const Dimensions& strides, size_t total_dst_bytes)
+    DestinationBuffer(Scalar* data, const Dimensions& strides,
+                      DestinationBufferKind kind)
        : m_data(static_cast<void*>(data)),
-          m_dimensions(dimensions),
+          m_data_type_size(sizeof(Scalar)),
          m_strides(strides),
-          m_total_dst_bytes(total_dst_bytes) {
-      // TODO(ezhulenev): Benchmark template meta-unroll for this loop.
-      for (int i = 0; i < NumDims; ++i) {
-        m_dimensions[i] *= sizeof(Scalar);
-        m_strides[i] *= sizeof(Scalar);
-      }
+          m_kind(kind) {}
+
+    template <int Layout, typename Scalar>
+    static DestinationBuffer make(const TensorBlockDescriptor& desc,
+                                  Scalar* data, const Dimensions& strides) {
+      return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
    }

+    template <int Layout>
+    static DestinationBufferKind kind(const TensorBlockDescriptor& desc,
+                                      const Dimensions& strides) {
+      const Dimensions& desc_dims = desc.dimensions();
+      const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
+      for (int i = 0; i < NumDims; ++i) {
+        if (desc_dims[i] == 1) continue;
+        if (desc_strides[i] != strides[i]) return kStrided;
+      }
+      return kContiguous;
+    }
+
+    // Storage pointer is type erased, to reduce template bloat, but we still
+    // keep the size of the underlying element type for error checking.
    void* m_data;
-    Dimensions m_dimensions;
+    size_t m_data_type_size;
+
+    // Destination buffer dimensions always match the dimensions of a tensor
+    // block descriptor it belongs to, however strides might be different.
    Dimensions m_strides;

-    // Total size of the memory buffer at the destination (typically the total
-    // size of the left hand side of an assignment expression). This can be the
-    // same as `array_prod(m_dimensions)` if the assignment target has just a
-    // single block, but typically it's a larger number.
-    size_t m_total_dst_bytes;
+    DestinationBufferKind m_kind;
  };

  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions,
@ -173,40 +171,31 @@ class TensorBlockDescriptor {
  IndexType dimension(int index) const { return m_dimensions[index]; }
  IndexType size() const { return array_prod<IndexType>(m_dimensions); }

-  template <typename Scalar>
-  void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides,
-                            size_t total_dst_bytes) {
+  const DestinationBuffer& destination() const { return m_destination; }
+
+  template <int Layout, typename Scalar>
+  void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {
+    eigen_assert(dst_base != NULL);
    m_destination =
-        DestinationBuffer(dst_base, m_dimensions, dst_strides, total_dst_bytes);
+        DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);
  }

-  template <typename Scalar, typename DstStridesIndexType>
+  template <int Layout, typename Scalar, typename DstStridesIndexType>
  void AddDestinationBuffer(
-      Scalar* dst_base, const DSizes<DstStridesIndexType, NumDims>& dst_strides,
-      size_t total_dst_bytes) {
+      Scalar* dst_base,
+      const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
    // DSizes constructor will do index type promotion if it's safe.
-    AddDestinationBuffer(dst_base, Dimensions(dst_strides), total_dst_bytes);
+    AddDestinationBuffer<Layout>(*this, dst_base, Dimensions(dst_strides));
  }

  TensorBlockDescriptor& DropDestinationBuffer() {
    m_destination.m_data = NULL;
+    m_destination.m_kind = DestinationBuffer::kEmpty;
    return *this;
  }

-  bool HasDestinationBuffer() const { return m_destination.m_data != NULL; }
-
-  const DestinationBuffer& GetDestinationBuffer() const {
-    return m_destination;
-  }
-
-  // Returns a non-nullptr pointer to a destination buffer memory if this
-  // block has a contiguous destination buffer.
-  template <typename Scalar, int Layout>
-  Scalar* destination() const {
-    if (m_destination.template fitsContiguously<Scalar, Layout>(*this)) {
-      return m_destination.template data<Scalar>();
-    }
-    return NULL;
+  bool HasDestinationBuffer() const {
+    return m_destination.kind() != DestinationBuffer::kEmpty;
  }

  // Returns a copy of `*this` with updated offset.
@ -404,6 +393,80 @@ class TensorMaterializedBlock {

  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;

+  // TensorMaterializedBlock can be backed by different types of storage:
+  //
+  //   (1) Contiguous block of memory allocated with scratch allocator.
+  //   (2) Contiguous block of memory reused from tensor block descriptor
+  //       destination buffer.
+  //   (3) Strided block of memory reused from tensor block descriptor
+  //       destination buffer.
+  //
+  class Storage {
+   public:
+    Scalar* data() const { return m_data; }
+    const Dimensions& dimensions() const { return m_dimensions; }
+    const Dimensions& strides() const { return m_strides; }
+
+    TensorMaterializedBlock AsTensorMaterializedBlock() const {
+      return TensorMaterializedBlock(
+          m_materialized_in_output
+              ? internal::TensorBlockKind::kMaterializedInOutput
+              : internal::TensorBlockKind::kMaterializedInScratch,
+          m_data, m_dimensions, !m_strided_storage);
+    }
+
+   private:
+    friend class TensorMaterializedBlock;
+
+    Storage(Scalar* data, const Dimensions& dimensions,
+            const Dimensions& strides, bool materialized_in_output,
+            bool strided_storage)
+        : m_data(data),
+          m_dimensions(dimensions),
+          m_strides(strides),
+          m_materialized_in_output(materialized_in_output),
+          m_strided_storage(strided_storage) {}
+
+    Scalar* m_data;
+    Dimensions m_dimensions;
+    Dimensions m_strides;
+    bool m_materialized_in_output;
+    bool m_strided_storage;
+  };
+
+  // Creates a storage for materialized block either from the block descriptor
+  // destination buffer, or allocates a new buffer with scratch allocator.
+  template <typename TensorBlockScratch>
+  EIGEN_STRONG_INLINE static Storage prepareStorage(
+      TensorBlockDesc& desc, TensorBlockScratch& scratch,
+      bool allow_strided_storage = false) {
+    // Try to reuse destination as an output block buffer.
+    typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
+
+    if (desc.destination().kind() == DestinationBuffer::kContiguous) {
+      Scalar* buffer = desc.destination().template data<Scalar>();
+      desc.DropDestinationBuffer();
+      return Storage(buffer, desc.dimensions(),
+                     internal::strides<Layout>(desc.dimensions()),
+                     /*materialized_in_output=*/true,
+                     /*strided_storage=*/false);
+
+    } else if (desc.destination().kind() == DestinationBuffer::kStrided &&
+               allow_strided_storage) {
+      Scalar* buffer = desc.destination().template data<Scalar>();
+      desc.DropDestinationBuffer();
+      return Storage(buffer, desc.dimensions(), desc.destination().strides(),
+                     /*materialized_in_output=*/true, /*strided_storage=*/true);
+
+    } else {
+      void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
+      return Storage(static_cast<Scalar*>(mem), desc.dimensions(),
+                     internal::strides<Layout>(desc.dimensions()),
+                     /*materialized_in_output=*/false,
+                     /*strided_storage=*/false);
+    }
+  }
+
  // Creates a materialized block for the given descriptor from a memory buffer.
  template <typename DataDimensions, typename TensorBlockScratch>
  EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(
@ -448,19 +511,8 @@ class TensorMaterializedBlock {
                                     block_start, desc.dimensions());

    } else {
-      // Try to reuse destination as an output block buffer.
-      Scalar* block_buffer = desc.template destination<Scalar, Layout>();
-      bool materialized_in_output;
-
-      if (block_buffer != NULL) {
-        desc.DropDestinationBuffer();
-        materialized_in_output = true;
-
-      } else {
-        materialized_in_output = false;
-        void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
-        block_buffer = static_cast<Scalar*>(mem);
-      }
+      // Reuse destination buffer or allocate new buffer with scratch allocator.
+      const Storage storage = prepareStorage(desc, scratch);

      typedef internal::TensorBlockIOV2<Scalar, IndexType, NumDims, Layout>
          TensorBlockIO;
@ -469,17 +521,11 @@ class TensorMaterializedBlock {

      TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
                           data, desc.offset());
-      TensorBlockIODst dst(desc.dimensions(),
-                           internal::strides<Layout>(desc.dimensions()),
-                           block_buffer);
+      TensorBlockIODst dst(storage.dimensions(), storage.strides(),
+                           storage.data());

      TensorBlockIO::Copy(dst, src);
-
-      return TensorMaterializedBlock(
-          materialized_in_output
-              ? internal::TensorBlockKind::kMaterializedInOutput
-              : internal::TensorBlockKind::kMaterializedInScratch,
-          block_buffer, desc.dimensions());
+      return storage.AsTensorMaterializedBlock();
    }
  }

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@ -890,24 +890,14 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
      return emptyBlock();
    }

-    // Check if we can reuse `desc` destination, or allocate new scratch buffer.
-    ScalarNoConst* materialized_output =
-        desc.template destination<ScalarNoConst, Layout>();
-    bool materialized_in_output;
+    // Prepare storage for the materialized broadcasting result.
+    const typename TensorBlockV2::Storage block_storage =
+        TensorBlockV2::prepareStorage(desc, scratch);
+    ScalarNoConst* materialized_output = block_storage.data();

-    if (materialized_output != NULL) {
-      desc.DropDestinationBuffer();
-      materialized_in_output = true;
-
-    } else {
-      materialized_in_output = false;
-      const size_t materialized_output_size = desc.size() * sizeof(Scalar);
-      void* output_scratch_mem = scratch.allocate(materialized_output_size);
-      materialized_output = static_cast<ScalarNoConst*>(output_scratch_mem);
-    }
-
-    ScalarNoConst* materialized_input = NULL;
+    // We potentially will need to materialize input blocks.
    size_t materialized_input_size = 0;
+    ScalarNoConst* materialized_input = NULL;

    // Initialize block broadcating iterator state for outer dimensions (outer
    // with regard to bcast dimension). Dimension in this array are always in
@ -951,11 +941,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
      }
    }

-    return TensorBlockV2(
-        materialized_in_output
-            ? internal::TensorBlockKind::kMaterializedInOutput
-            : internal::TensorBlockKind::kMaterializedInScratch,
-        materialized_output, desc.dimensions());
+    return block_storage.AsTensorMaterializedBlock();
  }

  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
@ -1019,7 +1005,8 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
    Index output_span;
  };

-  BlockBroadcastingParams blockBroadcastingParams(TensorBlockDesc& desc) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockBroadcastingParams
+  blockBroadcastingParams(TensorBlockDesc& desc) const {
    BlockBroadcastingParams params;

    params.input_dims = Dimensions(m_impl.dimensions());
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@ -369,12 +369,13 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool /*root_of_expr_ast*/ = false) const {
+          bool root_of_expr_ast = false) const {
    const Index chip_dim = m_dim.actualDim();

    DSizes<Index, NumInputDims> input_block_dims;
    for (int i = 0; i < NumInputDims; ++i) {
-      input_block_dims[i] = i < chip_dim ? desc.dimension(i)
+      input_block_dims[i]
+            = i < chip_dim ? desc.dimension(i)
            : i > chip_dim ? desc.dimension(i - 1)
            : 1;
    }
@ -382,15 +383,21 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
    ArgTensorBlockDesc arg_desc(srcCoeff(desc.offset()), input_block_dims);

    // Try to reuse destination buffer for materializing argument block.
-    ScalarNoConst* destination_buffer =
-        desc.template destination<ScalarNoConst, Layout>();
-    if (destination_buffer != NULL) {
-      arg_desc.AddDestinationBuffer(
-          destination_buffer, internal::strides<Layout>(arg_desc.dimensions()),
-          (arg_desc.size() * sizeof(Scalar)));
+    if (desc.HasDestinationBuffer()) {
+      DSizes<Index, NumInputDims> arg_destination_strides;
+      for (int i = 0; i < NumInputDims; ++i) {
+      arg_destination_strides[i]
+            = i < chip_dim ? desc.destination().strides()[i]
+            : i > chip_dim ? desc.destination().strides()[i - 1]
+            : 0; // for dimensions of size `1` stride should never be used.
      }

-    ArgTensorBlock arg_block = m_impl.blockV2(arg_desc, scratch);
+      arg_desc.template AddDestinationBuffer<Layout>(
+          desc.destination().template data<ScalarNoConst>(),
+          arg_destination_strides);
+    }
+
+    ArgTensorBlock arg_block = m_impl.blockV2(arg_desc, scratch, root_of_expr_ast);
    if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer();

    if (arg_block.data() != NULL) {
@ -401,21 +408,9 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
    } else {
      // Assign argument block expression to a buffer.

-      // Try to reuse destination as an output buffer.
-      ScalarNoConst* output_buffer =
-          desc.template destination<ScalarNoConst, Layout>();
-      bool materialized_in_output;
-
-      if (output_buffer != NULL) {
-        desc.DropDestinationBuffer();
-        materialized_in_output = true;
-
-      } else {
-        materialized_in_output = false;
-        const size_t materialized_output_size = desc.size() * sizeof(Scalar);
-        void* output_scratch_mem = scratch.allocate(materialized_output_size);
-        output_buffer = static_cast<ScalarNoConst*>(output_scratch_mem);
-      }
+      // Prepare storage for the materialized chipping result.
+      const typename TensorBlockV2::Storage block_storage =
+          TensorBlockV2::prepareStorage(desc, scratch);

      typedef internal::TensorBlockAssignment<
          ScalarNoConst, NumInputDims, typename ArgTensorBlock::XprType, Index>
@ -425,14 +420,10 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
          TensorBlockAssignment::target(
              arg_desc.dimensions(),
              internal::strides<Layout>(arg_desc.dimensions()),
-              output_buffer),
+              block_storage.data()),
          arg_block.expr());

-      return TensorBlockV2(
-          materialized_in_output
-              ? internal::TensorBlockKind::kMaterializedInOutput
-              : internal::TensorBlockKind::kMaterializedInScratch,
-          output_buffer, desc.dimensions());
+      return block_storage.AsTensorMaterializedBlock();
    }
  }

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@ -173,12 +173,9 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlockV2(
      TensorBlockDesc& desc, TensorBlockScratch& scratch) {
    // Add `m_buffer` as destination buffer to the block descriptor.
-    desc.AddDestinationBuffer(
+    desc.template AddDestinationBuffer<Layout>(
        /*dst_base=*/m_buffer + desc.offset(),
-        /*dst_strides=*/internal::strides<Layout>(m_impl.dimensions()),
-        /*total_dst_bytes=*/
-                     (internal::array_prod(m_impl.dimensions())
-                         * sizeof(Scalar)));
+        /*dst_strides=*/internal::strides<Layout>(m_impl.dimensions()));

    ArgTensorBlock block = m_impl.blockV2(desc, scratch);

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@ -248,21 +248,6 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
    extract_coordinates(desc.offset(), coords);
    array<Index, NumDims> initial_coords = coords;

-    // Try to reuse destination as an output block buffer.
-    CoeffReturnType* block_buffer =
-        desc.template destination<CoeffReturnType, Layout>();
-    bool materialized_in_output;
-
-    if (block_buffer != NULL) {
-      desc.DropDestinationBuffer();
-      materialized_in_output = true;
-
-    } else {
-      materialized_in_output = false;
-      void* mem = scratch.allocate(desc.size() * sizeof(CoeffReturnType));
-      block_buffer = static_cast<CoeffReturnType*>(mem);
-    }
-
    // Offset in the output block buffer.
    Index offset = 0;

@ -278,6 +263,12 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
    }
    eigen_assert(it[0].stride == 1);

+    // Prepare storage for the materialized generator result.
+    const typename TensorBlockV2::Storage block_storage =
+        TensorBlockV2::prepareStorage(desc, scratch);
+
+    CoeffReturnType* block_buffer = block_storage.data();
+
    while (it[NumDims - 1].count < it[NumDims - 1].size) {
      // Generate data for the inner-most dimension.
      for (Index i = 0; i < it[0].size; ++i) {
@ -304,11 +295,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
      }
    }

-    return TensorBlockV2(
-        materialized_in_output
-          ? internal::TensorBlockKind::kMaterializedInOutput
-          : internal::TensorBlockKind::kMaterializedInScratch,
-        block_buffer, desc.dimensions());
+    return block_storage.AsTensorMaterializedBlock();
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@ -238,22 +238,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
                           desc.dimensions());
    }

-    // Check if we can reuse `desc` destination, or allocate new scratch buffer.
-    ScalarNoConst* materialized_output =
-        desc.template destination<ScalarNoConst, Layout>();
-    bool materialized_in_output;
-
-    if (materialized_output != NULL) {
-      desc.DropDestinationBuffer();
-      materialized_in_output = true;
-
-    } else {
-      const size_t materialized_output_size = desc.size() * sizeof(Scalar);
-      void* output_scratch_mem = scratch.allocate(materialized_output_size);
-      materialized_output = static_cast<ScalarNoConst*>(output_scratch_mem);
-      materialized_in_output = false;
-    }
-
    static const bool IsColMajor = Layout == static_cast<int>(ColMajor);

    Index offset = desc.offset();
@ -363,6 +347,10 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device

    typedef internal::StridedLinearBufferCopy<ScalarNoConst, Index> LinCopy;

+    // Prepare storage for the materialized padding result.
+    const typename TensorBlockV2::Storage block_storage =
+        TensorBlockV2::prepareStorage(desc, scratch);
+
    // Iterate copying data from `m_impl.data()` to the output buffer.
    for (Index size = 0; size < output_size; size += output_inner_dim_size) {
      // Detect if we are in the padded region (exclude innermost dimension).
@ -376,7 +364,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
      if (is_padded) {
        // Fill with padding value.
        LinCopy::template Run<LinCopy::Kind::FillLinear>(
-            typename LinCopy::Dst(output_offset, 1, materialized_output),
+            typename LinCopy::Dst(output_offset, 1, block_storage.data()),
            typename LinCopy::Src(0, 0, &m_paddingValue),
            output_inner_dim_size);

@ -385,7 +373,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
          const Index out = output_offset;

          LinCopy::template Run<LinCopy::Kind::FillLinear>(
-              typename LinCopy::Dst(out, 1, materialized_output),
+              typename LinCopy::Dst(out, 1, block_storage.data()),
              typename LinCopy::Src(0, 0, &m_paddingValue),
              output_inner_pad_before_size);
        }
@ -397,7 +385,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
          eigen_assert(output_inner_copy_size == 0 || m_impl.data() != NULL);

          LinCopy::template Run<LinCopy::Kind::Linear>(
-              typename LinCopy::Dst(out, 1, materialized_output),
+              typename LinCopy::Dst(out, 1, block_storage.data()),
              typename LinCopy::Src(in, 1, m_impl.data()),
              output_inner_copy_size);
        }
@ -407,7 +395,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
                            output_inner_copy_size;

          LinCopy::template Run<LinCopy::Kind::FillLinear>(
-              typename LinCopy::Dst(out, 1, materialized_output),
+              typename LinCopy::Dst(out, 1, block_storage.data()),
              typename LinCopy::Src(0, 0, &m_paddingValue),
              output_inner_pad_after_size);
        }
@ -431,11 +419,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
      }
    }

-    return TensorBlockV2(materialized_in_output
-                         ? internal::TensorBlockKind::kMaterializedInOutput
-                         : internal::TensorBlockKind::kMaterializedInScratch,
-                         materialized_output,
-                         desc.dimensions());
+    return block_storage.AsTensorMaterializedBlock();
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@ -370,21 +370,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
    static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1;
    const bool inner_dim_reversed = m_reverse[inner_dim_idx];

-    // Try to reuse destination as an output block buffer.
-    CoeffReturnType* block_buffer =
-        desc.template destination<CoeffReturnType, Layout>();
-    bool materialized_in_output;
-
-    if (block_buffer != NULL) {
-      desc.DropDestinationBuffer();
-      materialized_in_output = true;
-
-    } else {
-      materialized_in_output = false;
-      void* mem = scratch.allocate(desc.size() * sizeof(CoeffReturnType));
-      block_buffer = static_cast<CoeffReturnType*>(mem);
-    }
-
    // Offset in the output block.
    Index block_offset = 0;

@ -438,6 +423,11 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device

    const Index inner_dim_size = it[effective_inner_dim].size;

+    // Prepare storage for the materialized reverse result.
+    const typename TensorBlockV2::Storage block_storage =
+        TensorBlockV2::prepareStorage(desc, scratch);
+    CoeffReturnType* block_buffer = block_storage.data();
+
    while (it[NumDims - 1].count < it[NumDims - 1].size) {
      // Copy inner-most dimension data from reversed location in input.
      Index dst = block_offset;
@ -475,11 +465,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
      }
    }

-    return TensorBlockV2(
-        materialized_in_output
-            ? internal::TensorBlockKind::kMaterializedInOutput
-            : internal::TensorBlockKind::kMaterializedInScratch,
-        block_buffer, desc.dimensions());
+    return block_storage.AsTensorMaterializedBlock();
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@ -351,66 +351,20 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
    typedef typename TensorBlockIO::Dst TensorBlockIODst;
    typedef typename TensorBlockIO::Src TensorBlockIOSrc;

-    ScalarNoConst* block_buffer = NULL;
-    typename TensorBlockIO::Dimensions block_strides;
-
-    bool materialized_in_output = false;
-    bool has_valid_materialized_expr = true;
-
-    if (desc.HasDestinationBuffer()) {
-      // Check if we can reuse destination buffer for block materialization.
-      const typename TensorBlockDesc::DestinationBuffer& destination_buffer =
-          desc.GetDestinationBuffer();
-
-      const bool dims_match = dimensions_match(
-          desc.dimensions(), destination_buffer.template dimensions<Scalar>());
-
-      const bool strides_match =
-          dimensions_match(internal::strides<Layout>(desc.dimensions()),
-                           destination_buffer.template strides<Scalar>());
-
-      if (dims_match && strides_match) {
-        // Destination buffer fits the block contiguously.
-        materialized_in_output = true;
-        has_valid_materialized_expr = true;
-        block_buffer = destination_buffer.template data<ScalarNoConst>();
-        block_strides = internal::strides<Layout>(desc.dimensions());
-        eigen_assert(block_buffer != NULL);
-
-      } else if (dims_match && root_of_expr_ast) {
-        // Destination buffer has strides not matching the block strides, but
-        // for the root of the expression tree it's safe to materialize anyway.
-        materialized_in_output = true;
-        has_valid_materialized_expr = false;
-        block_buffer = destination_buffer.template data<ScalarNoConst>();
-        block_strides = destination_buffer.template strides<ScalarNoConst>();
-        eigen_assert(block_buffer != NULL);
-      }
-
-      if (materialized_in_output) desc.DropDestinationBuffer();
-    }
-
-    // If we were not able to reuse destination buffer, allocate temporary
-    // buffer for block evaluation using scratch allocator.
-    if (!materialized_in_output) {
-      void* mem = scratch.allocate(desc.size() * sizeof(ScalarNoConst));
-      block_buffer = static_cast<ScalarNoConst*>(mem);
-      block_strides = internal::strides<Layout>(desc.dimensions());
-    }
+    const typename TensorBlockV2::Storage block_storage =
+        TensorBlockV2::prepareStorage(
+            desc, scratch, /*allow_strided_storage=*/root_of_expr_ast);

    typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides);
    TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset()));

-    TensorBlockIODst dst(desc.dimensions(), block_strides, block_buffer);
+    TensorBlockIODst dst(block_storage.dimensions(), block_storage.strides(),
+                         block_storage.data());

    typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle);
    TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);

-    return TensorBlockV2(
-        materialized_in_output
-            ? internal::TensorBlockKind::kMaterializedInOutput
-            : internal::TensorBlockKind::kMaterializedInScratch,
-        block_buffer, desc.dimensions(), has_valid_materialized_expr);
+    return block_storage.AsTensorMaterializedBlock();
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
--- a/unsupported/test/cxx11_tensor_block_eval.cpp
+++ b/unsupported/test/cxx11_tensor_block_eval.cpp
@ -154,9 +154,8 @@ static void VerifyBlockEvaluator(Expression expr, GenBlockParams gen_block) {
  Tensor<T, NumDims, Layout> dst(dst_dims);
  dst.setZero();
  if (internal::random<bool>()) {
-    block_params.desc.template AddDestinationBuffer(
-        dst.data(), internal::strides<Layout>(dst.dimensions()),
-        dst.dimensions().TotalSize() * sizeof(T));
+    block_params.desc.template AddDestinationBuffer<Layout>(
+        dst.data(), internal::strides<Layout>(dst.dimensions()));
  }

  const bool root_of_expr = internal::random<bool>();