Fix typo + get rid of redundant member variables for block sizes

This commit is contained in:
Eugene Zhulenev 2018-08-01 12:35:19 -07:00
parent 385b3ff12f
commit 64abdf1d7e
6 changed files with 32 additions and 42 deletions

View File

@ -120,7 +120,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
// Block based access to the XprType (input) tensor. // Block based access to the XprType (input) tensor.
using TensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>; using TensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
using TensorBlockReader = internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>; using TensorBlockReader = internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>;
// We do block based broadcasting using a a trick with 2x tensor rank and 0 // We do block based broadcasting using a trick with 2x tensor rank and 0
// strides. See block method implementation for details. // strides. See block method implementation for details.
using BroadcastDimensions = DSizes<Index, 2 * NumDims>; using BroadcastDimensions = DSizes<Index, 2 * NumDims>;
using BroadcastTensorBlock = internal::TensorBlock<ScalarNoConst, Index, 2 * NumDims, Layout>; using BroadcastTensorBlock = internal::TensorBlock<ScalarNoConst, Index, 2 * NumDims, Layout>;
@ -589,8 +589,8 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
std::vector<internal::TensorOpResourceRequirements>* resources) const { std::vector<internal::TensorOpResourceRequirements>* resources) const {
// TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large
// tensors. But this might need further tuning. // tensors. But this might need further tuning.
Index l1_cache_scalars = m_device.firstLevelCacheSize() / sizeof(Scalar); auto block_total_size_max = numext::maxi<Eigen::Index>(
Index block_total_size_max = numext::maxi(Index(1), l1_cache_scalars); 1, m_device.firstLevelCacheSize() / sizeof(Scalar));
resources->push_back(internal::TensorOpResourceRequirements( resources->push_back(internal::TensorOpResourceRequirements(
internal::TensorBlockShapeType::kSkewedInnerDims, internal::TensorBlockShapeType::kSkewedInnerDims,

View File

@ -202,9 +202,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1]; m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
} }
} }
m_block_total_size_max =
numext::maxi<Index>(1, device.lastLevelCacheSize() / sizeof(Scalar));
} }
} }
@ -290,9 +287,11 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
std::vector<internal::TensorOpResourceRequirements>* resources) const { std::vector<internal::TensorOpResourceRequirements>* resources) const {
auto block_total_size_max = numext::maxi<Eigen::Index>(
1, m_device.lastLevelCacheSize() / sizeof(Scalar));
resources->push_back(internal::TensorOpResourceRequirements( resources->push_back(internal::TensorOpResourceRequirements(
internal::TensorBlockShapeType::kSkewedInnerDims, internal::TensorBlockShapeType::kSkewedInnerDims,
m_block_total_size_max)); block_total_size_max));
m_impl.getResourceRequirements(resources); m_impl.getResourceRequirements(resources);
} }
@ -370,13 +369,14 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
{ {
Index inputIndex; Index inputIndex;
if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) || if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims - 1)) {
// m_stride is equal to 1, so let's avoid the integer division. // m_stride is equal to 1, so let's avoid the integer division.
eigen_assert(m_stride == 1); eigen_assert(m_stride == 1);
inputIndex = index * m_inputStride + m_inputOffset; inputIndex = index * m_inputStride + m_inputOffset;
} else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims-1) || } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims - 1) ||
(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) { (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
// m_stride is aways greater than index, so let's avoid the integer division. // m_stride is aways greater than index, so let's avoid the integer
// division.
eigen_assert(m_stride > index); eigen_assert(m_stride > index);
inputIndex = index + m_inputOffset; inputIndex = index + m_inputOffset;
} else { } else {
@ -392,7 +392,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
Index m_stride; Index m_stride;
Index m_inputOffset; Index m_inputOffset;
Index m_inputStride; Index m_inputStride;
Index m_block_total_size_max;
DSizes<Index, NumInputDims> m_inputStrides; DSizes<Index, NumInputDims> m_inputStrides;
TensorEvaluator<ArgType, Device> m_impl; TensorEvaluator<ArgType, Device> m_impl;
const internal::DimensionId<DimId> m_dim; const internal::DimensionId<DimId> m_dim;

View File

@ -259,7 +259,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
#else #else
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device)
#endif #endif
: m_impl(op.expression(), device) : m_device(device), m_impl(op.expression(), device)
#ifdef EIGEN_USE_SYCL #ifdef EIGEN_USE_SYCL
, m_op(op) , m_op(op)
#endif #endif
@ -404,9 +404,6 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
} else { } else {
m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]); m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]);
} }
m_block_total_size_max =
numext::maxi<Index>(1, device.lastLevelCacheSize() / sizeof(Scalar));
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@ -551,9 +548,11 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
std::vector<internal::TensorOpResourceRequirements>* resources) const { std::vector<internal::TensorOpResourceRequirements>* resources) const {
auto block_total_size_max = numext::maxi<Eigen::Index>(
1, m_device.lastLevelCacheSize() / sizeof(Scalar));
resources->push_back(internal::TensorOpResourceRequirements( resources->push_back(internal::TensorOpResourceRequirements(
internal::TensorBlockShapeType::kSkewedInnerDims, internal::TensorBlockShapeType::kSkewedInnerDims,
m_block_total_size_max)); block_total_size_max));
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
@ -743,8 +742,8 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
internal::TensorIntDivisor<Index> m_fastOutputDepth; internal::TensorIntDivisor<Index> m_fastOutputDepth;
Scalar m_paddingValue; Scalar m_paddingValue;
Index m_block_total_size_max;
const Device& m_device;
TensorEvaluator<ArgType, Device> m_impl; TensorEvaluator<ArgType, Device> m_impl;
#ifdef EIGEN_USE_SYCL #ifdef EIGEN_USE_SYCL
// Required for SYCL in order to construct the expression tree on the device // Required for SYCL in order to construct the expression tree on the device

View File

@ -560,9 +560,6 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]); m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
} }
} }
m_block_total_size_max =
numext::maxi<Index>(1, device.lastLevelCacheSize() / sizeof(Scalar));
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@ -672,9 +669,11 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
std::vector<internal::TensorOpResourceRequirements>* resources) const { std::vector<internal::TensorOpResourceRequirements>* resources) const {
auto block_total_size_max = numext::maxi<Eigen::Index>(
1, m_device.lastLevelCacheSize() / sizeof(Scalar));
resources->push_back(internal::TensorOpResourceRequirements( resources->push_back(internal::TensorOpResourceRequirements(
internal::TensorBlockShapeType::kSkewedInnerDims, internal::TensorBlockShapeType::kSkewedInnerDims,
m_block_total_size_max)); block_total_size_max));
m_impl.getResourceRequirements(resources); m_impl.getResourceRequirements(resources);
} }
@ -761,7 +760,6 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
Dimensions m_dimensions; Dimensions m_dimensions;
bool m_is_identity; bool m_is_identity;
const StartIndices m_offsets; const StartIndices m_offsets;
Index m_block_total_size_max;
}; };
@ -1047,9 +1045,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]); m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]);
} }
} }
m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1),
device.lastLevelCacheSize() /
sizeof(Scalar));
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@ -1128,7 +1123,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
DSizes<Index, NumDims> m_dimensions; DSizes<Index, NumDims> m_dimensions;
DSizes<Index, NumDims> m_offsets; // offset in a flattened shape DSizes<Index, NumDims> m_offsets; // offset in a flattened shape
const Strides m_strides; const Strides m_strides;
std::size_t m_block_total_size_max;
//use by sycl //use by sycl
const StartIndices m_exprStartIndices; const StartIndices m_exprStartIndices;
//use by sycl //use by sycl

View File

@ -572,9 +572,6 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
: (static_cast<int>(Layout) == static_cast<int>(ColMajor)) : (static_cast<int>(Layout) == static_cast<int>(ColMajor))
? m_preservedStrides[0] ? m_preservedStrides[0]
: m_preservedStrides[NumOutputDims - 1]; : m_preservedStrides[NumOutputDims - 1];
m_block_total_size_max =
numext::maxi<Index>(1, device.lastLevelCacheSize() / sizeof(Scalar));
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@ -771,9 +768,11 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
std::vector<internal::TensorOpResourceRequirements>* resources) const { std::vector<internal::TensorOpResourceRequirements>* resources) const {
auto block_total_size_max = numext::maxi<Eigen::Index>(
1, m_device.lastLevelCacheSize() / sizeof(Scalar));
resources->push_back(internal::TensorOpResourceRequirements( resources->push_back(internal::TensorOpResourceRequirements(
internal::TensorBlockShapeType::kSkewedInnerDims, internal::TensorBlockShapeType::kSkewedInnerDims,
m_block_total_size_max)); block_total_size_max));
m_impl.getResourceRequirements(resources); m_impl.getResourceRequirements(resources);
} }
@ -1204,9 +1203,6 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
// Indexed by reduced dimensions. // Indexed by reduced dimensions.
array<Index, NumReducedDims> m_reducedDims; array<Index, NumReducedDims> m_reducedDims;
// Block size for tiled (aka TensorBlock) evaluation.
Index m_block_total_size_max;
// Evaluator for the input expression. // Evaluator for the input expression.
TensorEvaluator<ArgType, Device> m_impl; TensorEvaluator<ArgType, Device> m_impl;

View File

@ -124,8 +124,11 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
using TensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>; using TensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
using TensorBlockReader = internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>; using TensorBlockReader = internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
: m_impl(op.expression(), device), m_shuffle(op.shufflePermutation()) const Device& device)
: m_device(device),
m_impl(op.expression(), device),
m_shuffle(op.shufflePermutation())
{ {
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
const Shuffle& shuffle = op.shufflePermutation(); const Shuffle& shuffle = op.shufflePermutation();
@ -162,9 +165,6 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
for (int i = 0; i < NumDims; ++i) { for (int i = 0; i < NumDims; ++i) {
m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]]; m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]];
} }
m_block_total_size_max =
numext::maxi<Index>(1, device.firstLevelCacheSize() / sizeof(Scalar));
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@ -226,9 +226,10 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
std::vector<internal::TensorOpResourceRequirements>* resources) const { std::vector<internal::TensorOpResourceRequirements>* resources) const {
auto block_total_size_max = numext::maxi<Eigen::Index>(
1, m_device.firstLevelCacheSize() / sizeof(Scalar));
resources->push_back(internal::TensorOpResourceRequirements( resources->push_back(internal::TensorOpResourceRequirements(
internal::TensorBlockShapeType::kUniformAllDims, internal::TensorBlockShapeType::kUniformAllDims, block_total_size_max));
m_block_total_size_max));
m_impl.getResourceRequirements(resources); m_impl.getResourceRequirements(resources);
} }
@ -384,7 +385,8 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides; array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
array<Index, NumDims> m_inputStrides; array<Index, NumDims> m_inputStrides;
array<Index, NumDims> m_unshuffledInputStrides; array<Index, NumDims> m_unshuffledInputStrides;
Index m_block_total_size_max;
const Device& m_device;
TensorEvaluator<ArgType, Device> m_impl; TensorEvaluator<ArgType, Device> m_impl;
/// required by sycl /// required by sycl
Shuffle m_shuffle; Shuffle m_shuffle;