Fix broken tensor executor test, allow tensor packets of size 1.

2025-03-07 18:27:40 +08:00 · 2022-03-07 20:30:37 +00:00 · 2022-03-07 20:30:37 +00:00 · 008ff3483a
commit 008ff3483a
parent 28c7c1a629
13 changed files with 28 additions and 56 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/README.md
+++ b/unsupported/Eigen/CXX11/src/Tensor/README.md
@ -1847,8 +1847,3 @@ product of 2 1d tensors (through contractions) returns a 0d tensor.
 *   The IndexList class requires a cxx11 compliant compiler. You can use an
    array of indices instead if you don't have access to a modern compiler.
 *   On GPUs only floating point values are properly tested and optimized for.
-*   Complex and integer values are known to be broken on GPUs. If you try to use
-    them you'll most likely end up triggering a static assertion failure such as
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-
-
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@ -370,7 +370,6 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne
  (Index index) const
  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());

    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
@ -416,7 +415,6 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
    // Concatenates m_broadcast[dim] copies,
    //    [v0, ..., vN, v0, ..., vN, ... ]
    // with dim == NumDims - 1 for col-major, dim == 0 for row-major.
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());

    // Size of flattened tensor.
@ -445,7 +443,6 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
    // Interleaves m_broadcast[dim] copies,
    //    [v0, v0, ..., v1, v1, ..., vN, vN, ... ]
    // with dim == 0 for col-major, dim == NumDims - 1 for row-major.
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
    eigen_assert(index + PacketSize-1 < dimensions().TotalSize());

    const Index M = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ?
@ -476,7 +473,6 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
  template<int LoadMode>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());

    const Index originalIndex = index;
@ -534,7 +530,6 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
  template<int LoadMode>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());

    const Index originalIndex = index;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@ -219,7 +219,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
  template<int LoadMode>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());

    if (isInnerChipping()) {
@ -447,8 +446,6 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  void writePacket(Index index, const PacketReturnType& x)
  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-
    if (this->isInnerChipping()) {
      // m_stride is equal to 1, so let's avoid the integer division.
      eigen_assert(this->m_stride == 1);
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@ -155,7 +155,6 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
  {
    const int packetSize = PacketType<CoeffReturnType, Device>::size;
-    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
    eigen_assert(index+packetSize-1 < dimensions().TotalSize());

    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@ -449,7 +449,6 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
  template<int LoadMode>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());

    if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@ -551,7 +551,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());

    const Index initialIndex = index;
@ -609,7 +608,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());

    const Index initialIndex = index;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@ -197,7 +197,6 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
  template<int LoadMode>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());

    Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@ -837,7 +837,6 @@ static const bool RunningOnGPU = false;
  template<int LoadMode>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
    eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions())));

    if (RunningOnGPU && m_result) {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@ -215,7 +215,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  PacketReturnType packet(Index index) const
  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());

    // TODO(ndjaitly): write a better packing routine that uses
@ -448,7 +447,6 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>

  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  void writePacket(Index index, const PacketReturnType& x) {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());

    // This code is pilfered from TensorMorphing.h
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@ -227,8 +227,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
  template<int LoadMode>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-        eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
    return PacketLoader<LoadMode, Self, TensorEvaluator<ArgType, Device>::PacketAccess>::Run(*this, index);
  }

@ -399,8 +398,6 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
  template <int StoreMode> EIGEN_STRONG_INLINE
  void writePacket(Index index, const PacketReturnType& x)
  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-
    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
    internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
    EIGEN_UNROLL_LOOP
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
@ -245,8 +245,6 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>

  template<int LoadMode>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
-
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());

    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@ -421,7 +421,6 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
  template<int LoadMode>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());

    if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 ||
--- a/unsupported/test/cxx11_tensor_executor.cpp
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@ -612,43 +612,42 @@ static void test_async_execute_binary_expr(Device d)
  }
 }

-#ifdef EIGEN_DONT_VECTORIZE
-#define VECTORIZABLE(VAL) !EIGEN_DONT_VECTORIZE && VAL
-#else
-#define VECTORIZABLE(VAL) VAL
+#ifndef EIGEN_DONT_VECTORIZE
+#define EIGEN_DONT_VECTORIZE 0
 #endif
+#define VECTORIZABLE(T, VAL) !EIGEN_DONT_VECTORIZE && Eigen::internal::packet_traits<T>::Vectorizable && VAL

 #define CALL_SUBTEST_PART(PART) \
  CALL_SUBTEST_##PART

 #define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS)                                                                                 \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Off,     ColMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::On,  ColMajor>(default_device)));     \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::On,  ColMajor>(default_device)));     \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Off,     RowMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::On,  RowMajor>(default_device)));     \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::On,  RowMajor>(default_device)));     \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     ColMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  ColMajor>(tp_device)));          \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  ColMajor>(tp_device)));          \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     RowMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  RowMajor>(tp_device)));          \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  RowMajor>(tp_device)))
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,                  TiledEvaluation::Off, ColMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,                  TiledEvaluation::On,  ColMajor>(default_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(T, true),  TiledEvaluation::Off, ColMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(T, true),  TiledEvaluation::On,  ColMajor>(default_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,                  TiledEvaluation::Off, RowMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,                  TiledEvaluation::On,  RowMajor>(default_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(T, true),  TiledEvaluation::Off, RowMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(T, true),  TiledEvaluation::On,  RowMajor>(default_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,                  TiledEvaluation::Off, ColMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,                  TiledEvaluation::On,  ColMajor>(tp_device)));          \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(T, true),  TiledEvaluation::Off, ColMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(T, true),  TiledEvaluation::On,  ColMajor>(tp_device)));          \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,                  TiledEvaluation::Off, RowMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,                  TiledEvaluation::On,  RowMajor>(tp_device)));          \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(T, true),  TiledEvaluation::Off, RowMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(T, true),  TiledEvaluation::On,  RowMajor>(tp_device)))

 // NOTE: Currently only ThreadPoolDevice supports async expression evaluation.
 #define CALL_ASYNC_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS)                                                                      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     ColMajor>(tp_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  ColMajor>(tp_device)));     \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(tp_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  ColMajor>(tp_device)));     \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     RowMajor>(tp_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  RowMajor>(tp_device)));     \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(tp_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  RowMajor>(tp_device)))
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,                  TiledEvaluation::Off, ColMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,                  TiledEvaluation::On,  ColMajor>(tp_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(T, true),  TiledEvaluation::Off, ColMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(T, true),  TiledEvaluation::On,  ColMajor>(tp_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,                  TiledEvaluation::Off, RowMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,                  TiledEvaluation::On,  RowMajor>(tp_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(T, true),  TiledEvaluation::Off, RowMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(T, true),  TiledEvaluation::On,  RowMajor>(tp_device)))

 EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
  Eigen::DefaultDevice default_device;