Break loop dependence in TensorGenerator block access

2025-01-30 17:40:05 +08:00 · 2019-11-11 10:32:57 -08:00 · 2019-11-11 10:32:57 -08:00 · c952b8dfda
commit c952b8dfda
parent ebf04fb3e8
1 changed files with 22 additions and 7 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@ -269,20 +269,35 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>

    CoeffReturnType* block_buffer = block_storage.data();

+    static const int packet_size = PacketType<CoeffReturnType, Device>::size;
+
+    static const int inner_dim = is_col_major ? 0 : NumDims - 1;
+    const Index inner_dim_size = it[0].size;
+    const Index inner_dim_vectorized = inner_dim_size - packet_size;
+
    while (it[NumDims - 1].count < it[NumDims - 1].size) {
-      // Generate data for the inner-most dimension.
-      for (Index i = 0; i < it[0].size; ++i) {
-        *(block_buffer + offset + i) = m_generator(coords);
-        coords[is_col_major ? 0 : NumDims - 1]++;
+      Index i = 0;
+      // Generate data for the vectorized part of the inner-most dimension.
+      for (; i <= inner_dim_vectorized; i += packet_size) {
+        for (Index j = 0; j < packet_size; ++j) {
+          array<Index, NumDims> j_coords = coords;  // Break loop dependence.
+          j_coords[inner_dim] += j;
+          *(block_buffer + offset + i + j) = m_generator(j_coords);
+        }
+        coords[inner_dim] += packet_size;
      }
-      coords[is_col_major ? 0 : NumDims - 1] =
-          initial_coords[is_col_major ? 0 : NumDims - 1];
+      // Finalize non-vectorized part of the inner-most dimension.
+      for (; i < inner_dim_size; ++i) {
+        *(block_buffer + offset + i) = m_generator(coords);
+        coords[inner_dim]++;
+      }
+      coords[inner_dim] = initial_coords[inner_dim];

      // For the 1d tensor we need to generate only one inner-most dimension.
      if (NumDims == 1) break;

      // Update offset.
-      for (Index i = 1; i < NumDims; ++i) {
+      for (i = 1; i < NumDims; ++i) {
        if (++it[i].count < it[i].size) {
          offset += it[i].stride;
          coords[is_col_major ? i : NumDims - 1 - i]++;