Refined the cost of the striding operation.

2025-03-07 18:27:40 +08:00 · 2016-04-25 09:16:08 -07:00 · 2016-04-25 09:16:08 -07:00 · fd9401f260
commit fd9401f260
parent 5c372d19e3
1 changed files with 11 additions and 5 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@ -209,11 +209,17 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
-                                           2 * TensorOpCost::MulCost<Index>() +
-                                           TensorOpCost::DivCost<Index>());
-    return m_impl.costPerCoeff(vectorized) +
-           TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
+    double compute_cost = (NumDims - 1) * (TensorOpCost::AddCost<Index>() +
+                                           TensorOpCost::MulCost<Index>() +
+                                           TensorOpCost::DivCost<Index>()) +
+        TensorOpCost::MulCost<Index>();
+    if (vectorized) {
+      compute_cost *= 2;  // packet() computes two indices
+    }
+    const int innerDim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : (NumDims - 1);
+    return m_impl.costPerCoeff(vectorized && m_inputStrides[innerDim] == 1) +
+        // Computation is not vectorized per se, but it is done once per packet.
+        TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
  }

  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }