From f519fca72bde003d6a96144b79b62503429db30b Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Tue, 17 May 2016 16:06:00 -0700
Subject: [PATCH 1/2] Reduce overhead for small tensors and cheap ops by
 short-circuiting the const computation and block size calculation in
 parallelFor.

---
 .../Eigen/CXX11/src/Tensor/TensorCostModel.h  |  3 --
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h   | 22 +++++++------
 .../Eigen/CXX11/src/Tensor/TensorReduction.h  | 33 +++++++------------
 3 files changed, 24 insertions(+), 34 deletions(-)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
index cb6fb46262..a76c8ca35a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -10,9 +10,6 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
 #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
 
-// Turn on the cost model by default
-#define EIGEN_USE_COST_MODEL
-
 namespace Eigen {
 
 /** \class TensorEvaluator
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 8683987538..0edd24a777 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -152,23 +152,25 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
     {
       const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
       const Index size = array_prod(evaluator.dimensions());
-#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL) && defined(EIGEN_USE_COST_MODEL)
-      device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
-                         EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize,
-                         [&evaluator](Index first, Index last) {
-                           EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, first, last);
-                         });
-#else
       size_t num_threads = device.numThreads();
-#ifdef EIGEN_USE_COST_MODEL
+      ThreadOpCost cost;
       if (num_threads > 1) {
+        cost = evaluator.costPerCoeff(Vectorizable)
         num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
             size, evaluator.costPerCoeff(Vectorizable), num_threads);
       }
-#endif
       if (num_threads == 1) {
         EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
       } else {
+#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL)
+        device.parallelFor(
+            size, cost,
+            EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize,
+            [&evaluator](Index first, Index last) {
+              EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, first,
+                                                             last);
+            });
+#else
         Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
         const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
         const Index numblocks = size / blocksize;
@@ -184,8 +186,8 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
               &evaluator, numblocks * blocksize, size);
         }
         barrier.Wait();
+#endif  // defined(!EIGEN_USE_SIMPLE_THREAD_POOL)
       }
-#endif  // defined(EIGEN_USE_NONBLOCKING_THREAD_POOL) && defined(EIGEN_USE_COST_MODEL)
     }
     evaluator.cleanup();
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 2a8047b7dc..177d620d5f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -248,16 +248,15 @@ struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
       *output = reducer.finalize(reducer.initialize());
       return;
     }
-#ifdef EIGEN_USE_COST_MODEL
-    const TensorOpCost cost =
-        self.m_impl.costPerCoeff(Vectorizable) +
-        TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
-                     PacketSize);
-    const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
-        num_coeffs, cost, device.numThreads());
-#else
-    const int num_threads = device.numThreads();
-#endif
+    int num_threads = device.numThreads();
+    if (num_threads > 1) {
+      const TensorOpCost cost =
+          self.m_impl.costPerCoeff(Vectorizable) +
+          TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
+                       PacketSize);
+      num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+          num_coeffs, cost, device.numThreads());
+    }
     if (num_threads == 1) {
       *output =
           InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
@@ -472,22 +471,14 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  static bool size_large_enough(Index total_size) {
-#ifndef EIGEN_USE_COST_MODEL
-    return total_size > 1024 * 1024;
-#else
-    return true || total_size;
-#endif
-  }
-
   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) {
     m_impl.evalSubExprsIfNeeded(NULL);
 
     // Use the FullReducer if possible.
-    if (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
+    if (RunningFullReduction &&
+        internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
         ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
-         (!RunningOnGPU && size_large_enough(internal::array_prod(m_impl.dimensions()))))) {
-
+         !RunningOnGPU)) {
       bool need_assign = false;
       if (!data) {
         m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType)));

From 7df811cfe5d0047658de1cb4522c9c00d211b059 Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Wed, 18 May 2016 15:09:48 -0700
Subject: [PATCH 2/2] Minor cleanups: 1. Get rid of unused variables. 2. Get
 rid of last uses of EIGEN_USE_COST_MODEL.

---
 .../src/Tensor/TensorContractionThreadPool.h   |  4 ----
 .../Eigen/CXX11/src/Tensor/TensorCostModel.h   |  3 ---
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h    |  8 +++-----
 .../Eigen/CXX11/src/Tensor/TensorReduction.h   | 18 +++---------------
 4 files changed, 6 insertions(+), 27 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 88d485f384..98fe6f5427 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -568,10 +568,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
           (parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)) +
           nm_ * nn_;
       if (k < nk_) {
-        // It is important to copy out nm_ and nn_, because once we kick off
-        // the last packing operation this and device_ can be destroyed.
-        Index nm = nm_;
-        Index nn = nn_;
         // Issue lhs/rhs packing. Their completion will in turn kick off
         // kernels.
         if (parallel_pack_) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
index cb6fb46262..a76c8ca35a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -10,9 +10,6 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
 #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
 
-// Turn on the cost model by default
-#define EIGEN_USE_COST_MODEL
-
 namespace Eigen {
 
 /** \class TensorEvaluator
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 8683987538..2f1acd321f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -150,9 +150,8 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
     if (needs_assign)
     {
-      const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
       const Index size = array_prod(evaluator.dimensions());
-#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL) && defined(EIGEN_USE_COST_MODEL)
+#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL)
       device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
                          EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize,
                          [&evaluator](Index first, Index last) {
@@ -160,15 +159,14 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
                          });
 #else
       size_t num_threads = device.numThreads();
-#ifdef EIGEN_USE_COST_MODEL
       if (num_threads > 1) {
         num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
             size, evaluator.costPerCoeff(Vectorizable), num_threads);
       }
-#endif
       if (num_threads == 1) {
         EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
       } else {
+        const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
         Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
         const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
         const Index numblocks = size / blocksize;
@@ -185,7 +183,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
         }
         barrier.Wait();
       }
-#endif  // defined(EIGEN_USE_NONBLOCKING_THREAD_POOL) && defined(EIGEN_USE_COST_MODEL)
+#endif
     }
     evaluator.cleanup();
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 2a8047b7dc..8b10c11204 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -248,16 +248,12 @@ struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
       *output = reducer.finalize(reducer.initialize());
       return;
     }
-#ifdef EIGEN_USE_COST_MODEL
     const TensorOpCost cost =
         self.m_impl.costPerCoeff(Vectorizable) +
         TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
                      PacketSize);
     const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
         num_coeffs, cost, device.numThreads());
-#else
-    const int num_threads = device.numThreads();
-#endif
     if (num_threads == 1) {
       *output =
           InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
@@ -472,22 +468,14 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  static bool size_large_enough(Index total_size) {
-#ifndef EIGEN_USE_COST_MODEL
-    return total_size > 1024 * 1024;
-#else
-    return true || total_size;
-#endif
-  }
-
   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) {
     m_impl.evalSubExprsIfNeeded(NULL);
 
     // Use the FullReducer if possible.
-    if (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
+    if (RunningFullReduction &&
+        internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
         ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
-         (!RunningOnGPU && size_large_enough(internal::array_prod(m_impl.dimensions()))))) {
-
+         !RunningOnGPU)) {
       bool need_assign = false;
       if (!data) {
         m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType)));