From f519fca72bde003d6a96144b79b62503429db30b Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Tue, 17 May 2016 16:06:00 -0700 Subject: [PATCH 1/2] Reduce overhead for small tensors and cheap ops by short-circuiting the const computation and block size calculation in parallelFor. --- .../Eigen/CXX11/src/Tensor/TensorCostModel.h | 3 -- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 22 +++++++------ .../Eigen/CXX11/src/Tensor/TensorReduction.h | 33 +++++++------------ 3 files changed, 24 insertions(+), 34 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h index cb6fb46262..a76c8ca35a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h @@ -10,9 +10,6 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H -// Turn on the cost model by default -#define EIGEN_USE_COST_MODEL - namespace Eigen { /** \class TensorEvaluator diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 8683987538..0edd24a777 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -152,23 +152,25 @@ class TensorExecutor { { const Index PacketSize = Vectorizable ? unpacket_traits::size : 1; const Index size = array_prod(evaluator.dimensions()); -#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL) && defined(EIGEN_USE_COST_MODEL) - device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), - EvalRange::alignBlockSize, - [&evaluator](Index first, Index last) { - EvalRange::run(&evaluator, first, last); - }); -#else size_t num_threads = device.numThreads(); -#ifdef EIGEN_USE_COST_MODEL + ThreadOpCost cost; if (num_threads > 1) { + cost = evaluator.costPerCoeff(Vectorizable) num_threads = TensorCostModel::numThreads( size, evaluator.costPerCoeff(Vectorizable), num_threads); } -#endif if (num_threads == 1) { EvalRange::run(&evaluator, 0, size); } else { +#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL) + device.parallelFor( + size, cost, + EvalRange::alignBlockSize, + [&evaluator](Index first, Index last) { + EvalRange::run(&evaluator, first, + last); + }); +#else Index blocksz = std::ceil(static_cast(size)/num_threads) + PacketSize - 1; const Index blocksize = numext::maxi(PacketSize, (blocksz - (blocksz % PacketSize))); const Index numblocks = size / blocksize; @@ -184,8 +186,8 @@ class TensorExecutor { &evaluator, numblocks * blocksize, size); } barrier.Wait(); +#endif // defined(!EIGEN_USE_SIMPLE_THREAD_POOL) } -#endif // defined(EIGEN_USE_NONBLOCKING_THREAD_POOL) && defined(EIGEN_USE_COST_MODEL) } evaluator.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 2a8047b7dc..177d620d5f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -248,16 +248,15 @@ struct FullReducer { *output = reducer.finalize(reducer.initialize()); return; } -#ifdef EIGEN_USE_COST_MODEL - const TensorOpCost cost = - self.m_impl.costPerCoeff(Vectorizable) + - TensorOpCost(0, 0, internal::functor_traits::Cost, Vectorizable, - PacketSize); - const int num_threads = TensorCostModel::numThreads( - num_coeffs, cost, device.numThreads()); -#else - const int num_threads = device.numThreads(); -#endif + int num_threads = device.numThreads(); + if (num_threads > 1) { + const TensorOpCost cost = + self.m_impl.costPerCoeff(Vectorizable) + + TensorOpCost(0, 0, internal::functor_traits::Cost, Vectorizable, + PacketSize); + num_threads = TensorCostModel::numThreads( + num_coeffs, cost, device.numThreads()); + } if (num_threads == 1) { *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); @@ -472,22 +471,14 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - static bool size_large_enough(Index total_size) { -#ifndef EIGEN_USE_COST_MODEL - return total_size > 1024 * 1024; -#else - return true || total_size; -#endif - } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) { m_impl.evalSubExprsIfNeeded(NULL); // Use the FullReducer if possible. - if (RunningFullReduction && internal::FullReducer::HasOptimizedImplementation && + if (RunningFullReduction && + internal::FullReducer::HasOptimizedImplementation && ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || - (!RunningOnGPU && size_large_enough(internal::array_prod(m_impl.dimensions()))))) { - + !RunningOnGPU)) { bool need_assign = false; if (!data) { m_result = static_cast(m_device.allocate(sizeof(CoeffReturnType))); From 7df811cfe5d0047658de1cb4522c9c00d211b059 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 18 May 2016 15:09:48 -0700 Subject: [PATCH 2/2] Minor cleanups: 1. Get rid of unused variables. 2. Get rid of last uses of EIGEN_USE_COST_MODEL. --- .../src/Tensor/TensorContractionThreadPool.h | 4 ---- .../Eigen/CXX11/src/Tensor/TensorCostModel.h | 3 --- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 8 +++----- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 18 +++--------------- 4 files changed, 6 insertions(+), 27 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 88d485f384..98fe6f5427 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -568,10 +568,6 @@ struct TensorEvaluator { const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { - const Index PacketSize = Vectorizable ? unpacket_traits::size : 1; const Index size = array_prod(evaluator.dimensions()); -#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL) && defined(EIGEN_USE_COST_MODEL) +#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL) device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), EvalRange::alignBlockSize, [&evaluator](Index first, Index last) { @@ -160,15 +159,14 @@ class TensorExecutor { }); #else size_t num_threads = device.numThreads(); -#ifdef EIGEN_USE_COST_MODEL if (num_threads > 1) { num_threads = TensorCostModel::numThreads( size, evaluator.costPerCoeff(Vectorizable), num_threads); } -#endif if (num_threads == 1) { EvalRange::run(&evaluator, 0, size); } else { + const Index PacketSize = Vectorizable ? unpacket_traits::size : 1; Index blocksz = std::ceil(static_cast(size)/num_threads) + PacketSize - 1; const Index blocksize = numext::maxi(PacketSize, (blocksz - (blocksz % PacketSize))); const Index numblocks = size / blocksize; @@ -185,7 +183,7 @@ class TensorExecutor { } barrier.Wait(); } -#endif // defined(EIGEN_USE_NONBLOCKING_THREAD_POOL) && defined(EIGEN_USE_COST_MODEL) +#endif } evaluator.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 2a8047b7dc..8b10c11204 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -248,16 +248,12 @@ struct FullReducer { *output = reducer.finalize(reducer.initialize()); return; } -#ifdef EIGEN_USE_COST_MODEL const TensorOpCost cost = self.m_impl.costPerCoeff(Vectorizable) + TensorOpCost(0, 0, internal::functor_traits::Cost, Vectorizable, PacketSize); const int num_threads = TensorCostModel::numThreads( num_coeffs, cost, device.numThreads()); -#else - const int num_threads = device.numThreads(); -#endif if (num_threads == 1) { *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); @@ -472,22 +468,14 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - static bool size_large_enough(Index total_size) { -#ifndef EIGEN_USE_COST_MODEL - return total_size > 1024 * 1024; -#else - return true || total_size; -#endif - } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) { m_impl.evalSubExprsIfNeeded(NULL); // Use the FullReducer if possible. - if (RunningFullReduction && internal::FullReducer::HasOptimizedImplementation && + if (RunningFullReduction && + internal::FullReducer::HasOptimizedImplementation && ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || - (!RunningOnGPU && size_large_enough(internal::array_prod(m_impl.dimensions()))))) { - + !RunningOnGPU)) { bool need_assign = false; if (!data) { m_result = static_cast(m_device.allocate(sizeof(CoeffReturnType)));