Reduce dispatch overhead in parallelFor by only calling thread_pool.Schedule() for one of the two recursive calls in handleRange. This avoids going through the scedule path to push both recursive calls onto another thread-queue in the binary tree, but instead executes one of them on the main thread. At the leaf level this will still activate a full complement of threads, but will save up to 50% of the overhead in Schedule (random number generation, insertion in queue which includes signaling via atomics).

2024-12-21 07:19:46 +08:00 · 2016-11-14 14:18:16 -08:00 · 2016-11-14 14:18:16 -08:00 · 32df1b1046
commit 32df1b1046
parent 0ee92aa38e
1 changed files with 1 additions and 1 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@ -256,7 +256,7 @@ struct ThreadPoolDevice {
      // Split into halves and submit to the pool.
      Index mid = first + divup((last - first) / 2, block_size) * block_size;
      pool_->Schedule([=, &handleRange]() { handleRange(mid, last); });
-      pool_->Schedule([=, &handleRange]() { handleRange(first, mid); });
+      handleRange(first, mid);
    };
    handleRange(0, n);
    barrier.Wait();