Merged in rmlarsen/eigen2 (pull request PR-409)

Fix oversharding bug in parallelFor.
2024-12-15 07:10:37 +08:00 · 2018-06-21 18:34:57 +00:00 · 2018-06-21 18:34:57 +00:00 · b6ffcd22e3
commit b6ffcd22e3
parent 4cc32d80fd 5418154a45
1 changed files with 7 additions and 4 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@ -189,9 +189,11 @@ struct ThreadPoolDevice {
    // of blocks to be evenly dividable across threads.

    double block_size_f = 1.0 / CostModel::taskSize(1, cost);
-    Index block_size = numext::mini(n, numext::maxi<Index>(1, block_size_f));
-    const Index max_block_size =
-        numext::mini(n, numext::maxi<Index>(1, 2 * block_size_f));
+    const Index max_oversharding_factor = 4;
+    Index block_size = numext::mini(
+        n, numext::maxi<Index>(divup<Index>(n, max_oversharding_factor * numThreads()),
+                               block_size_f));
+    const Index max_block_size = numext::mini(n, 2 * block_size);
    if (block_align) {
      Index new_block_size = block_align(block_size);
      eigen_assert(new_block_size >= block_size);
@ -205,7 +207,8 @@ struct ThreadPoolDevice {
        (divup<int>(block_count, numThreads()) * numThreads());
    // Now try to increase block size up to max_block_size as long as it
    // doesn't decrease parallel efficiency.
-    for (Index prev_block_count = block_count; prev_block_count > 1;) {
+    for (Index prev_block_count = block_count;
+         max_efficiency < 1.0 && prev_block_count > 1;) {
      // This is the next block size that divides size into a smaller number
      // of blocks than the current block_size.
      Index coarser_block_size = divup(n, prev_block_count - 1);