Avoid a division in NonBlockingThreadPool::Steal.

Looking at profiles we spend ~10-20% of Steal on simply computing
random % size. We can reduce random 32-bit int into [0, size) range with
a single multiplication and shift. This transformation is described in
https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
This commit is contained in:
Ilya Tokar 2020-02-14 16:02:57 -05:00
parent 7769600245
commit eb6cc29583

View File

@ -335,8 +335,12 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
PerThread* pt = GetPerThread();
const size_t size = limit - start;
unsigned r = Rand(&pt->rand);
unsigned victim = r % size;
unsigned inc = all_coprimes_[size - 1][r % all_coprimes_[size - 1].size()];
// Reduce r into [0, size) range, this utilizes trick from
// https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
eigen_plain_assert(all_coprimes_[size - 1].size() < (1<<30));
unsigned victim = ((uint64_t)r * (uint64_t)size) >> 32;
unsigned index = ((uint64_t) all_coprimes_[size - 1].size() * (uint64_t)r) >> 32;
unsigned inc = all_coprimes_[size - 1][index];
for (unsigned i = 0; i < size; i++) {
eigen_plain_assert(start + victim < limit);