mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-01-30 17:40:05 +08:00
Improved the blocking strategy to speedup multithreaded tensor contractions.
This commit is contained in:
parent
0eb220c00d
commit
5401fbcc50
@ -112,14 +112,18 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
|||||||
nr = Traits::nr,
|
nr = Traits::nr,
|
||||||
nr_mask = (0xffffffff/nr)*nr
|
nr_mask = (0xffffffff/nr)*nr
|
||||||
};
|
};
|
||||||
Index k_cache = (l1-ksub)/kdiv;
|
// Increasing k gives us more time to prefetch the content of the "C"
|
||||||
|
// registers. However once the latency is hidden there is no point in
|
||||||
|
// increasing the value of k, so we'll cap it at 320 (value determined
|
||||||
|
// experimentally).
|
||||||
|
const Index k_cache = (std::min<Index>)((l1-ksub)/kdiv, 320);
|
||||||
if (k_cache < k) {
|
if (k_cache < k) {
|
||||||
k = k_cache & k_mask;
|
k = k_cache & k_mask;
|
||||||
eigen_internal_assert(k > 0);
|
eigen_internal_assert(k > 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
|
const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
|
||||||
Index n_per_thread = numext::div_ceil(n, num_threads);
|
const Index n_per_thread = numext::div_ceil(n, num_threads);
|
||||||
if (n_cache <= n_per_thread) {
|
if (n_cache <= n_per_thread) {
|
||||||
// Don't exceed the capacity of the l2 cache.
|
// Don't exceed the capacity of the l2 cache.
|
||||||
eigen_internal_assert(n_cache >= static_cast<Index>(nr));
|
eigen_internal_assert(n_cache >= static_cast<Index>(nr));
|
||||||
@ -131,8 +135,8 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
|||||||
|
|
||||||
if (l3 > l2) {
|
if (l3 > l2) {
|
||||||
// l3 is shared between all cores, so we'll give each thread its own chunk of l3.
|
// l3 is shared between all cores, so we'll give each thread its own chunk of l3.
|
||||||
Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
|
const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
|
||||||
Index m_per_thread = numext::div_ceil(m, num_threads);
|
const Index m_per_thread = numext::div_ceil(m, num_threads);
|
||||||
if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
|
if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
|
||||||
m = m_cache & mr_mask;
|
m = m_cache & mr_mask;
|
||||||
eigen_internal_assert(m > 0);
|
eigen_internal_assert(m > 0);
|
||||||
|
Loading…
Reference in New Issue
Block a user