x86: Make the divisor in setting non_temporal_threshold cpu specific

Different systems prefer a different divisors.

From benchmarks[1] so far the following divisors have been found:
    ICX     : 2
    SKX     : 2
    BWD     : 8

For Intel, we are generalizing that BWD and older prefers 8 as a
divisor, and SKL and newer prefers 2. This number can be further tuned
as benchmarks are run.

[1]: https://github.com/goldsteinn/memcpy-nt-benchmarks
Reviewed-by: DJ Delorie <dj@redhat.com>
This commit is contained in:
Noah Goldstein 2023-06-07 13:18:03 -05:00
parent f193ea20ed
commit 180897c161
4 changed files with 51 additions and 26 deletions

View File

@ -636,6 +636,7 @@ init_cpu_features (struct cpu_features *cpu_features)
unsigned int stepping = 0;
enum cpu_features_kind kind;
cpu_features->cachesize_non_temporal_divisor = 4;
#if !HAS_CPUID
if (__get_cpuid_max (0, 0) == 0)
{
@ -716,13 +717,13 @@ init_cpu_features (struct cpu_features *cpu_features)
/* Bigcore/Default Tuning. */
default:
default_tuning:
/* Unknown family 0x06 processors. Assuming this is one
of Core i3/i5/i7 processors if AVX is available. */
if (!CPU_FEATURES_CPU_P (cpu_features, AVX))
break;
/* Fall through. */
case INTEL_BIGCORE_NEHALEM:
case INTEL_BIGCORE_WESTMERE:
enable_modern_features:
/* Rep string instructions, unaligned load, unaligned copy,
and pminub are fast on Intel Core i3, i5 and i7. */
cpu_features->preferred[index_arch_Fast_Rep_String]
@ -732,12 +733,23 @@ init_cpu_features (struct cpu_features *cpu_features)
| bit_arch_Prefer_PMINUB_for_stringop);
break;
/*
Default tuned Bigcore microarch.
case INTEL_BIGCORE_NEHALEM:
case INTEL_BIGCORE_WESTMERE:
/* Older CPUs prefer non-temporal stores at lower threshold. */
cpu_features->cachesize_non_temporal_divisor = 8;
goto enable_modern_features;
/* Older Bigcore microarch (smaller non-temporal store
threshold). */
case INTEL_BIGCORE_SANDYBRIDGE:
case INTEL_BIGCORE_IVYBRIDGE:
case INTEL_BIGCORE_HASWELL:
case INTEL_BIGCORE_BROADWELL:
cpu_features->cachesize_non_temporal_divisor = 8;
goto default_tuning;
/* Newer Bigcore microarch (larger non-temporal store
threshold). */
case INTEL_BIGCORE_SKYLAKE:
case INTEL_BIGCORE_KABYLAKE:
case INTEL_BIGCORE_COMETLAKE:
@ -753,13 +765,14 @@ init_cpu_features (struct cpu_features *cpu_features)
case INTEL_BIGCORE_SAPPHIRERAPIDS:
case INTEL_BIGCORE_EMERALDRAPIDS:
case INTEL_BIGCORE_GRANITERAPIDS:
*/
cpu_features->cachesize_non_temporal_divisor = 2;
goto default_tuning;
/*
Default tuned Mixed (bigcore + atom SOC).
/* Default tuned Mixed (bigcore + atom SOC). */
case INTEL_MIXED_LAKEFIELD:
case INTEL_MIXED_ALDERLAKE:
*/
cpu_features->cachesize_non_temporal_divisor = 2;
goto default_tuning;
}
/* Disable TSX on some processors to avoid TSX on kernels that

View File

@ -738,19 +738,25 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
cpu_features->level3_cache_linesize = level3_cache_linesize;
cpu_features->level4_cache_size = level4_cache_size;
/* The default setting for the non_temporal threshold is 1/4 of size
of the chip's cache. For most Intel and AMD processors with an
initial release date between 2017 and 2023, a thread's typical
share of the cache is from 18-64MB. Using the 1/4 L3 is meant to
estimate the point where non-temporal stores begin out-competing
REP MOVSB. As well the point where the fact that non-temporal
stores are forced back to main memory would already occurred to the
majority of the lines in the copy. Note, concerns about the
entire L3 cache being evicted by the copy are mostly alleviated
by the fact that modern HW detects streaming patterns and
provides proper LRU hints so that the maximum thrashing
capped at 1/associativity. */
unsigned long int non_temporal_threshold = shared / 4;
unsigned long int cachesize_non_temporal_divisor
= cpu_features->cachesize_non_temporal_divisor;
if (cachesize_non_temporal_divisor <= 0)
cachesize_non_temporal_divisor = 4;
/* The default setting for the non_temporal threshold is [1/8, 1/2] of size
of the chip's cache (depending on `cachesize_non_temporal_divisor` which
is microarch specific. The defeault is 1/4). For most Intel and AMD
processors with an initial release date between 2017 and 2023, a thread's
typical share of the cache is from 18-64MB. Using a reasonable size
fraction of L3 is meant to estimate the point where non-temporal stores
begin out-competing REP MOVSB. As well the point where the fact that
non-temporal stores are forced back to main memory would already occurred
to the majority of the lines in the copy. Note, concerns about the entire
L3 cache being evicted by the copy are mostly alleviated by the fact that
modern HW detects streaming patterns and provides proper LRU hints so that
the maximum thrashing capped at 1/associativity. */
unsigned long int non_temporal_threshold
= shared / cachesize_non_temporal_divisor;
/* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
a higher risk of actually thrashing the cache as they don't have a HW LRU
hint. As well, their performance in highly parallel situations is

View File

@ -113,8 +113,11 @@ _dl_diagnostics_cpu (void)
cpu_features->level3_cache_linesize);
print_cpu_features_value ("level4_cache_size",
cpu_features->level4_cache_size);
_Static_assert (offsetof (struct cpu_features, level4_cache_size)
+ sizeof (cpu_features->level4_cache_size)
== sizeof (*cpu_features),
"last cpu_features field has been printed");
print_cpu_features_value ("cachesize_non_temporal_divisor",
cpu_features->cachesize_non_temporal_divisor);
_Static_assert (
offsetof (struct cpu_features, cachesize_non_temporal_divisor)
+ sizeof (cpu_features->cachesize_non_temporal_divisor)
== sizeof (*cpu_features),
"last cpu_features field has been printed");
}

View File

@ -945,6 +945,9 @@ struct cpu_features
unsigned long int level3_cache_linesize;
/* /_SC_LEVEL4_CACHE_SIZE. */
unsigned long int level4_cache_size;
/* When no user non_temporal_threshold is specified. We default to
cachesize / cachesize_non_temporal_divisor. */
unsigned long int cachesize_non_temporal_divisor;
};
/* Get a pointer to the CPU features structure. */