znver3 tuning part 1

2021-03-15  Jan Hubicka  <hubicka@ucw.cz>

	* config/i386/i386-options.c (processor_cost_table): Add znver3_cost.
	* config/i386/x86-tune-costs.h (znver3_cost): New gobal variable; copy
	of znver2_cost.
This commit is contained in:
Jan Hubicka 2021-03-15 11:36:52 +01:00
parent 52654036a5
commit 5b32a1817d
2 changed files with 135 additions and 1 deletions

View File

@ -743,7 +743,7 @@ static const struct processor_costs *processor_cost_table[] =
&btver2_cost,
&znver1_cost,
&znver2_cost,
&znver2_cost
&znver3_cost
};
/* Guarantee that the array is aligned with enum processor_type. */

View File

@ -1688,6 +1688,140 @@ struct processor_costs znver2_cost = {
"16", /* Func alignment. */
};
struct processor_costs znver3_cost = {
{
/* Start of register allocator costs. integer->integer move cost is 2. */
/* reg-reg moves are done by renaming and thus they are even cheaper than
1 cycle. Because reg-reg move cost is 2 and following tables correspond
to doubles of latencies, we do not model this correctly. It does not
seem to make practical difference to bump prices up even more. */
6, /* cost for loading QImode using
movzbl. */
{6, 6, 6}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{8, 8, 8}, /* cost of storing integer
registers. */
2, /* cost of reg,reg fld/fst. */
{6, 6, 16}, /* cost of loading fp registers
in SFmode, DFmode and XFmode. */
{8, 8, 16}, /* cost of storing fp registers
in SFmode, DFmode and XFmode. */
2, /* cost of moving MMX register. */
{6, 6}, /* cost of loading MMX registers
in SImode and DImode. */
{8, 8}, /* cost of storing MMX registers
in SImode and DImode. */
2, 2, 3, /* cost of moving XMM,YMM,ZMM
register. */
{6, 6, 6, 6, 12}, /* cost of loading SSE registers
in 32,64,128,256 and 512-bit. */
{8, 8, 8, 8, 16}, /* cost of storing SSE registers
in 32,64,128,256 and 512-bit. */
6, 6, /* SSE->integer and integer->SSE
moves. */
8, 8, /* mask->integer and integer->mask moves */
{6, 6, 6}, /* cost of loading mask register
in QImode, HImode, SImode. */
{8, 8, 8}, /* cost if storing mask register
in QImode, HImode, SImode. */
2, /* cost of moving mask register. */
/* End of register allocator costs. */
},
COSTS_N_INSNS (1), /* cost of an add instruction. */
COSTS_N_INSNS (1), /* cost of a lea instruction. */
COSTS_N_INSNS (1), /* variable shift costs. */
COSTS_N_INSNS (1), /* constant shift costs. */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
COSTS_N_INSNS (3), /* HI. */
COSTS_N_INSNS (3), /* SI. */
COSTS_N_INSNS (3), /* DI. */
COSTS_N_INSNS (3)}, /* other. */
0, /* cost of multiply per each bit
set. */
/* Depending on parameters, idiv can get faster on ryzen. This is upper
bound. */
{COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
COSTS_N_INSNS (22), /* HI. */
COSTS_N_INSNS (30), /* SI. */
COSTS_N_INSNS (45), /* DI. */
COSTS_N_INSNS (45)}, /* other. */
COSTS_N_INSNS (1), /* cost of movsx. */
COSTS_N_INSNS (1), /* cost of movzx. */
8, /* "large" insn. */
9, /* MOVE_RATIO. */
6, /* CLEAR_RATIO */
{6, 6, 6}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{8, 8, 8}, /* cost of storing integer
registers. */
{6, 6, 6, 6, 12}, /* cost of loading SSE registers
in 32bit, 64bit, 128bit, 256bit and 512bit */
{8, 8, 8, 8, 16}, /* cost of storing SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{6, 6, 6, 6, 12}, /* cost of unaligned loads. */
{8, 8, 8, 8, 16}, /* cost of unaligned stores. */
2, 2, 3, /* cost of moving XMM,YMM,ZMM
register. */
6, /* cost of moving SSE register to integer. */
/* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
throughput 12. Approx 9 uops do not depend on vector size and every load
is 7 uops. */
18, 8, /* Gather load static, per_elt. */
18, 10, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block. */
/* New AMD processors never drop prefetches; if they cannot be performed
immediately, they are queued. We set number of simultaneous prefetches
to a large constant to reflect this (it probably is not a good idea not
to limit number of prefetches at all, as their execution also takes some
time). */
100, /* number of parallel prefetches. */
3, /* Branch cost. */
COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (5), /* cost of FMUL instruction. */
/* Latency of fdiv is 8-15. */
COSTS_N_INSNS (15), /* cost of FDIV instruction. */
COSTS_N_INSNS (1), /* cost of FABS instruction. */
COSTS_N_INSNS (1), /* cost of FCHS instruction. */
/* Latency of fsqrt is 4-10. */
COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
COSTS_N_INSNS (3), /* cost of MULSS instruction. */
COSTS_N_INSNS (3), /* cost of MULSD instruction. */
COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
/* 9-13. */
COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
/* Zen can execute 4 integer operations per cycle. FP operations
take 3 cycles and it can execute 2 integer additions and 2
multiplications thus reassociation may make sense up to with of 6.
SPEC2k6 bencharks suggests
that 4 works better than 6 probably due to register pressure.
Integer vector operations are taken by FP unit and execute 3 vector
plus/minus operations per cycle but only one multiply. This is adjusted
in ix86_reassociation_width. */
4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
znver2_memcpy,
znver2_memset,
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
"16", /* Loop alignment. */
"16", /* Jump alignment. */
"0:0:8", /* Label alignment. */
"16", /* Func alignment. */
};
/* skylake_cost should produce code tuned for Skylake familly of CPUs. */
static stringop_algs skylake_memcpy[2] = {
{libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},