Enable AMD znver4 support and add instruction reservations

2022-09-28  Tejas Joshi <TejasSanjay.Joshi@amd.com>

gcc/ChangeLog:

	* common/config/i386/cpuinfo.h (get_amd_cpu): Recognize znver4.
	* common/config/i386/i386-common.cc (processor_names): Add znver4.
	(processor_alias_table): Add znver4 and modularize old znvers.
	* common/config/i386/i386-cpuinfo.h (processor_subtypes):
	AMDFAM19H_ZNVER4.
	* config.gcc (x86_64-*-* |...): Likewise.
	* config/i386/driver-i386.cc (host_detect_local_cpu): Let
	-march=native recognize znver4 cpus.
	* config/i386/i386-c.cc (ix86_target_macros_internal): Add znver4.
	* config/i386/i386-options.cc (m_ZNVER4): New definition.
	(m_ZNVER): Include m_ZNVER4.
	(processor_cost_table): Add znver4.
	* config/i386/i386.cc (ix86_reassociation_width): Likewise.
	* config/i386/i386.h (processor_type): Add PROCESSOR_ZNVER4.
	(PTA_ZNVER1): New definition.
	(PTA_ZNVER2): Likewise.
	(PTA_ZNVER3): Likewise.
	(PTA_ZNVER4): Likewise.
	* config/i386/i386.md (define_attr "cpu"): Add znver4 and rename
	md file.
	* config/i386/x86-tune-costs.h (znver4_cost): New definition.
	* config/i386/x86-tune-sched.cc (ix86_issue_rate): Add znver4.
	(ix86_adjust_cost): Likewise.
	* config/i386/znver1.md: Rename to znver.md.
	* config/i386/znver.md: Add new reservations for znver4.
	* doc/extend.texi: Add details about znver4.
	* doc/invoke.texi: Likewise.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/funcspec-56.inc: Handle new march.
	* g++.target/i386/mv29.C: Likewise.
This commit is contained in:
Tejas Joshi 2022-06-28 16:33:53 +05:30 committed by Venkataramanan Kumar
parent 88b34661f7
commit bf3b532b52
17 changed files with 1036 additions and 71 deletions

View File

@ -253,13 +253,27 @@ get_amd_cpu (struct __processor_model *cpu_model,
break;
case 0x19:
cpu_model->__cpu_type = AMDFAM19H;
/* AMD family 19h version 1. */
/* AMD family 19h. */
if (model <= 0x0f)
{
cpu = "znver3";
CHECK___builtin_cpu_is ("znver3");
cpu_model->__cpu_subtype = AMDFAM19H_ZNVER3;
}
else if ((model >= 0x10 && model <= 0x1f)
|| (model >= 0x60 && model <= 0xaf))
{
cpu = "znver4";
CHECK___builtin_cpu_is ("znver4");
cpu_model->__cpu_subtype = AMDFAM19H_ZNVER4;
}
else if (has_cpu_feature (cpu_model, cpu_features2,
FEATURE_AVX512F))
{
cpu = "znver4";
CHECK___builtin_cpu_is ("znver4");
cpu_model->__cpu_subtype = AMDFAM19H_ZNVER4;
}
else if (has_cpu_feature (cpu_model, cpu_features2,
FEATURE_VAES))
{

View File

@ -1868,7 +1868,8 @@ const char *const processor_names[] =
"btver2",
"znver1",
"znver2",
"znver3"
"znver3",
"znver4"
};
/* Guarantee that the array is aligned with enum processor_type. */
@ -2104,37 +2105,17 @@ const pta processor_alias_table[] =
| PTA_MOVBE | PTA_MWAITX,
M_CPU_SUBTYPE (AMDFAM15H_BDVER4), P_PROC_AVX2},
{"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
| PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
| PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
| PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
| PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
| PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
| PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
| PTA_SHA | PTA_LZCNT | PTA_POPCNT,
PTA_ZNVER1,
M_CPU_SUBTYPE (AMDFAM17H_ZNVER1), P_PROC_AVX2},
{"znver2", PROCESSOR_ZNVER2, CPU_ZNVER2,
PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
| PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
| PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
| PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
| PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
| PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
| PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
| PTA_SHA | PTA_LZCNT | PTA_POPCNT | PTA_CLWB | PTA_RDPID
| PTA_WBNOINVD,
PTA_ZNVER2,
M_CPU_SUBTYPE (AMDFAM17H_ZNVER2), P_PROC_AVX2},
{"znver3", PROCESSOR_ZNVER3, CPU_ZNVER3,
PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
| PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
| PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
| PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
| PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
| PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
| PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
| PTA_SHA | PTA_LZCNT | PTA_POPCNT | PTA_CLWB | PTA_RDPID
| PTA_WBNOINVD | PTA_VAES | PTA_VPCLMULQDQ | PTA_PKU,
PTA_ZNVER3,
M_CPU_SUBTYPE (AMDFAM19H_ZNVER3), P_PROC_AVX2},
{"znver4", PROCESSOR_ZNVER4, CPU_ZNVER4,
PTA_ZNVER4,
M_CPU_SUBTYPE (AMDFAM19H_ZNVER4), P_PROC_AVX512F},
{"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
| PTA_SSSE3 | PTA_SSE4A | PTA_ABM | PTA_CX16 | PTA_PRFCHW

View File

@ -92,6 +92,7 @@ enum processor_subtypes
AMDFAM19H_ZNVER3,
INTEL_COREI7_ROCKETLAKE,
ZHAOXIN_FAM7H_LUJIAZUI,
AMDFAM19H_ZNVER4,
CPU_SUBTYPE_MAX
};

View File

@ -660,7 +660,7 @@ c7 esther"
# 64-bit x86 processors supported by --with-arch=. Each processor
# MUST be separated by exactly one space.
x86_64_archs="amdfam10 athlon64 athlon64-sse3 barcelona bdver1 bdver2 \
bdver3 bdver4 znver1 znver2 znver3 btver1 btver2 k8 k8-sse3 opteron \
bdver3 bdver4 znver1 znver2 znver3 znver4 btver1 btver2 k8 k8-sse3 opteron \
opteron-sse3 nocona core2 corei7 corei7-avx core-avx-i core-avx2 atom \
slm nehalem westmere sandybridge ivybridge haswell broadwell bonnell \
silvermont knl knm skylake-avx512 cannonlake icelake-client icelake-server \
@ -3643,6 +3643,10 @@ case ${target} in
arch=znver3
cpu=znver3
;;
znver4-*)
arch=znver4
cpu=znver4
;;
bdver4-*)
arch=bdver4
cpu=bdver4
@ -3771,6 +3775,10 @@ case ${target} in
znver3-*)
arch=znver3
cpu=znver3
;;
znver4-*)
arch=znver4
cpu=znver4
;;
bdver4-*)
arch=bdver4

View File

@ -465,6 +465,8 @@ const char *host_detect_local_cpu (int argc, const char **argv)
processor = PROCESSOR_GEODE;
else if (has_feature (FEATURE_MOVBE) && family == 22)
processor = PROCESSOR_BTVER2;
else if (has_feature (FEATURE_AVX512F))
processor = PROCESSOR_ZNVER4;
else if (has_feature (FEATURE_VAES))
processor = PROCESSOR_ZNVER3;
else if (has_feature (FEATURE_CLWB))
@ -779,6 +781,9 @@ const char *host_detect_local_cpu (int argc, const char **argv)
case PROCESSOR_ZNVER3:
cpu = "znver3";
break;
case PROCESSOR_ZNVER4:
cpu = "znver4";
break;
case PROCESSOR_BTVER1:
cpu = "btver1";
break;

View File

@ -132,6 +132,10 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
def_or_undef (parse_in, "__znver3");
def_or_undef (parse_in, "__znver3__");
break;
case PROCESSOR_ZNVER4:
def_or_undef (parse_in, "__znver4");
def_or_undef (parse_in, "__znver4__");
break;
case PROCESSOR_BTVER1:
def_or_undef (parse_in, "__btver1");
def_or_undef (parse_in, "__btver1__");
@ -330,6 +334,9 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
case PROCESSOR_ZNVER3:
def_or_undef (parse_in, "__tune_znver3__");
break;
case PROCESSOR_ZNVER4:
def_or_undef (parse_in, "__tune_znver4__");
break;
case PROCESSOR_BTVER1:
def_or_undef (parse_in, "__tune_btver1__");
break;

View File

@ -154,11 +154,12 @@ along with GCC; see the file COPYING3. If not see
#define m_ZNVER1 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER1)
#define m_ZNVER2 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER2)
#define m_ZNVER3 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER3)
#define m_ZNVER4 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER4)
#define m_BTVER1 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER1)
#define m_BTVER2 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER2)
#define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
#define m_BTVER (m_BTVER1 | m_BTVER2)
#define m_ZNVER (m_ZNVER1 | m_ZNVER2 | m_ZNVER3)
#define m_ZNVER (m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4)
#define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
| m_ZNVER)
@ -773,7 +774,8 @@ static const struct processor_costs *processor_cost_table[] =
&btver2_cost,
&znver1_cost,
&znver2_cost,
&znver3_cost
&znver3_cost,
&znver4_cost
};
/* Guarantee that the array is aligned with enum processor_type. */

View File

@ -23079,7 +23079,7 @@ ix86_reassociation_width (unsigned int op, machine_mode mode)
/* Integer vector instructions execute in FP unit
and can execute 3 additions and one multiplication per cycle. */
if ((ix86_tune == PROCESSOR_ZNVER1 || ix86_tune == PROCESSOR_ZNVER2
|| ix86_tune == PROCESSOR_ZNVER3)
|| ix86_tune == PROCESSOR_ZNVER3 || ix86_tune == PROCESSOR_ZNVER4)
&& INTEGRAL_MODE_P (mode) && op != PLUS && op != MINUS)
return 1;

View File

@ -2255,6 +2255,7 @@ enum processor_type
PROCESSOR_ZNVER1,
PROCESSOR_ZNVER2,
PROCESSOR_ZNVER3,
PROCESSOR_ZNVER4,
PROCESSOR_max
};
@ -2347,6 +2348,21 @@ constexpr wide_int_bitmask PTA_ALDERLAKE = PTA_TREMONT | PTA_ADX | PTA_AVX
| PTA_HRESET | PTA_KL | PTA_WIDEKL | PTA_AVXVNNI;
constexpr wide_int_bitmask PTA_KNM = PTA_KNL | PTA_AVX5124VNNIW
| PTA_AVX5124FMAPS | PTA_AVX512VPOPCNTDQ;
constexpr wide_int_bitmask PTA_ZNVER1 = PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2
| PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
| PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2 | PTA_BMI | PTA_BMI2
| PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT
| PTA_FSGSBASE | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
| PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES | PTA_SHA | PTA_LZCNT
| PTA_POPCNT;
constexpr wide_int_bitmask PTA_ZNVER2 = PTA_ZNVER1 | PTA_CLWB | PTA_RDPID
| PTA_WBNOINVD;
constexpr wide_int_bitmask PTA_ZNVER3 = PTA_ZNVER2 | PTA_VAES | PTA_VPCLMULQDQ
| PTA_PKU;
constexpr wide_int_bitmask PTA_ZNVER4 = PTA_ZNVER3 | PTA_AVX512F | PTA_AVX512DQ
| PTA_AVX512IFMA | PTA_AVX512CD | PTA_AVX512BW | PTA_AVX512VL
| PTA_AVX512BF16 | PTA_AVX512VBMI | PTA_AVX512VBMI2 | PTA_GFNI
| PTA_AVX512VNNI | PTA_AVX512BITALG | PTA_AVX512VPOPCNTDQ;
#ifndef GENERATOR_FILE

View File

@ -474,7 +474,7 @@
;; Processor type.
(define_attr "cpu" "none,pentium,pentiumpro,geode,k6,athlon,k8,core2,nehalem,
atom,slm,glm,haswell,generic,lujiazui,amdfam10,bdver1,
bdver2,bdver3,bdver4,btver2,znver1,znver2,znver3"
bdver2,bdver3,bdver4,btver2,znver1,znver2,znver3,znver4"
(const (symbol_ref "ix86_schedule")))
;; A basic instruction type. Refinements due to arguments to be
@ -1309,7 +1309,7 @@
(include "bdver1.md")
(include "bdver3.md")
(include "btver2.md")
(include "znver1.md")
(include "znver.md")
(include "geode.md")
(include "atom.md")
(include "slm.md")

View File

@ -1820,6 +1820,139 @@ struct processor_costs znver3_cost = {
"16", /* Func alignment. */
};
/* This table currently replicates znver3_cost table. */
struct processor_costs znver4_cost = {
{
/* Start of register allocator costs. integer->integer move cost is 2. */
/* reg-reg moves are done by renaming and thus they are even cheaper than
1 cycle. Because reg-reg move cost is 2 and following tables correspond
to doubles of latencies, we do not model this correctly. It does not
seem to make practical difference to bump prices up even more. */
6, /* cost for loading QImode using
movzbl. */
{6, 6, 6}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{8, 8, 8}, /* cost of storing integer
registers. */
2, /* cost of reg,reg fld/fst. */
{6, 6, 16}, /* cost of loading fp registers
in SFmode, DFmode and XFmode. */
{8, 8, 16}, /* cost of storing fp registers
in SFmode, DFmode and XFmode. */
2, /* cost of moving MMX register. */
{6, 6}, /* cost of loading MMX registers
in SImode and DImode. */
{8, 8}, /* cost of storing MMX registers
in SImode and DImode. */
2, 2, 3, /* cost of moving XMM,YMM,ZMM
register. */
{6, 6, 6, 6, 12}, /* cost of loading SSE registers
in 32,64,128,256 and 512-bit. */
{8, 8, 8, 8, 16}, /* cost of storing SSE registers
in 32,64,128,256 and 512-bit. */
6, 6, /* SSE->integer and integer->SSE
moves. */
8, 8, /* mask->integer and integer->mask moves */
{6, 6, 6}, /* cost of loading mask register
in QImode, HImode, SImode. */
{8, 8, 8}, /* cost if storing mask register
in QImode, HImode, SImode. */
2, /* cost of moving mask register. */
/* End of register allocator costs. */
},
COSTS_N_INSNS (1), /* cost of an add instruction. */
COSTS_N_INSNS (1), /* cost of a lea instruction. */
COSTS_N_INSNS (1), /* variable shift costs. */
COSTS_N_INSNS (1), /* constant shift costs. */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
COSTS_N_INSNS (3), /* HI. */
COSTS_N_INSNS (3), /* SI. */
COSTS_N_INSNS (3), /* DI. */
COSTS_N_INSNS (3)}, /* other. */
0, /* cost of multiply per each bit
set. */
{COSTS_N_INSNS (9), /* cost of a divide/mod for QI. */
COSTS_N_INSNS (10), /* HI. */
COSTS_N_INSNS (12), /* SI. */
COSTS_N_INSNS (17), /* DI. */
COSTS_N_INSNS (17)}, /* other. */
COSTS_N_INSNS (1), /* cost of movsx. */
COSTS_N_INSNS (1), /* cost of movzx. */
8, /* "large" insn. */
9, /* MOVE_RATIO. */
6, /* CLEAR_RATIO */
{6, 6, 6}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{8, 8, 8}, /* cost of storing integer
registers. */
{6, 6, 6, 6, 12}, /* cost of loading SSE registers
in 32bit, 64bit, 128bit, 256bit and 512bit */
{8, 8, 8, 8, 16}, /* cost of storing SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{6, 6, 6, 6, 12}, /* cost of unaligned loads. */
{8, 8, 8, 8, 16}, /* cost of unaligned stores. */
2, 2, 3, /* cost of moving XMM,YMM,ZMM
register. */
6, /* cost of moving SSE register to integer. */
/* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,
throughput 9. Approx 7 uops do not depend on vector size and every load
is 4 uops. */
14, 8, /* Gather load static, per_elt. */
14, 10, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block. */
/* New AMD processors never drop prefetches; if they cannot be performed
immediately, they are queued. We set number of simultaneous prefetches
to a large constant to reflect this (it probably is not a good idea not
to limit number of prefetches at all, as their execution also takes some
time). */
100, /* number of parallel prefetches. */
3, /* Branch cost. */
COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (5), /* cost of FMUL instruction. */
/* Latency of fdiv is 8-15. */
COSTS_N_INSNS (15), /* cost of FDIV instruction. */
COSTS_N_INSNS (1), /* cost of FABS instruction. */
COSTS_N_INSNS (1), /* cost of FCHS instruction. */
/* Latency of fsqrt is 4-10. */
COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
COSTS_N_INSNS (3), /* cost of MULSS instruction. */
COSTS_N_INSNS (3), /* cost of MULSD instruction. */
COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
/* 9-13. */
COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
/* Zen can execute 4 integer operations per cycle. FP operations
take 3 cycles and it can execute 2 integer additions and 2
multiplications thus reassociation may make sense up to with of 6.
SPEC2k6 bencharks suggests
that 4 works better than 6 probably due to register pressure.
Integer vector operations are taken by FP unit and execute 3 vector
plus/minus operations per cycle but only one multiply. This is adjusted
in ix86_reassociation_width. */
4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
znver2_memcpy,
znver2_memset,
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
"16", /* Loop alignment. */
"16", /* Jump alignment. */
"0:0:8", /* Label alignment. */
"16", /* Func alignment. */
};
/* skylake_cost should produce code tuned for Skylake familly of CPUs. */
static stringop_algs skylake_memcpy[2] = {
{libcall,

View File

@ -68,6 +68,7 @@ ix86_issue_rate (void)
case PROCESSOR_ZNVER1:
case PROCESSOR_ZNVER2:
case PROCESSOR_ZNVER3:
case PROCESSOR_ZNVER4:
case PROCESSOR_CORE2:
case PROCESSOR_NEHALEM:
case PROCESSOR_SANDYBRIDGE:
@ -415,6 +416,7 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
case PROCESSOR_ZNVER1:
case PROCESSOR_ZNVER2:
case PROCESSOR_ZNVER3:
case PROCESSOR_ZNVER4:
/* Stack engine allows to execute push&pop instructions in parall. */
if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
&& (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))

File diff suppressed because it is too large Load Diff

View File

@ -21935,6 +21935,9 @@ AMD Family 19h CPU.
@item znver3
AMD Family 19h Zen version 3.
@item znver4
AMD Family 19h Zen version 4.
@item x86-64
Baseline x86-64 microarchitecture level (as defined in x86-64 psABI).

View File

@ -32172,6 +32172,15 @@ MWAITX, SHA, CLZERO, AES, PCLMUL, CX16, MOVBE, MMX, SSE, SSE2, SSE3, SSE4A,
SSSE3, SSE4.1, SSE4.2, ABM, XSAVEC, XSAVES, CLFLUSHOPT, POPCNT, RDPID,
WBNOINVD, PKU, VPCLMULQDQ, VAES, and 64-bit instruction set extensions.)
@item znver4
AMD Family 19h core based CPUs with x86-64 instruction set support. (This
supersets BMI, BMI2, CLWB, F16C, FMA, FSGSBASE, AVX, AVX2, ADCX, RDSEED,
MWAITX, SHA, CLZERO, AES, PCLMUL, CX16, MOVBE, MMX, SSE, SSE2, SSE3, SSE4A,
SSSE3, SSE4.1, SSE4.2, ABM, XSAVEC, XSAVES, CLFLUSHOPT, POPCNT, RDPID,
WBNOINVD, PKU, VPCLMULQDQ, VAES, AVX512F, AVX512DQ, AVX512IFMA, AVX512CD,
AVX512BW, AVX512VL, AVX512BF16, AVX512VBMI, AVX512VBMI2, AVX512VNNI,
AVX512BITALG, AVX512VPOPCNTDQ, GFNI and 64-bit instruction set extensions.)
@item btver1
CPUs based on AMD Family 14h cores with x86-64 instruction set support. (This
supersets MMX, SSE, SSE2, SSE3, SSSE3, SSE4A, CX16, ABM and 64-bit

View File

@ -49,6 +49,9 @@ int __attribute__ ((target("arch=znver3"))) foo () {
return 9;
}
int __attribute__ ((target("arch=znver4"))) foo () {
return 10;
}
int main ()
{
@ -72,6 +75,8 @@ int main ()
assert (val == 8);
else if (__builtin_cpu_is ("znver3"))
assert (val == 9);
else if (__builtin_cpu_is ("znver4"))
assert (val == 10);
else
assert (val == 0);

View File

@ -204,6 +204,7 @@ extern void test_arch_bdver3 (void) __attribute__((__target__("arch=bdver3")));
extern void test_arch_znver1 (void) __attribute__((__target__("arch=znver1")));
extern void test_arch_znver2 (void) __attribute__((__target__("arch=znver2")));
extern void test_arch_znver3 (void) __attribute__((__target__("arch=znver3")));
extern void test_arch_znver4 (void) __attribute__((__target__("arch=znver4")));
extern void test_tune_nocona (void) __attribute__((__target__("tune=nocona")));
extern void test_tune_core2 (void) __attribute__((__target__("tune=core2")));
@ -227,6 +228,7 @@ extern void test_tune_generic (void) __attribute__((__target__("tune=generic"))
extern void test_tune_znver1 (void) __attribute__((__target__("tune=znver1")));
extern void test_tune_znver2 (void) __attribute__((__target__("tune=znver2")));
extern void test_tune_znver3 (void) __attribute__((__target__("tune=znver3")));
extern void test_tune_znver4 (void) __attribute__((__target__("tune=znver4")));
extern void test_fpmath_sse (void) __attribute__((__target__("sse2,fpmath=sse")));
extern void test_fpmath_387 (void) __attribute__((__target__("sse2,fpmath=387")));