Document CPU LATEVEX, add CPU EVEX and CPU VEX flags

Document CPU LATEVEX and the associated prefixes; add CPU EVEX and CPU VEX flags to further control encodings. Fix the error message for invalid encodings due to flags. Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2024-11-21 03:14:19 +08:00 · 2022-12-07 10:04:40 -08:00 · 2022-12-07 10:04:40 -08:00 · 55dc058356
commit 55dc058356
parent 494d9531dd
5 changed files with 145 additions and 39 deletions
--- a/asm/assemble.c
+++ b/asm/assemble.c
@ -934,8 +934,12 @@ int64_t assemble(int32_t segment, int64_t start, int bits, insn *instruction)
                nasm_nonfatal("instruction not supported in %d-bit mode", bits);
                break;
            case MERR_ENCMISMATCH:
-                nasm_nonfatal("instruction not encodable with %s prefix",
-                              prefix_name(instruction->prefixes[PPS_REX]));
+                if (!instruction->prefixes[PPS_REX]) {
+                    nasm_nonfatal("instruction not encodable without explicit prefix");
+                } else {
+                    nasm_nonfatal("instruction not encodable with %s prefix",
+                                  prefix_name(instruction->prefixes[PPS_REX]));
+                }
                break;
            case MERR_BADBND:
            case MERR_BADREPNE:
@ -2552,9 +2556,16 @@ static enum match_result matches(const struct itemplate *itemp,
            return MERR_ENCMISMATCH;
        break;
    default:
-        if (itemp_has(itemp, IF_LATEVEX)) {
-            if (!iflag_test(&cpu, IF_LATEVEX))
+        if (itemp_has(itemp, IF_EVEX)) {
+            if (!iflag_test(&cpu, IF_EVEX))
                return MERR_ENCMISMATCH;
+        } else if (itemp_has(itemp, IF_VEX)) {
+            if (!iflag_test(&cpu, IF_VEX)) {
+                return MERR_ENCMISMATCH;
+            } else if (itemp_has(itemp, IF_LATEVEX)) {
+                if (!iflag_test(&cpu, IF_LATEVEX) && iflag_test(&cpu, IF_EVEX))
+                    return MERR_ENCMISMATCH;
+            }
        }
        break;
    }
--- a/asm/directiv.c
+++ b/asm/directiv.c
@ -111,7 +111,9 @@ void set_cpu(const char *value)
        { "any", IF_ANY },
        { "all", IF_ANY },
        { "latevex", IF_LATEVEX },
-        { NULL, IF_DEFAULT }    /* End of list */
+        { "evex", IF_EVEX },
+        { "vex", IF_VEX },
+        { NULL, 0 }
    };

    if (!value) {
--- a/doc/changes.src
+++ b/doc/changes.src
@ -68,6 +68,20 @@ reservations (e.g. \c{dw ?}.)
 \b Allow forcing an instruction in 64-bit mode to have a (possibly
 redundant) REX prefix, using the syntax \i\c{\{rex\}} as a prefix.

+\b Add a \c{\{vex\}} prefix to enforce VEX (AVX) encoding of an
+instruction, either using the 2- or 3-byte VEX prefixes.
+
+\b The \c{CPU} directive has been augmented to allow control of
+generation of VEX (AVX) versus EVEX (AVX-512) instruction formats, see
+\k{CPU}.
+
+\b Some recent instructions that previously have been only available
+using EVEX encodings are now also encodable using VEX (AVX)
+encodings. For backwards compatibility these encodings are not enabled
+by default, but can be generated either via an explicit \c{\{vex\}}
+prefix or by specifying either \c{CPU LATEVEX} or \c{CPU NOEVEX}; see
+\k{CPU}.
+
 \b Document the already existing \c{%unimacro} directive. See \k{unmacro}.

 \b Fix a code range generation bug in the DWARF debug format
@ -767,9 +781,10 @@ options to indicate whether all relevant branches should be getting
 \c{BND} prefixes.  This is expected to be the normal for use in MPX
 code.

-\b Add \c{{evex}}, \c{{vex3}} and \c{{vex2}} instruction prefixes to
-have NASM encode the corresponding instruction, if possible, with an EVEX,
-3-byte VEX, or 2-byte VEX prefix, respectively.
+\b Add \c{\{evex\}}, \c{\{vex3\}} and \c{\{vex2\}} instruction
+prefixes to have NASM encode the corresponding instruction, if
+possible, with an EVEX, 3-byte VEX, or 2-byte VEX prefix,
+respectively.

 \b Support for section names longer than 8 bytes in Win32/Win64 COFF.

--- a/doc/nasmdoc.src
+++ b/doc/nasmdoc.src
@ -5594,47 +5594,87 @@ are excluded from the symbol mangling and also not marked as global.
 \H{CPU} \i\c{CPU}: Defining CPU Dependencies

 The \i\c{CPU} directive restricts assembly to those instructions which
-are available on the specified CPU.
+are available on the specified CPU. At the moment, it is primarily
+used to enforce unavailable \e{encodings} of instructions, such as
+5-byte jumps on the 8080.

-Options are:
+(If someone would volunteer to work through the database and add
+proper annotations to each instruction, this could be greatly
+improved. Please contact the developers to volunteer, see \{contact}.)

-\b\c{CPU 8086}          Assemble only 8086 instruction set
+Current CPU keywords are:

-\b\c{CPU 186}           Assemble instructions up to the 80186 instruction set
+\b\c{CPU 8086}        - Assemble only 8086 instruction set

-\b\c{CPU 286}           Assemble instructions up to the 286 instruction set
+\b\c{CPU 186}         - Assemble instructions up to the 80186 instruction set

-\b\c{CPU 386}           Assemble instructions up to the 386 instruction set
+\b\c{CPU 286}         - Assemble instructions up to the 286 instruction set

-\b\c{CPU 486}           486 instruction set
+\b\c{CPU 386}         - Assemble instructions up to the 386 instruction set

-\b\c{CPU 586}           Pentium instruction set
+\b\c{CPU 486}         - 486 instruction set

-\b\c{CPU PENTIUM}       Same as 586
+\b\c{CPU 586}         - Pentium instruction set

-\b\c{CPU 686}           P6 instruction set
+\b\c{CPU PENTIUM}     - Same as 586

-\b\c{CPU PPRO}          Same as 686
+\b\c{CPU 686}         - P6 instruction set

-\b\c{CPU P2}            Same as 686
+\b\c{CPU PPRO}        - Same as 686

-\b\c{CPU P3}            Pentium III (Katmai) instruction sets
+\b\c{CPU P2}          - Same as 686

-\b\c{CPU KATMAI}        Same as P3
+\b\c{CPU P3}          - Pentium III (Katmai) instruction sets

-\b\c{CPU P4}            Pentium 4 (Willamette) instruction set
+\b\c{CPU KATMAI}      - Same as P3

-\b\c{CPU WILLAMETTE}    Same as P4
+\b\c{CPU P4}          - Pentium 4 (Willamette) instruction set

-\b\c{CPU PRESCOTT}      Prescott instruction set
+\b\c{CPU WILLAMETTE}  - Same as P4

-\b\c{CPU X64}           x86-64 (x64/AMD64/Intel 64) instruction set
+\b\c{CPU PRESCOTT}    - Prescott instruction set

-\b\c{CPU IA64}          IA64 CPU (in x86 mode) instruction set
+\b\c{CPU X64}         - x86-64 (x64/AMD64/Intel 64) instruction set

-All options are case insensitive.  All instructions will be selected
-only if they apply to the selected CPU or lower.  By default, all
-instructions are available.
+\b\c{CPU IA64}        - IA64 CPU (in x86 mode) instruction set
+
+\b\c{CPU DEFAULT}     - All available instructions
+
+\b\c{CPU ALL}	      - All available instructions \e{and flags}
+
+All options are case insensitive.
+
+In addition, optional flags can be specified to modify the instruction
+selections. These can be combined with a CPU declaration or specified
+alone. They can be prefixed by \c{+} (add flag, default), \c{-}
+(remove flag) or \c{*} (set flag to default); these prefixes are
+"sticky", so:
+
+\c      cpu -foo,bar
+
+means remove both the \c{foo} and \c{bar} options.
+
+If prefixed with \c{no}, it inverts the meaning of the flag, but this
+is not sticky, so:
+
+\c      cpu nofoo,bar
+
+means remove the \c{foo} flag but add the \c{bar} flag.
+
+Currently available flags are:
+
+\b\c{EVEX} - Enable generation of EVEX (AVX-512) encoded instructions
+without an explicit \c{\{evex\}} prefix. Default on.
+
+\b\c\{VEX} - Enable generation of VEX (AVX) or XOP encoded
+instructions without an explict \c{\{vex\}} prefix. Default on.
+
+\b\c{LATEVEX} - Enable generation of VEX (AVX) encoding of
+instructions where the VEX instructions forms were introduced
+\e{after} the corresponding EVEX (AVX-512) instruction forms without
+requiring an explicit \c{\{vex\}} prefix. This is implicit if the
+\c{EVEX} flag is disabled and the \c{VEX} flag is enabled. Default
+off.


 \H{FLOAT} \i\c{FLOAT}: Handling of \I{floating-point, constants}floating-point constants
@ -5643,19 +5683,19 @@ By default, floating-point constants are rounded to nearest, and IEEE
 denormals are supported.  The following options can be set to alter
 this behaviour:

-\b\c{FLOAT DAZ}         Flush denormals to zero
+\b\c{FLOAT DAZ}       - Flush denormals to zero

-\b\c{FLOAT NODAZ}       Do not flush denormals to zero (default)
+\b\c{FLOAT NODAZ}     - Do not flush denormals to zero (default)

-\b\c{FLOAT NEAR}        Round to nearest (default)
+\b\c{FLOAT NEAR}      - Round to nearest (default)

-\b\c{FLOAT UP}          Round up (toward +Infinity)
+\b\c{FLOAT UP}        - Round up (toward +Infinity)

-\b\c{FLOAT DOWN}        Round down (toward -Infinity)
+\b\c{FLOAT DOWN}      - Round down (toward -Infinity)

-\b\c{FLOAT ZERO}        Round toward zero
+\b\c{FLOAT ZERO}      - Round toward zero

-\b\c{FLOAT DEFAULT}     Restore default settings
+\b\c{FLOAT DEFAULT}   - Restore default settings

 The standard macros \i\c{__?FLOAT_DAZ?__}, \i\c{__?FLOAT_ROUND?__}, and
 \i\c{__?FLOAT?__} contain the current state, as long as the programmer
--- a/test/latevex.asm
+++ b/test/latevex.asm
@ -1,7 +1,7 @@
 	bits 64

 %define YMMWORD yword
-	
+
 	vpmadd52luq	ymm3,ymm1,YMMWORD[rsi]
 	vpmadd52luq	ymm16,ymm1,YMMWORD[32+rsi]
 	vpmadd52luq	ymm17,ymm1,YMMWORD[64+rsi]
@ -30,4 +30,42 @@
 	vpmadd52luq	ymm17,ymm2,YMMWORD[64+rcx]
 	vpmadd52luq	ymm18,ymm2,YMMWORD[96+rcx]
 	vpmadd52luq	ymm19,ymm2,YMMWORD[128+rcx]
-	
+
+	cpu default
+
+	vpmadd52luq	ymm3,ymm1,YMMWORD[rsi]
+	vpmadd52luq	ymm3,ymm2,YMMWORD[rcx]
+
+	cpu noevex
+
+	vpmadd52luq	ymm3,ymm1,YMMWORD[rsi]
+	vpmadd52luq	ymm3,ymm2,YMMWORD[rcx]
+
+%ifdef ERROR
+	vpmadd52luq	ymm19,ymm2,YMMWORD[128+rcx]
+%endif
+
+	cpu evex,novex,latevex
+
+	vpmadd52luq	ymm3,ymm1,YMMWORD[rsi]
+	vpmadd52luq	ymm3,ymm2,YMMWORD[rcx]
+
+	cpu default
+
+	vaddps		ymm3,ymm1,YMMWORD[rsi]
+	vaddps		ymm3,ymm2,YMMWORD[rcx]
+
+	cpu novex
+
+	vaddps		ymm3,ymm1,YMMWORD[rsi]
+	vaddps		ymm3,ymm2,YMMWORD[rcx]
+
+%ifdef ERROR
+	cpu noevex
+
+	vaddps		ymm3,ymm1,YMMWORD[rsi]
+	vaddps		ymm3,ymm2,YMMWORD[rcx]
+%endif
+
+ {vex}	vaddps		ymm3,ymm1,YMMWORD[rsi]
+ {vex}	vaddps		ymm3,ymm2,YMMWORD[rcx]