Add support for new instructions from ISE June 2020

Add support for new instructions as defined in the Instruction Set Extensions manual as of June 2020. Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2025-04-12 18:40:23 +08:00 · 2020-07-16 21:48:28 -07:00 · 2020-07-16 21:48:28 -07:00 · b31a4c9906
commit b31a4c9906
parent 36814f1fc8
9 changed files with 172 additions and 36 deletions
--- a/asm/assemble.c
+++ b/asm/assemble.c
@ -63,17 +63,18 @@
 *                                          assembly mode or the operand-size override on the operand
 * \70..\73         rel32                   a long relative operand, from operand 0..3
 * \74..\77         seg                     a word constant, from the _segment_ part of operand 0..3
- * \1ab                                     a ModRM, calculated on EA in operand a, with the spare
+ * \1ab             /r                      a ModRM, calculated on EA in operand a, with the reg
 *                                          field the register value of operand b.
- * \172\ab                                  the register number from operand a in bits 7..4, with
+ * \171\mab         /mrb (e.g /3r0)         a ModRM, with the reg field taken from operand a, and the m
+ *                                          and b fields set to the specified values.
+ * \172\ab          /is4                    the register number from operand a in bits 7..4, with
 *                                          the 4-bit immediate from operand b in bits 3..0.
 * \173\xab                                 the register number from operand a in bits 7..4, with
 *                                          the value b in bits 3..0.
 * \174..\177                               the register number from operand 0..3 in bits 7..4, and
 *                                          an arbitrary value in bits 3..0 (assembled as zero.)
- * \2ab                                     a ModRM, calculated on EA in operand a, with the spare
+ * \2ab             /b                      a ModRM, calculated on EA in operand a, with the reg
 *                                          field equal to digit b.
- *
 * \240..\243                               this instruction uses EVEX rather than REX or VEX/XOP, with the
 *                                          V field taken from operand 0..3.
 * \250                                     this instruction uses EVEX rather than REX or VEX/XOP, with the
@ -103,12 +104,11 @@
 *                tup is tuple type for Disp8*N from %tuple_codes in insns.pl
 *                    (compressed displacement encoding)
 *
- * \254..\257       id,s                        a signed 32-bit operand to be extended to 64 bits.
- * \260..\263                                   this instruction uses VEX/XOP rather than REX, with the
- *                                              V field taken from operand 0..3.
- * \270                                         this instruction uses VEX/XOP rather than REX, with the
- *                                              V field set to 1111b.
- *
+ * \254..\257       id,s                    a signed 32-bit operand to be extended to 64 bits.
+ * \260..\263                               this instruction uses VEX/XOP rather than REX, with the
+ *                                          V field taken from operand 0..3.
+ * \270                                     this instruction uses VEX/XOP rather than REX, with the
+ *                                          V field set to 1111b.
 * VEX/XOP prefixes are followed by the sequence:
 * \tmm\wlp        where mm is the M field; and wlp is:
 *                 00 wwl lpp
@ -1317,6 +1317,14 @@ static int64_t calcsize(int32_t segment, int64_t offset, int bits,
            length += 2;
            break;

+        case 0171:
+            c = *codes++;
+            op2 = (op2 & ~3) | ((c >> 3) & 3);
+            opx = &ins->oprs[op2];
+            ins->rex |= op_rexflags(opx, REX_R|REX_H|REX_P|REX_W);
+            length++;
+            break;
+
        case 0172:
        case 0173:
            codes++;
@ -1951,6 +1959,15 @@ static void gencode(struct out_data *data, insn *ins)
            out_segment(data, opx);
            break;

+        case 0171:
+            c = *codes++;
+            op2 = (op2 & ~3) | ((c >> 3) & 3);
+            opx = &ins->oprs[op2];
+            r = nasm_regvals[opx->basereg];
+            c = (c & ~070) | ((r & 7) << 3);
+            out_rawbyte(data, c);
+            break;
+
        case 0172:
        {
            int mask = ins->prefixes[PPS_VEX] == P_EVEX ? 7 : 15;
@ -2807,7 +2824,7 @@ static enum ea_type process_ea(operand *input, ea *output, int bits,
                 input->disp_size != (addrbits != 16 ? 32 : 16)))
                nasm_warn(WARN_OTHER, "displacement size ignored on absolute address");

-            if (bits == 64 && (~input->type & IP_REL)) {
+            if ((eaflags & EAF_MIB) || (bits == 64 && (~input->type & IP_REL))) {
                output->sib_present = true;
                output->sib         = GEN_SIB(0, 4, 5);
                output->bytes       = 4;
@ -3026,7 +3043,7 @@ static enum ea_type process_ea(operand *input, ea *output, int bits,
                output->rex |= rexflags(it, ix, REX_X);
                output->rex |= rexflags(bt, bx, REX_B);

-                if (it == -1 && (bt & 7) != REG_NUM_ESP) {
+                if (it == -1 && (bt & 7) != REG_NUM_ESP && !(eaflags & EAF_MIB)) {
                    /* no SIB needed */
                    int mod, rm;

--- a/disasm/disasm.c
+++ b/disasm/disasm.c
@ -203,6 +203,8 @@ static enum reg_enum whichreg(opflags_t regflags, int regval, int rex)
        return GET_REGISTER(nasm_rd_opmaskreg, regval);
    if (!(BNDREG & ~regflags))
        return GET_REGISTER(nasm_rd_bndreg, regval);
+    if (!(TMMREG & ~regflags))
+        return GET_REGISTER(nasm_rd_tmmreg, regval);

 #undef GET_REGISTER
    return 0;
@ -679,6 +681,22 @@ static int matches(const struct itemplate *t, uint8_t *data,
            break;
        }

+        case 0171:
+        {
+            uint8_t t = *r++;
+            uint8_t d = *data++;
+            if ((d ^ t) & ~070) {
+                return 0;
+            } else {
+                op2 = (op2 & ~3) | ((t >> 3) & 3);
+                opy = &ins->oprs[op2];
+                opy->basereg = ((d >> 3) & 7) +
+                    (ins->rex & REX_R ? 8 : 0);
+                opy->segment |= SEG_RMREG;
+            }
+            break;
+        }
+
        case 0172:
            {
                uint8_t ximm = *data++;
--- a/doc/changes.src
+++ b/doc/changes.src
@ -9,6 +9,9 @@ since 2007.

 \S{cl-2.15.03} Version 2.15.03

+\b Add instructions from the Intel Instruction Set Extensions and
+Future Features Programming Reference, June 2020.
+
 \b Properly display warnings in preprocess-only mode.

 \b Fix copy-and-paste of examples from the PDF documentation.
--- a/include/opflags.h
+++ b/include/opflags.h
@ -81,19 +81,19 @@
 /*
 * Register classes.
 *
- * Bits: 7 - 16
+ * Bits: 7 - 17
 */
 #define REG_CLASS_SHIFT         (7)
-#define REG_CLASS_BITS          (10)
+#define REG_CLASS_BITS          (11)
 #define REG_CLASS_MASK          OP_GENMASK(REG_CLASS_BITS, REG_CLASS_SHIFT)
 #define GEN_REG_CLASS(bit)      OP_GENBIT(bit, REG_CLASS_SHIFT)

 /*
 * Subclasses. Depends on type of operand.
 *
- * Bits: 17 - 24
+ * Bits: 18 - 25
 */
-#define SUBCLASS_SHIFT          (17)
+#define SUBCLASS_SHIFT          (18)
 #define SUBCLASS_BITS           (8)
 #define SUBCLASS_MASK           OP_GENMASK(SUBCLASS_BITS, SUBCLASS_SHIFT)
 #define GEN_SUBCLASS(bit)       OP_GENBIT(bit, SUBCLASS_SHIFT)
@ -101,9 +101,9 @@
 /*
 * Special flags. Context dependant.
 *
- * Bits: 25 - 31
+ * Bits: 26 - 32
 */
-#define SPECIAL_SHIFT           (25)
+#define SPECIAL_SHIFT           (26)
 #define SPECIAL_BITS            (7)
 #define SPECIAL_MASK            OP_GENMASK(SPECIAL_BITS, SPECIAL_SHIFT)
 #define GEN_SPECIAL(bit)        OP_GENBIT(bit, SPECIAL_SHIFT)
@ -111,9 +111,9 @@
 /*
 * Sizes of the operands and attributes.
 *
- * Bits: 32 - 42
+ * Bits: 33 - 43
 */
-#define SIZE_SHIFT              (32)
+#define SIZE_SHIFT              (33)
 #define SIZE_BITS               (11)
 #define SIZE_MASK               OP_GENMASK(SIZE_BITS, SIZE_SHIFT)
 #define GEN_SIZE(bit)           OP_GENBIT(bit, SIZE_SHIFT)
@ -121,9 +121,9 @@
 /*
 * Register set count
 *
- * Bits: 47 - 43
+ * Bits: 44 - 48
 */
-#define REGSET_SHIFT            (43)
+#define REGSET_SHIFT            (44)
 #define REGSET_BITS             (5)
 #define REGSET_MASK             OP_GENMASK(REGSET_BITS, REGSET_SHIFT)
 #define GEN_REGSET(bit)         OP_GENBIT(bit, REGSET_SHIFT)
@ -138,11 +138,11 @@
 *
 * ............................................................1111 optypes
 * .........................................................111.... modifiers
- * ...............................................1111111111....... register classes
- * .......................................11111111................. subclasses
- * ................................1111111......................... specials
- * .....................11111111111................................ sizes
- * ................11111........................................... regset count
+ * ..............................................11111111111....... register classes
+ * ......................................11111111.................. subclasses
+ * ...............................1111111.......................... specials
+ * ....................11111111111................................. sizes
+ * ...............11111............................................ regset count
 */

 #define REGISTER                GEN_OPTYPE(0)                   /* register number in 'basereg' */
@ -176,6 +176,7 @@
 #define REG_CLASS_RM_ZMM        GEN_REG_CLASS(7)
 #define REG_CLASS_OPMASK        GEN_REG_CLASS(8)
 #define REG_CLASS_BND           GEN_REG_CLASS(9)
+#define REG_CLASS_RM_TMM	GEN_REG_CLASS(10)

 static inline bool is_class(opflags_t class, opflags_t op)
 {
@ -217,6 +218,7 @@ static inline bool is_reg_class(opflags_t class, opflags_t reg)
 #define KREG                    OPMASKREG
 #define RM_BND                  (                  REG_CLASS_BND              | REGMEM)                 /* Bounds operand */
 #define BNDREG                  (                  REG_CLASS_BND              | REGMEM | REGISTER)      /* Bounds register */
+#define TMMREG                  (                  REG_CLASS_RM_TMM           | REGMEM | REGISTER)      /* TMM (AMX) register */
 #define REG_CDT                 (                  REG_CLASS_CDT    | BITS32           | REGISTER)      /* CRn, DRn and TRn */
 #define REG_CREG                (GEN_SUBCLASS(1) | REG_CLASS_CDT    | BITS32           | REGISTER)      /* CRn */
 #define REG_DREG                (GEN_SUBCLASS(2) | REG_CLASS_CDT    | BITS32           | REGISTER)      /* DRn */
--- a/test/amx.asm
+++ b/test/amx.asm
@ -0,0 +1,36 @@
+	bits 64
+
+%macro amx 1
+  %define treg tmm %+ %1
+
+	ldtilecfg [rsi]
+	sttilecfg [rdi]
+
+	tilezero treg
+
+	tileloadd treg, [rax]
+	tileloadd treg, [rax,rdx]
+	tileloadd treg, [rax,rdx*2]
+
+	tileloaddt1 treg, [rax]
+	tileloaddt1 treg, [rax,rdx]
+	tileloaddt1 treg, [rax,rdx*2]
+
+	tdpbf16ps treg, treg, treg
+	tdpbssd treg, treg, treg
+	tdpbusd treg, treg, treg
+	tdpbsud treg, treg, treg
+	tdpbuud treg, treg, treg
+
+	tilestored [rax], treg
+	tilestored [rax,rdx], treg
+	tilestored [rax,rdx*2], treg
+
+	tilerelease
+%endmacro
+
+%assign n 0
+  %rep 8
+	amx n
+    %assign n n+1
+  %endrep
--- a/x86/iflags.ph
+++ b/x86/iflags.ph
@ -84,6 +84,16 @@ if_("AVX5124FMAPS",      "AVX-512 4-iteration multiply-add");
 if_("AVX5124VNNIW",      "AVX-512 4-iteration dot product");
 if_("SGX",               "Intel Software Guard Extensions (SGX)");
 if_("CET",               "Intel Control-Flow Enforcement Technology (CET)");
+if_("ENQCMD",            "Enqueue command instructions");
+if_("PCONFIG",           "Platform configuration instruction");
+if_("WBNOINVD",          "Writeback and do not invalidate instruction");
+if_("TSXLDTRK",          "TSX suspend load address tracking");
+if_("SERIALIZE",         "SERIALIZE instruction");
+if_("AVX512BF16",        "AVX-512 bfloat16");
+if_("AVX512VP2INTERSECT", "AVX-512 VP2INTERSECT instructions");
+if_("AMXTILE",           "AMX tile configuration instructions");
+if_("AMXBF16",           "AMX bfloat16 multiplication");
+if_("AMXINT8",           "AMX 8-bit integer multiplication");

 # Put these last [hpa: why?]
 if_("OBSOLETE",          "Instruction removed from architecture");
--- a/x86/insns.dat
+++ b/x86/insns.dat
@ -5999,6 +5999,51 @@ WRUSSQ		mem,reg64			[mr:	o64 66 0f 38 f5 /r]			CET,FUTURE,X64
 WRSSD		mem,reg32			[mr:	o32 0f 38 f6 /r]			CET,FUTURE
 WRSSQ		mem,reg64			[mr:	o64 0f 38 f6 /r]			CET,FUTURE,X64

+;# Instructions from ISE doc 319433-040, June 2020
+ENQCMD		reg16,mem512			[rm:	a16 f2 0f 38 f8 /r]			ENQCMD,FUTURE
+ENQCMD		reg32,mem512			[rm:	a16 f2 0f 38 f8 /r]			ENQCMD,FUTURE,ND
+ENQCMD		reg32,mem512			[rm:	a32 f2 0f 38 f8 /r]			ENQCMD,FUTURE
+ENQCMD		reg64,mem512			[rm:	a64 f2 0f 38 f8 /r]			ENQCMD,FUTURE,X64
+ENQCMDS		reg16,mem512			[rm:	a16 f2 0f 38 f8 /r]			ENQCMD,FUTURE,PRIV
+ENQCMDS		reg32,mem512			[rm:	a16 f2 0f 38 f8 /r]			ENQCMD,FUTURE,PRIV,ND
+ENQCMDS		reg32,mem512			[rm:	a32 f2 0f 38 f8 /r]			ENQCMD,FUTURE,PRIV
+ENQCMDS		reg64,mem512			[rm:	a64 f2 0f 38 f8 /r]			ENQCMD,FUTURE,PRIV,X64
+PCONFIG		void				[	np 0f 01 c5]				PCONFIG,FUTURE,PRIV
+SERIALIZE	void				[	np 0f 01 e8]				SERIALIZE,FUTURE
+WBNOINVD	void				[	f3 0f 09]				WBNOINVD,FUTURE,PRIV
+XRESLDTRK	void				[	f2 0f 01 e9]				TSXLDTRK,FUTURE
+XSUSLDTRK	void				[	f2 0f 01 e8]				TSXLDTRK,FUTURE
+
+;# AVX512 Bfloat16 instructions
+VCVTNE2PS2BF16	xmmreg|mask|z,xmmreg*,xmmrm128|b32	[rvm:	evex.128.f2.0f38.w0 72 /r]	AVX512BF16,FUTURE
+VCVTNE2PS2BF16	ymmreg|mask|z,ymmreg*,ymmrm256|b32	[rvm:	evex.256.f2.0f38.w0 72 /r]	AVX512BF16,FUTURE
+VCVTNE2PS2BF16	zmmreg|mask|z,zmmreg*,zmmrm512|b32	[rvm:	evex.512.f2.0f38.w0 72 /r]	AVX512BF16,FUTURE
+VCVTNE2PS2BF16	xmmreg|mask|z,xmmreg*,xmmrm128|b32	[rvm:	evex.128.f3.0f38.w0 72 /r]	AVX512BF16,FUTURE
+VCVTNE2PS2BF16	ymmreg|mask|z,ymmreg*,ymmrm256|b32	[rvm:	evex.256.f3.0f38.w0 72 /r]	AVX512BF16,FUTURE
+VCVTNE2PS2BF16	zmmreg|mask|z,zmmreg*,zmmrm512|b32	[rvm:	evex.512.f3.0f38.w0 72 /r]	AVX512BF16,FUTURE
+VDPBF16PS	xmmreg|mask|z,xmmreg*,xmmrm128|b32	[rvm:	evex.128.f3.0f38.w0 52 /r]	AVX512BF16,FUTURE
+VDPBF16PS	ymmreg|mask|z,ymmreg*,ymmrm128|b32	[rvm:	evex.256.f3.0f38.w0 52 /r]	AVX512BF16,FUTURE
+VDPBF16PS	zmmreg|mask|z,zmmreg*,zmmrm128|b32	[rvm:	evex.512.f3.0f38.w0 52 /r]	AVX512BF16,FUTURE
+
+;# AVX512 mask intersect instructions
+VP2INTERSECTD	kreg|rs2,xmmreg,xmmrm128|b32		[rvm:	evex.nds.128.f2.0f38.w0 68 /r]	AVX512BF16,FUTURE
+VP2INTERSECTD	kreg|rs2,ymmreg,ymmrm128|b32		[rvm:	evex.nds.256.f2.0f38.w0 68 /r]	AVX512BF16,FUTURE
+VP2INTERSECTD	kreg|rs2,zmmreg,zmmrm128|b32		[rvm:	evex.nds.512.f2.0f38.w0 68 /r]	AVX512BF16,FUTURE
+
+;# Intel Advanced Matrix Extensions (AMX)
+LDTILECFG	mem512				[m:	vex.128.np.0f38.w0 49 /0]		AMXTILE,FUTURE,SZ,X64
+STTILECFG	mem512				[m:	vex.128.66.0f38.w0 49 /0]		AMXTILE,FUTURE,SZ,X64
+TDPBF16PS	tmmreg,tmmreg,tmmreg		[rmv:	vex.128.f3.0f38.w0 5c /r]		AMXBF16,FUTURE,X64
+TDPBSSD		tmmreg,tmmreg,tmmreg		[rmv:	vex.128.f2.0f38.w0 5e /r]		AMXINT8,FUTURE,X64
+TDPBSUD		tmmreg,tmmreg,tmmreg		[rmv:	vex.128.f3.0f38.w0 5e /r]		AMXINT8,FUTURE,X64
+TDPBUSD		tmmreg,tmmreg,tmmreg		[rmv:	vex.128.66.0f38.w0 5e /r]		AMXINT8,FUTURE,X64
+TDPBUUD		tmmreg,tmmreg,tmmreg		[rmv:	vex.128.np.0f38.w0 5e /r]		AMXINT8,FUTURE,X64
+TILELOADD	tmmreg,mem			[rm:	vex.128.f2.0f38.w0 4b /r]		AMXTILE,MIB,FUTURE,SX,X64
+TILELOADDT1	tmmreg,mem			[rm:	vex.128.f2.0f38.w0 4b /r]		AMXTILE,MIB,FUTURE,SX,X64
+TILERELEASE	void				[	vex.128.np.0f38.w0 49 c0]		AMXTILE,FUTURE,X64
+TILESTORED	mem,tmmreg			[mr:	vex.128.f3.0f38.w0 4b /r]		AMXTILE,MIB,FUTURE,SX,X64
+TILEZERO	tmmreg				[r:	vex.128.f2.0f38.w0 49 /3r0]		AMXTILE,FUTURE,X64
+
 ;# Systematic names for the hinting nop instructions
 ; These should be last in the file
 HINT_NOP0	rm16				[m:	o16 0f 18 /0]				P6,UNDOC
--- a/x86/insns.pl
+++ b/x86/insns.pl
@ -880,11 +880,19 @@ sub byte_code_compile($$) {
            $prefix_ok = 0;
        } elsif ($op =~ m:^/([0-7])$:) {
            if (!defined($oppos{'m'})) {
-                die "$fname:$line: $op requires m operand\n";
+                die "$fname:$line: $op requires an m operand\n";
            }
            push(@codes, 06) if ($oppos{'m'} & 4);
            push(@codes, 0200 + (($oppos{'m'} & 3) << 3) + $1);
            $prefix_ok = 0;
+	} elsif ($op =~ m:^/([0-3]?)r([0-7])$:) {
+	    if (!defined($oppos{'r'})) {
+                die "$fname:$line: $op requires an r operand\n";
+	    }
+	    push(@codes, 05) if ($oppos{'r'} & 4);
+	    push(@codes, 0171);
+	    push(@codes, (($1+0) << 6) + (($oppos{'r'} & 3) << 3) + $2);
+	    $prefix_ok = 0;
        } elsif ($op =~ /^(vex|xop)(|\..*)$/) {
            my $vexname = $1;
            my $c = $vexmap{$vexname};
@ -907,7 +915,7 @@ sub byte_code_compile($$) {
                        $w = 2;
                    } elsif ($oq eq 'ww') {
                        $w = 3;
-                    } elsif ($oq eq 'p0') {
+                    } elsif ($oq eq 'np' || $oq eq 'p0') {
                        $p = 0;
                    } elsif ($oq eq '66' || $oq eq 'p1') {
                        $p = 1;
@ -935,9 +943,6 @@ sub byte_code_compile($$) {
            if (!defined($m) || !defined($w) || !defined($l) || !defined($p)) {
                die "$fname:$line: missing fields in \U$vexname\E specification\n";
            }
-            if (defined($oppos{'v'}) && !$has_nds) {
-                die "$fname:$line: 'v' operand without ${vexname}.nds or ${vexname}.ndd\n";
-            }
 	    my $minmap = ($c == 1) ? 8 : 0; # 0-31 for VEX, 8-31 for XOP
 	    if ($m < $minmap || $m > 31) {
 		die "$fname:$line: Only maps ${minmap}-31 are valid for \U${vexname}\n";
@ -966,7 +971,7 @@ sub byte_code_compile($$) {
                        $w = 2;
                    } elsif ($oq eq 'ww') {
                        $w = 3;
-                    } elsif ($oq eq 'p0') {
+                    } elsif ($oq eq 'np' || $oq eq 'p0') {
                        $p = 0;
                    } elsif ($oq eq '66' || $oq eq 'p1') {
                        $p = 1;
@ -994,9 +999,6 @@ sub byte_code_compile($$) {
            if (!defined($m) || !defined($w) || !defined($l) || !defined($p)) {
                die "$fname:$line: missing fields in EVEX specification\n";
            }
-            if (defined($oppos{'v'}) && !$has_nds) {
-                die "$fname:$line: 'v' operand without evex.nds or evex.ndd\n";
-            }
 	    if ($m > 15) {
 		die "$fname:$line: Only maps 0-15 are valid for EVEX\n";
 	    }
--- a/x86/regs.dat
+++ b/x86/regs.dat
@ -130,6 +130,9 @@ zmm0	ZMM0		zmmreg		0
 zmm1-15	ZMM_L16		zmmreg		1
 zmm16-31	ZMMREG		zmmreg		16

+# AMX tile registers
+tmm0-7	TMMREG		tmmreg		0
+
 # Opmask registers
 k0	OPMASK0		opmaskreg	0
 k1-7	OPMASKREG	opmaskreg	1   TFLAG_BRC_OPT