From 7eb4a387939955c1c0b41fbc8b1216419082321f Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 17 Sep 2007 15:49:30 -0700
Subject: [PATCH 01/29] Initial support for four arguments per instruction

For SSE5, we will need to support four arguments per instruction.
---
 assemble.c | 275 ++++++++++++++++++++++++++++++++---------------------
 disasm.c   |  54 +++++------
 insns.dat  | 162 +++++++++++++++----------------
 insns.h    |  18 ++--
 insns.pl   |  69 ++++++++------
 nasm.h     |   3 +-
 6 files changed, 325 insertions(+), 256 deletions(-)

diff --git a/assemble.c b/assemble.c
index 54522712..7dc2b25b 100644
--- a/assemble.c
+++ b/assemble.c
@@ -12,39 +12,36 @@
  *                 (POP is never used for CS) depending on operand 0
  * \5, \7        - the second byte of POP/PUSH codes for FS, GS, depending
  *                 on operand 0
- * \10, \11, \12 - a literal byte follows in the code stream, to be added
- *                 to the register value of operand 0, 1 or 2
- * \17           - encodes the literal byte 0. (Some compilers don't take
- *                 kindly to a zero byte in the _middle_ of a compile time
- *                 string constant, so I had to put this hack in.)
- * \14, \15, \16 - a signed byte immediate operand, from operand 0, 1 or 2
- * \20, \21, \22 - a byte immediate operand, from operand 0, 1 or 2
- * \24, \25, \26 - an unsigned byte immediate operand, from operand 0, 1 or 2
- * \30, \31, \32 - a word immediate operand, from operand 0, 1 or 2
- * \34, \35, \36 - select between \3[012] and \4[012] depending on 16/32 bit
+ * \10..\13      - a literal byte follows in the code stream, to be added
+ *                 to the register value of operand 0..3
+ * \14..\17      - a signed byte immediate operand, from operand 0..3
+ * \20..\23      - a byte immediate operand, from operand 0..3
+ * \24..\27      - an unsigned byte immediate operand, from operand 0..3
+ * \30..\33      - a word immediate operand, from operand 0..3
+ * \34..\37      - select between \3[0-3] and \4[0-3] depending on 16/32 bit
  *                 assembly mode or the operand-size override on the operand
- * \37           - a word constant, from the _segment_ part of operand 0
- * \40, \41, \42 - a long immediate operand, from operand 0, 1 or 2
- * \44, \45, \46 - select between \3[012], \4[012] and \5[456]
+ * \40..\43      - a long immediate operand, from operand 0..3
+ * \44..\47      - select between \3[0-3], \4[0-3] and \5[4-7]
  *		   depending on assembly mode or the address-size override
  *		   on the operand.
- * \50, \51, \52 - a byte relative operand, from operand 0, 1 or 2
- * \54, \55, \56 - a qword immediate operand, from operand 0, 1 or 2
- * \60, \61, \62 - a word relative operand, from operand 0, 1 or 2
- * \64, \65, \66 - select between \6[012] and \7[012] depending on 16/32 bit
+ * \50..\53      - a byte relative operand, from operand 0..3
+ * \54..\57      - a qword immediate operand, from operand 0..3
+ * \60..\63      - a word relative operand, from operand 0..3
+ * \64..\67      - select between \6[0-3] and \7[0-3] depending on 16/32 bit
  *                 assembly mode or the operand-size override on the operand
- * \70, \71, \72 - a long relative operand, from operand 0, 1 or 2
+ * \70..\73      - a long relative operand, from operand 0..3
+ * \74..\77       - a word constant, from the _segment_ part of operand 0..3
  * \1ab          - a ModRM, calculated on EA in operand a, with the spare
  *                 field the register value of operand b.
- * \130,\131,\132 - an immediate word or signed byte for operand 0, 1, or 2
- * \133,\134,\135 - or 2 (s-field) into next opcode byte if operand 0, 1, or 2
+ * \140..\143    - an immediate word or signed byte for operand 0..3
+ * \144..\147    - or 2 (s-field) into next opcode byte if operand 0..3
  *		    is a signed byte rather than a word.
- * \140,\141,\142 - an immediate dword or signed byte for operand 0, 1, or 2
- * \143,\144,\145 - or 2 (s-field) into next opcode byte if operand 0, 1, or 2
+ * \150..\153     - an immediate dword or signed byte for operand 0..3
+ * \154..\157     - or 2 (s-field) into next opcode byte if operand 0..3
  *		    is a signed byte rather than a dword.
- * \150,\151,\152 - an immediate qword or signed byte for operand 0, 1, or 2
- * \153,\154,\155 - or 2 (s-field) into next opcode byte if operand 0, 1, or 2
- *		    is a signed byte rather than a qword.
+ * \170          - encodes the literal byte 0. (Some compilers don't take
+ *                 kindly to a zero byte in the _middle_ of a compile time
+ *                 string constant, so I had to put this hack in.)
  * \2ab          - a ModRM, calculated on EA in operand a, with the spare
  *                 field equal to digit b.
  * \30x          - might be an 0x67 byte, depending on the address size of
@@ -730,73 +727,79 @@ static int32_t calcsize(int32_t segment, int32_t offset, int bits,
         case 010:
         case 011:
         case 012:
+	case 013:
 	    ins->rex |=
 		op_rexflags(&ins->oprs[c - 010], REX_B|REX_H|REX_P|REX_W);
             codes++, length++;
             break;
-        case 017:
-            length++;
-            break;
         case 014:
         case 015:
         case 016:
+	case 017:
             length++;
             break;
         case 020:
         case 021:
         case 022:
+	case 023:
             length++;
             break;
         case 024:
         case 025:
         case 026:
+	case 027:
             length++;
             break;
         case 030:
         case 031:
         case 032:
+	case 033:
             length += 2;
             break;
         case 034:
         case 035:
         case 036:
+	case 037:
             if (ins->oprs[c - 034].type & (BITS16 | BITS32 | BITS64))
                 length += (ins->oprs[c - 034].type & BITS16) ? 2 : 4;
             else
                 length += (bits == 16) ? 2 : 4;
             break;
-        case 037:
-            length += 2;
-            break;
         case 040:
         case 041:
         case 042:
+	case 043:
             length += 4;
             break;
         case 044:
         case 045:
         case 046:
+	case 047:
             length += ((ins->oprs[c - 044].addr_size ?
                         ins->oprs[c - 044].addr_size : bits) >> 3);
             break;
         case 050:
         case 051:
         case 052:
+	case 053:
             length++;
             break;
         case 054:
         case 055:
         case 056:
+	case 057:
             length += 8; /* MOV reg64/imm */
             break;
         case 060:
         case 061:
         case 062:
+	case 063:
             length += 2;
             break;
         case 064:
         case 065:
         case 066:
+	case 067:
             if (ins->oprs[c - 064].type & (BITS16 | BITS32 | BITS64))
                 length += (ins->oprs[c - 064].type & BITS16) ? 2 : 4;
             else
@@ -805,33 +808,48 @@ static int32_t calcsize(int32_t segment, int32_t offset, int bits,
         case 070:
         case 071:
         case 072:
+	case 073:
             length += 4;
             break;
-        case 0130:
-        case 0131:
-        case 0132:
-            length += is_sbyte(ins, c - 0130, 16) ? 1 : 2;
-            break;
-        case 0133:
-        case 0134:
-        case 0135:
-            codes += 2;
-            length++;
+        case 074:
+        case 075:
+        case 076:
+        case 077:
+            length += 2;
             break;
         case 0140:
         case 0141:
         case 0142:
-            length += is_sbyte(ins, c - 0140, 32) ? 1 : 4;
+	case 0143:
+            length += is_sbyte(ins, c - 0140, 16) ? 1 : 2;
             break;
-        case 0143:
         case 0144:
         case 0145:
+        case 0146:
+        case 0147:
             codes += 2;
             length++;
             break;
+        case 0150:
+        case 0151:
+        case 0152:
+        case 0153:
+            length += is_sbyte(ins, c - 0150, 32) ? 1 : 4;
+            break;
+        case 0154:
+        case 0155:
+        case 0156:
+        case 0157:
+            codes += 2;
+            length++;
+            break;
+        case 0170:
+            length++;
+            break;
         case 0300:
         case 0301:
         case 0302:         
+        case 0303:         
             length += chsize(&ins->oprs[c - 0300], bits);
             break;
         case 0310:
@@ -1020,21 +1038,17 @@ static void gencode(int32_t segment, int32_t offset, int bits,
         case 010:
         case 011:
         case 012:
+	case 013:
 	    EMIT_REX();
             bytes[0] = *codes++ + ((regval(&ins->oprs[c - 010])) & 7);
             out(offset, segment, bytes, OUT_RAWDATA + 1, NO_SEG, NO_SEG);
             offset += 1;
             break;
 
-        case 017:
-            bytes[0] = 0;
-            out(offset, segment, bytes, OUT_RAWDATA + 1, NO_SEG, NO_SEG);
-            offset += 1;
-            break;
-
         case 014:
         case 015:
         case 016:
+	case 017:
             if (ins->oprs[c - 014].offset < -128
                 || ins->oprs[c - 014].offset > 127) {
                 errfunc(ERR_WARNING, "signed byte value exceeds bounds");
@@ -1055,6 +1069,7 @@ static void gencode(int32_t segment, int32_t offset, int bits,
         case 020:
         case 021:
         case 022:
+	case 023:
             if (ins->oprs[c - 020].offset < -256
                 || ins->oprs[c - 020].offset > 255) {
                 errfunc(ERR_WARNING, "byte value exceeds bounds");
@@ -1074,6 +1089,7 @@ static void gencode(int32_t segment, int32_t offset, int bits,
         case 024:
         case 025:
         case 026:
+	case 027:
             if (ins->oprs[c - 024].offset < 0
                 || ins->oprs[c - 024].offset > 255)
                 errfunc(ERR_WARNING, "unsigned byte value exceeds bounds");
@@ -1092,6 +1108,7 @@ static void gencode(int32_t segment, int32_t offset, int bits,
         case 030:
         case 031:
         case 032:
+	case 033:
             if (ins->oprs[c - 030].segment == NO_SEG &&
                 ins->oprs[c - 030].wrt == NO_SEG &&
                 (ins->oprs[c - 030].offset < -65536L ||
@@ -1107,6 +1124,7 @@ static void gencode(int32_t segment, int32_t offset, int bits,
         case 034:
         case 035:
         case 036:
+	case 037:
             if (ins->oprs[c - 034].type & (BITS16 | BITS32))
                 size = (ins->oprs[c - 034].type & BITS16) ? 2 : 4;
             else
@@ -1119,20 +1137,10 @@ static void gencode(int32_t segment, int32_t offset, int bits,
             offset += size;
             break;
 
-        case 037:
-            if (ins->oprs[0].segment == NO_SEG)
-                errfunc(ERR_NONFATAL, "value referenced by FAR is not"
-                        " relocatable");
-            data = 0L;
-            out(offset, segment, &data, OUT_ADDRESS + 2,
-                outfmt->segbase(1 + ins->oprs[0].segment),
-                ins->oprs[0].wrt);
-            offset += 2;
-            break;
-
         case 040:
         case 041:
         case 042:
+	case 043:
             data = ins->oprs[c - 040].offset;
             out(offset, segment, &data, OUT_ADDRESS + 4,
                 ins->oprs[c - 040].segment, ins->oprs[c - 040].wrt);
@@ -1142,6 +1150,7 @@ static void gencode(int32_t segment, int32_t offset, int bits,
         case 044:
         case 045:
         case 046:
+	case 047:
             data = ins->oprs[c - 044].offset;
             size = ((ins->oprs[c - 044].addr_size ?
                      ins->oprs[c - 044].addr_size : bits) >> 3);
@@ -1155,6 +1164,7 @@ static void gencode(int32_t segment, int32_t offset, int bits,
         case 050:
         case 051:
         case 052:
+	case 053:
             if (ins->oprs[c - 050].segment != segment)
                 errfunc(ERR_NONFATAL,
                         "short relative jump outside segment");
@@ -1169,6 +1179,7 @@ static void gencode(int32_t segment, int32_t offset, int bits,
         case 054:
         case 055:
         case 056:
+	case 057:
             data = (int64_t)ins->oprs[c - 054].offset;
             out(offset, segment, &data, OUT_ADDRESS + 8,
                 ins->oprs[c - 054].segment, ins->oprs[c - 054].wrt);
@@ -1178,6 +1189,7 @@ static void gencode(int32_t segment, int32_t offset, int bits,
         case 060:
         case 061:
         case 062:
+	case 063:
             if (ins->oprs[c - 060].segment != segment) {
                 data = ins->oprs[c - 060].offset;
                 out(offset, segment, &data,
@@ -1194,6 +1206,7 @@ static void gencode(int32_t segment, int32_t offset, int bits,
         case 064:
         case 065:
         case 066:
+	case 067:
             if (ins->oprs[c - 064].type & (BITS16 | BITS32 | BITS64))
                 size = (ins->oprs[c - 064].type & BITS16) ? 2 : 4;
             else
@@ -1214,6 +1227,7 @@ static void gencode(int32_t segment, int32_t offset, int bits,
         case 070:
         case 071:
         case 072:
+	case 073:
             if (ins->oprs[c - 070].segment != segment) {
                 data = ins->oprs[c - 070].offset;
                 out(offset, segment, &data,
@@ -1227,70 +1241,95 @@ static void gencode(int32_t segment, int32_t offset, int bits,
             offset += 4;
             break;
 
-        case 0130:
-        case 0131:
-        case 0132:
-            data = ins->oprs[c - 0130].offset;
-            if (is_sbyte(ins, c - 0130, 16)) {
-                bytes[0] = data;
-                out(offset, segment, bytes, OUT_RAWDATA + 1, NO_SEG,
-                    NO_SEG);
-                offset++;
-            } else {
-                if (ins->oprs[c - 0130].segment == NO_SEG &&
-                    ins->oprs[c - 0130].wrt == NO_SEG &&
-                    (data < -65536L || data > 65535L)) {
-                    errfunc(ERR_WARNING, "word value exceeds bounds");
-                }
-                out(offset, segment, &data, OUT_ADDRESS + 2,
-                    ins->oprs[c - 0130].segment, ins->oprs[c - 0130].wrt);
-                offset += 2;
-            }
-            break;
-
-        case 0133:
-        case 0134:
-        case 0135:
-	    EMIT_REX();
-            codes++;
-            bytes[0] = *codes++;
-            if (is_sbyte(ins, c - 0133, 16))
-                bytes[0] |= 2;  /* s-bit */
-            out(offset, segment, bytes, OUT_RAWDATA + 1, NO_SEG, NO_SEG);
-            offset++;
+        case 074:
+        case 075:
+        case 076:
+        case 077:
+            if (ins->oprs[c - 074].segment == NO_SEG)
+                errfunc(ERR_NONFATAL, "value referenced by FAR is not"
+                        " relocatable");
+            data = 0L;
+            out(offset, segment, &data, OUT_ADDRESS + 2,
+                outfmt->segbase(1 + ins->oprs[c - 074].segment),
+                ins->oprs[c - 074].wrt);
+            offset += 2;
             break;
 
         case 0140:
         case 0141:
         case 0142:
+	case 0143:
             data = ins->oprs[c - 0140].offset;
-            if (is_sbyte(ins, c - 0140, 32)) {
+            if (is_sbyte(ins, c - 0140, 16)) {
+                bytes[0] = data;
+                out(offset, segment, bytes, OUT_RAWDATA + 1, NO_SEG,
+                    NO_SEG);
+                offset++;
+            } else {
+                if (ins->oprs[c - 0140].segment == NO_SEG &&
+                    ins->oprs[c - 0140].wrt == NO_SEG &&
+                    (data < -65536L || data > 65535L)) {
+                    errfunc(ERR_WARNING, "word value exceeds bounds");
+                }
+                out(offset, segment, &data, OUT_ADDRESS + 2,
+                    ins->oprs[c - 0140].segment, ins->oprs[c - 0130].wrt);
+                offset += 2;
+            }
+            break;
+
+        case 0144:
+        case 0145:
+        case 0146:
+	case 0147:
+	    EMIT_REX();
+            codes++;
+            bytes[0] = *codes++;
+            if (is_sbyte(ins, c - 0144, 16))
+                bytes[0] |= 2;  /* s-bit */
+            out(offset, segment, bytes, OUT_RAWDATA + 1, NO_SEG, NO_SEG);
+            offset++;
+            break;
+
+        case 0150:
+        case 0151:
+        case 0152:
+	case 0153:
+            data = ins->oprs[c - 0150].offset;
+            if (is_sbyte(ins, c - 0150, 32)) {
                 bytes[0] = data;
                 out(offset, segment, bytes, OUT_RAWDATA + 1, NO_SEG,
                     NO_SEG);
                 offset++;
             } else {
                 out(offset, segment, &data, OUT_ADDRESS + 4,
-                    ins->oprs[c - 0140].segment, ins->oprs[c - 0140].wrt);
+                    ins->oprs[c - 0150].segment, ins->oprs[c - 0140].wrt);
                 offset += 4;
             }
             break;
 
-        case 0143:
-        case 0144:
-        case 0145:
+        case 0154:
+        case 0155:
+        case 0156:
+	case 0157:
 	    EMIT_REX();
             codes++;
             bytes[0] = *codes++;
-            if (is_sbyte(ins, c - 0143, 32))
+            if (is_sbyte(ins, c - 0154, 32))
                 bytes[0] |= 2;  /* s-bit */
             out(offset, segment, bytes, OUT_RAWDATA + 1, NO_SEG, NO_SEG);
             offset++;
             break;
 
+        case 0170:
+            bytes[0] = 0;
+            out(offset, segment, bytes, OUT_RAWDATA + 1, NO_SEG, NO_SEG);
+            offset += 1;
+            break;
+
         case 0300:
         case 0301:
         case 0302:
+        case 0303:
             if (chsize(&ins->oprs[c - 0300], bits)) {
                 *bytes = 0x67;
                 out(offset, segment, bytes,
@@ -1537,7 +1576,7 @@ static int rexflags(int val, int32_t flags, int mask)
 
 static int matches(const struct itemplate *itemp, insn * instruction, int bits)
 {
-    int i, size[3], asize, oprs, ret;
+    int i, size[MAX_OPERANDS], asize, oprs, ret;
 
     ret = 100;
 
@@ -1579,7 +1618,7 @@ static int matches(const struct itemplate *itemp, insn * instruction, int bits)
      * Check operand sizes
      */
     if (itemp->flags & IF_ARMASK) {
-        size[0] = size[1] = size[2] = 0;
+	memset(size, 0, sizeof size);
 
         switch (itemp->flags & IF_ARMASK) {
         case IF_AR0:
@@ -1591,34 +1630,54 @@ static int matches(const struct itemplate *itemp, insn * instruction, int bits)
         case IF_AR2:
             i = 2;
             break;
+#if 0 /* Need to reorganize instruction flags to fit IF_AR3 */
+	case IF_AR3:
+	    i = 3;
+	    break;
+#endif
         default:
             break;              /* Shouldn't happen */
         }
-        if (itemp->flags & IF_SB) {
+	switch (itemp->flags & IF_SMASK) {
+	case IF_SB:
             size[i] = BITS8;
-        } else if (itemp->flags & IF_SW) {
+	    break;
+	case IF_SW:
             size[i] = BITS16;
-        } else if (itemp->flags & IF_SD) {
+	    break;
+	case IF_SD:
             size[i] = BITS32;
-        } else if (itemp->flags & IF_SQ) {
+	    break;
+	case IF_SQ:
             size[i] = BITS64;
+	    break;
+	default:
+	    break;
         }
     } else {
         asize = 0;
-        if (itemp->flags & IF_SB) {
+	switch (itemp->flags & IF_SMASK) {
+	case IF_SB:
             asize = BITS8;
             oprs = itemp->operands;
-        } else if (itemp->flags & IF_SW) {
+	    break;
+	case IF_SW:
             asize = BITS16;
             oprs = itemp->operands;
-        } else if (itemp->flags & IF_SD) {
+	    break;
+	case IF_SD:
             asize = BITS32;
             oprs = itemp->operands;
-        } else if (itemp->flags & IF_SQ) {
+	    break;
+	case IF_SQ:
             asize = BITS64;
             oprs = itemp->operands;
+	    break;
+	default:
+	    break;
         }
-        size[0] = size[1] = size[2] = asize;
+	for (i = 0; i < MAX_OPERANDS; i++)
+	    size[i] = asize;
     }
     
     if (itemp->flags & (IF_SM | IF_SM2)) {
diff --git a/disasm.c b/disasm.c
index 0452c295..cfe86938 100644
--- a/disasm.c
+++ b/disasm.c
@@ -341,12 +341,12 @@ static int matches(const struct itemplate *t, uint8_t *data,
     uint8_t lock = prefix->lock;
     int osize = prefix->osize;
     int asize = prefix->asize;
+    int i;
 
-    ins->oprs[0].segment = ins->oprs[1].segment =
-	ins->oprs[2].segment =
-	ins->oprs[0].addr_size = ins->oprs[1].addr_size =
-	ins->oprs[2].addr_size = (segsize == 64 ? SEG_64BIT :
-				  segsize == 32 ? SEG_32BIT : 0);
+    for (i = 0; i < MAX_OPERANDS; i++) {
+	ins->oprs[i].segment = ins->oprs[i].addr_size =
+	    (segsize == 64 ? SEG_64BIT : segsize == 32 ? SEG_32BIT : 0);
+    }
     ins->condition = -1;
     ins->rex = prefix->rex;
 
@@ -419,7 +419,7 @@ static int matches(const struct itemplate *t, uint8_t *data,
             default:
                 return FALSE;
             }
-	} else if (c >= 010 && c <= 012) {
+	} else if (c >= 010 && c <= 013) {
             int t = *r++, d = *data++;
             if (d < t || d > t + 7)
                 return FALSE;
@@ -428,20 +428,17 @@ static int matches(const struct itemplate *t, uint8_t *data,
 		    (ins->rex & REX_B ? 8 : 0);
                 ins->oprs[c - 010].segment |= SEG_RMREG;
             }
-        } else if (c == 017) {
-            if (*data++)
-                return FALSE;
-	} else if (c >= 014 && c <= 016) {
+	} else if (c >= 014 && c <= 017) {
             ins->oprs[c - 014].offset = (int8_t)*data++;
             ins->oprs[c - 014].segment |= SEG_SIGNED;
-        } else if (c >= 020 && c <= 022) {
+        } else if (c >= 020 && c <= 023) {
             ins->oprs[c - 020].offset = *data++;
-	} else if (c >= 024 && c <= 026) {
+	} else if (c >= 024 && c <= 027) {
             ins->oprs[c - 024].offset = *data++;
-	} else if (c >= 030 && c <= 032) {
+	} else if (c >= 030 && c <= 033) {
             ins->oprs[c - 030].offset = getu16(data);
 	    data += 2;
-        } else if (c >= 034 && c <= 036) {
+        } else if (c >= 034 && c <= 037) {
 	    if (osize == 32) {
 		ins->oprs[c - 034].offset = getu32(data);
 		data += 4;
@@ -451,10 +448,10 @@ static int matches(const struct itemplate *t, uint8_t *data,
 	    }
             if (segsize != asize)
                 ins->oprs[c - 034].addr_size = asize;
-        } else if (c >= 040 && c <= 042) {
+        } else if (c >= 040 && c <= 043) {
             ins->oprs[c - 040].offset = getu32(data);
 	    data += 4;
-        } else if (c >= 044 && c <= 046) {
+        } else if (c >= 044 && c <= 047) {
 	    switch (asize) {
 	    case 16:
 		ins->oprs[c - 044].offset = getu16(data);
@@ -471,18 +468,18 @@ static int matches(const struct itemplate *t, uint8_t *data,
 	    }
             if (segsize != asize)
                 ins->oprs[c - 044].addr_size = asize;
-        } else if (c >= 050 && c <= 052) {
+        } else if (c >= 050 && c <= 053) {
             ins->oprs[c - 050].offset = gets8(data++);
             ins->oprs[c - 050].segment |= SEG_RELATIVE;
-        } else if (c >= 054 && c <= 056) {
+        } else if (c >= 054 && c <= 057) {
 	    ins->oprs[c - 054].offset = getu64(data);
 	    data += 8;
-	} else if (c >= 060 && c <= 062) {
+	} else if (c >= 060 && c <= 063) {
             ins->oprs[c - 060].offset = gets16(data);
 	    data += 2;
             ins->oprs[c - 060].segment |= SEG_RELATIVE;
             ins->oprs[c - 060].segment &= ~SEG_32BIT;
-        } else if (c >= 064 && c <= 066) {
+        } else if (c >= 064 && c <= 067) {
 	    if (osize == 16) {
 		ins->oprs[c - 064].offset = getu16(data);
 		data += 2;
@@ -498,30 +495,33 @@ static int matches(const struct itemplate *t, uint8_t *data,
                     (ins->oprs[c - 064].type & ~SIZE_MASK)
                     | ((osize == 16) ? BITS16 : BITS32);
             }
-        } else if (c >= 070 && c <= 072) {
+        } else if (c >= 070 && c <= 073) {
             ins->oprs[c - 070].offset = getu32(data);
 	    data += 4;
             ins->oprs[c - 070].segment |= SEG_32BIT | SEG_RELATIVE;
-        } else if (c >= 0100 && c < 0130) {
+        } else if (c >= 0100 && c < 0140) {
             int modrm = *data++;
             ins->oprs[c & 07].basereg = ((modrm >> 3)&7)+
 		(ins->rex & REX_R ? 8 : 0);
             ins->oprs[c & 07].segment |= SEG_RMREG;
             data = do_ea(data, modrm, asize, segsize,
                          &ins->oprs[(c >> 3) & 07], ins->rex);
-        } else if (c >= 0130 && c <= 0132) {
-            ins->oprs[c - 0130].offset = getu16(data);
+        } else if (c >= 0140 && c <= 0143) {
+            ins->oprs[c - 0140].offset = getu16(data);
 	    data += 2;
-        } else if (c >= 0140 && c <= 0142) {
-	    ins->oprs[c - 0140].offset = getu32(data);
+        } else if (c >= 0150 && c <= 0153) {
+	    ins->oprs[c - 0150].offset = getu32(data);
 	    data += 4;
+        } else if (c == 0170) {
+            if (*data++)
+                return FALSE;
         } else if (c >= 0200 && c <= 0277) {
             int modrm = *data++;
             if (((modrm >> 3) & 07) != (c & 07))
                 return FALSE;   /* spare field doesn't match up */
             data = do_ea(data, modrm, asize, segsize,
                          &ins->oprs[(c >> 3) & 07], ins->rex);
-        } else if (c >= 0300 && c <= 0302) {
+        } else if (c >= 0300 && c <= 0303) {
             a_used = TRUE;
         } else if (c == 0310) {
             if (asize != 16)
diff --git a/insns.dat b/insns.dat
index 422109e1..1595ba69 100644
--- a/insns.dat
+++ b/insns.dat
@@ -47,14 +47,14 @@ ADC       reg_eax,imm         \321\1\x15\41                 386,SM
 ADC       reg_rax,sbyte       \321\1\x83\202\15             X64,SM,ND
 ADC       reg_rax,imm         \321\1\x15\41                 X64,SM
 ADC       rm8,imm             \300\1\x80\202\21             8086,SM
-ADC       rm16,imm            \320\300\134\1\x81\202\131    8086,SM
-ADC       rm32,imm            \321\300\144\1\x81\202\141    386,SM
-ADC       rm64,imm            \324\300\144\1\x81\202\141    X64,SM
+ADC       rm16,imm            \320\300\145\1\x81\202\141    8086,SM
+ADC       rm32,imm            \321\300\155\1\x81\202\151    386,SM
+ADC       rm64,imm            \324\300\155\1\x81\202\151    X64,SM
 ADC       mem,imm8            \300\1\x80\202\21             8086,SM
-ADC       mem,imm16           \320\300\134\1\x81\202\131    8086,SM
-ADC       mem,imm32           \321\300\144\1\x81\202\141    386,SM
-ADD       mem,reg8            \300\17\101                   8086,SM
-ADD       reg8,reg8           \17\101                       8086
+ADC       mem,imm16           \320\300\145\1\x81\202\141    8086,SM
+ADC       mem,imm32           \321\300\155\1\x81\202\151    386,SM
+ADD       mem,reg8            \300\170\101                  8086,SM
+ADD       reg8,reg8           \170\101                      8086
 ADD       mem,reg16           \320\300\1\x01\101            8086,SM
 ADD       reg16,reg16         \320\1\x01\101                8086
 ADD       mem,reg32           \321\300\1\x01\101            386,SM
@@ -80,12 +80,12 @@ ADD       reg_eax,imm         \321\1\x05\41                 386,SM
 ADD       reg_rax,sbyte       \321\1\x83\200\15             X64,SM,ND
 ADD       reg_rax,imm         \323\1\x05\41                 X64,SM
 ADD       rm8,imm             \300\1\x80\200\21             8086,SM
-ADD       rm16,imm            \320\300\134\1\x81\200\131    8086,SM
-ADD       rm32,imm            \321\300\144\1\x81\200\141    386,SM
-ADD       rm64,imm            \324\300\144\1\x81\200\141    X64,SM
+ADD       rm16,imm            \320\300\145\1\x81\200\141    8086,SM
+ADD       rm32,imm            \321\300\155\1\x81\200\151    386,SM
+ADD       rm64,imm            \324\300\155\1\x81\200\151    X64,SM
 ADD       mem,imm8            \300\1\x80\200\21             8086,SM
-ADD       mem,imm16           \320\300\134\1\x81\200\131    8086,SM
-ADD       mem,imm32           \321\300\144\1\x81\200\141    386,SM
+ADD       mem,imm16           \320\300\145\1\x81\200\141    8086,SM
+ADD       mem,imm32           \321\300\155\1\x81\200\151    386,SM
 AND       mem,reg8            \300\1\x20\101                8086,SM
 AND       reg8,reg8           \1\x20\101                    8086
 AND       mem,reg16           \320\300\1\x21\101            8086,SM
@@ -113,12 +113,12 @@ AND       reg_eax,imm         \321\1\x25\41                 386,SM
 AND       reg_rax,sbyte       \321\1\x83\204\15             X64,SM,ND
 AND       reg_rax,imm         \324\1\x25\41                 X64,SM
 AND       rm8,imm             \300\1\x80\204\21             8086,SM
-AND       rm16,imm            \320\300\134\1\x81\204\131    8086,SM
-AND       rm32,imm            \321\300\144\1\x81\204\141    386,SM
-AND       rm64,imm            \324\300\144\1\x81\204\141    X64,SM
+AND       rm16,imm            \320\300\145\1\x81\204\141    8086,SM
+AND       rm32,imm            \321\300\155\1\x81\204\151    386,SM
+AND       rm64,imm            \324\300\155\1\x81\204\151    X64,SM
 AND       mem,imm8            \300\1\x80\204\21             8086,SM
-AND       mem,imm16           \320\300\134\1\x81\204\131    8086,SM
-AND       mem,imm32           \321\300\144\1\x81\204\141    386,SM
+AND       mem,imm16           \320\300\145\1\x81\204\141    8086,SM
+AND       mem,imm32           \321\300\155\1\x81\204\151    386,SM
 ARPL      mem,reg16           \300\1\x63\101                286,PROT,SM,NOLONG
 ARPL      reg16,reg16         \1\x63\101                    286,PROT,NOLONG
 BOUND     reg16,mem           \320\301\1\x62\110            186,NOLONG
@@ -175,13 +175,13 @@ BTS       rm32,imm            \321\300\2\x0F\xBA\205\25     386,SB
 BTS       rm64,imm            \324\300\2\x0F\xBA\205\25     X64,SB
 CALL      imm                 \322\1\xE8\64                 8086
 CALL      imm|near            \322\1\xE8\64                 8086
-CALL      imm|far             \322\1\x9A\34\37              8086,ND,NOLONG
+CALL      imm|far             \322\1\x9A\34\74              8086,ND,NOLONG
 CALL      imm16               \320\1\xE8\64                 8086
 CALL      imm16|near          \320\1\xE8\64                 8086
-CALL      imm16|far           \320\1\x9A\34\37              8086,ND,NOLONG
+CALL      imm16|far           \320\1\x9A\34\74              8086,ND,NOLONG
 CALL      imm32               \321\1\xE8\64                 386
 CALL      imm32|near          \321\1\xE8\64                 386
-CALL      imm32|far           \321\1\x9A\34\37              386,ND,NOLONG
+CALL      imm32|far           \321\1\x9A\34\74              386,ND,NOLONG
 CALL      imm:imm             \322\1\x9A\35\30              8086,NOLONG
 CALL      imm16:imm           \320\1\x9A\31\30              8086,NOLONG
 CALL      imm:imm16           \320\1\x9A\31\30              8086,NOLONG
@@ -238,12 +238,12 @@ CMP       reg_eax,imm         \321\1\x3D\41                 386,SM
 CMP       reg_rax,sbyte       \321\1\x83\207\15             X64,SM,ND
 CMP       reg_rax,imm         \321\1\x3D\41                 X64,SM
 CMP       rm8,imm             \300\1\x80\207\21             8086,SM
-CMP       rm16,imm            \320\300\134\1\x81\207\131    8086,SM
-CMP       rm32,imm            \321\300\144\1\x81\207\141    386,SM
-CMP       rm64,imm            \324\300\144\1\x81\207\141    X64,SM
+CMP       rm16,imm            \320\300\145\1\x81\207\141    8086,SM
+CMP       rm32,imm            \321\300\155\1\x81\207\151    386,SM
+CMP       rm64,imm            \324\300\155\1\x81\207\151    X64,SM
 CMP       mem,imm8            \300\1\x80\207\21             8086,SM
-CMP       mem,imm16           \320\300\134\1\x81\207\131    8086,SM
-CMP       mem,imm32           \321\300\144\1\x81\207\141    386,SM
+CMP       mem,imm16           \320\300\145\1\x81\207\141    8086,SM
+CMP       mem,imm32           \321\300\155\1\x81\207\151    386,SM
 CMPSB     void                \335\1\xA6                    8086
 CMPSD     void                \335\321\1\xA7                386
 CMPSQ     void                \335\324\1\xA7                X64
@@ -497,38 +497,38 @@ IMUL      reg64,reg64         \324\2\x0F\xAF\110            X64
 IMUL      reg16,mem,imm8      \320\301\1\x6B\110\16         186,SM
 IMUL      reg16,mem,sbyte     \320\301\1\x6B\110\16         186,SM,ND
 IMUL      reg16,mem,imm16     \320\301\1\x69\110\32         186,SM
-IMUL      reg16,mem,imm       \320\301\135\1\x69\110\132    186,SM,ND
+IMUL      reg16,mem,imm       \320\301\146\1\x69\110\142    186,SM,ND
 IMUL      reg16,reg16,imm8    \320\1\x6B\110\16             186
 IMUL      reg16,reg16,sbyte   \320\1\x6B\110\16             186,SM,ND
 IMUL      reg16,reg16,imm16   \320\1\x69\110\32             186
-IMUL      reg16,reg16,imm     \320\135\1\x69\110\132        186,SM,ND
+IMUL      reg16,reg16,imm     \320\146\1\x69\110\142        186,SM,ND
 IMUL      reg32,mem,imm8      \321\301\1\x6B\110\16         386,SM
 IMUL      reg32,mem,sbyte     \321\301\1\x6B\110\16         386,SM,ND
 IMUL      reg32,mem,imm32     \321\301\1\x69\110\42         386,SM
-IMUL      reg32,mem,imm       \321\301\145\1\x69\110\142    386,SM,ND
+IMUL      reg32,mem,imm       \321\301\156\1\x69\110\152    386,SM,ND
 IMUL      reg32,reg32,imm8    \321\1\x6B\110\16             386
 IMUL      reg32,reg32,sbyte   \321\1\x6B\110\16             386,SM,ND
 IMUL      reg32,reg32,imm32   \321\1\x69\110\42             386
-IMUL      reg32,reg32,imm     \321\145\1\x69\110\142        386,SM,ND
+IMUL      reg32,reg32,imm     \321\156\1\x69\110\152        386,SM,ND
 IMUL      reg64,mem,imm8      \324\301\1\x6B\110\16         X64,SM
 IMUL      reg64,mem,sbyte     \324\301\1\x6B\110\16         X64,SM,ND
 IMUL      reg64,mem,imm32     \324\301\1\x69\110\42         X64,SM
-IMUL      reg64,mem,imm       \324\301\145\1\x69\110\142    X64,SM,ND
+IMUL      reg64,mem,imm       \324\301\156\1\x69\110\152    X64,SM,ND
 IMUL      reg64,reg64,imm8    \324\1\x6B\110\16             X64
 IMUL      reg64,reg64,sbyte   \324\1\x6B\110\16             X64,SM,ND
 IMUL      reg64,reg64,imm32   \324\1\x69\110\42             X64
-IMUL      reg64,reg64,imm     \324\145\1\x69\110\142        X64,SM,ND
+IMUL      reg64,reg64,imm     \324\156\1\x69\110\152        X64,SM,ND
 IMUL      reg16,imm8          \320\1\x6B\100\15             186
 IMUL      reg16,sbyte         \320\1\x6B\100\15             186,SM,ND
 IMUL      reg16,imm16         \320\1\x69\100\31             186
-IMUL      reg16,imm           \320\134\1\x69\100\131        186,SM,ND
+IMUL      reg16,imm           \320\145\1\x69\100\141        186,SM,ND
 IMUL      reg32,imm8          \321\1\x6B\100\15             386
 IMUL      reg32,sbyte         \321\1\x6B\100\15             386,SM,ND
 IMUL      reg32,imm32         \321\1\x69\100\41             386
-IMUL      reg32,imm           \321\144\1\x69\100\141        386,SM,ND
+IMUL      reg32,imm           \321\155\1\x69\100\151        386,SM,ND
 IMUL      reg64,sbyte         \324\1\x6B\100\15             X64,SM,ND
 IMUL      reg64,imm32         \324\1\x69\100\41             X64
-IMUL      reg64,imm           \324\144\1\x69\100\141        X64,SM,ND
+IMUL      reg64,imm           \324\155\1\x69\100\151        X64,SM,ND
 IN        reg_al,imm          \1\xE4\25                     8086,SB
 IN        reg_ax,imm          \320\1\xE5\25                 8086,SB
 IN        reg_eax,imm         \321\1\xE5\25                 386,SB
@@ -564,13 +564,13 @@ JMP       imm|short           \1\xEB\50                     8086
 JMP       imm                 \371\1\xEB\50                 8086,ND
 JMP       imm                 \322\1\xE9\64                 8086
 JMP       imm|near            \322\1\xE9\64                 8086,ND
-JMP       imm|far             \322\1\xEA\34\37              8086,ND,NOLONG
+JMP       imm|far             \322\1\xEA\34\74              8086,ND,NOLONG
 JMP       imm16               \320\1\xE9\64                 8086
 JMP       imm16|near          \320\1\xE9\64                 8086,ND
-JMP       imm16|far           \320\1\xEA\34\37              8086,ND,NOLONG
+JMP       imm16|far           \320\1\xEA\34\74              8086,ND,NOLONG
 JMP       imm32               \321\1\xE9\64                 386
 JMP       imm32|near          \321\1\xE9\64                 386,ND
-JMP       imm32|far           \321\1\xEA\34\37              386,ND,NOLONG
+JMP       imm32|far           \321\1\xEA\34\74              386,ND,NOLONG
 JMP       imm:imm             \322\1\xEA\35\30              8086,NOLONG
 JMP       imm16:imm           \320\1\xEA\31\30              8086,NOLONG
 JMP       imm:imm16           \320\1\xEA\31\30              8086,NOLONG
@@ -618,9 +618,9 @@ LGDT      mem                 \300\2\x0F\x01\202            286,PRIV
 LGS       reg16,mem           \320\301\2\x0F\xB5\110        386
 LGS       reg32,mem           \321\301\2\x0F\xB5\110        386
 LIDT      mem                 \300\2\x0F\x01\203            286,PRIV
-LLDT      mem                 \300\1\x0F\17\202             286,PROT,PRIV
-LLDT      mem16               \300\1\x0F\17\202             286,PROT,PRIV
-LLDT      reg16               \1\x0F\17\202                 286,PROT,PRIV
+LLDT      mem                 \300\1\x0F\170\202            286,PROT,PRIV
+LLDT      mem16               \300\1\x0F\170\202            286,PROT,PRIV
+LLDT      reg16               \1\x0F\170\202                286,PROT,PRIV
 LMSW      mem                 \300\2\x0F\x01\206            286,PRIV
 LMSW      mem16               \300\2\x0F\x01\206            286,PRIV
 LMSW      reg16               \2\x0F\x01\206                286,PRIV
@@ -658,9 +658,9 @@ LSL       reg64,mem           \324\301\2\x0F\x03\110        X64,SM
 LSL       reg64,reg64         \324\2\x0F\x03\110            X64,PROT
 LSS       reg16,mem           \320\301\2\x0F\xB2\110        386
 LSS       reg32,mem           \321\301\2\x0F\xB2\110        386
-LTR       mem                 \300\1\x0F\17\203             286,PROT,PRIV
-LTR       mem16               \300\1\x0F\17\203             286,PROT,PRIV,NOLONG
-LTR       reg16               \1\x0F\17\203                 286,PROT,PRIV,NOLONG
+LTR       mem                 \300\1\x0F\170\203            286,PROT,PRIV
+LTR       mem16               \300\1\x0F\170\203            286,PROT,PRIV,NOLONG
+LTR       reg16               \1\x0F\170\203                286,PROT,PRIV,NOLONG
 MFENCE    void                \3\x0F\xAE\xF0                X64,AMD
 MONITOR   void		      \3\x0F\x01\xC8		    PRESCOTT
 MONITOR	  reg_eax,reg_ecx,reg_edx      \3\x0F\x01\xC8	    PRESCOTT,ND
@@ -788,12 +788,12 @@ OR        reg_eax,imm         \321\1\x0D\41                 386,SM
 OR        reg_rax,sbyte       \321\1\x83\201\15             X64,SM,ND
 OR        reg_rax,imm         \321\1\x0D\41                 X64,SM
 OR        rm8,imm             \300\1\x80\201\21             8086,SM
-OR        rm16,imm            \320\300\134\1\x81\201\131    8086,SM
-OR        rm32,imm            \321\300\144\1\x81\201\141    386,SM
-OR        rm64,imm            \324\300\144\1\x81\201\141    X64,SM
+OR        rm16,imm            \320\300\145\1\x81\201\141    8086,SM
+OR        rm32,imm            \321\300\155\1\x81\201\151    386,SM
+OR        rm64,imm            \324\300\155\1\x81\201\151    X64,SM
 OR        mem,imm8            \300\1\x80\201\21             8086,SM
-OR        mem,imm16           \320\300\134\1\x81\201\131    8086,SM
-OR        mem,imm32           \321\300\144\1\x81\201\141    386,SM
+OR        mem,imm16           \320\300\145\1\x81\201\141    8086,SM
+OR        mem,imm32           \321\300\155\1\x81\201\151    386,SM
 OUT       imm,reg_al          \1\xE6\24                     8086,SB
 OUT       imm,reg_ax          \320\1\xE7\24                 8086,SB
 OUT       imm,reg_eax         \321\1\xE7\24                 386,SB
@@ -987,9 +987,9 @@ PUSH      reg_dess            \6                            8086,NOLONG
 PUSH      reg_fsgs            \1\x0F\7                      386
 PUSH      imm8                \1\x6A\14                     186
 PUSH      sbyte               \1\x6A\14                     186,ND
-PUSH      imm16               \320\133\1\x68\130            186
-PUSH      imm32               \321\143\1\x68\140            386,NOLONG
-PUSH      imm64               \321\143\1\x68\140            X64
+PUSH      imm16               \320\144\1\x68\140            186
+PUSH      imm32               \321\154\1\x68\150            386,NOLONG
+PUSH      imm64               \321\154\1\x68\150            X64
 PUSH	  imm		      \1\x68\34			    186
 PUSHA     void                \322\1\x60                    186,NOLONG
 PUSHAD    void                \321\1\x60                    386,NOLONG
@@ -1121,12 +1121,12 @@ SBB       reg_eax,imm         \321\1\x1D\41                 386,SM
 SBB       reg_rax,sbyte       \321\1\x83\203\15             X64,SM,ND
 SBB       reg_rax,imm         \321\1\x1D\41                 X64,SM
 SBB       rm8,imm             \300\1\x80\203\21             8086,SM
-SBB       rm16,imm            \320\300\134\1\x81\203\131    8086,SM
-SBB       rm32,imm            \321\300\144\1\x81\203\141    386,SM
-SBB       rm64,imm            \324\300\144\1\x81\203\141    X64,SM
+SBB       rm16,imm            \320\300\145\1\x81\203\141    8086,SM
+SBB       rm32,imm            \321\300\155\1\x81\203\151    386,SM
+SBB       rm64,imm            \324\300\155\1\x81\203\151    X64,SM
 SBB       mem,imm8            \300\1\x80\203\21             8086,SM
-SBB       mem,imm16           \320\300\134\1\x81\203\131    8086,SM
-SBB       mem,imm32           \321\300\144\1\x81\203\141    386,SM
+SBB       mem,imm16           \320\300\145\1\x81\203\141    8086,SM
+SBB       mem,imm32           \321\300\155\1\x81\203\151    386,SM
 SCASB     void                \335\1\xAE                    8086
 SCASD     void                \335\321\1\xAF                386
 SCASQ     void                \335\324\1\xAF                X64
@@ -1182,10 +1182,10 @@ SHRD      reg32,reg32,reg_cl  \321\2\x0F\xAD\101            386
 SHRD      mem,reg64,reg_cl    \300\324\2\x0F\xAD\101        X64,SM
 SHRD      reg64,reg64,reg_cl  \324\2\x0F\xAD\101            X64
 SIDT      mem                 \300\2\x0F\x01\201            286
-SLDT      mem                 \300\1\x0F\17\200             286
-SLDT      mem16               \300\1\x0F\17\200             286
-SLDT      reg16               \320\1\x0F\17\200             286
-SLDT      reg32               \321\1\x0F\17\200             386
+SLDT      mem                 \300\1\x0F\170\200            286
+SLDT      mem16               \300\1\x0F\170\200            286
+SLDT      reg16               \320\1\x0F\170\200            286
+SLDT      reg32               \321\1\x0F\170\200            386
 SKINIT    void                \3\x0F\x01\xDE                X64
 SMI       void                \1\xF1                        386,UNDOC
 SMINT     void                \2\x0F\x38                    P6,CYRIX
@@ -1203,11 +1203,11 @@ STOSB     void                \1\xAA                        8086
 STOSD     void                \321\1\xAB                    386
 STOSQ     void                \324\1\xAB                    X64
 STOSW     void                \320\1\xAB                    8086
-STR       mem                 \300\1\x0F\17\201             286,PROT
-STR       mem16               \300\1\x0F\17\201             286,PROT
-STR       reg16               \320\1\x0F\17\201             286,PROT
-STR       reg32               \321\1\x0F\17\201             386,PROT
-STR       reg64               \324\1\x0F\17\201             X64
+STR       mem                 \300\1\x0F\170\201            286,PROT
+STR       mem16               \300\1\x0F\170\201            286,PROT
+STR       reg16               \320\1\x0F\170\201            286,PROT
+STR       reg32               \321\1\x0F\170\201            386,PROT
+STR       reg64               \324\1\x0F\170\201            X64
 SUB       mem,reg8            \300\1\x28\101                8086,SM
 SUB       reg8,reg8           \1\x28\101                    8086
 SUB       mem,reg16           \320\300\1\x29\101            8086,SM
@@ -1235,12 +1235,12 @@ SUB       reg_eax,imm         \321\1\x2D\41                 386,SM
 SUB       reg_rax,sbyte       \321\1\x83\205\15             X64,SM,ND
 SUB       reg_rax,imm         \321\1\x2D\41                 X64,SM
 SUB       rm8,imm             \300\1\x80\205\21             8086,SM
-SUB       rm16,imm            \320\300\134\1\x81\205\131    8086,SM
-SUB       rm32,imm            \321\300\144\1\x81\205\141    386,SM
-SUB       rm64,imm            \324\300\144\1\x81\205\141    X64,SM
+SUB       rm16,imm            \320\300\145\1\x81\205\141    8086,SM
+SUB       rm32,imm            \321\300\155\1\x81\205\151    386,SM
+SUB       rm64,imm            \324\300\155\1\x81\205\151    X64,SM
 SUB       mem,imm8            \300\1\x80\205\21             8086,SM
-SUB       mem,imm16           \320\300\134\1\x81\205\131    8086,SM
-SUB       mem,imm32           \321\300\144\1\x81\205\141    386,SM
+SUB       mem,imm16           \320\300\145\1\x81\205\141    8086,SM
+SUB       mem,imm32           \321\300\155\1\x81\205\151    386,SM
 SVDC      mem80,reg_sreg      \300\2\x0F\x78\101            486,CYRIX,SMM
 SVLDT     mem80               \300\2\x0F\x7A\200            486,CYRIX,SMM
 SVTS      mem80               \300\2\x0F\x7C\200            486,CYRIX,SMM
@@ -1287,12 +1287,12 @@ UMOV      reg16,mem           \320\301\2\x0F\x13\110        386,UNDOC,SM
 UMOV      reg16,reg16         \320\2\x0F\x13\110            386,UNDOC
 UMOV      reg32,mem           \321\301\2\x0F\x13\110        386,UNDOC,SM
 UMOV      reg32,reg32         \321\2\x0F\x13\110            386,UNDOC
-VERR      mem                 \300\1\x0F\17\204             286,PROT
-VERR      mem16               \300\1\x0F\17\204             286,PROT
-VERR      reg16               \1\x0F\17\204                 286,PROT
-VERW      mem                 \300\1\x0F\17\205             286,PROT
-VERW      mem16               \300\1\x0F\17\205             286,PROT
-VERW      reg16               \1\x0F\17\205                 286,PROT
+VERR      mem                 \300\1\x0F\170\204            286,PROT
+VERR      mem16               \300\1\x0F\170\204            286,PROT
+VERR      reg16               \1\x0F\170\204                286,PROT
+VERW      mem                 \300\1\x0F\170\205            286,PROT
+VERW      mem16               \300\1\x0F\170\205            286,PROT
+VERW      reg16               \1\x0F\170\205                286,PROT
 WAIT      void                \1\x9B                        8086
 FWAIT     void                \1\x9B                        8086
 WBINVD    void                \2\x0F\x09                    486,PRIV
@@ -1360,12 +1360,12 @@ XOR       reg_eax,imm         \321\1\x35\41                 386,SM
 XOR       reg_rax,sbyte       \321\1\x83\206\15             X64,SM,ND
 XOR       reg_rax,imm         \321\1\x35\41                 X64,SM
 XOR       rm8,imm             \300\1\x80\206\21             8086,SM
-XOR       rm16,imm            \320\300\134\1\x81\206\131    8086,SM
-XOR       rm32,imm            \321\300\144\1\x81\206\141    386,SM
-XOR       rm64,imm            \324\300\144\1\x81\206\141    X64,SM
+XOR       rm16,imm            \320\300\145\1\x81\206\141    8086,SM
+XOR       rm32,imm            \321\300\155\1\x81\206\151    386,SM
+XOR       rm64,imm            \324\300\155\1\x81\206\151    X64,SM
 XOR       mem,imm8            \300\1\x80\206\21             8086,SM
-XOR       mem,imm16           \320\300\134\1\x81\206\131    8086,SM
-XOR       mem,imm32           \321\300\144\1\x81\206\141    386,SM
+XOR       mem,imm16           \320\300\145\1\x81\206\141    8086,SM
+XOR       mem,imm32           \321\300\155\1\x81\206\151    386,SM
 XSTORE    void                \3\x0F\xA7\xC0                P6,CYRIX
 CMOVcc    reg16,mem           \320\301\1\x0F\330\x40\110    P6,SM
 CMOVcc    reg16,reg16         \320\1\x0F\330\x40\110        P6
diff --git a/insns.h b/insns.h
index 4deccf94..21dfd93b 100644
--- a/insns.h
+++ b/insns.h
@@ -9,7 +9,7 @@
 #ifndef NASM_INSNS_H
 #define NASM_INSNS_H
 
-#include "insnsi.h"             /* instruction opcode enum */
+#include "nasm.h"
 
 /* max length of any instruction, register name etc. */
 #if MAX_INSLEN > 9              /* MAX_INSLEN defined in insnsi.h */
@@ -21,7 +21,7 @@
 struct itemplate {
     enum opcode opcode;		/* the token, passed from "parser.c" */
     int operands;		/* number of operands */
-    int32_t opd[3];		/* bit flags for operand types */
+    int32_t opd[MAX_OPERANDS];	/* bit flags for operand types */
     const char *code;		/* the code it assembles to */
     uint32_t flags;		/* some flags */
 };
@@ -66,12 +66,14 @@ extern const struct itemplate * const * const itable[];
 #define IF_SM2    0x00000002UL  /* size match first two operands */
 #define IF_SB     0x00000004UL  /* unsized operands can't be non-byte */
 #define IF_SW     0x00000008UL  /* unsized operands can't be non-word */
-#define IF_SD     0x00000010UL  /* unsized operands can't be non-dword */
-#define IF_SQ     0x00000020UL  /* unsized operands can't be non-qword */
-#define IF_AR0	  0x00000040UL  /* SB, SW, SD applies to argument 0 */
-#define IF_AR1	  0x00000080UL  /* SB, SW, SD applies to argument 1 */
-#define IF_AR2	  0x000000C0UL  /* SB, SW, SD applies to argument 2 */
-#define IF_ARMASK 0x000000C0UL  /* mask for unsized argument spec */
+#define IF_SD     0x0000000CUL  /* unsized operands can't be non-dword */
+#define IF_SQ     0x00000010UL  /* unsized operands can't be non-qword */
+#define IF_SMASK  0x0000001CUL  /* mask for unsized argument size */
+#define IF_AR0	  0x00000020UL  /* SB, SW, SD applies to argument 0 */
+#define IF_AR1	  0x00000040UL  /* SB, SW, SD applies to argument 1 */
+#define IF_AR2	  0x00000060UL  /* SB, SW, SD applies to argument 2 */
+#define IF_AR3	  0x00000080UL  /* SB, SW, SD applies to argument 2 */
+#define IF_ARMASK 0x000000E0UL  /* mask for unsized argument spec */
 #define IF_PRIV   0x00000100UL  /* it's a privileged instruction */
 #define IF_SMM    0x00000200UL  /* it's only valid in SMM */
 #define IF_PROT   0x00000400UL  /* it's protected mode only */
diff --git a/insns.pl b/insns.pl
index 421f16aa..e596b48b 100644
--- a/insns.pl
+++ b/insns.pl
@@ -203,45 +203,52 @@ if ( !defined($output) || $output eq 'n' ) {
 printf STDERR "Done: %d instructions\n", $insns;
 
 sub format {
-  local ($opcode, $operands, $codes, $flags) = @_;
-  local $num, $nd = 0;
+    my ($opcode, $operands, $codes, $flags) = @_;
+    my $num, $nd = 0;
 
-  return (undef, undef) if $operands eq "ignore";
-
-  # format the operands
-  $operands =~ s/:/|colon,/g;
-  $operands =~ s/mem(\d+)/mem|bits$1/g;
-  $operands =~ s/mem/memory/g;
-  $operands =~ s/memory_offs/mem_offs/g;
-  $operands =~ s/imm(\d+)/imm|bits$1/g;
-  $operands =~ s/imm/immediate/g;
-  $operands =~ s/rm(\d+)/rm_gpr|bits$1/g;
-  $operands =~ s/mmxrm/rm_mmx/g;
-  $operands =~ s/xmmrm/rm_xmm/g;
-  $num = 3;
-  $operands = '0,0,0', $num = 0 if $operands eq 'void';
-  $operands .= ',0', $num-- while $operands !~ /,.*,/;
-  $operands =~ tr/a-z/A-Z/;
-
-  # format the flags
-  $flags =~ s/,/|IF_/g;
-  $flags =~ s/(\|IF_ND|IF_ND\|)//, $nd = 1 if $flags =~ /IF_ND/;
-  $flags = "IF_" . $flags;
-
-  ("{I_$opcode, $num, {$operands}, \"$codes\", $flags},", $nd);
+    return (undef, undef) if $operands eq "ignore";
+    
+    # format the operands
+    $operands =~ s/:/|colon,/g;
+    $operands =~ s/mem(\d+)/mem|bits$1/g;
+    $operands =~ s/mem/memory/g;
+    $operands =~ s/memory_offs/mem_offs/g;
+    $operands =~ s/imm(\d+)/imm|bits$1/g;
+    $operands =~ s/imm/immediate/g;
+    $operands =~ s/rm(\d+)/rm_gpr|bits$1/g;
+    $operands =~ s/mmxrm/rm_mmx/g;
+    $operands =~ s/xmmrm/rm_xmm/g;
+    if ($operands eq 'void') {
+	@ops = ();
+    } else {
+	@ops = split(/\,/, $operands);
+    }
+    $num = scalar(@ops);
+    while (scalar(@ops) < 4) {
+	push(@ops, '0');
+    }
+    $operands = join(',', @ops);
+    $operands =~ tr/a-z/A-Z/;
+    
+    # format the flags
+    $flags =~ s/,/|IF_/g;
+    $flags =~ s/(\|IF_ND|IF_ND\|)//, $nd = 1 if $flags =~ /IF_ND/;
+    $flags = "IF_" . $flags;
+    
+    ("{I_$opcode, $num, {$operands}, \"$codes\", $flags},", $nd);
 }
 
 # Here we determine the range of possible starting bytes for a given
 # instruction. We need only consider the codes:
 # \1 \2 \3     mean literal bytes, of course
 # \4 \5 \6 \7  mean PUSH/POP of segment registers: special case
-# \10 \11 \12  mean byte plus register value
-# \17          means byte zero
+# \1[0123]     mean byte plus register value
+# \170         means byte zero
 # \330         means byte plus condition code
 # \0 or \340   mean give up and return empty set
 sub startbyte {
-  local ($codes) = @_;
-  local $word, @range;
+  my ($codes) = @_;
+  my $word, @range;
 
   while (1) {
     die "couldn't get code in '$codes'" if $codes !~ /^(\\[^\\]+)(\\.*)?$/;
@@ -251,8 +258,8 @@ sub startbyte {
     return (0xA1, 0xA9) if $word eq "\\5";
     return (0x06, 0x0E, 0x16, 0x1E) if $word eq "\\6";
     return (0xA0, 0xA8) if $word eq "\\7";
-    $start=hex $1, $r=8, last if $word =~ /^\\1[012]$/ && $codes =~/^\\x(..)/;
-    return (0) if $word eq "\\17";
+    $start=hex $1, $r=8, last if $word =~ /^\\1[0123]$/ && $codes =~/^\\x(..)/;
+    return (0) if $word eq "\\170";
     $start=hex $1, $r=16, last if $word =~ /^\\330$/ && $codes =~ /^\\x(..)/;
     return () if $word eq "\\0" || $word eq "\\340";
   }
diff --git a/nasm.h b/nasm.h
index 4ae93b61..7c5a1b75 100644
--- a/nasm.h
+++ b/nasm.h
@@ -607,6 +607,7 @@ typedef struct extop {          /* extended operand */
 } extop;
 
 #define MAXPREFIX 4
+#define MAX_OPERANDS 4
 
 typedef struct {                /* an instruction itself */
     char *label;              /* the label defined, or NULL */
@@ -616,7 +617,7 @@ typedef struct {                /* an instruction itself */
     enum ccode condition;       /* the condition code, if Jcc/SETcc */
     int operands;               /* how many operands? 0-3 
                                  * (more if db et al) */
-    operand oprs[3];            /* the operands, defined as above */
+    operand oprs[MAX_OPERANDS]; /* the operands, defined as above */
     extop *eops;                /* extended operands */
     int eops_float;             /* true if DD and floating */
     int32_t times;              /* repeat count (TIMES prefix) */

From 19315e012fda54ec3e4af65849170ab335dcc36f Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 17 Sep 2007 16:20:45 -0700
Subject: [PATCH 02/29] Enable IF_AR3

Enable IF_AR3, which was incorrectly disabled in a previous checkin.
---
 assemble.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/assemble.c b/assemble.c
index 7dc2b25b..0821101a 100644
--- a/assemble.c
+++ b/assemble.c
@@ -1630,11 +1630,9 @@ static int matches(const struct itemplate *itemp, insn * instruction, int bits)
         case IF_AR2:
             i = 2;
             break;
-#if 0 /* Need to reorganize instruction flags to fit IF_AR3 */
 	case IF_AR3:
 	    i = 3;
 	    break;
-#endif
         default:
             break;              /* Shouldn't happen */
         }

From 8f94f988f0413c35520095866e00ac358d36c99c Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 17 Sep 2007 16:31:33 -0700
Subject: [PATCH 03/29] Fix a few instances of missing renumbers

parser.c: change hard-coded argument count 3 to MAX_ARGUMENTS
assemble.c: change a few missed code renumbers
---
 assemble.c | 4 ++--
 parser.c   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/assemble.c b/assemble.c
index 0821101a..f1b4dbb9 100644
--- a/assemble.c
+++ b/assemble.c
@@ -1272,7 +1272,7 @@ static void gencode(int32_t segment, int32_t offset, int bits,
                     errfunc(ERR_WARNING, "word value exceeds bounds");
                 }
                 out(offset, segment, &data, OUT_ADDRESS + 2,
-                    ins->oprs[c - 0140].segment, ins->oprs[c - 0130].wrt);
+                    ins->oprs[c - 0140].segment, ins->oprs[c - 0140].wrt);
                 offset += 2;
             }
             break;
@@ -1302,7 +1302,7 @@ static void gencode(int32_t segment, int32_t offset, int bits,
                 offset++;
             } else {
                 out(offset, segment, &data, OUT_ADDRESS + 4,
-                    ins->oprs[c - 0150].segment, ins->oprs[c - 0140].wrt);
+                    ins->oprs[c - 0150].segment, ins->oprs[c - 0150].wrt);
                 offset += 4;
             }
             break;
diff --git a/parser.c b/parser.c
index 1c7b8d9b..16164d77 100644
--- a/parser.c
+++ b/parser.c
@@ -339,10 +339,10 @@ insn *parse_line(int pass, char *buffer, insn * result,
         return result;
     }
 
-    /* right. Now we begin to parse the operands. There may be up to three
+    /* right. Now we begin to parse the operands. There may be up to four
      * of these, separated by commas, and terminated by a zero token. */
 
-    for (operand = 0; operand < 3; operand++) {
+    for (operand = 0; operand < MAX_OPERANDS; operand++) {
         expr *value;            /* used most of the time */
         int mref;               /* is this going to be a memory ref? */
         int bracket;            /* is it a [] mref, or a & mref? */

From 401c07e20d14130a2d147468a408fce9edd1faff Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 17 Sep 2007 16:55:04 -0700
Subject: [PATCH 04/29] Initial support for generating DREX suffixes

Initial support for generating DREX suffixes.  Not used yet.  No
disassembler support yet, and no support for "operand X must match
operand Y."
---
 assemble.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 nasm.h     |  5 ++++-
 2 files changed, 66 insertions(+), 3 deletions(-)

diff --git a/assemble.c b/assemble.c
index f1b4dbb9..a140c729 100644
--- a/assemble.c
+++ b/assemble.c
@@ -39,9 +39,16 @@
  * \150..\153     - an immediate dword or signed byte for operand 0..3
  * \154..\157     - or 2 (s-field) into next opcode byte if operand 0..3
  *		    is a signed byte rather than a dword.
+ * \160..\163    - this instruction uses DREX rather than REX, with the
+ *		   OC0 field set to 0, and the dest field taken from
+ *                 operand 0..3.
+ * \164..\167    - this instruction uses DREX rather than REX, with the
+ *		   OC0 field set to 1, and the dest field taken from
+ *                 operand 0..3.
  * \170          - encodes the literal byte 0. (Some compilers don't take
  *                 kindly to a zero byte in the _middle_ of a compile time
  *                 string constant, so I had to put this hack in.)
+ * \171		 - placement of DREX suffix in the absence of an EA
  * \2ab          - a ModRM, calculated on EA in operand a, with the spare
  *                 field equal to digit b.
  * \30x          - might be an 0x67 byte, depending on the address size of
@@ -843,9 +850,25 @@ static int32_t calcsize(int32_t segment, int32_t offset, int bits,
             codes += 2;
             length++;
             break;
+	case 0160:
+	case 0161:
+	case 0162:
+	case 0163:
+	    length++;
+	    ins->rex |= REX_D;
+	    break;
+	case 0164:
+	case 0165:
+	case 0166:
+	case 0167:
+	    length++;
+	    ins->rex |= REX_D|REX_OC;
+	    break;
         case 0170:
             length++;
             break;
+	case 0171:
+	    break;
         case 0300:
         case 0301:
         case 0302:         
@@ -945,7 +968,14 @@ static int32_t calcsize(int32_t segment, int32_t offset, int bits,
         }
 
     ins->rex &= rex_mask;
-    if (ins->rex & REX_REAL) {
+    
+    if (ins->rex & REX_D) {
+	if (ins->rex & REX_H) {
+	    errfunc(ERR_NONFATAL, "cannot use high register in drex instruction");
+	    return -1;
+	}
+	length++;
+    } else if (ins->rex & REX_REAL) {
 	if (ins->rex & REX_H) {
 	    errfunc(ERR_NONFATAL, "cannot use high register in rex instruction");
 	    return -1;
@@ -964,7 +994,7 @@ static int32_t calcsize(int32_t segment, int32_t offset, int bits,
 }
 
 #define EMIT_REX()							\
-    if((ins->rex & REX_REAL) && (bits == 64)) {				\
+    if (!(ins->rex & REX_D) && (ins->rex & REX_REAL) && (bits == 64)) {	\
 	ins->rex = (ins->rex & REX_REAL)|REX_P;				\
 	out(offset, segment, &ins->rex, OUT_RAWDATA+1, NO_SEG, NO_SEG); \
 	ins->rex = 0;							\
@@ -1320,12 +1350,33 @@ static void gencode(int32_t segment, int32_t offset, int bits,
             offset++;
             break;
 
+	case 0160:
+	case 0161:
+	case 0162:
+	case 0163:
+	case 0164:
+	case 0165:
+	case 0166:
+	case 0167:
+	    ins->drexdst = regval(&ins->oprs[c & 3]);
+	    break;
+
         case 0170:
             bytes[0] = 0;
             out(offset, segment, bytes, OUT_RAWDATA + 1, NO_SEG, NO_SEG);
             offset += 1;
             break;
 
+	case 0171:
+	    bytes[0] =
+		(ins->drexdst << 4) |
+		(ins->rex & REX_OC ? 0x08 : 0) |
+		(ins->rex & (REX_R|REX_X|REX_B));
+	    ins->rex = 0;
+            out(offset, segment, bytes, OUT_RAWDATA + 1, NO_SEG, NO_SEG);
+	    offset++;
+	    break;
+
         case 0300:
         case 0301:
         case 0302:
@@ -1487,6 +1538,15 @@ static void gencode(int32_t segment, int32_t offset, int bits,
                 if (ea_data.sib_present)
                     *p++ = ea_data.sib;
 
+		/* DREX suffixes come between the SIB and the displacement */
+		if (ins->rex & REX_D) {
+		    *p++ =
+			(ins->drexdst << 4) |
+			(ins->rex & REX_OC ? 0x08 : 0) |
+			(ins->rex & (REX_R|REX_X|REX_B));
+		    ins->rex = 0;
+		}
+
                 s = p - bytes;
                 out(offset, segment, bytes, OUT_RAWDATA + s,
                     NO_SEG, NO_SEG);
diff --git a/nasm.h b/nasm.h
index 7c5a1b75..93c35de6 100644
--- a/nasm.h
+++ b/nasm.h
@@ -540,6 +540,8 @@ enum ccode {			/* condition code names */
 /*
  * REX flags
  */
+#define REX_OC		0x0200	/* DREX suffix has the OC0 bit set */
+#define REX_D		0x0100	/* Instruction uses DREX instead of REX */
 #define REX_H		0x80	/* High register present, REX forbidden */
 #define REX_P		0x40	/* REX prefix present/required */
 #define REX_L		0x20	/* Use LOCK prefix instead of REX.R */
@@ -622,7 +624,8 @@ typedef struct {                /* an instruction itself */
     int eops_float;             /* true if DD and floating */
     int32_t times;              /* repeat count (TIMES prefix) */
     int forw_ref;               /* is there a forward reference? */
-    uint8_t rex;                /* Special REX Prefix */
+    int rex;			/* Special REX Prefix */
+    int drexdst;		/* Destination register for DREX suffix */
 } insn;
 
 enum geninfo { GI_SWITCH };

From cf5180a9553e43bbaa46fd1a77c75dc8b7f6da42 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 17 Sep 2007 17:25:27 -0700
Subject: [PATCH 05/29] Actually generate SSE5 instructions

This checkin completes what is required to actually generate SSE5
instructions.  No support in the disassembler yet.

This checkin covers:

- Support for actually generating DREX prefixes.
- Support for matching operand "operand X must match Y"
---
 assemble.c     | 19 +++++++++++++++----
 insns.dat      | 18 ++++++++++++++++++
 insns.h        |  3 ++-
 insns.pl       |  1 +
 nasm.h         | 12 +++++++++++-
 test/fmsub.asm | 16 ++++++++++++++++
 6 files changed, 63 insertions(+), 6 deletions(-)
 create mode 100644 test/fmsub.asm

diff --git a/assemble.c b/assemble.c
index a140c729..ec3b1124 100644
--- a/assemble.c
+++ b/assemble.c
@@ -856,6 +856,7 @@ static int32_t calcsize(int32_t segment, int32_t offset, int bits,
 	case 0163:
 	    length++;
 	    ins->rex |= REX_D;
+	    ins->drexdst = regval(&ins->oprs[c & 3]);
 	    break;
 	case 0164:
 	case 0165:
@@ -863,6 +864,7 @@ static int32_t calcsize(int32_t segment, int32_t offset, int bits,
 	case 0167:
 	    length++;
 	    ins->rex |= REX_D|REX_OC;
+	    ins->drexdst = regval(&ins->oprs[c & 3]);
 	    break;
         case 0170:
             length++;
@@ -974,6 +976,11 @@ static int32_t calcsize(int32_t segment, int32_t offset, int bits,
 	    errfunc(ERR_NONFATAL, "cannot use high register in drex instruction");
 	    return -1;
 	}
+	if (bits != 64 && ((ins->rex & (REX_W|REX_X|REX_B)) ||
+			   ins->drexdst > 7)) {
+	    errfunc(ERR_NONFATAL, "invalid operands in non-64-bit mode");
+	    return -1;
+	}
 	length++;
     } else if (ins->rex & REX_REAL) {
 	if (ins->rex & REX_H) {
@@ -985,8 +992,8 @@ static int32_t calcsize(int32_t segment, int32_t offset, int bits,
 		    cpu >= IF_X86_64)) {
 	    length++;
 	} else {
-	  errfunc(ERR_NONFATAL, "invalid operands in non-64-bit mode");
-	  return -1;
+	    errfunc(ERR_NONFATAL, "invalid operands in non-64-bit mode");
+	    return -1;
 	}
     }
 
@@ -1358,7 +1365,6 @@ static void gencode(int32_t segment, int32_t offset, int bits,
 	case 0165:
 	case 0166:
 	case 0167:
-	    ins->drexdst = regval(&ins->oprs[c & 3]);
 	    break;
 
         case 0170:
@@ -1663,7 +1669,12 @@ static int matches(const struct itemplate *itemp, insn * instruction, int bits)
      * Check that the operand flags all match up
      */
     for (i = 0; i < itemp->operands; i++) {
-        if (itemp->opd[i] & ~instruction->oprs[i].type ||
+	if (itemp->opd[i] & SAME_AS) {
+	    int j = itemp->opd[i] & ~SAME_AS;
+	    if (instruction->oprs[i].type != instruction->oprs[j].type ||
+		instruction->oprs[i].basereg != instruction->oprs[j].basereg)
+		return 0;
+	} else  if (itemp->opd[i] & ~instruction->oprs[i].type ||
             ((itemp->opd[i] & SIZE_MASK) &&
              ((itemp->opd[i] ^ instruction->oprs[i].type) & SIZE_MASK))) {
             if ((itemp->opd[i] & ~instruction->oprs[i].type & ~SIZE_MASK) ||
diff --git a/insns.dat b/insns.dat
index 4c7b2f5b..60bfa047 100644
--- a/insns.dat
+++ b/insns.dat
@@ -2020,3 +2020,21 @@ PCMPGTQ		xmmreg,xmmrm		\366\3\x0F\x38\x37\110		SSE42
 POPCNT		reg16,rm16		\320\333\2\x0F\xB8\110		NEHALEM
 POPCNT		reg32,rm32		\321\333\2\x0F\xB8\110		NEHALEM
 POPCNT		reg64,rm32		\324\333\2\x0F\xB8\110		NEHALEM,X64
+
+; AMD SSE5 instructions
+FMSUBPS		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x08\132		SSE5
+FMSUBPS		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x08\123		SSE5
+FMSUBPS		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x0C\121		SSE5
+FMSUBPS		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x0C\112		SSE5
+FMSUBPD		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x09\132		SSE5
+FMSUBPD		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x09\123		SSE5
+FMSUBPD		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x0D\121		SSE5
+FMSUBPD		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x0D\112		SSE5
+FMSUBSS		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x0A\132		SSE5
+FMSUBSS		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x0A\123		SSE5
+FMSUBSS		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x0E\121		SSE5
+FMSUBSS		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x0E\112		SSE5
+FMSUBSD		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x0B\132		SSE5
+FMSUBSD		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x0B\123		SSE5
+FMSUBSD		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x0F\121		SSE5
+FMSUBSD		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x0F\112		SSE5
diff --git a/insns.h b/insns.h
index 21dfd93b..c7fa75a0 100644
--- a/insns.h
+++ b/insns.h
@@ -21,7 +21,7 @@
 struct itemplate {
     enum opcode opcode;		/* the token, passed from "parser.c" */
     int operands;		/* number of operands */
-    int32_t opd[MAX_OPERANDS];	/* bit flags for operand types */
+    opflags_t opd[MAX_OPERANDS]; /* bit flags for operand types */
     const char *code;		/* the code it assembles to */
     uint32_t flags;		/* some flags */
 };
@@ -90,6 +90,7 @@ extern const struct itemplate * const * const itable[];
 #define IF_SSSE3  0x00200000UL  /* it's an SSSE3 instruction */
 #define IF_SSE41  0x00400000UL  /* it's an SSE4.1 instruction */
 #define IF_SSE42  0x00800000UL  /* it's an SSE4.2 instruction */
+#define IF_SSE5   0x00800000UL  /* HACK NEED TO REORGANIZE THESE BITS */
 #define IF_PMASK  0xFF000000UL  /* the mask for processor types */
 #define IF_PLEVEL 0x0F000000UL  /* the mask for processor instr. level */
                                         /* also the highest possible processor */
diff --git a/insns.pl b/insns.pl
index e596b48b..30f59c65 100644
--- a/insns.pl
+++ b/insns.pl
@@ -218,6 +218,7 @@ sub format {
     $operands =~ s/rm(\d+)/rm_gpr|bits$1/g;
     $operands =~ s/mmxrm/rm_mmx/g;
     $operands =~ s/xmmrm/rm_xmm/g;
+    $operands =~ s/\=([0-9]+)/same_as|$1/g;
     if ($operands eq 'void') {
 	@ops = ();
     } else {
diff --git a/nasm.h b/nasm.h
index 93c35de6..f5d64946 100644
--- a/nasm.h
+++ b/nasm.h
@@ -438,9 +438,16 @@ enum {
  * 25: RM_MMX (MMXREG)
  * 26: RM_XMM (XMMREG)
  *
- * Bits 27-31 are currently unallocated.
+ * Bits 27-29 & 31 are currently unallocated.
+ *
+ * 30: SAME_AS
+ * Special flag only used in instruction patterns; means this operand
+ * has to be identical to another operand.  Currently only supported
+ * for registers.
  */
 
+typedef uint32_t opflags_t;
+
 /* Size, and other attributes, of the operand */
 #define BITS8     	0x00000001L
 #define BITS16    	0x00000002L
@@ -527,6 +534,9 @@ enum {
 #define UNITY		0x00012000L   /* for shift/rotate instructions */
 #define SBYTE		0x00022000L   /* for op r16/32,immediate instrs. */
 
+/* special flags */
+#define SAME_AS		0x40000000L
+
 /* Register names automatically generated from regs.dat */
 #include "regs.h"
 
diff --git a/test/fmsub.asm b/test/fmsub.asm
new file mode 100644
index 00000000..7f087cd7
--- /dev/null
+++ b/test/fmsub.asm
@@ -0,0 +1,16 @@
+	bits 64
+
+	fmsubps xmm0,xmm0,xmm1,xmm2
+	fmsubps xmm0,xmm0,xmm1,[rax]
+	fmsubps xmm0,xmm0,xmm1,[rax+0x77]
+	fmsubps xmm0,xmm0,xmm1,[rax+0x7777]
+	fmsubps xmm1,xmm2,xmm3,xmm1
+	fmsubps xmm1,xmm2,[rax],xmm1
+	fmsubps xmm1,xmm2,[rax+0x77],xmm1
+	fmsubps xmm1,xmm2,[rax+0x7777],xmm1
+	fmsubps xmm0,[rax],xmm2,xmm0
+	fmsubps xmm0,[rax+0x77],xmm2,xmm0
+	fmsubps xmm0,[rax+0x7777],xmm2,xmm0
+	fmsubps xmm14,[rax],xmm2,xmm14
+	fmsubps xmm14,[rax+0x77],xmm2,xmm14
+	fmsubps xmm14,[rax+0x7777],xmm2,xmm14

From 0a80739c46f2a6e7217c56f0b96248388a8ea1c9 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 17 Sep 2007 17:27:46 -0700
Subject: [PATCH 06/29] insns.dat: All SSE5 instructions are AMD

SSE5 is an AMD-defined instruction set, so tag those AMD.
---
 insns.dat | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/insns.dat b/insns.dat
index 60bfa047..fcf0bec8 100644
--- a/insns.dat
+++ b/insns.dat
@@ -2022,19 +2022,19 @@ POPCNT		reg32,rm32		\321\333\2\x0F\xB8\110		NEHALEM
 POPCNT		reg64,rm32		\324\333\2\x0F\xB8\110		NEHALEM,X64
 
 ; AMD SSE5 instructions
-FMSUBPS		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x08\132		SSE5
-FMSUBPS		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x08\123		SSE5
-FMSUBPS		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x0C\121		SSE5
-FMSUBPS		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x0C\112		SSE5
-FMSUBPD		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x09\132		SSE5
-FMSUBPD		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x09\123		SSE5
-FMSUBPD		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x0D\121		SSE5
-FMSUBPD		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x0D\112		SSE5
-FMSUBSS		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x0A\132		SSE5
-FMSUBSS		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x0A\123		SSE5
-FMSUBSS		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x0E\121		SSE5
-FMSUBSS		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x0E\112		SSE5
-FMSUBSD		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x0B\132		SSE5
-FMSUBSD		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x0B\123		SSE5
-FMSUBSD		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x0F\121		SSE5
-FMSUBSD		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x0F\112		SSE5
+FMSUBPS		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x08\132		SSE5,AMD
+FMSUBPS		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x08\123		SSE5,AMD
+FMSUBPS		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x0C\121		SSE5,AMD
+FMSUBPS		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x0C\112		SSE5,AMD
+FMSUBPD		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x09\132		SSE5,AMD
+FMSUBPD		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x09\123		SSE5,AMD
+FMSUBPD		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x0D\121		SSE5,AMD
+FMSUBPD		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x0D\112		SSE5,AMD
+FMSUBSS		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x0A\132		SSE5,AMD
+FMSUBSS		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x0A\123		SSE5,AMD
+FMSUBSS		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x0E\121		SSE5,AMD
+FMSUBSS		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x0E\112		SSE5,AMD
+FMSUBSD		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x0B\132		SSE5,AMD
+FMSUBSD		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x0B\123		SSE5,AMD
+FMSUBSD		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x0F\121		SSE5,AMD
+FMSUBSD		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x0F\112		SSE5,AMD

From 7786c364b455806e991b3ef785618ec16f940ee5 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 17 Sep 2007 18:45:44 -0700
Subject: [PATCH 07/29] Disassembler support for SSE5 instructions

Support for the SSE5 instruction format in the disassembler.

Also adds some comments to insnsd.c for easier debugging.
---
 disasm.c | 207 +++++++++++++++++++++++++++++++++++--------------------
 insns.pl |   5 +-
 2 files changed, 134 insertions(+), 78 deletions(-)

diff --git a/disasm.c b/disasm.c
index cfe86938..3a8f710d 100644
--- a/disasm.c
+++ b/disasm.c
@@ -166,17 +166,47 @@ static const char *whichcond(int condval)
     return conditions[conds[condval]];
 }
 
+/*
+ * Process a DREX suffix
+ */
+static uint8_t *do_drex(uint8_t *data, insn *ins)
+{
+    uint8_t drex = *data++;
+    operand *dst = &ins->oprs[ins->drexdst];
+
+    if ((drex & 8) != ((ins->rex & REX_OC) ? 8 : 0))
+	return NULL;	/* OC0 mismatch */
+    ins->rex = (ins->rex & ~7) | (drex & 7);
+    
+    dst->segment = SEG_RMREG;
+    dst->basereg = drex >> 4;
+    return data;
+}
+
+
 /*
  * Process an effective address (ModRM) specification.
  */
 static uint8_t *do_ea(uint8_t *data, int modrm, int asize,
-		      int segsize, operand * op, int rex)
+		      int segsize, operand * op, insn *ins)
 {
     int mod, rm, scale, index, base;
+    int rex;
+    uint8_t sib = 0;
 
     mod = (modrm >> 6) & 03;
     rm = modrm & 07;
 
+    if (mod != 3 && rm == 4 && asize != 16)
+	sib = *data++;
+
+    if (ins->rex & REX_D) {
+	data = do_drex(data, ins);
+	if (!data)
+	    return NULL;
+    }
+    rex = ins->rex;
+
     if (mod == 3) {             /* pure register version */
         op->basereg = rm+(rex & REX_B ? 8 : 0);
         op->segment |= SEG_RMREG;
@@ -282,10 +312,9 @@ static uint8_t *do_ea(uint8_t *data, int modrm, int asize,
         }
 
         if (rm == 4) {          /* process SIB */
-            scale = (*data >> 6) & 03;
-            index = (*data >> 3) & 07;
-            base = *data & 07;
-            data++;
+            scale = (sib >> 6) & 03;
+            index = (sib >> 3) & 07;
+            base = sib & 07;
 
             op->scale = 1 << scale;
 
@@ -501,26 +530,37 @@ static int matches(const struct itemplate *t, uint8_t *data,
             ins->oprs[c - 070].segment |= SEG_32BIT | SEG_RELATIVE;
         } else if (c >= 0100 && c < 0140) {
             int modrm = *data++;
-            ins->oprs[c & 07].basereg = ((modrm >> 3)&7)+
-		(ins->rex & REX_R ? 8 : 0);
             ins->oprs[c & 07].segment |= SEG_RMREG;
             data = do_ea(data, modrm, asize, segsize,
-                         &ins->oprs[(c >> 3) & 07], ins->rex);
+			 &ins->oprs[(c >> 3) & 07], ins);
+	    if (!data)
+		return FALSE;
+            ins->oprs[c & 07].basereg = ((modrm >> 3)&7)+
+		(ins->rex & REX_R ? 8 : 0);
         } else if (c >= 0140 && c <= 0143) {
             ins->oprs[c - 0140].offset = getu16(data);
 	    data += 2;
         } else if (c >= 0150 && c <= 0153) {
 	    ins->oprs[c - 0150].offset = getu32(data);
 	    data += 4;
+	} else if (c >= 0160 && c <= 0167) {
+	    ins->rex |= (c & 4) ? REX_D|REX_OC : REX_D;
+	    ins->drexdst = c & 3;
         } else if (c == 0170) {
             if (*data++)
                 return FALSE;
+	} else if (c == 0171) {
+	    data = do_drex(data, ins);
+	    if (!data)
+		return FALSE;
         } else if (c >= 0200 && c <= 0277) {
             int modrm = *data++;
             if (((modrm >> 3) & 07) != (c & 07))
                 return FALSE;   /* spare field doesn't match up */
             data = do_ea(data, modrm, asize, segsize,
-                         &ins->oprs[(c >> 3) & 07], ins->rex);
+                         &ins->oprs[(c >> 3) & 07], ins);
+	    if (!data)
+		return FALSE;
         } else if (c >= 0300 && c <= 0303) {
             a_used = TRUE;
         } else if (c == 0310) {
@@ -605,6 +645,10 @@ static int matches(const struct itemplate *t, uint8_t *data,
 	}
     }
 
+    /* REX cannot be combined with DREX */
+    if ((ins->rex & REX_D) && (prefix->rex))
+	return FALSE;
+
     /*
      * Check for unused rep or a/o prefixes.
      */
@@ -692,19 +736,21 @@ int32_t disasm(uint8_t *data, char *output, int outbufsize, int segsize,
 	     * XXX: Need to make sure this is actually correct.
              */
             for (i = 0; i < (*p)->operands; i++) {
-                if (
-                       /* If it's a mem-only EA but we have a register, die. */
-                       ((tmp_ins.oprs[i].segment & SEG_RMREG) &&
-                        !(MEMORY & ~(*p)->opd[i])) ||
-                       /* If it's a reg-only EA but we have a memory ref, die. */
-                       (!(tmp_ins.oprs[i].segment & SEG_RMREG) &&
-                        !(REG_EA & ~(*p)->opd[i]) &&
-                        !((*p)->opd[i] & REG_SMASK)) ||
-                       /* Register type mismatch (eg FS vs REG_DESS): die. */
-                       ((((*p)->opd[i] & (REGISTER | FPUREG)) ||
-                         (tmp_ins.oprs[i].segment & SEG_RMREG)) &&
-                        !whichreg((*p)->opd[i],
-                                  tmp_ins.oprs[i].basereg, tmp_ins.rex))) {
+                if (!((*p)->opd[i] & SAME_AS) &&
+		    (
+			/* If it's a mem-only EA but we have a register, die. */
+			((tmp_ins.oprs[i].segment & SEG_RMREG) &&
+			 !(MEMORY & ~(*p)->opd[i])) ||
+			/* If it's a reg-only EA but we have a memory ref, die. */
+			(!(tmp_ins.oprs[i].segment & SEG_RMREG) &&
+			 !(REG_EA & ~(*p)->opd[i]) &&
+			 !((*p)->opd[i] & REG_SMASK)) ||
+			/* Register type mismatch (eg FS vs REG_DESS): die. */
+			((((*p)->opd[i] & (REGISTER | FPUREG)) ||
+			  (tmp_ins.oprs[i].segment & SEG_RMREG)) &&
+			 !whichreg((*p)->opd[i],
+				   tmp_ins.oprs[i].basereg, tmp_ins.rex))
+			)) {
                     works = FALSE;
                     break;
                 }
@@ -793,107 +839,116 @@ int32_t disasm(uint8_t *data, char *output, int outbufsize, int segsize,
     colon = FALSE;
     length += data - origdata;  /* fix up for prefixes */
     for (i = 0; i < (*p)->operands; i++) {
+	opflags_t t = (*p)->opd[i];
+	const operand *o = &ins.oprs[i];
+	int64_t offs;
+
+	if (t & SAME_AS) {
+	    o = &ins.oprs[t & ~SAME_AS];
+	    t = (*p)->opd[t & ~SAME_AS];
+	}
+
         output[slen++] = (colon ? ':' : i == 0 ? ' ' : ',');
 
-        if (ins.oprs[i].segment & SEG_RELATIVE) {
-            ins.oprs[i].offset += offset + length;
+	offs = o->offset;
+        if (o->segment & SEG_RELATIVE) {
+            offs += offset + length;
             /*
              * sort out wraparound
              */
-            if (!(ins.oprs[i].segment & (SEG_32BIT|SEG_64BIT)))
-		ins.oprs[i].offset &= 0xffff;
+            if (!(o->segment & (SEG_32BIT|SEG_64BIT)))
+		offs &= 0xffff;
             /*
              * add sync marker, if autosync is on
              */
             if (autosync)
-                add_sync(ins.oprs[i].offset, 0L);
+                add_sync(offs, 0L);
         }
 
-        if ((*p)->opd[i] & COLON)
+        if (t & COLON)
             colon = TRUE;
         else
             colon = FALSE;
 
-        if (((*p)->opd[i] & (REGISTER | FPUREG)) ||
-            (ins.oprs[i].segment & SEG_RMREG)) {
-            ins.oprs[i].basereg = whichreg((*p)->opd[i],
-                                           ins.oprs[i].basereg, ins.rex);
-            if ((*p)->opd[i] & TO)
+        if ((t & (REGISTER | FPUREG)) ||
+            (o->segment & SEG_RMREG)) {
+	    enum reg_enum reg;
+            reg = whichreg(t, o->basereg, ins.rex);
+            if (t & TO)
                 slen += snprintf(output + slen, outbufsize - slen, "to ");
             slen += snprintf(output + slen, outbufsize - slen, "%s",
-                             reg_names[ins.oprs[i].basereg -
-                                       EXPR_REG_START]);
-        } else if (!(UNITY & ~(*p)->opd[i])) {
+                             reg_names[reg - EXPR_REG_START]);
+        } else if (!(UNITY & ~t)) {
             output[slen++] = '1';
-        } else if ((*p)->opd[i] & IMMEDIATE) {
-            if ((*p)->opd[i] & BITS8) {
+        } else if (t & IMMEDIATE) {
+            if (t & BITS8) {
                 slen +=
                     snprintf(output + slen, outbufsize - slen, "byte ");
-                if (ins.oprs[i].segment & SEG_SIGNED) {
-                    if (ins.oprs[i].offset < 0) {
-                        ins.oprs[i].offset *= -1;
+                if (o->segment & SEG_SIGNED) {
+                    if (offs < 0) {
+                        offs *= -1;
                         output[slen++] = '-';
                     } else
                         output[slen++] = '+';
                 }
-            } else if ((*p)->opd[i] & BITS16) {
+            } else if (t & BITS16) {
                 slen +=
                     snprintf(output + slen, outbufsize - slen, "word ");
-            } else if ((*p)->opd[i] & BITS32) {
+            } else if (t & BITS32) {
                 slen +=
                     snprintf(output + slen, outbufsize - slen, "dword ");
-            } else if ((*p)->opd[i] & BITS64) {
+            } else if (t & BITS64) {
                 slen +=
                     snprintf(output + slen, outbufsize - slen, "qword ");
-            } else if ((*p)->opd[i] & NEAR) {
+            } else if (t & NEAR) {
                 slen +=
                     snprintf(output + slen, outbufsize - slen, "near ");
-            } else if ((*p)->opd[i] & SHORT) {
+            } else if (t & SHORT) {
                 slen +=
                     snprintf(output + slen, outbufsize - slen, "short ");
             }
             slen +=
                 snprintf(output + slen, outbufsize - slen, "0x%"PRIx64"",
-                         ins.oprs[i].offset);
-        } else if (!(MEM_OFFS & ~(*p)->opd[i])) {
+                         offs);
+        } else if (!(MEM_OFFS & ~t)) {
             slen +=
                 snprintf(output + slen, outbufsize - slen, "[%s%s%s0x%"PRIx64"]",
                          (segover ? segover : ""),
                          (segover ? ":" : ""),
-                         (ins.oprs[i].addr_size ==
-                          32 ? "dword " : ins.oprs[i].addr_size ==
-                          16 ? "word " : ""), ins.oprs[i].offset);
+                         (o->addr_size ==
+                          32 ? "dword " : o->addr_size ==
+                          16 ? "word " : ""), offs);
             segover = NULL;
-        } else if (!(REGMEM & ~(*p)->opd[i])) {
+        } else if (!(REGMEM & ~t)) {
             int started = FALSE;
-            if ((*p)->opd[i] & BITS8)
+            if (t & BITS8)
                 slen +=
                     snprintf(output + slen, outbufsize - slen, "byte ");
-            if ((*p)->opd[i] & BITS16)
+            if (t & BITS16)
                 slen +=
                     snprintf(output + slen, outbufsize - slen, "word ");
-            if ((*p)->opd[i] & BITS32)
+            if (t & BITS32)
                 slen +=
                     snprintf(output + slen, outbufsize - slen, "dword ");
-            if ((*p)->opd[i] & BITS64)
+            if (t & BITS64)
                 slen +=
                     snprintf(output + slen, outbufsize - slen, "qword ");
-            if ((*p)->opd[i] & BITS80)
+            if (t & BITS80)
                 slen +=
                     snprintf(output + slen, outbufsize - slen, "tword ");
-            if ((*p)->opd[i] & FAR)
+            if (t & FAR)
                 slen += snprintf(output + slen, outbufsize - slen, "far ");
-            if ((*p)->opd[i] & NEAR)
+            if (t & NEAR)
                 slen +=
                     snprintf(output + slen, outbufsize - slen, "near ");
             output[slen++] = '[';
-            if (ins.oprs[i].addr_size)
+            if (o->addr_size)
                 slen += snprintf(output + slen, outbufsize - slen, "%s",
-                                 (ins.oprs[i].addr_size == 64 ? "qword " :
-				  ins.oprs[i].addr_size == 32 ? "dword " :
-                                  ins.oprs[i].addr_size == 16 ? "word " :
+                                 (o->addr_size == 64 ? "qword " :
+				  o->addr_size == 32 ? "dword " :
+                                  o->addr_size == 16 ? "word " :
 				  ""));
-	    if (ins.oprs[i].eaflags & EAF_REL)
+	    if (o->eaflags & EAF_REL)
 		slen += snprintf(output + slen, outbufsize - slen, "rel ");
             if (segover) {
                 slen +=
@@ -901,27 +956,27 @@ int32_t disasm(uint8_t *data, char *output, int outbufsize, int segsize,
                              segover);
                 segover = NULL;
             }
-            if (ins.oprs[i].basereg != -1) {
+            if (o->basereg != -1) {
                 slen += snprintf(output + slen, outbufsize - slen, "%s",
-                                 reg_names[(ins.oprs[i].basereg -
+                                 reg_names[(o->basereg -
                                             EXPR_REG_START)]);
                 started = TRUE;
             }
-            if (ins.oprs[i].indexreg != -1) {
+            if (o->indexreg != -1) {
                 if (started)
                     output[slen++] = '+';
                 slen += snprintf(output + slen, outbufsize - slen, "%s",
-                                 reg_names[(ins.oprs[i].indexreg -
+                                 reg_names[(o->indexreg -
                                             EXPR_REG_START)]);
-                if (ins.oprs[i].scale > 1)
+                if (o->scale > 1)
                     slen +=
                         snprintf(output + slen, outbufsize - slen, "*%d",
-                                 ins.oprs[i].scale);
+                                 o->scale);
                 started = TRUE;
             }
-            if (ins.oprs[i].segment & SEG_DISP8) {
+            if (o->segment & SEG_DISP8) {
 		int minus = 0;
-		int8_t offset = ins.oprs[i].offset;
+		int8_t offset = offs;
 		if (offset < 0) {
 		    minus = 1;
 		    offset = -offset;
@@ -929,9 +984,9 @@ int32_t disasm(uint8_t *data, char *output, int outbufsize, int segsize,
                 slen +=
                     snprintf(output + slen, outbufsize - slen, "%s0x%"PRIx8"",
 			     minus ? "-" : "+", offset);
-            } else if (ins.oprs[i].segment & SEG_DISP16) {
+            } else if (o->segment & SEG_DISP16) {
 		int minus = 0;
-		int16_t offset = ins.oprs[i].offset;
+		int16_t offset = offs;
 		if (offset < 0) {
 		    minus = 1;
 		    offset = -offset;
@@ -939,9 +994,9 @@ int32_t disasm(uint8_t *data, char *output, int outbufsize, int segsize,
                 slen +=
                     snprintf(output + slen, outbufsize - slen, "%s0x%"PRIx16"",
 			     minus ? "-" : started ? "+" : "", offset);
-            } else if (ins.oprs[i].segment & SEG_DISP32) {
+            } else if (o->segment & SEG_DISP32) {
 		    char *prefix = "";
-		    int32_t offset = ins.oprs[i].offset;
+		    int32_t offset = offs;
 		    if (offset < 0) {
 			offset = -offset;
 			prefix = "-";
diff --git a/insns.pl b/insns.pl
index 30f59c65..6e961dec 100644
--- a/insns.pl
+++ b/insns.pl
@@ -102,10 +102,11 @@ if ( !defined($output) || $output eq 'd' ) {
     print D "\n";
     
     print D "static const struct itemplate instrux[] = {\n";
+    $n = 0;
     foreach $j (@big) {
-	print D "    $j\n";
+	printf D "    /* %4d */ %s\n", $n++, $j;
     }
-	print D "    ITEMPLATE_END\n};\n\n";
+    print D "    ITEMPLATE_END\n};\n\n";
     
     for ($c=0; $c<256; $c++) {
 	$h = sprintf "%02X", $c;

From 24196047b52a410c281248796164187c275cf768 Mon Sep 17 00:00:00 2001
From: Frank Kotler <fbk@Reltok1.comcast.net>
Date: Tue, 18 Sep 2007 02:06:09 -0400
Subject: [PATCH 08/29] add "const" to output/outdbg.c

apparently we missed that when updating "const" in other files
---
 output/outdbg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/output/outdbg.c b/output/outdbg.c
index 045ad371..0dda75a6 100644
--- a/output/outdbg.c
+++ b/output/outdbg.c
@@ -100,7 +100,7 @@ static void dbg_deflabel(char *name, int32_t segment, int32_t offset,
             is_global, special ? ": " : "", special);
 }
 
-static void dbg_out(int32_t segto, void *data, uint32_t type,
+static void dbg_out(int32_t segto, const void *data, uint32_t type,
                     int32_t segment, int32_t wrt)
 {
     int32_t realbytes = type & OUT_SIZMASK;

From 3ce3715fba17d5ad54278a42dd0fe75a5b0ebc8d Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 18 Sep 2007 12:23:21 -0700
Subject: [PATCH 09/29] SSE5 instruction table

Implement the full SSE5 instruction table.
---
 insns.dat | 148 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 148 insertions(+)

diff --git a/insns.dat b/insns.dat
index fcf0bec8..5214ee52 100644
--- a/insns.dat
+++ b/insns.dat
@@ -2022,6 +2022,24 @@ POPCNT		reg32,rm32		\321\333\2\x0F\xB8\110		NEHALEM
 POPCNT		reg64,rm32		\324\333\2\x0F\xB8\110		NEHALEM,X64
 
 ; AMD SSE5 instructions
+
+; Four operands with DREX
+FMADDPS		xmmreg,=0,xmmreg,xmmrm	\160\2\x0F\x24\170\132		SSE5,AMD
+FMADDPS		xmmreg,=0,xmmrm,xmmreg	\164\2\x0F\x24\170\123		SSE5,AMD
+FMADDPS		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x04\121		SSE5,AMD
+FMADDPS		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x04\112		SSE5,AMD
+FMADDPD		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x01\132		SSE5,AMD
+FMADDPD		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x01\123		SSE5,AMD
+FMADDPD		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x05\121		SSE5,AMD
+FMADDPD		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x05\112		SSE5,AMD
+FMADDSS		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x02\132		SSE5,AMD
+FMADDSS		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x02\123		SSE5,AMD
+FMADDSS		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x06\121		SSE5,AMD
+FMADDSS		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x06\112		SSE5,AMD
+FMADDSD		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x03\132		SSE5,AMD
+FMADDSD		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x03\123		SSE5,AMD
+FMADDSD		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x07\121		SSE5,AMD
+FMADDSD		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x07\112		SSE5,AMD
 FMSUBPS		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x08\132		SSE5,AMD
 FMSUBPS		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x08\123		SSE5,AMD
 FMSUBPS		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x0C\121		SSE5,AMD
@@ -2038,3 +2056,133 @@ FMSUBSD		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x0B\132		SSE5,AMD
 FMSUBSD		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x0B\123		SSE5,AMD
 FMSUBSD		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x0F\121		SSE5,AMD
 FMSUBSD		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x0F\112		SSE5,AMD
+FMNADDPS	xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x10\132		SSE5,AMD
+FMNADDPS	xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x10\123		SSE5,AMD
+FMNADDPS	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x14\121		SSE5,AMD
+FMNADDPS	xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x14\112		SSE5,AMD
+FMNADDPD	xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x11\132		SSE5,AMD
+FMNADDPD	xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x11\123		SSE5,AMD
+FMNADDPD	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x15\121		SSE5,AMD
+FMNADDPD	xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x15\112		SSE5,AMD
+FMNADDSS	xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x12\132		SSE5,AMD
+FMNADDSS	xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x12\123		SSE5,AMD
+FMNADDSS	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x16\121		SSE5,AMD
+FMNADDSS	xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x16\112		SSE5,AMD
+FMNADDSD	xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x13\132		SSE5,AMD
+FMNADDSD	xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x13\123		SSE5,AMD
+FMNADDSD	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x17\121		SSE5,AMD
+FMNADDSD	xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x17\112		SSE5,AMD
+FMNSUBPS	xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x18\132		SSE5,AMD
+FMNSUBPS	xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x18\123		SSE5,AMD
+FMNSUBPS	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x1C\121		SSE5,AMD
+FMNSUBPS	xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x1C\112		SSE5,AMD
+FMNSUBPD	xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x19\132		SSE5,AMD
+FMNSUBPD	xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x19\123		SSE5,AMD
+FMNSUBPD	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x1D\121		SSE5,AMD
+FMNSUBPD	xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x1D\112		SSE5,AMD
+FMNSUBSS	xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x1A\132		SSE5,AMD
+FMNSUBSS	xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x1A\123		SSE5,AMD
+FMNSUBSS	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x1E\121		SSE5,AMD
+FMNSUBSS	xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x1E\112		SSE5,AMD
+FMNSUBSD	xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x1B\132		SSE5,AMD
+FMNSUBSD	xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x1B\123		SSE5,AMD
+FMNSUBSD	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x1F\121		SSE5,AMD
+FMNSUBSD	xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x1F\112		SSE5,AMD
+COMPS		xmmreg,xmmreg,xmmrm,imm \160\3\x0F\x25\x2C\121\27	SSE5,AMD
+COMPD		xmmreg,xmmreg,xmmrm,imm	\160\3\x0F\x25\x2D\121\27	SSE5,AMD
+COMSS		xmmreg,xmmreg,xmmrm,imm	\160\3\x0F\x25\x2E\121\27	SSE5,AMD
+COMSD		xmmreg,xmmreg,xmmrm,imm	\160\3\x0F\x25\x2F\121\27	SSE5,AMD
+PCOMB		xmmreg,xmmreg,xmmrm,imm	\160\3\x0F\x25\x4C\121\27	SSE5,AMD
+PCOMW		xmmreg,xmmreg,xmmrm,imm	\160\3\x0F\x25\x4D\121\27	SSE5,AMD
+PCOMD		xmmreg,xmmreg,xmmrm,imm	\160\3\x0F\x25\x4E\121\27	SSE5,AMD
+PCOMQ		xmmreg,xmmreg,xmmrm,imm	\160\3\x0F\x25\x4F\121\27	SSE5,AMD
+PCOMUB		xmmreg,xmmreg,xmmrm,imm	\160\3\x0F\x25\x6C\121\27	SSE5,AMD
+PCOMUW		xmmreg,xmmreg,xmmrm,imm	\160\3\x0F\x25\x6D\121\27	SSE5,AMD
+PCOMUD		xmmreg,xmmreg,xmmrm,imm	\160\3\x0F\x25\x6E\121\27	SSE5,AMD
+PCOMUQ		xmmreg,xmmreg,xmmrm,imm	\160\3\x0F\x25\x6F\121\27	SSE5,AMD
+PERMPS		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x20\132		SSE5,AMD
+PERMPS		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x20\123		SSE5,AMD
+PERMPS		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x24\121		SSE5,AMD
+PERMPS		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x24\112		SSE5,AMD
+PERMPD		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x21\132		SSE5,AMD
+PERMPD		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x21\123		SSE5,AMD
+PERMPD		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x25\121		SSE5,AMD
+PERMPD		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x25\112		SSE5,AMD
+PCMOV		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x22\132		SSE5,AMD
+PCMOV		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x22\123		SSE5,AMD
+PCMOV		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x26\121		SSE5,AMD
+PCMOV		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x26\112		SSE5,AMD
+PPERM		xmmreg,=0,xmmreg,xmmrm	\160\3\x0F\x24\x23\132		SSE5,AMD
+PPERM		xmmreg,=0,xmmrm,xmmreg	\164\3\x0F\x24\x23\123		SSE5,AMD
+PPERM		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x27\121		SSE5,AMD
+PPERM		xmmreg,xmmrm,xmmreg,=0	\164\3\x0F\x24\x27\112		SSE5,AMD
+PMACSSWW	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x85\121		SSE5,AMD
+PMACSWW		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x95\121		SSE5,AMD
+PMACSSWD	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x86\121		SSE5,AMD
+PMACSWD		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x96\121		SSE5,AMD
+PMACSSDD	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x8E\121		SSE5,AMD
+PMACSDD		xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x9E\121		SSE5,AMD
+PMACSSDQL	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x87\121		SSE5,AMD
+PMACSDQL	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x97\121		SSE5,AMD
+PMACSSDQH	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x8F\121		SSE5,AMD
+PMACSDQH	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\x9F\121		SSE5,AMD
+PMADCSSWD	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\xA6\121		SSE5,AMD
+PMADCSWD	xmmreg,xmmreg,xmmrm,=0	\160\3\x0F\x24\xB6\121		SSE5,AMD
+
+; Three operands with DREX
+PROTB		xmmreg,xmmreg,xmmrm	\160\3\x0F\x24\x40\121		SSE5,AMD
+PROTB		xmmreg,xmmrm,xmmreg	\164\3\x0F\x24\x40\112		SSE5,AMD
+PROTW		xmmreg,xmmreg,xmmrm	\160\3\x0F\x24\x41\121		SSE5,AMD
+PROTW		xmmreg,xmmrm,xmmreg	\164\3\x0F\x24\x41\112		SSE5,AMD
+PROTD		xmmreg,xmmreg,xmmrm	\160\3\x0F\x24\x42\121		SSE5,AMD
+PROTD		xmmreg,xmmrm,xmmreg	\164\3\x0F\x24\x42\112		SSE5,AMD
+PROTQ		xmmreg,xmmreg,xmmrm	\160\3\x0F\x24\x43\121		SSE5,AMD
+PROTQ		xmmreg,xmmrm,xmmreg	\164\3\x0F\x24\x43\112		SSE5,AMD
+PSHLB		xmmreg,xmmreg,xmmrm	\160\3\x0F\x24\x44\121		SSE5,AMD
+PSHLB		xmmreg,xmmrm,xmmreg	\164\3\x0F\x24\x44\112		SSE5,AMD
+PSHLW		xmmreg,xmmreg,xmmrm	\160\3\x0F\x24\x45\121		SSE5,AMD
+PSHLW		xmmreg,xmmrm,xmmreg	\164\3\x0F\x24\x45\112		SSE5,AMD
+PSHLD		xmmreg,xmmreg,xmmrm	\160\3\x0F\x24\x46\121		SSE5,AMD
+PSHLD		xmmreg,xmmrm,xmmreg	\164\3\x0F\x24\x46\112		SSE5,AMD
+PSHLQ		xmmreg,xmmreg,xmmrm	\160\3\x0F\x24\x47\121		SSE5,AMD
+PSHLQ		xmmreg,xmmrm,xmmreg	\164\3\x0F\x24\x47\112		SSE5,AMD
+PSHAB		xmmreg,xmmreg,xmmrm	\160\3\x0F\x24\x48\121		SSE5,AMD
+PSHAB		xmmreg,xmmrm,xmmreg	\164\3\x0F\x24\x48\112		SSE5,AMD
+PSHAW		xmmreg,xmmreg,xmmrm	\160\3\x0F\x24\x49\121		SSE5,AMD
+PSHAW		xmmreg,xmmrm,xmmreg	\164\3\x0F\x24\x49\112		SSE5,AMD
+PSHAD		xmmreg,xmmreg,xmmrm	\160\3\x0F\x24\x4A\121		SSE5,AMD
+PSHAD		xmmreg,xmmrm,xmmreg	\164\3\x0F\x24\x4A\112		SSE5,AMD
+PSHAQ		xmmreg,xmmreg,xmmrm	\160\3\x0F\x24\x4B\121		SSE5,AMD
+PSHAQ		xmmreg,xmmrm,xmmreg	\164\3\x0F\x24\x4B\112		SSE5,AMD
+
+; Non-DREX
+FRCZPS		xmmreg,xmmrm		\3\x0F\x7A\x10\110		SSE5,AMD
+FRCZPD		xmmreg,xmmrm		\3\x0F\x7A\x11\110		SSE5,AMD
+FRCZSS		xmmreg,xmmrm		\3\x0F\x7A\x12\110		SSE5,AMD
+FRCZSD		xmmreg,xmmrm		\3\x0F\x7A\x13\110		SSE5,AMD
+CVTPH2PS	xmmreg,xmmrm		\3\x0F\x7A\x30\110		SSE5,AMD,SQ
+CVTPS2PH	xmmrm,xmmreg		\3\x0F\x7A\x31\101		SSE5,AMD,SQ
+PHADDBW		xmmreg,xmmrm		\3\x0F\x7A\x41\110		SSE5,AMD
+PHADDBD		xmmreg,xmmrm		\3\x0F\x7A\x42\110		SSE5,AMD
+PHADDBQ		xmmreg,xmmrm		\3\x0F\x7A\x43\110		SSE5,AMD
+PHADDWD		xmmreg,xmmrm		\3\x0F\x7A\x46\110		SSE5,AMD
+PHADDWQ		xmmreg,xmmrm		\3\x0F\x7A\x47\110		SSE5,AMD
+PHADDDQ		xmmreg,xmmrm		\3\x0F\x7A\x4B\110		SSE5,AMD
+PHADDUBW	xmmreg,xmmrm		\3\x0F\x7A\x51\110		SSE5,AMD
+PHADDUBD	xmmreg,xmmrm		\3\x0F\x7A\x52\110		SSE5,AMD
+PHADDUBQ	xmmreg,xmmrm		\3\x0F\x7A\x53\110		SSE5,AMD
+PHADDUWD	xmmreg,xmmrm		\3\x0F\x7A\x56\110		SSE5,AMD
+PHADDUWQ	xmmreg,xmmrm		\3\x0F\x7A\x57\110		SSE5,AMD
+PHADDUDQ	xmmreg,xmmrm		\3\x0F\x7A\x5B\110		SSE5,AMD
+PHSUBBW		xmmreg,xmmrm		\3\x0F\x7A\x61\110		SSE5,AMD
+PHSUBWD		xmmreg,xmmrm		\3\x0F\x7A\x62\110		SSE5,AMD
+PHSUBDQ		xmmreg,xmmrm		\3\x0F\x7A\x63\110		SSE5,AMD
+PROTB		xmmreg,xmmrm,imm	\3\x0F\x7B\x40\110\26		SSE5,AMD
+PROTW		xmmreg,xmmrm,imm	\3\x0F\x7B\x41\110\26		SSE5,AMD
+PROTD		xmmreg,xmmrm,imm	\3\x0F\x7B\x42\110\26		SSE5,AMD
+PROTQ		xmmreg,xmmrm,imm	\3\x0F\x7B\x43\110\26		SSE5,AMD
+PTEST		xmmreg,xmmrm		\366\3\x0F\x38\x17\110		SSE5,AMD
+ROUNDPS		xmmreg,xmmrm,imm	\366\3\x0F\x3A\x08\110\26	SSE5,AMD
+ROUNDPD		xmmreg,xmmrm,imm	\366\3\x0F\x3A\x08\110\26	SSE5,AMD
+ROUNDSS		xmmreg,xmmrm,imm	\366\3\x0F\x3A\x08\110\26	SSE5,AMD
+ROUNDSD 	xmmreg,xmmrm,imm	\366\3\x0F\x3A\x08\110\26	SSE5,AMD

From 5255fd1f36eece1cbf4000ffc3120dbcb9bf5038 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 18 Sep 2007 12:38:07 -0700
Subject: [PATCH 10/29] Change the token prehash function for better
 convergence

Combining arithmetric (add) and bitwise (xor) mixing seems to give
better result than either.

With the new prehash function, we find a valid hash much quicker.
---
 perllib/phash.ph | 4 ++--
 pptok.pl         | 4 ++--
 tokhash.pl       | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/perllib/phash.ph b/perllib/phash.ph
index 60334272..3bb3a05b 100644
--- a/perllib/phash.ph
+++ b/perllib/phash.ph
@@ -42,8 +42,8 @@ sub prehash($$$) {
 
     foreach $c (unpack("C*", $key)) {
 	$ko1 = $k1;  $ko2 = $k2;
-	$k1 = int32(rot($ko1,$s0)-rot($ko2, $s1)+$c);
-	$k2 = int32(rot($ko2,$s2)-rot($ko1, $s3)+$c);
+	$k1 = int32(rot($ko1,$s0)^int32(rot($ko2, $s1)+$c));
+	$k2 = int32(rot($ko2,$s2)^int32(rot($ko1, $s3)+$c));
     }
 
     # Create a bipartite graph...
diff --git a/pptok.pl b/pptok.pl
index a0425b7c..a835bf3e 100755
--- a/pptok.pl
+++ b/pptok.pl
@@ -191,8 +191,8 @@ if ($what eq 'c') {
     print OUT  "    while ((c = *p++) != 0) {\n";
     print OUT  "        uint32_t kn1, kn2;\n";
     print OUT  "        c |= 0x20; /* convert to lower case */\n";
-    printf OUT "        kn1 = rot(k1,%2d) - rot(k2,%2d) + c;\n", ${$sv}[0], ${$sv}[1];
-    printf OUT "        kn2 = rot(k2,%2d) - rot(k1,%2d) + c;\n", ${$sv}[2], ${$sv}[3];
+    printf OUT "        kn1 = rot(k1,%2d)^(rot(k2,%2d) + c);\n", ${$sv}[0], ${$sv}[1];
+    printf OUT "        kn2 = rot(k2,%2d)^(rot(k1,%2d) + c);\n", ${$sv}[2], ${$sv}[3];
     print OUT  "        k1 = kn1; k2 = kn2;\n";
     print OUT  "    }\n";
     print OUT  "\n";
diff --git a/tokhash.pl b/tokhash.pl
index 5f1a9f4c..9d5888be 100755
--- a/tokhash.pl
+++ b/tokhash.pl
@@ -187,8 +187,8 @@ print  "    const char *p = token;\n";
 print  "\n";
 
 print  "    while ((c = *p++) != 0) {\n";
-printf "        uint32_t kn1 = rot(k1,%2d) - rot(k2,%2d) + c;\n", ${$sv}[0], ${$sv}[1];
-printf "        uint32_t kn2 = rot(k2,%2d) - rot(k1,%2d) + c;\n", ${$sv}[2], ${$sv}[3];
+printf "        uint32_t kn1 = rot(k1,%2d)^(rot(k2,%2d) + c);\n", ${$sv}[0], ${$sv}[1];
+printf "        uint32_t kn2 = rot(k2,%2d)^(rot(k1,%2d) + c);\n", ${$sv}[2], ${$sv}[3];
 print  "        k1 = kn1; k2 = kn2;\n";
 print  "    }\n";
 print  "\n";

From 41c9f6fde06091199f1a95e0c045230baaa25bf4 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 18 Sep 2007 13:01:32 -0700
Subject: [PATCH 11/29] Implement "oword" (128 bits) as a first-class size

Implement oword, reso, do, as well as the SO flag to instructions.  No
instructions are actually flagged with SO yet, but this allows us to
specify 128-bit sizes in instruction patterns.
---
 assemble.c |  7 +++++++
 insns.dat  | 26 ++++++++++++++++----------
 insns.h    |  1 +
 nasm.h     | 10 ++++++----
 parser.c   | 35 +++++++++++++++++++++++++----------
 tokens.dat |  1 +
 6 files changed, 56 insertions(+), 24 deletions(-)

diff --git a/assemble.c b/assemble.c
index ec3b1124..e5384548 100644
--- a/assemble.c
+++ b/assemble.c
@@ -1720,6 +1720,9 @@ static int matches(const struct itemplate *itemp, insn * instruction, int bits)
 	case IF_SQ:
             size[i] = BITS64;
 	    break;
+	case IF_SO:
+	    size[i] = BITS128;
+	    break;
 	default:
 	    break;
         }
@@ -1742,6 +1745,10 @@ static int matches(const struct itemplate *itemp, insn * instruction, int bits)
             asize = BITS64;
             oprs = itemp->operands;
 	    break;
+	case IF_SO:
+            asize = BITS128;
+            oprs = itemp->operands;
+	    break;
 	default:
 	    break;
         }
diff --git a/insns.dat b/insns.dat
index 5214ee52..f95b157e 100644
--- a/insns.dat
+++ b/insns.dat
@@ -14,6 +14,22 @@
 ; see the comment at the top of assemble.c.  For a detailed description
 ; of the flags (fourth field), please see insns.h.
 ;
+
+; Special instructions...
+DB        ignore              ignore                        ignore
+DW        ignore              ignore                        ignore
+DD        ignore              ignore                        ignore
+DQ        ignore              ignore                        ignore
+DT        ignore              ignore                        ignore
+DO        ignore              ignore                        ignore
+RESB      imm                 \340                          8086
+RESW      ignore              ignore                        ignore
+RESD      ignore              ignore                        ignore
+RESQ      ignore              ignore                        ignore
+REST      ignore              ignore                        ignore
+RESO      ignore              ignore                        ignore
+
+; Conventional instructions
 AAA       void                \1\x37                        8086,NOLONG
 AAD       void                \2\xD5\x0A                    8086,NOLONG
 AAD       imm                 \1\xD5\24                     8086,SB,NOLONG
@@ -270,8 +286,6 @@ CWD       void                \320\1\x99                    8086
 CWDE      void                \321\1\x98                    386
 DAA       void                \1\x27                        8086,NOLONG
 DAS       void                \1\x2F                        8086,NOLONG
-DB        ignore              ignore                        ignore
-DD        ignore              ignore                        ignore
 DEC       reg16               \320\10\x48                   8086,NOLONG
 DEC       reg32               \321\10\x48                   386,NOLONG
 DEC       rm8                 \300\1\xFE\201                8086
@@ -282,9 +296,6 @@ DIV       rm8                 \300\1\xF6\206                8086
 DIV       rm16                \320\300\1\xF7\206            8086
 DIV       rm32                \321\300\1\xF7\206            386
 DIV       rm64                \324\300\1\xF7\206            X64
-DQ        ignore              ignore                        ignore
-DT        ignore              ignore                        ignore
-DW        ignore              ignore                        ignore
 EMMS      void                \2\x0F\x77                    PENT,MMX
 ENTER     imm,imm             \1\xC8\30\25                  186
 EQU       imm                 \0                            8086
@@ -1029,11 +1040,6 @@ RDMSR     void                \2\x0F\x32                    PENT,PRIV
 RDPMC     void                \2\x0F\x33                    P6
 RDTSC     void                \2\x0F\x31                    PENT
 RDTSCP    void                \3\x0F\x01\xF9                X64
-RESB      imm                 \340                          8086
-RESD      ignore              ignore                        ignore
-RESQ      ignore              ignore                        ignore
-REST      ignore              ignore                        ignore
-RESW      ignore              ignore                        ignore
 RET       void                \1\xC3                        8086
 RET       imm                 \1\xC2\30                     8086,SW
 RETF      void                \1\xCB                        8086
diff --git a/insns.h b/insns.h
index c7fa75a0..b5d6caf7 100644
--- a/insns.h
+++ b/insns.h
@@ -68,6 +68,7 @@ extern const struct itemplate * const * const itable[];
 #define IF_SW     0x00000008UL  /* unsized operands can't be non-word */
 #define IF_SD     0x0000000CUL  /* unsized operands can't be non-dword */
 #define IF_SQ     0x00000010UL  /* unsized operands can't be non-qword */
+#define IF_SO     0x00000014UL  /* unsized operands can't be non-oword */
 #define IF_SMASK  0x0000001CUL  /* mask for unsized argument size */
 #define IF_AR0	  0x00000020UL  /* SB, SW, SD applies to argument 0 */
 #define IF_AR1	  0x00000040UL  /* SB, SW, SD applies to argument 1 */
diff --git a/nasm.h b/nasm.h
index f5d64946..f4afad36 100644
--- a/nasm.h
+++ b/nasm.h
@@ -375,7 +375,7 @@ enum {
  *
  * The bits are assigned as follows:
  *
- * Bits 0-7: sizes
+ * Bits 0-7, 29: sizes
  *  0:  8 bits (BYTE)
  *  1: 16 bits (WORD)
  *  2: 32 bits (DWORD)
@@ -384,6 +384,7 @@ enum {
  *  5: FAR
  *  6: NEAR
  *  7: SHORT
+ * 29: 128 bits (OWORD)
  *
  * Bits 8-11 modifiers
  *  8: TO
@@ -454,12 +455,13 @@ typedef uint32_t opflags_t;
 #define BITS32    	0x00000004L
 #define BITS64    	0x00000008L   /* x64 and FPU only */
 #define BITS80    	0x00000010L   /* FPU only */
+#define BITS128		0x20000000L
 #define FAR       	0x00000020L   /* grotty: this means 16:16 or */
                                        /* 16:32, like in CALL/JMP */
 #define NEAR      	0x00000040L
 #define SHORT     	0x00000080L   /* and this means what it says :) */
 
-#define SIZE_MASK 	0x000000FFL   /* all the size attributes */
+#define SIZE_MASK 	0x200000FFL   /* all the size attributes */
 
 /* Modifiers */
 #define MODIFIER_MASK	0x00000f00L
@@ -959,8 +961,8 @@ struct dfmt {
  */
 
 enum special_tokens {
-    S_ABS, S_BYTE, S_DWORD, S_FAR, S_LONG, S_NEAR, S_NOSPLIT, S_QWORD, S_REL,
-    S_SHORT, S_STRICT, S_TO, S_TWORD, S_WORD
+    S_ABS, S_BYTE, S_DWORD, S_FAR, S_LONG, S_NEAR, S_NOSPLIT,
+    S_OWORD, S_QWORD, S_REL, S_SHORT, S_STRICT, S_TO, S_TWORD, S_WORD
 };
 
 /*
diff --git a/parser.c b/parser.c
index 16164d77..ca12a097 100644
--- a/parser.c
+++ b/parser.c
@@ -175,23 +175,25 @@ insn *parse_line(int pass, char *buffer, insn * result,
      * For the moment, EQU has the same difficulty, so we'll
      * include that.
      */
-    if (result->opcode == I_RESB || result->opcode == I_RESW || result->opcode == I_RESD || result->opcode == I_RESQ || result->opcode == I_REST || result->opcode == I_EQU || result->opcode == I_INCBIN) {    /* fbk */
+    if (result->opcode == I_RESB || result->opcode == I_RESW ||
+	result->opcode == I_RESD || result->opcode == I_RESQ ||
+	result->opcode == I_REST || result->opcode == I_RESO ||
+	result->opcode == I_EQU || result->opcode == I_INCBIN) {
         critical = pass0;
     } else
         critical = (pass == 2 ? 2 : 0);
 
-    if (result->opcode == I_DB ||
-        result->opcode == I_DW ||
-        result->opcode == I_DD ||
-        result->opcode == I_DQ ||
-        result->opcode == I_DT || result->opcode == I_INCBIN) {
+    if (result->opcode == I_DB || result->opcode == I_DW ||
+        result->opcode == I_DD || result->opcode == I_DQ ||
+        result->opcode == I_DT || result->opcode == I_DO ||
+	result->opcode == I_INCBIN) {
         extop *eop, **tail = &result->eops, **fixptr;
         int oper_num = 0;
 
         result->eops_float = FALSE;
 
         /*
-         * Begin to read the DB/DW/DD/DQ/DT/INCBIN operands.
+         * Begin to read the DB/DW/DD/DQ/DT/DO/INCBIN operands.
          */
         while (1) {
             i = stdscan(NULL, &tokval);
@@ -234,6 +236,8 @@ insn *parse_line(int pass, char *buffer, insn * result,
                         eop->stringlen = 8;
                     else if (result->opcode == I_DT)
                         eop->stringlen = 10;
+		    else if (result->opcode == I_DO)
+			eop->stringlen = 16;
                     else {
                         error(ERR_NONFATAL, "floating-point constant"
                               " encountered in `D%c' instruction",
@@ -245,8 +249,7 @@ insn *parse_line(int pass, char *buffer, insn * result,
                          */
                         eop->stringlen = 0;
                     }
-                    eop =
-                        nasm_realloc(eop, sizeof(extop) + eop->stringlen);
+                    eop = nasm_realloc(eop, sizeof(extop) + eop->stringlen);
                     tail = &eop->next;
                     *fixptr = eop;
                     eop->stringval = (char *)eop + sizeof(extop);
@@ -384,6 +387,11 @@ insn *parse_line(int pass, char *buffer, insn * result,
                     result->oprs[operand].type |= BITS80;
                 setsize = 1;
                 break;
+            case S_OWORD:
+                if (!setsize)
+                    result->oprs[operand].type |= BITS128;
+                setsize = 1;
+                break;
             case S_TO:
                 result->oprs[operand].type |= TO;
                 break;
@@ -440,6 +448,9 @@ insn *parse_line(int pass, char *buffer, insn * result,
                     case S_TWORD:
                         result->oprs[operand].type |= BITS80;
                         break;
+                    case S_OWORD:
+                        result->oprs[operand].type |= BITS128;
+                        break;
                     default:
                         error(ERR_NONFATAL,
                               "invalid operand size specification");
@@ -751,7 +762,7 @@ insn *parse_line(int pass, char *buffer, insn * result,
         result->oprs[operand++].type = 0;
 
     /*
-     * Transform RESW, RESD, RESQ, REST into RESB.
+     * Transform RESW, RESD, RESQ, REST, RESO into RESB.
      */
     switch (result->opcode) {
     case I_RESW:
@@ -770,6 +781,10 @@ insn *parse_line(int pass, char *buffer, insn * result,
         result->opcode = I_RESB;
         result->oprs[0].offset *= 10;
         break;
+    case I_RESO:
+        result->opcode = I_RESB;
+        result->oprs[0].offset *= 16;
+        break;
     default:
 	break;
     }
diff --git a/tokens.dat b/tokens.dat
index 6acaba49..c84b8fb3 100644
--- a/tokens.dat
+++ b/tokens.dat
@@ -23,6 +23,7 @@ far
 long
 near
 nosplit
+oword
 qword
 rel
 short

From 0edc309505e659345cf353f81fb77793f8f5c291 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 18 Sep 2007 13:45:12 -0700
Subject: [PATCH 12/29] Document oword, do and reso

Document oword and the associated do and reso pseudoinstructions.
---
 doc/nasmdoc.src | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/doc/nasmdoc.src b/doc/nasmdoc.src
index 13ae013d..2530b2b5 100644
--- a/doc/nasmdoc.src
+++ b/doc/nasmdoc.src
@@ -1115,19 +1115,19 @@ indicate what size of \i{memory operand} it refers to.
 \H{pseudop} \i{Pseudo-Instructions}
 
 Pseudo-instructions are things which, though not real x86 machine
-instructions, are used in the instruction field anyway because
-that's the most convenient place to put them. The current
-pseudo-instructions are \i\c{DB}, \i\c{DW}, \i\c{DD}, \i\c{DQ} and
-\i\c{DT}, their \i{uninitialized} counterparts \i\c{RESB},
-\i\c{RESW}, \i\c{RESD}, \i\c{RESQ} and \i\c{REST}, the \i\c{INCBIN}
+instructions, are used in the instruction field anyway because that's
+the most convenient place to put them. The current pseudo-instructions
+are \i\c{DB}, \i\c{DW}, \i\c{DD}, \i\c{DQ}, \i\c{DT} and \i\c{DO};
+their \i{uninitialized} counterparts \i\c{RESB}, \i\c{RESW},
+\i\c{RESD}, \i\c{RESQ}, \i\c{REST} and \i\c{RESO}; the \i\c{INCBIN}
 command, the \i\c{EQU} command, and the \i\c{TIMES} prefix.
 
 
 \S{db} \c{DB} and friends: Declaring initialized Data
 
-\i\c{DB}, \i\c{DW}, \i\c{DD}, \i\c{DQ} and \i\c{DT} are used, much
-as in MASM, to declare initialized data in the output file. They can
-be invoked in a wide range of ways:
+\i\c{DB}, \i\c{DW}, \i\c{DD}, \i\c{DQ}, \i\c{DT} and \i\c{DO} are
+used, much as in MASM, to declare initialized data in the output
+file. They can be invoked in a wide range of ways:
 \I{floating-point}\I{character constant}\I{string constant}
 
 \c       db    0x55                ; just the byte 0x55
@@ -1144,20 +1144,20 @@ be invoked in a wide range of ways:
 \c       dq    1.234567e20         ; double-precision float
 \c       dt    1.234567e20         ; extended-precision float
 
-\c{DT} does not accept \i{numeric constants} as operands.
+\c{DT} and \c{DO} do not accept \i{numeric constants} as operands.
 
 
 \S{resb} \c{RESB} and friends: Declaring \i{Uninitialized} Data
 
-\i\c{RESB}, \i\c{RESW}, \i\c{RESD}, \i\c{RESQ} and \i\c{REST} are
-designed to be used in the BSS section of a module: they declare
-\e{uninitialized} storage space. Each takes a single operand, which
-is the number of bytes, words, doublewords or whatever to reserve.
-As stated in \k{qsother}, NASM does not support the MASM/TASM syntax
-of reserving uninitialized space by writing \I\c{?}\c{DW ?} or
-similar things: this is what it does instead. The operand to a
-\c{RESB}-type pseudo-instruction is a \i\e{critical expression}: see
-\k{crit}.
+\i\c{RESB}, \i\c{RESW}, \i\c{RESD}, \i\c{RESQ}, \i\c{REST} and
+\i\c{RESO} are designed to be used in the BSS section of a module:
+they declare \e{uninitialized} storage space. Each takes a single
+operand, which is the number of bytes, words, doublewords or whatever
+to reserve.  As stated in \k{qsother}, NASM does not support the
+MASM/TASM syntax of reserving uninitialized space by writing
+\I\c{?}\c{DW ?} or similar things: this is what it does instead. The
+operand to a \c{RESB}-type pseudo-instruction is a \i\e{critical
+expression}: see \k{crit}.
 
 For example:
 
@@ -1560,11 +1560,11 @@ invent one using the macro processor.
 
 When assembling with the optimizer set to level 2 or higher (see
 \k{opt-On}), NASM will use size specifiers (\c{BYTE}, \c{WORD},
-\c{DWORD}, \c{QWORD}, or \c{TWORD}), but will give them the smallest
-possible size. The keyword \c{STRICT} can be used to inhibit
+\c{DWORD}, \c{QWORD}, \c{TWORD} or \c{OWORD}), but will give them the
+smallest possible size. The keyword \c{STRICT} can be used to inhibit
 optimization and force a particular operand to be emitted in the
-specified size. For example, with the optimizer on, and in
-\c{BITS 16} mode,
+specified size. For example, with the optimizer on, and in \c{BITS 16}
+mode,
 
 \c         push dword 33
 

From 19e201053689be68d0e45077fa86e9538d74daa1 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 18 Sep 2007 15:08:20 -0700
Subject: [PATCH 13/29] Speed up the disassembler by allowing prefixed
 instruction tables

Modify the disassembler so that we can have separate instruction
tables for prefixed instructions.  As it was, all instructions which
started with 0F were linearly searched, and that is by now more than
half the instruction set.
---
 disasm.c |  13 ++++-
 insns.h  |  11 +++-
 insns.pl | 151 +++++++++++++++++++++++++++++++++++++++++++------------
 3 files changed, 139 insertions(+), 36 deletions(-)

diff --git a/disasm.c b/disasm.c
index 3a8f710d..a6c1c729 100644
--- a/disasm.c
+++ b/disasm.c
@@ -671,9 +671,11 @@ int32_t disasm(uint8_t *data, char *output, int outbufsize, int segsize,
             int32_t offset, int autosync, uint32_t prefer)
 {
     const struct itemplate * const *p, * const *best_p;
+    const struct disasm_index *ix;
+    uint8_t *dp;
     int length, best_length = 0;
     char *segover;
-    int i, slen, colon;
+    int i, slen, colon, n;
     uint8_t *origdata;
     int works;
     insn tmp_ins, ins;
@@ -728,7 +730,14 @@ int32_t disasm(uint8_t *data, char *output, int outbufsize, int segsize,
     best_p = NULL;
     best_pref = INT_MAX;
 
-    for (p = itable[*data]; *p; p++) {
+    dp = data;
+    ix = itable + *dp++;
+    while (ix->n == (size_t)-1) {
+	ix = (const struct disasm_index *)ix->p + *dp++;
+    }
+
+    p = (const struct itemplate * const *)ix->p;
+    for (n = ix->n; n; n--, p++) {
         if ((length = matches(*p, data, &prefix, segsize, &tmp_ins))) {
             works = TRUE;
             /*
diff --git a/insns.h b/insns.h
index b5d6caf7..b025c7a5 100644
--- a/insns.h
+++ b/insns.h
@@ -26,9 +26,18 @@ struct itemplate {
     uint32_t flags;		/* some flags */
 };
 
+/* Disassembler table structure */
+/* If n == -1, then p points to another table of 256
+   struct disasm_index, otherwise p points to a list of n
+   struct itemplates to consider. */
+struct disasm_index {
+    const void *p;
+    int n;
+};
+
 /* Tables for the assembler and disassembler, respectively */
 extern const struct itemplate * const nasm_instructions[];
-extern const struct itemplate * const * const itable[];
+extern const struct disasm_index itable[256];
 
 /*
  * this define is used to signify the end of an itemplate
diff --git a/insns.pl b/insns.pl
index 6e961dec..c5f280c6 100644
--- a/insns.pl
+++ b/insns.pl
@@ -7,6 +7,10 @@
 # redistributable under the licence given in the file "Licence"
 # distributed in the NASM archive.
 
+# Opcode prefixes which need their own opcode tables
+# LONGER PREFIXES FIRST!
+@disasm_prefixes = qw(0F0F 0F24 0F25 0F38 0F3A 0F7A 0FC2 0F);
+
 print STDERR "Reading insns.dat...\n";
 
 @args   = ();
@@ -26,6 +30,8 @@ foreach $arg ( @ARGV ) {
 $fname = "insns.dat" unless $fname = $args[0];
 open (F, $fname) || die "unable to open $fname";
 
+%dinstables = ();
+
 $line = 0;
 $insns = 0;
 while (<F>) {
@@ -50,9 +56,11 @@ while (<F>) {
   }
   if ($formatted && !$nd) {
     push @big, $formatted;
-    foreach $i (&startbyte($_[2])) {
-      $aname = sprintf "dd_%02X",$i;
-      push @$aname, $#big;
+    foreach $i (startseq($_[2])) {
+	if (!defined($dinstables{$i})) {
+	    $dinstables{$i} = [];
+	}
+	push(@{$dinstables{$i}}, $#big);
     }
   }
 }
@@ -106,23 +114,38 @@ if ( !defined($output) || $output eq 'd' ) {
     foreach $j (@big) {
 	printf D "    /* %4d */ %s\n", $n++, $j;
     }
-    print D "    ITEMPLATE_END\n};\n\n";
-    
-    for ($c=0; $c<256; $c++) {
-	$h = sprintf "%02X", $c;
-	print D "static const struct itemplate * const itable_${h}[] = {\n";
-	$aname = "dd_$h";
-	foreach $j (@$aname) {
+    print D "};\n";
+
+    foreach $h (sort(keys(%dinstables))) {
+	print D "\nstatic const struct itemplate * const itable_${h}[] = {\n";
+	foreach $j (@{$dinstables{$h}}) {
 	    print D "    instrux + $j,\n";
 	}
-	print D "    NULL\n};\n\n";
-    }
-    
-    print D "const struct itemplate * const * const itable[] = {\n";
-    for ($c=0; $c<256; $c++) {
-	printf D "    itable_%02X,\n", $c;
+	print D "};\n";
     }
+
+    foreach $h (@disasm_prefixes, '') {
+	$is_prefix{$h} = 1;
+	print D "\n";
+	print D "static " unless ($h eq '');
+	print D "const struct disasm_index ";
+	print D ($h eq '') ? 'itable' : "itable_$h";
+	print D "[256] = {\n";
+	for ($c = 0; $c < 256; $c++) {
+	    $nn = sprintf("%s%02X", $h, $c);
+	    if ($is_prefix{$nn}) {
+		die "$0: ambiguous decoding of $nn\n"
+		    if (defined($dinstables{$nn}));
+		printf D "    { itable_%s, -1 },\n", $nn;
+	    } elsif (defined($dinstables{$nn})) {
+		printf D "    { itable_%s, %u },\n",
+	    	$nn, scalar(@{$dinstables{$nn}});
+	    } else {
+		printf D "    { NULL, 0 },\n";
+	    }
+	}
     print D "};\n";
+    }
     
     close D;
 }
@@ -240,6 +263,17 @@ sub format {
     ("{I_$opcode, $num, {$operands}, \"$codes\", $flags},", $nd);
 }
 
+sub hexlist($$$) {
+    my($prefix, $start, $n) = @_;
+    my $i;
+    my @l = ();
+
+    for ($i = 0; $i < $n; $i++) {
+	push(@l, sprintf("%s%02X", $prefix, $start+$i));
+    }
+    return @l;
+}
+
 # Here we determine the range of possible starting bytes for a given
 # instruction. We need only consider the codes:
 # \1 \2 \3     mean literal bytes, of course
@@ -248,24 +282,75 @@ sub format {
 # \170         means byte zero
 # \330         means byte plus condition code
 # \0 or \340   mean give up and return empty set
-sub startbyte {
-  my ($codes) = @_;
+sub startseq($) {
+  my ($codestr) = @_;
   my $word, @range;
+  my @codes = ();
+  my $c = $codestr;
+  my $c0, $c1, $i;
+  my $prefix = '';
 
-  while (1) {
-    die "couldn't get code in '$codes'" if $codes !~ /^(\\[^\\]+)(\\.*)?$/;
-    $word = $1, $codes = $2;
-    return (hex $1) if $word =~ /^\\[123]$/ && $codes =~ /^\\x(..)/;
-    return (0x07, 0x17, 0x1F) if $word eq "\\4";
-    return (0xA1, 0xA9) if $word eq "\\5";
-    return (0x06, 0x0E, 0x16, 0x1E) if $word eq "\\6";
-    return (0xA0, 0xA8) if $word eq "\\7";
-    $start=hex $1, $r=8, last if $word =~ /^\\1[0123]$/ && $codes =~/^\\x(..)/;
-    return (0) if $word eq "\\170";
-    $start=hex $1, $r=16, last if $word =~ /^\\330$/ && $codes =~ /^\\x(..)/;
-    return () if $word eq "\\0" || $word eq "\\340";
+  # Although these are C-syntax strings, by convention they should have
+  # only octal escapes (for directives) and hexadecimal escapes
+  # (for verbatim bytes)
+  while ($c ne '') {
+      if ($c =~ /^\\x([0-9a-f]+)(.*)$/i) {
+	  push(@codes, hex $1);
+	  $c = $2;
+	  next;
+      } elsif ($c =~ /^\\([0-7]{1,3})(.*)$/) {
+	  push(@codes, oct $1);
+	  $c = $2;
+	  next;
+      } else {
+	  die "$0: unknown code format in \"$codestr\"\n";
+      }
   }
-  @range = ();
-  push @range, $start++ while ($r-- > 0);
-  @range;
+
+  while ($c0 = shift(@codes)) {
+      $c1 = $codes[0];
+      if ($c0 == 01 || $c0 == 02 || $c0 == 03 || $c0 == 0170) {
+	  # Fixed byte string
+	  my $fbs = $prefix;
+	  while (1) {
+	      if ($c0 == 01 || $c0 == 02 || $c0 == 03) {
+		  while ($c0--) {
+		      $fbs .= sprintf("%02X", shift(@codes));
+		  }
+	      } elsif ($c0 == 0170) {
+		  $fbs .= '00';
+	      } else {
+		  last;
+	      }
+	      $c0 = shift(@codes);
+	  }
+
+	  foreach $pfx (@disasm_prefixes) {
+	      if ($fbs =~ /^$pfx(.*)$/) {
+		  $prefix = $pfx;
+		  $fbs = $1;
+		  last;
+	      }
+	  }
+
+	  if ($fbs ne '') {
+	      return ($prefix.substr($fbs,0,2));
+	  }
+      } elsif ($c0 == 04) {
+	  return ("07", "17", "1F");
+      } elsif ($c0 == 05) {
+	  return ("A1", "A9");
+      } elsif ($c0 == 06) {
+	  return ("06", "0E", "16", "1E");
+      } elsif ($c0 == 07) {
+	  return ("A0", "A8");
+      } elsif ($c0 >= 010 && $c0 <= 013) {
+	  return hexlist($prefix, $c1, 8);
+      } elsif ($c0 == 0330) {
+	  return hexlist($prefix, $c1, 16);
+      } elsif ($c0 == 0 || $c0 == 0340) {
+	  return ();
+      }
+  }
+  return ();
 }

From 76815bf60b5db3bb0f9711920562ea4afc3f5c85 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 18 Sep 2007 15:24:38 -0700
Subject: [PATCH 14/29] Remove 0FC2 from list of instruction prefixes

0FC2 is not really an instruction prefix; it's the opcode for
CMPPS/CMPSS, which takes a control immediate which Intel chose to have
opcode aliases for.  However, we can't dispatch on a tail byte, so
it's useless.
---
 insns.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/insns.pl b/insns.pl
index c5f280c6..356c183d 100644
--- a/insns.pl
+++ b/insns.pl
@@ -9,7 +9,7 @@
 
 # Opcode prefixes which need their own opcode tables
 # LONGER PREFIXES FIRST!
-@disasm_prefixes = qw(0F0F 0F24 0F25 0F38 0F3A 0F7A 0FC2 0F);
+@disasm_prefixes = qw(0F0F 0F24 0F25 0F38 0F3A 0F7A 0F);
 
 print STDERR "Reading insns.dat...\n";
 

From 141d7cf68d60f6c77c078fea7ff85526db668c6f Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 18 Sep 2007 16:39:03 -0700
Subject: [PATCH 15/29] Support 16-bit IEEE floating point; used in SSE5

SSE5 supports standard IEEE 16-bit floating point, so we should
support that too.
---
 float.c  | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
 parser.c | 24 ++++++++++++-------
 2 files changed, 82 insertions(+), 13 deletions(-)

diff --git a/float.c b/float.c
index 099e23f2..afa84d2e 100644
--- a/float.c
+++ b/float.c
@@ -213,6 +213,7 @@ static int ieee_round(uint16_t *mant, int i)
 
 #define put(a,b) ( (*(a)=(b)), ((a)[1]=(b)>>8) )
 
+/* 64-bit format with 52-bit mantissa and 11-bit exponent */
 static int to_double(char *str, int32_t sign, uint8_t *result,
                      efunc error)
 {
@@ -275,6 +276,7 @@ static int to_double(char *str, int32_t sign, uint8_t *result,
     return 1;                   /* success */
 }
 
+/* 32-bit format with 23-bit mantissa and 8-bit exponent */
 static int to_float(char *str, int32_t sign, uint8_t *result,
                     efunc error)
 {
@@ -330,6 +332,64 @@ static int to_float(char *str, int32_t sign, uint8_t *result,
     return 1;
 }
 
+/* 16-bit format with 10-bit mantissa and 5-bit exponent.
+   Defined in IEEE 754r.  Used in SSE5.  See the AMD SSE5 manual, AMD
+   document number 43479. */
+static int to_float16(char *str, int32_t sign, uint8_t *result,
+		      efunc error)
+{
+    uint16_t mant[MANT_WORDS];
+    int32_t exponent;
+
+    sign = (sign < 0 ? 0x8000L : 0L);
+
+    ieee_flconvert(str, mant, &exponent, error);
+    if (mant[0] & 0x8000) {
+        /*
+         * Non-zero.
+         */
+        exponent--;
+        if (exponent >= -14 && exponent <= 16) {
+            /*
+             * Normalised.
+             */
+            exponent += 15;
+            ieee_shr(mant, 5);
+            ieee_round(mant, 1);
+            if (mant[0] & 0x800)        /* did we scale up by one? */
+                ieee_shr(mant, 1), exponent++;
+            mant[0] &= 0x3FF;    /* remove leading one */
+            put(result + 0, (exponent << 7) | mant[0] | sign);
+        } else if (exponent < -14 && exponent >= -24) {
+            /*
+             * Denormal.
+             */
+            int shift = -(exponent + 8);
+            int sh = shift % 16, wds = shift / 16;
+            ieee_shr(mant, sh);
+            if (ieee_round(mant, 1 - wds)
+                || (sh > 0 && (mant[0] & (0x8000 >> (sh - 1))))) {
+                ieee_shr(mant, 1);
+                if (sh == 0)
+                    mant[0] |= 0x8000;
+                exponent++;
+            }
+            put(result + 0, (wds == 0 ? mant[0] : 0) | sign);
+        } else {
+            if (exponent > 0) {
+                error(ERR_NONFATAL, "overflow in floating-point constant");
+                return 0;
+            } else
+                memset(result, 0, 2);
+        }
+    } else {
+        memset(result, 0, 2);
+    }
+    return 1;
+}
+
+/* 80-bit format with 64-bit mantissa *including an explicit integer 1*
+   and 15-bit exponent. */
 static int to_ldoub(char *str, int32_t sign, uint8_t *result,
                     efunc error)
 {
@@ -394,13 +454,16 @@ static int to_ldoub(char *str, int32_t sign, uint8_t *result,
 int float_const(char *number, int32_t sign, uint8_t *result, int bytes,
                 efunc error)
 {
-    if (bytes == 4)
+    switch (bytes) {
+    case 2:
+	return to_float16(number, sign, result, error);
+    case 4:
         return to_float(number, sign, result, error);
-    else if (bytes == 8)
+    case 8:
         return to_double(number, sign, result, error);
-    else if (bytes == 10)
+    case 10:
         return to_ldoub(number, sign, result, error);
-    else {
+    default:
         error(ERR_PANIC, "strange value %d passed to float_const", bytes);
         return 0;
     }
diff --git a/parser.c b/parser.c
index ca12a097..69ae3790 100644
--- a/parser.c
+++ b/parser.c
@@ -230,30 +230,36 @@ insn *parse_line(int pass, char *buffer, insn * result,
                 if (i == TOKEN_FLOAT) {
                     eop->type = EOT_DB_STRING;
                     result->eops_float = TRUE;
-                    if (result->opcode == I_DD)
+		    switch (result->opcode) {
+		    case I_DW:
+			eop->stringlen = 2;
+			break;
+		    case I_DD:
                         eop->stringlen = 4;
-                    else if (result->opcode == I_DQ)
+			break;
+		    case I_DQ:
                         eop->stringlen = 8;
-                    else if (result->opcode == I_DT)
+			break;
+		    case I_DT:
                         eop->stringlen = 10;
-		    else if (result->opcode == I_DO)
-			eop->stringlen = 16;
-                    else {
+			break;
+		    default:
                         error(ERR_NONFATAL, "floating-point constant"
-                              " encountered in `D%c' instruction",
-                              result->opcode == I_DW ? 'W' : 'B');
+                              " encountered in `d%c' instruction"
+			      ? (result->opcode == I_DO) ? 'o' : 'b');
                         /*
                          * fix suggested by Pedro Gimeno... original line
                          * was:
                          * eop->type = EOT_NOTHING;
                          */
                         eop->stringlen = 0;
+			break;
                     }
                     eop = nasm_realloc(eop, sizeof(extop) + eop->stringlen);
                     tail = &eop->next;
                     *fixptr = eop;
                     eop->stringval = (char *)eop + sizeof(extop);
-                    if (eop->stringlen < 4 ||
+                    if (!eop->stringlen ||
                         !float_const(tokval.t_charptr, sign,
                                      (uint8_t *)eop->stringval,
                                      eop->stringlen, error))

From cfbe7c3cc2dbdfe1268e2d0a19fc59b52cbcfcc5 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 18 Sep 2007 17:49:09 -0700
Subject: [PATCH 16/29] Fix handling of DO; support unary + for floating-point
 numbers

Floating-point users generally expect to be able to use a unary plus.
Fix support for the DO instruction in several places.
---
 assemble.c | 13 +++++++++----
 parser.c   | 18 +++++++++++-------
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/assemble.c b/assemble.c
index e5384548..efb02207 100644
--- a/assemble.c
+++ b/assemble.c
@@ -250,6 +250,9 @@ int32_t assemble(int32_t segment, int32_t offset, int bits, uint32_t cp,
     case I_DT:
         wsize = 10;
         break;
+    case I_DO:
+	wsize = 16;
+	break;
     default:
 	break;
     }
@@ -564,10 +567,9 @@ int32_t insn_size(int32_t segment, int32_t offset, int bits, uint32_t cp,
     if (instruction->opcode == -1)
         return 0;
 
-    if (instruction->opcode == I_DB ||
-        instruction->opcode == I_DW ||
-        instruction->opcode == I_DD ||
-        instruction->opcode == I_DQ || instruction->opcode == I_DT) {
+    if (instruction->opcode == I_DB || instruction->opcode == I_DW ||
+        instruction->opcode == I_DD || instruction->opcode == I_DQ ||
+	instruction->opcode == I_DT || instruction->opcode == I_DO) {
         extop *e;
         int32_t isize, osize, wsize = 0;   /* placate gcc */
 
@@ -588,6 +590,9 @@ int32_t insn_size(int32_t segment, int32_t offset, int bits, uint32_t cp,
         case I_DT:
             wsize = 10;
             break;
+	case I_DO:
+	    wsize = 16;
+	    break;
 	default:
 	    break;
         }
diff --git a/parser.c b/parser.c
index 69ae3790..31c3612a 100644
--- a/parser.c
+++ b/parser.c
@@ -214,16 +214,18 @@ insn *parse_line(int pass, char *buffer, insn * result,
                 continue;
             }
 
-            if ((i == TOKEN_FLOAT && is_comma_next()) || i == '-') {
-                int32_t sign = +1L;
+            if ((i == TOKEN_FLOAT && is_comma_next())
+		|| i == '-' || i == '+') {
+                int32_t sign = +1;
 
-                if (i == '-') {
+                if (i == '+' || i == '-') {
                     char *save = stdscan_bufptr;
+		    int token = i;
+		    sign = (i == '-') ? -1 : 1;
                     i = stdscan(NULL, &tokval);
-                    sign = -1L;
                     if (i != TOKEN_FLOAT || !is_comma_next()) {
                         stdscan_bufptr = save;
-                        i = tokval.t_type = '-';
+                        i = tokval.t_type = token;
                     }
                 }
 
@@ -243,10 +245,12 @@ insn *parse_line(int pass, char *buffer, insn * result,
 		    case I_DT:
                         eop->stringlen = 10;
 			break;
+		    case I_DO:
+                        eop->stringlen = 16;
+			break;
 		    default:
                         error(ERR_NONFATAL, "floating-point constant"
-                              " encountered in `d%c' instruction"
-			      ? (result->opcode == I_DO) ? 'o' : 'b');
+                              " encountered in `db' instruction");
                         /*
                          * fix suggested by Pedro Gimeno... original line
                          * was:

From e31747e95bba75c7e27d0a76f0e385c6d12351e2 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 18 Sep 2007 17:50:34 -0700
Subject: [PATCH 17/29] Unify all standard IEEE floating-point formats; add
 128-bit

Unify all the standard IEEE formats into one function, add support for
IEEE standard 128-bit floating point numbers.

The 80-bit format is still special since it explicitly represents the
integer portion.
---
 float.c        | 206 +++++++++++++++----------------------------------
 test/float.asm | 103 +++++++++++++++++++++++++
 2 files changed, 167 insertions(+), 142 deletions(-)
 create mode 100644 test/float.asm

diff --git a/float.c b/float.c
index afa84d2e..a6ad3936 100644
--- a/float.c
+++ b/float.c
@@ -18,8 +18,8 @@
 #define TRUE 1
 #define FALSE 0
 
-#define MANT_WORDS 6            /* 64 bits + 32 for accuracy == 96 */
-#define MANT_DIGITS 28          /* 29 digits don't fit in 96 bits */
+#define MANT_WORDS  10          /* 112 bits + 48 for accuracy == 160 */
+#define MANT_DIGITS 49          /* 50 digits don't fit in 160 bits */
 
 /*
  * guaranteed top bit of from is set
@@ -47,9 +47,8 @@ static int ieee_multiply(uint16_t *to, uint16_t *from)
         temp[i] &= 0xFFFF;
     }
     if (temp[0] & 0x8000) {
-        for (i = 0; i < MANT_WORDS; i++)
-            to[i] = temp[i] & 0xFFFF;
-        return 0;
+	memcpy(to, temp, 2*MANT_WORDS);
+	return 0;
     } else {
         for (i = 0; i < MANT_WORDS; i++)
             to[i] = (temp[i] << 1) + !!(temp[i + 1] & 0x8000);
@@ -213,75 +212,33 @@ static int ieee_round(uint16_t *mant, int i)
 
 #define put(a,b) ( (*(a)=(b)), ((a)[1]=(b)>>8) )
 
-/* 64-bit format with 52-bit mantissa and 11-bit exponent */
-static int to_double(char *str, int32_t sign, uint8_t *result,
-                     efunc error)
-{
-    uint16_t mant[MANT_WORDS];
-    int32_t exponent;
+/* Produce standard IEEE formats, with implicit "1" bit; this makes
+   the following assumptions:
 
-    sign = (sign < 0 ? 0x8000L : 0L);
+   - the sign bit is the MSB, followed by the exponent.
+   - the sign bit plus exponent fit in 16 bits.
+   - the exponent bias is 2^(n-1)-1 for an n-bit exponent */
 
-    ieee_flconvert(str, mant, &exponent, error);
-    if (mant[0] & 0x8000) {
-        /*
-         * Non-zero.
-         */
-        exponent--;
-        if (exponent >= -1022 && exponent <= 1024) {
-            /*
-             * Normalised.
-             */
-            exponent += 1023;
-            ieee_shr(mant, 11);
-            ieee_round(mant, 4);
-            if (mant[0] & 0x20) /* did we scale up by one? */
-                ieee_shr(mant, 1), exponent++;
-            mant[0] &= 0xF;     /* remove leading one */
-            put(result + 6, (exponent << 4) | mant[0] | sign);
-            put(result + 4, mant[1]);
-            put(result + 2, mant[2]);
-            put(result + 0, mant[3]);
-        } else if (exponent < -1022 && exponent >= -1074) {
-            /*
-             * Denormal.
-             */
-            int shift = -(exponent + 1011);
-            int sh = shift % 16, wds = shift / 16;
-            ieee_shr(mant, sh);
-            if (ieee_round(mant, 4 - wds)
-                || (sh > 0 && (mant[0] & (0x8000 >> (sh - 1))))) {
-                ieee_shr(mant, 1);
-                if (sh == 0)
-                    mant[0] |= 0x8000;
-                exponent++;
-            }
-            put(result + 6, (wds == 0 ? mant[0] : 0) | sign);
-            put(result + 4, (wds <= 1 ? mant[1 - wds] : 0));
-            put(result + 2, (wds <= 2 ? mant[2 - wds] : 0));
-            put(result + 0, (wds <= 3 ? mant[3 - wds] : 0));
-        } else {
-            if (exponent > 0) {
-                error(ERR_NONFATAL, "overflow in floating-point constant");
-                return 0;
-            } else
-                memset(result, 0, 8);
-        }
-    } else {
-        /*
-         * Zero.
-         */
-        memset(result, 0, 8);
-    }
-    return 1;                   /* success */
-}
+struct ieee_format {
+    int words;
+    int mantissa;		/* Bits in the mantissa */
+    int exponent;		/* Bits in the exponent */
+};
 
-/* 32-bit format with 23-bit mantissa and 8-bit exponent */
+static const struct ieee_format ieee_16  = { 1,  10,  5 };
+static const struct ieee_format ieee_32  = { 2,  23,  8 };
+static const struct ieee_format ieee_64  = { 4,  52, 11 };
+static const struct ieee_format ieee_128 = { 8, 112, 15 };
+
+/* Produce all the standard IEEE formats: 16, 32, 64, and 128 bits */
 static int to_float(char *str, int32_t sign, uint8_t *result,
-                    efunc error)
+		    const struct ieee_format *fmt, efunc error)
 {
-    uint16_t mant[MANT_WORDS];
+    uint16_t mant[MANT_WORDS], *mp;
     int32_t exponent;
+    int32_t expmax = 1 << (fmt->exponent-1);
+    uint16_t implicit_one = 0x8000 >> fmt->exponent;
+    int i;
 
     sign = (sign < 0 ? 0x8000L : 0L);
 
@@ -291,101 +248,64 @@ static int to_float(char *str, int32_t sign, uint8_t *result,
          * Non-zero.
          */
         exponent--;
-        if (exponent >= -126 && exponent <= 128) {
+        if (exponent >= 2-expmax && exponent <= expmax) {
             /*
              * Normalised.
              */
-            exponent += 127;
-            ieee_shr(mant, 8);
-            ieee_round(mant, 2);
-            if (mant[0] & 0x100)        /* did we scale up by one? */
-                ieee_shr(mant, 1), exponent++;
-            mant[0] &= 0x7F;    /* remove leading one */
-            put(result + 2, (exponent << 7) | mant[0] | sign);
-            put(result + 0, mant[1]);
-        } else if (exponent < -126 && exponent >= -149) {
+            exponent += expmax;
+            ieee_shr(mant, fmt->exponent);
+            ieee_round(mant, fmt->words);
+	    /* did we scale up by one? */
+            if (mant[0] & (implicit_one << 1)) {
+                ieee_shr(mant, 1);
+		exponent++;
+	    }
+
+            mant[0] &= (implicit_one-1);     /* remove leading one */
+	    mant[0] |= exponent << (15 - fmt->exponent);
+        } else if (exponent < 2-expmax && exponent >= 2-expmax-fmt->mantissa) {
             /*
              * Denormal.
              */
-            int shift = -(exponent + 118);
+            int shift = -(exponent + expmax-2-fmt->exponent);
             int sh = shift % 16, wds = shift / 16;
             ieee_shr(mant, sh);
-            if (ieee_round(mant, 2 - wds)
+            if (ieee_round(mant, fmt->words - wds)
                 || (sh > 0 && (mant[0] & (0x8000 >> (sh - 1))))) {
                 ieee_shr(mant, 1);
                 if (sh == 0)
                     mant[0] |= 0x8000;
                 exponent++;
             }
-            put(result + 2, (wds == 0 ? mant[0] : 0) | sign);
-            put(result + 0, (wds <= 1 ? mant[1 - wds] : 0));
+
+	    if (wds) {
+		for (i = fmt->words-1; i >= wds; i--)
+		    mant[i] = mant[i-wds];
+		for (; i >= 0; i--)
+		    mant[i] = 0;
+	    }
         } else {
             if (exponent > 0) {
                 error(ERR_NONFATAL, "overflow in floating-point constant");
                 return 0;
-            } else
-                memset(result, 0, 4);
+	    } else {
+		memset(mant, 0, 2*fmt->words);
+	    }
         }
     } else {
-        memset(result, 0, 4);
+	/* Zero */
+        memset(mant, 0, 2*fmt->words);
     }
-    return 1;
-}
 
-/* 16-bit format with 10-bit mantissa and 5-bit exponent.
-   Defined in IEEE 754r.  Used in SSE5.  See the AMD SSE5 manual, AMD
-   document number 43479. */
-static int to_float16(char *str, int32_t sign, uint8_t *result,
-		      efunc error)
-{
-    uint16_t mant[MANT_WORDS];
-    int32_t exponent;
+    mant[0] |= sign;
 
-    sign = (sign < 0 ? 0x8000L : 0L);
-
-    ieee_flconvert(str, mant, &exponent, error);
-    if (mant[0] & 0x8000) {
-        /*
-         * Non-zero.
-         */
-        exponent--;
-        if (exponent >= -14 && exponent <= 16) {
-            /*
-             * Normalised.
-             */
-            exponent += 15;
-            ieee_shr(mant, 5);
-            ieee_round(mant, 1);
-            if (mant[0] & 0x800)        /* did we scale up by one? */
-                ieee_shr(mant, 1), exponent++;
-            mant[0] &= 0x3FF;    /* remove leading one */
-            put(result + 0, (exponent << 7) | mant[0] | sign);
-        } else if (exponent < -14 && exponent >= -24) {
-            /*
-             * Denormal.
-             */
-            int shift = -(exponent + 8);
-            int sh = shift % 16, wds = shift / 16;
-            ieee_shr(mant, sh);
-            if (ieee_round(mant, 1 - wds)
-                || (sh > 0 && (mant[0] & (0x8000 >> (sh - 1))))) {
-                ieee_shr(mant, 1);
-                if (sh == 0)
-                    mant[0] |= 0x8000;
-                exponent++;
-            }
-            put(result + 0, (wds == 0 ? mant[0] : 0) | sign);
-        } else {
-            if (exponent > 0) {
-                error(ERR_NONFATAL, "overflow in floating-point constant");
-                return 0;
-            } else
-                memset(result, 0, 2);
-        }
-    } else {
-        memset(result, 0, 2);
+    for (mp = &mant[fmt->words], i = 0; i < fmt->words; i++) {
+	uint16_t m = *--mp;
+	put(result, m);
+	result += 2;
     }
-    return 1;
+
+    return 1;                   /* success */
 }
 
 /* 80-bit format with 64-bit mantissa *including an explicit integer 1*
@@ -456,13 +376,15 @@ int float_const(char *number, int32_t sign, uint8_t *result, int bytes,
 {
     switch (bytes) {
     case 2:
-	return to_float16(number, sign, result, error);
+	return to_float(number, sign, result, &ieee_16, error);
     case 4:
-        return to_float(number, sign, result, error);
+        return to_float(number, sign, result, &ieee_32, error);
     case 8:
-        return to_double(number, sign, result, error);
+        return to_float(number, sign, result, &ieee_64, error);
     case 10:
         return to_ldoub(number, sign, result, error);
+    case 16:
+        return to_float(number, sign, result, &ieee_128, error);
     default:
         error(ERR_PANIC, "strange value %d passed to float_const", bytes);
         return 0;
diff --git a/test/float.asm b/test/float.asm
new file mode 100644
index 00000000..30d1f062
--- /dev/null
+++ b/test/float.asm
@@ -0,0 +1,103 @@
+;
+; Test of floating-point formats
+;
+
+; 16-bit
+	dw 1.0
+	dw +1.0
+	dw -1.0
+	dw 0.0
+	dw +0.0
+	dw -0.0
+	dw 1.83203125
+	dw +1.83203125
+	dw -1.83203125
+	dw 1.83203125e3
+	dw +1.83203125e3
+	dw -1.83203125e3
+	dw 1.83203125e-3
+	dw +1.83203125e-3
+	dw -1.83203125e-3
+	dw 1.83203125e-6		; Denormal!
+	dw +1.83203125e-6		; Denormal!
+	dw -1.83203125e-6		; Denormal!
+
+; 32-bit
+	dd 1.0
+	dd +1.0
+	dd -1.0
+	dd 0.0
+	dd +0.0
+	dd -0.0
+	dd 1.83203125
+	dd +1.83203125
+	dd -1.83203125
+	dd 1.83203125e15
+	dd +1.83203125e15
+	dd -1.83203125e15
+	dd 1.83203125e-15
+	dd +1.83203125e-15
+	dd -1.83203125e-15
+	dd 1.83203125e-40		; Denormal!
+	dd +1.83203125e-40		; Denormal!
+	dd -1.83203125e-40		; Denormal!
+
+; 64-bit
+	dq 1.0
+	dq +1.0
+	dq -1.0
+	dq 0.0
+	dq +0.0
+	dq -0.0
+	dq 1.83203125
+	dq +1.83203125
+	dq -1.83203125
+	dq 1.83203125e300
+	dq +1.83203125e300
+	dq -1.83203125e300
+	dq 1.83203125e-300
+	dq +1.83203125e-300
+	dq -1.83203125e-300
+	dq 1.83203125e-320		; Denormal!
+	dq +1.83203125e-320		; Denormal!
+	dq -1.83203125e-320		; Denormal!
+
+; 80-bit
+	dt 1.0
+	dt +1.0
+	dt -1.0
+	dt 0.0
+	dt +0.0
+	dt -0.0
+	dt 1.83203125
+	dt +1.83203125
+	dt -1.83203125
+	dt 1.83203125e+4000
+	dt +1.83203125e+4000
+	dt -1.83203125e+4000
+	dt 1.83203125e-4000
+	dt +1.83203125e-4000
+	dt -1.83203125e-4000
+	dt 1.83203125e-4940		; Denormal!
+	dt +1.83203125e-4940		; Denormal!
+	dt -1.83203125e-4940		; Denormal!
+
+; 128-bit
+	do 1.0
+	do +1.0
+	do -1.0
+	do 0.0
+	do +0.0
+	do -0.0
+	do 1.83203125
+	do +1.83203125
+	do -1.83203125
+	do 1.83203125e+4000
+	do +1.83203125e+4000
+	do -1.83203125e+4000
+	do 1.83203125e-4000
+	do +1.83203125e-4000
+	do -1.83203125e-4000
+	do 1.83203125e-4940		; Denormal!
+	do +1.83203125e-4940		; Denormal!
+	do -1.83203125e-4940		; Denormal!

From fe2177fe4287bbfa9205bcd362694f47870a3c30 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 18 Sep 2007 18:31:26 -0700
Subject: [PATCH 18/29] Support C99-style hexadecimal floating point.

Add support for C99-style hexadecimal floating point.  The format is
0x <hexadecimal mantissa> p <binary exponent>.  0x1.0e+1 thus is the
same as 2.0.
---
 float.c   | 91 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 stdscan.c |  4 ++-
 2 files changed, 94 insertions(+), 1 deletion(-)

diff --git a/float.c b/float.c
index a6ad3936..08dfdb47 100644
--- a/float.c
+++ b/float.c
@@ -8,6 +8,7 @@
  * initial version 13/ix/96 by Simon Tatham
  */
 
+#include <ctype.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -56,6 +57,91 @@ static int ieee_multiply(uint16_t *to, uint16_t *from)
     }
 }
 
+static int hexval(char c)
+{
+    if (c >= '0' && c <= '9')
+	return c-'0';
+    else if (c >= 'a' && c <= 'f')
+	return c-'a'+10;
+    else
+	return c-'A'+10;
+}
+
+static void ieee_flconvert_hex(char *string, uint16_t *mant,
+			       int32_t *exponent, efunc error)
+{
+    static const int log2tbl[16] =
+	{ -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3 };
+    uint16_t mult[MANT_WORDS+1], *mp;
+    int ms;
+    int32_t twopwr;
+    int seendot, seendigit;
+    unsigned char c;
+
+    twopwr = 0;
+    seendot = seendigit = 0;
+
+    memset(mult, 0, sizeof mult);
+
+    while ((c = *string++) != '\0') {
+	if (c == '.') {
+            if (!seendot)
+                seendot = TRUE;
+            else {
+                error(ERR_NONFATAL,
+                      "too many periods in floating-point constant");
+                return;
+            }
+	} else if (isxdigit(c)) {
+	    int v = hexval(c);
+
+	    if (!seendigit && v) {
+		int l = log2tbl[v];
+
+		seendigit = 1;
+		mp = mult;
+		ms = 15-l;
+
+		twopwr = seendot ? twopwr-4+l : l-3;
+	    }
+
+	    if (seendigit) {
+		if (ms <= 0) {
+		    *mp |= v >> -ms;
+		    mp++;
+		    if (mp > &mult[MANT_WORDS])
+			mp = &mult[MANT_WORDS]; /* Guard slot */
+		    ms += 16;
+		}
+		*mp |= v << ms;
+		ms -= 4;
+
+		if (!seendot)
+		    twopwr += 4;
+	    } else {
+		if (seendot)
+		    twopwr -= 4;
+	    }
+	} else if (c == 'p' || c == 'P') {
+	    twopwr += atoi(string);
+	    break;
+	} else {
+            error(ERR_NONFATAL,
+                  "floating-point constant: `%c' is invalid character",
+                  *string);
+            return;
+        }
+    }
+
+    if (!seendigit) {
+	memset(mant, 0, 2*MANT_WORDS); /* Zero */
+	*exponent = 0;
+    } else {
+	memcpy(mant, mult, 2*MANT_WORDS);
+	*exponent = twopwr;
+    }
+}
+
 static void ieee_flconvert(char *string, uint16_t *mant,
                            int32_t *exponent, efunc error)
 {
@@ -66,6 +152,11 @@ static void ieee_flconvert(char *string, uint16_t *mant,
     int32_t tenpwr, twopwr;
     int extratwos, started, seendot;
 
+    if (string[0] == '0' && (string[1] == 'x' || string[1] == 'X')) {
+	ieee_flconvert_hex(string+2, mant, exponent, error);
+	return;
+    }
+
     p = digits;
     tenpwr = 0;
     started = seendot = FALSE;
diff --git a/stdscan.c b/stdscan.c
index d4ad696d..b6a4ee8f 100644
--- a/stdscan.c
+++ b/stdscan.c
@@ -130,7 +130,9 @@ int stdscan(void *private_data, struct tokenval *tv)
             stdscan_bufptr++;
             while (isnumchar(*stdscan_bufptr) ||
                    ((stdscan_bufptr[-1] == 'e'
-                     || stdscan_bufptr[-1] == 'E')
+                     || stdscan_bufptr[-1] == 'E'
+		     || stdscan_bufptr[-1] == 'p'
+		     || stdscan_bufptr[-1] == 'P')
                     && (*stdscan_bufptr == '-' || *stdscan_bufptr == '+'))) {
                 stdscan_bufptr++;
             }

From 26976a187fa0b3e393118b6bf1a72707b0767ddb Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 18 Sep 2007 18:33:17 -0700
Subject: [PATCH 19/29] Fix error-reporting in hexadecimal floating-point
 numbers

---
 float.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/float.c b/float.c
index 08dfdb47..b87db848 100644
--- a/float.c
+++ b/float.c
@@ -128,7 +128,7 @@ static void ieee_flconvert_hex(char *string, uint16_t *mant,
 	} else {
             error(ERR_NONFATAL,
                   "floating-point constant: `%c' is invalid character",
-                  *string);
+                  c);
             return;
         }
     }

From 72ac77bb0b37990aa7cae7a058ae646135280301 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 18 Sep 2007 18:37:36 -0700
Subject: [PATCH 20/29] Simple test for hexadecimal floating-point numbers

Very trivial test for hexadecimal floating-point numbers
---
 test/floatx.asm | 125 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 test/floatx.asm

diff --git a/test/floatx.asm b/test/floatx.asm
new file mode 100644
index 00000000..f513ec83
--- /dev/null
+++ b/test/floatx.asm
@@ -0,0 +1,125 @@
+;
+; floatx.asm
+;
+; Test hexadecimal floating-point numbers
+
+; 16-bit
+	dw 1.0
+	dw 0x1.0
+	dw 2.0
+	dw 0x2.0
+	dw 0x1.0p+1
+	dw 0x1.0p-1
+	dw 0x0.0
+	dw 0x1.23456789
+	dw 0x0.123456789
+	dw 0x0.0000123456789
+	dw 0x1.23456789p10
+	dw 0x1.23456789p+10
+	dw 0x1.23456789p-10
+	dw 0x0.123456789p10
+	dw 0x0.123456789p+10
+	dw 0x0.123456789abcdef0123456789abcdef012345p-10
+	dw 0x0.0000123456789
+	dw 0x0.0000123456789p+10
+	dw 0x0.0000123456789p-10
+
+; 32-bit
+	dd 1.0
+	dd 0x1.0
+	dd 2.0
+	dd 0x2.0
+	dd 0x1.0p+1
+	dd 0x1.0p-1
+	dd 0x0.0
+	dd 0x1.23456789
+	dd 0x0.123456789
+	dd 0x0.0000123456789
+	dd 0x1.23456789p10
+	dd 0x1.23456789p+10
+	dd 0x1.23456789p-10
+	dd 0x0.123456789p10
+	dd 0x0.123456789p+10
+	dd 0x0.123456789abcdef0123456789abcdef012345p-10
+	dd 0x0.0000123456789
+	dd 0x0.0000123456789p+10
+	dd 0x0.0000123456789p-10
+	dd 0x123456789.0
+	dd 0x0000123456789.0
+	dd 0x123456789.0p+0
+	dd 0x123456789.0p+64
+
+; 64-bit
+	dq 1.0
+	dq 0x1.0
+	dq 2.0
+	dq 0x2.0
+	dq 0x1.0p+1
+	dq 0x1.0p-1
+	dq 0x0.0
+	dq 0x1.23456789
+	dq 0x0.123456789
+	dq 0x0.0000123456789
+	dq 0x1.23456789p10
+	dq 0x1.23456789p+10
+	dq 0x1.23456789p-10
+	dq 0x0.123456789p10
+	dq 0x0.123456789p+10
+	dq 0x0.123456789abcdef0123456789abcdef012345p-10
+	dq 0x0.0000123456789
+	dq 0x0.0000123456789p+10
+	dq 0x0.0000123456789p-10
+	dq 0x123456789.0
+	dq 0x0000123456789.0
+	dq 0x123456789.0p+0
+	dq 0x123456789.0p+300
+	
+; 80-bit
+	dt 1.0
+	dt 0x1.0
+	dt 2.0
+	dt 0x2.0
+	dt 0x1.0p+1
+	dt 0x1.0p-1
+	dt 0x0.0
+	dt 0x1.23456789
+	dt 0x0.123456789
+	dt 0x0.0000123456789
+	dt 0x1.23456789p10
+	dt 0x1.23456789p+10
+	dt 0x1.23456789p-10
+	dt 0x0.123456789p10
+	dt 0x0.123456789p+10
+	dt 0x0.123456789abcdef0123456789abcdef012345p-10
+	dt 0x0.0000123456789
+	dt 0x0.0000123456789p+10
+	dt 0x0.0000123456789p-10
+	dt 0x123456789.0
+	dt 0x0000123456789.0
+	dt 0x123456789.0p+0
+	dt 0x123456789.0p+1024
+
+; 128-bit
+	do 1.0
+	do 0x1.0
+	do 2.0
+	do 0x2.0
+	do 0x1.0p+1
+	do 0x1.0p-1
+	do 0x0.0
+	do 0x1.23456789
+	do 0x0.123456789
+	do 0x0.0000123456789
+	do 0x1.23456789p10
+	do 0x1.23456789p+10
+	do 0x1.23456789p-10
+	do 0x0.123456789p10
+	do 0x0.123456789p+10
+	do 0x0.123456789abcdef0123456789abcdef012345p-10
+	do 0x0.0000123456789
+	do 0x0.0000123456789p+10
+	do 0x0.0000123456789p-10
+	do 0x123456789.0
+	do 0x0000123456789.0
+	do 0x123456789.0p+0
+	do 0x123456789.0p+1024

From 5107d672a006750a8f20b227d1fe33ead77ddd41 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 18 Sep 2007 19:12:26 -0700
Subject: [PATCH 21/29] Update documentation

Document new floating-point capabilities, and clean up the discussion
about BITS 64 and REX prefixes.
---
 doc/nasmdoc.src | 61 ++++++++++++++++++++++++++++---------------------
 1 file changed, 35 insertions(+), 26 deletions(-)

diff --git a/doc/nasmdoc.src b/doc/nasmdoc.src
index 2530b2b5..96faefbe 100644
--- a/doc/nasmdoc.src
+++ b/doc/nasmdoc.src
@@ -1093,7 +1093,7 @@ syntax in which register names must be prefixed by a \c{%} sign), or
 they can be \i{effective addresses} (see \k{effaddr}), constants
 (\k{const}) or expressions (\k{expr}).
 
-For \i{floating-point} instructions, NASM accepts a wide range of
+For x87 \i{floating-point} instructions, NASM accepts a wide range of
 syntaxes: you can use two-operand forms like MASM supports, or you
 can use NASM's native single-operand forms in most cases.
 \# Details of
@@ -1107,7 +1107,7 @@ For example, you can code:
 \c         fadd    st1,st0         ; this sets st1 := st1 + st0
 \c         fadd    to st1          ; so does this
 
-Almost any floating-point instruction that references memory must
+Almost any x87 floating-point instruction that references memory must
 use one of the prefixes \i\c{DWORD}, \i\c{QWORD} or \i\c{TWORD} to
 indicate what size of \i{memory operand} it refers to.
 
@@ -1145,6 +1145,7 @@ file. They can be invoked in a wide range of ways:
 \c       dt    1.234567e20         ; extended-precision float
 
 \c{DT} and \c{DO} do not accept \i{numeric constants} as operands.
+\c{DB} does not accept \i{floating-point} numbers as operands.
 
 
 \S{resb} \c{RESB} and friends: Declaring \i{Uninitialized} Data
@@ -1390,20 +1391,28 @@ when they are operands to \c{dw}.
 \S{fltconst} \I{floating-point, constants}Floating-Point Constants
 
 \i{Floating-point} constants are acceptable only as arguments to
-\i\c{DD}, \i\c{DQ} and \i\c{DT}. They are expressed in the
-traditional form: digits, then a period, then optionally more
-digits, then optionally an \c{E} followed by an exponent. The period
-is mandatory, so that NASM can distinguish between \c{dd 1}, which
-declares an integer constant, and \c{dd 1.0} which declares a
-floating-point constant.
+\i\c{DW}, \i\c{DD}, \i\c{DQ}, \i\c{DT}, and \i\c{DO}. They are
+expressed in the traditional form: digits, then a period, then
+optionally more digits, then optionally an \c{E} followed by an
+exponent. The period is mandatory, so that NASM can distinguish
+between \c{dd 1}, which declares an integer constant, and \c{dd 1.0}
+which declares a floating-point constant.
+
+NASM also support C99-style hexadecimal floating-point: \c{0x},
+hexadecimal digits, period, optionally more hexadeximal digits, then
+optionally a \c{P} followed by a \e{binary} (not hexadecimal) exponent
+in decimal notation.
 
 Some examples:
 
+\c       dw    -0.5                    ; IEEE half precision
 \c       dd    1.2                     ; an easy one
+\c	 dd    0x1p+2		       ; 1.0x2^2 = 4.0
 \c       dq    1.e10                   ; 10,000,000,000
 \c       dq    1.e+10                  ; synonymous with 1.e10
 \c       dq    1.e-10                  ; 0.000 000 000 1
 \c       dt    3.141592653589793238462 ; pi
+\c       do    1.e+4000		       ; IEEE quad precision
 
 NASM cannot do compile-time arithmetic on floating-point constants.
 This is because NASM is designed to be portable - although it always
@@ -1418,15 +1427,9 @@ size of the assembler for very little benefit.
 
 \H{expr} \i{Expressions}
 
-Expressions in NASM are similar in syntax to those in C.
-
-NASM does not guarantee the size of the integers used to evaluate
-expressions at compile time: since NASM can compile and run on
-64-bit systems quite happily, don't assume that expressions are
-evaluated in 32-bit registers and so try to make deliberate use of
-\i{integer overflow}. It might not always work. The only thing NASM
-will guarantee is what's guaranteed by ANSI C: you always have \e{at
-least} 32 bits to work in.
+Expressions in NASM are similar in syntax to those in C.  Expressions
+are evaluated as 64-bit integers which are then adjusted to the
+appropriate size.
 
 NASM supports two special tokens in expressions, allowing
 calculations to involve the current assembly position: the
@@ -3425,15 +3428,21 @@ using 16-bit data need an 0x66 and those working on 16-bit addresses
 need an 0x67.
 
 When NASM is in \c{BITS 64} mode, most instructions operate the same
-as they do for \c{BITS 32} mode. However, 16-bit addresses are depreciated
-in the x86-64 architecture extension and the 0x67 prefix is used for 32-bit
-addressing. This is due to the default of 64-bit addressing. When the \c{REX}
-prefix is used, the processor does not know how to address the AH, BH, CH or
-DH (high 8-bit legacy) registers. This because the x86-64 has added a new
-set of registers and the capability to address the low 8-bits of the SP, BP
-SI and DI registers as SPL, BPL, SIL and DIL, respectively; but only when
-the REX prefix is used. In summary, the \c{REX} prefix causes the addressing
-of AH, BH, CH and DH to be replaced by SPL, BPL, SIL and DIL.
+as they do for \c{BITS 32} mode. However, there are 8 more general and
+SSE registers, and 16-bit addressing is no longer supported.
+
+The default address size is 64 bits; 32-bit addressing can be selected
+with the 0x67 prefix.  The default operand size is still 32 bits,
+however, and the 0x66 prefix selects 16-bit operand size.  The \c{REX}
+prefix is used both to select 64-bit operand size, and to access the
+new registers. NASM automatically inserts REX prefixes when
+necessary.
+
+When the \c{REX} prefix is used, the processor does not know how to
+address the AH, BH, CH or DH (high 8-bit legacy) registers. Instead,
+it is possible to access the the low 8-bits of the SP, BP SI and DI
+registers as SPL, BPL, SIL and DIL, respectively; but only when the
+REX prefix is used.
 
 The \c{BITS} directive has an exactly equivalent primitive form,
 \c{[BITS 16]}, \c{[BITS 32]} and \c{[BITS 64]}. The user-level form is

From f48bc6fb485de852c128c5756c77acd0611c2b87 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 18 Sep 2007 21:55:56 -0700
Subject: [PATCH 22/29] Support generating NaNs and infinities

Support generating NaNs and infinities as part of floating-point
constants.
---
 float.c        | 184 +++++++++++++++++++++++++++++++++----------------
 insns.h        |   4 +-
 test/float.asm |  30 ++++++++
 tokens.dat     |   6 ++
 4 files changed, 161 insertions(+), 63 deletions(-)

diff --git a/float.c b/float.c
index b87db848..d22aa19c 100644
--- a/float.c
+++ b/float.c
@@ -303,6 +303,12 @@ static int ieee_round(uint16_t *mant, int i)
 
 #define put(a,b) ( (*(a)=(b)), ((a)[1]=(b)>>8) )
 
+/* Set a bit, using *bigendian* bit numbering (0 = MSB) */
+static void set_bit(uint16_t *mant, int bit)
+{
+    mant[bit >> 4] |= 1 << (~bit & 15);
+}
+
 /* Produce standard IEEE formats, with implicit "1" bit; this makes
    the following assumptions:
 
@@ -333,59 +339,84 @@ static int to_float(char *str, int32_t sign, uint8_t *result,
 
     sign = (sign < 0 ? 0x8000L : 0L);
 
-    ieee_flconvert(str, mant, &exponent, error);
-    if (mant[0] & 0x8000) {
-        /*
-         * Non-zero.
-         */
-        exponent--;
-        if (exponent >= 2-expmax && exponent <= expmax) {
-            /*
-             * Normalised.
-             */
-            exponent += expmax;
-            ieee_shr(mant, fmt->exponent);
-            ieee_round(mant, fmt->words);
-	    /* did we scale up by one? */
-            if (mant[0] & (implicit_one << 1)) {
-                ieee_shr(mant, 1);
-		exponent++;
-	    }
+    if (str[0] == '_') {
+	/* NaN or Infinity */
+	int32_t expmask = (1 << fmt->exponent)-1;
 
-            mant[0] &= (implicit_one-1);     /* remove leading one */
-	    mant[0] |= exponent << (15 - fmt->exponent);
-        } else if (exponent < 2-expmax && exponent >= 2-expmax-fmt->mantissa) {
-            /*
-             * Denormal.
-             */
-            int shift = -(exponent + expmax-2-fmt->exponent);
-            int sh = shift % 16, wds = shift / 16;
-            ieee_shr(mant, sh);
-            if (ieee_round(mant, fmt->words - wds)
-                || (sh > 0 && (mant[0] & (0x8000 >> (sh - 1))))) {
-                ieee_shr(mant, 1);
-                if (sh == 0)
-                    mant[0] |= 0x8000;
-                exponent++;
-            }
+	memset(mant, 0, sizeof mant);
+	mant[0] = expmask << (15-fmt->exponent); /* Exponent: all bits one */
 
-	    if (wds) {
-		for (i = fmt->words-1; i >= wds; i--)
-		    mant[i] = mant[i-wds];
-		for (; i >= 0; i--)
-		    mant[i] = 0;
-	    }
-        } else {
-            if (exponent > 0) {
-                error(ERR_NONFATAL, "overflow in floating-point constant");
-                return 0;
-	    } else {
-		memset(mant, 0, 2*fmt->words);
-	    }
-        }
+	switch (str[2]) {
+	case 'n':		/* __nan__ */
+	case 'N':
+	case 'q':		/* __qnan__ */
+	case 'Q':
+	    set_bit(mant, fmt->exponent+1); /* Highest bit in mantissa */
+	    break;
+	case 's':		/* __snan__ */
+	case 'S':
+	    set_bit(mant, fmt->exponent+fmt->mantissa);	/* Last bit */
+	    break;
+	case 'i':		/* __infinity__ */
+	case 'I':
+	    break;
+	}
     } else {
-	/* Zero */
-        memset(mant, 0, 2*fmt->words);
+	ieee_flconvert(str, mant, &exponent, error);
+	if (mant[0] & 0x8000) {
+	    /*
+	     * Non-zero.
+	     */
+	    exponent--;
+	    if (exponent >= 2-expmax && exponent <= expmax) {
+		/*
+		 * Normalised.
+		 */
+		exponent += expmax;
+		ieee_shr(mant, fmt->exponent);
+		ieee_round(mant, fmt->words);
+		/* did we scale up by one? */
+		if (mant[0] & (implicit_one << 1)) {
+		    ieee_shr(mant, 1);
+		    exponent++;
+		}
+		
+		mant[0] &= (implicit_one-1);     /* remove leading one */
+		mant[0] |= exponent << (15 - fmt->exponent);
+	    } else if (exponent < 2-expmax &&
+		       exponent >= 2-expmax-fmt->mantissa) {
+		/*
+		 * Denormal.
+		 */
+		int shift = -(exponent + expmax-2-fmt->exponent);
+		int sh = shift % 16, wds = shift / 16;
+		ieee_shr(mant, sh);
+		if (ieee_round(mant, fmt->words - wds)
+		    || (sh > 0 && (mant[0] & (0x8000 >> (sh - 1))))) {
+		    ieee_shr(mant, 1);
+		    if (sh == 0)
+			mant[0] |= 0x8000;
+		    exponent++;
+		}
+		
+		if (wds) {
+		    for (i = fmt->words-1; i >= wds; i--)
+			mant[i] = mant[i-wds];
+		    for (; i >= 0; i--)
+			mant[i] = 0;
+		}
+	    } else {
+		if (exponent > 0) {
+		    error(ERR_NONFATAL, "overflow in floating-point constant");
+		    return 0;
+		} else {
+		    memset(mant, 0, 2*fmt->words);
+		}
+	    }
+	} else {
+	    /* Zero */
+	    memset(mant, 0, 2*fmt->words);
+	}
     }
 
     mant[0] |= sign;
@@ -409,6 +440,31 @@ static int to_ldoub(char *str, int32_t sign, uint8_t *result,
 
     sign = (sign < 0 ? 0x8000L : 0L);
 
+    if (str[0] == '_') {
+	uint16_t is_snan = 0, is_qnan = 0x8000;
+	switch (str[2]) {
+	case 'n':
+	case 'N':
+	case 'q':
+	case 'Q':
+	    is_qnan = 0xc000;
+	    break;
+	case 's':
+	case 'S':
+	    is_snan = 1;
+	    break;
+	case 'i':
+	case 'I':
+	    break;
+	}
+	put(result + 0, is_snan);
+	put(result + 2, 0);
+	put(result + 4, 0);
+	put(result + 6, is_qnan);
+	put(result + 8, 0x7fff|sign);
+	return 1;
+    }
+
     ieee_flconvert(str, mant, &exponent, error);
     if (mant[0] & 0x8000) {
         /*
@@ -422,11 +478,11 @@ static int to_ldoub(char *str, int32_t sign, uint8_t *result,
             exponent += 16383;
             if (ieee_round(mant, 4))    /* did we scale up by one? */
                 ieee_shr(mant, 1), mant[0] |= 0x8000, exponent++;
-            put(result + 8, exponent | sign);
-            put(result + 6, mant[0]);
-            put(result + 4, mant[1]);
-            put(result + 2, mant[2]);
             put(result + 0, mant[3]);
+            put(result + 2, mant[2]);
+            put(result + 4, mant[1]);
+            put(result + 6, mant[0]);
+            put(result + 8, exponent | sign);
         } else if (exponent < -16383 && exponent >= -16446) {
             /*
              * Denormal.
@@ -441,23 +497,29 @@ static int to_ldoub(char *str, int32_t sign, uint8_t *result,
                     mant[0] |= 0x8000;
                 exponent++;
             }
-            put(result + 8, sign);
-            put(result + 6, (wds == 0 ? mant[0] : 0));
-            put(result + 4, (wds <= 1 ? mant[1 - wds] : 0));
-            put(result + 2, (wds <= 2 ? mant[2 - wds] : 0));
             put(result + 0, (wds <= 3 ? mant[3 - wds] : 0));
+            put(result + 2, (wds <= 2 ? mant[2 - wds] : 0));
+            put(result + 4, (wds <= 1 ? mant[1 - wds] : 0));
+            put(result + 6, (wds == 0 ? mant[0] : 0));
+            put(result + 8, sign);
         } else {
             if (exponent > 0) {
                 error(ERR_NONFATAL, "overflow in floating-point constant");
                 return 0;
-            } else
-                memset(result, 0, 10);
+            } else {
+		goto zero;
+	    }
         }
     } else {
         /*
          * Zero.
          */
-        memset(result, 0, 10);
+    zero:
+	put(result + 0, 0);
+	put(result + 2, 0);
+	put(result + 4, 0);
+	put(result + 6, 0);
+	put(result + 8, sign);
     }
     return 1;
 }
diff --git a/insns.h b/insns.h
index b025c7a5..314737af 100644
--- a/insns.h
+++ b/insns.h
@@ -12,10 +12,10 @@
 #include "nasm.h"
 
 /* max length of any instruction, register name etc. */
-#if MAX_INSLEN > 9              /* MAX_INSLEN defined in insnsi.h */
+#if MAX_INSLEN > 12              /* MAX_INSLEN defined in insnsi.h */
 #define MAX_KEYWORD MAX_INSLEN
 #else
-#define MAX_KEYWORD 9
+#define MAX_KEYWORD 12
 #endif
 
 struct itemplate {
diff --git a/test/float.asm b/test/float.asm
index 30d1f062..bcb2ec28 100644
--- a/test/float.asm
+++ b/test/float.asm
@@ -21,6 +21,12 @@
 	dw 1.83203125e-6		; Denormal!
 	dw +1.83203125e-6		; Denormal!
 	dw -1.83203125e-6		; Denormal!
+	dw __Infinity__
+	dw +__Infinity__
+	dw -__Infinity__
+	dw __NaN__
+	dw __QNaN__
+	dw __SNaN__
 
 ; 32-bit
 	dd 1.0
@@ -41,6 +47,12 @@
 	dd 1.83203125e-40		; Denormal!
 	dd +1.83203125e-40		; Denormal!
 	dd -1.83203125e-40		; Denormal!
+	dd __Infinity__
+	dd +__Infinity__
+	dd -__Infinity__
+	dd __NaN__
+	dd __QNaN__
+	dd __SNaN__
 
 ; 64-bit
 	dq 1.0
@@ -61,6 +73,12 @@
 	dq 1.83203125e-320		; Denormal!
 	dq +1.83203125e-320		; Denormal!
 	dq -1.83203125e-320		; Denormal!
+	dq __Infinity__
+	dq +__Infinity__
+	dq -__Infinity__
+	dq __NaN__
+	dq __QNaN__
+	dq __SNaN__
 
 ; 80-bit
 	dt 1.0
@@ -81,6 +99,12 @@
 	dt 1.83203125e-4940		; Denormal!
 	dt +1.83203125e-4940		; Denormal!
 	dt -1.83203125e-4940		; Denormal!
+	dt __Infinity__
+	dt +__Infinity__
+	dt -__Infinity__
+	dt __NaN__
+	dt __QNaN__
+	dt __SNaN__
 
 ; 128-bit
 	do 1.0
@@ -101,3 +125,9 @@
 	do 1.83203125e-4940		; Denormal!
 	do +1.83203125e-4940		; Denormal!
 	do -1.83203125e-4940		; Denormal!
+	do __Infinity__
+	do +__Infinity__
+	do -__Infinity__
+	do __NaN__
+	do __QNaN__
+	do __SNaN__
diff --git a/tokens.dat b/tokens.dat
index c84b8fb3..e7c1cb29 100644
--- a/tokens.dat
+++ b/tokens.dat
@@ -32,6 +32,12 @@ to
 tword
 word
 
+% TOKEN_FLOAT, 0, 0
+__infinity__
+__nan__
+__qnan__
+__snan__
+
 % TOKEN_*, 0, 0
 seg
 wrt

From 8084f105a251b025176b293a5232295fcd39f1d3 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 18 Sep 2007 22:08:04 -0700
Subject: [PATCH 23/29] Document Infinity and NaN

Add __Infinity__, __QNaN__, and __SNaN__ to the documentation.
---
 doc/nasmdoc.src | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/doc/nasmdoc.src b/doc/nasmdoc.src
index 96faefbe..c79cd39b 100644
--- a/doc/nasmdoc.src
+++ b/doc/nasmdoc.src
@@ -151,6 +151,7 @@ convention
 \IR{ms-dos} MS-DOS
 \IR{ms-dos device drivers} MS-DOS device drivers
 \IR{multipush} \c{multipush} macro
+\IR{nan} NaN
 \IR{nasm version} NASM version
 \IR{netbsd} NetBSD
 \IR{omf} OMF
@@ -1424,6 +1425,15 @@ do floating arithmetic it would have to include its own complete set
 of floating-point routines, which would significantly increase the
 size of the assembler for very little benefit.
 
+The special tokens \i\c{__Infinity__}, \i\c{__QNaN__} (or
+\i\c{__NaN__}) and \i\c{__SNaN__} can be used to generate
+\I{infinity}infinities, quiet \i{NaN}s, and signalling NaNs,
+respectively.  These are normally used as macros:
+
+\c %define Inf __Infinity__
+\c %define NaN __QNaN__
+\c
+\c       dq    +1.5, -Inf, NaN         ; Double-precision constants
 
 \H{expr} \i{Expressions}
 

From bf9a24f46471abad75fa3efba059646a6c4f5026 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 18 Sep 2007 22:54:40 -0700
Subject: [PATCH 24/29] Slightly optimize the interface to nasm_token_hash()

Instead of returning -1 from nasm_token_hash, set tv->t_type to
TOKEN_ID and return TOKEN_ID, since that's what stdscan.c wants to do
with it anyway.  This allows us to simply tailcall nasm_token_hash().
---
 stdscan.c  | 6 +-----
 tokhash.pl | 4 ++--
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/stdscan.c b/stdscan.c
index b6a4ee8f..aecbd4a7 100644
--- a/stdscan.c
+++ b/stdscan.c
@@ -75,7 +75,6 @@ int stdscan(void *private_data, struct tokenval *tv)
         (*stdscan_bufptr == '$' && isidstart(stdscan_bufptr[1]))) {
         /* now we've got an identifier */
         int is_sym = FALSE;
-	int t;
 
         if (*stdscan_bufptr == '$') {
             is_sym = TRUE;
@@ -99,10 +98,7 @@ int stdscan(void *private_data, struct tokenval *tv)
         *r = '\0';
         /* right, so we have an identifier sitting in temp storage. now,
          * is it actually a register or instruction name, or what? */
-	if ((t = nasm_token_hash(ourcopy, tv)) != -1)
-	    return t;
-	else
-	    return tv->t_type = TOKEN_ID;
+	return nasm_token_hash(ourcopy, tv);
     } else if (*stdscan_bufptr == '$' && !isnumchar(stdscan_bufptr[1])) {
         /*
          * It's a $ sign with no following hex number; this must
diff --git a/tokhash.pl b/tokhash.pl
index 9d5888be..a63e55f3 100755
--- a/tokhash.pl
+++ b/tokhash.pl
@@ -194,14 +194,14 @@ print  "    }\n";
 print  "\n";
 printf "    ix = hash1[k1 & 0x%x] + hash2[k2 & 0x%x];\n", $n-1, $n-1;
 printf "    if (ix >= %d)\n", scalar(@tokendata);
-print  "        return -1;\n";
+print  "        return tv->t_type = TOKEN_ID;\n";
 print  "\n";
 print  "    data = &tokendata[ix];\n";
 
 # print  "    fprintf(stderr, \"Looked for: %s found: %s\\n\", token, data->string);\n\n";
 
 print  "    if (strcmp(data->string, token))\n";
-print  "        return -1;\n";
+print  "        return tv->t_type = TOKEN_ID;\n";
 print  "\n";
 print  "    tv->t_integer = data->num;\n";
 print  "    tv->t_inttwo  = data->aux;\n";

From b4b43178783e963e95fb290e82f1a0c6d6725520 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 19 Sep 2007 16:15:22 -0700
Subject: [PATCH 25/29] test/Makefile: make a bit more useful

---
 test/Makefile | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/test/Makefile b/test/Makefile
index bdb55a62..f48e3d9f 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -1,2 +1,23 @@
+.SUFFIXES: .bin .o .o64 .obj .obj64 .exe .asm .lst
+
+NASM = ../nasm
+
+.asm.bin:
+	$(NASM) -f bin -o $@ -l $*.lst $<
+
+.asm.o:
+	$(NASM) -f elf32 -o $@ -l $*.lst $<
+
+.asm.o64:
+	$(NASM) -f elf64 -o $@ -l $*.lst $<
+
+.asm.obj:
+	$(NASM) -f win32 -o $@ -l $*.lst $<
+
+.asm.obj64:
+	$(NASM) -f win64 -o $@ -l $*.lst $<
+
+all:
+
 clean:
-	rm -f *test *.com *.o *.obj *so *.exe
+	rm -f *.com *.o *.o64 *.obj *.obj64 *.exe *.lst

From 4ff711889f0c29165c0b1d523b0ac53cf5e10763 Mon Sep 17 00:00:00 2001
From: Frank Kotler <fbk@Reltok1.comcast.net>
Date: Wed, 19 Sep 2007 21:07:32 -0400
Subject: [PATCH 26/29] Version 0.99.03

---
 version | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version b/version
index 10afd2eb..a3ebc112 100644
--- a/version
+++ b/version
@@ -1 +1 @@
-0.99.02
+0.99.03

From 87f252aaa53f8ce5305d84c95a8751592f647dc2 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 19 Sep 2007 21:40:37 -0700
Subject: [PATCH 27/29] Make nasm_malloc() et al available from inside ndisasm

Clean up nasmlib to remove functions irrelevant for ndisasm; make
nasm_malloc() etc usable inside ndisasm.
---
 Makefile.in      |  27 +++++----
 Mkfiles/msvc.mak |  27 +++++----
 exprlib.c        | 154 +++++++++++++++++++++++++++++++++++++++++++++++
 nasm.h           |  55 ++++-------------
 nasmlib.c        | 146 --------------------------------------------
 nasmlib.h        |  58 +++++++++++++-----
 ndisasm.c        |  14 +++++
 7 files changed, 253 insertions(+), 228 deletions(-)
 create mode 100644 exprlib.c

diff --git a/Makefile.in b/Makefile.in
index 0dc6aed7..1a43f551 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -58,7 +58,7 @@ NASM =	nasm.$(O) nasmlib.$(O) float.$(O) insnsa.$(O) assemble.$(O) \
 	output/outobj.$(O) output/outas86.$(O) output/outrdf2.$(O) \
 	output/outdbg.$(O) output/outieee.$(O) output/outmacho.$(O) \
 	preproc.$(O) pptok.$(O) \
-	listing.$(O) eval.$(O) stdscan.$(O) tokhash.$(O)
+	listing.$(O) eval.$(O) exprlib.$(O) stdscan.$(O) tokhash.$(O)
 
 NDISASM = ndisasm.$(O) disasm.$(O) sync.$(O) nasmlib.$(O) insnsd.$(O)
 
@@ -211,16 +211,19 @@ assemble.$(O): assemble.c assemble.h compiler.h config.h insns.h insnsi.h \
  nasm.h nasmlib.h pptok.h preproc.h regflags.c regs.h regvals.c version.h
 crc64.$(O): crc64.c
 disasm.$(O): disasm.c compiler.h config.h disasm.h insns.h insnsi.h insnsn.c \
- names.c nasm.h regdis.c regs.c regs.h sync.h version.h
+ names.c nasm.h nasmlib.h regdis.c regs.c regs.h sync.h version.h
 eval.$(O): eval.c compiler.h config.h eval.h insnsi.h labels.h nasm.h \
  nasmlib.h regs.h version.h
-float.$(O): float.c compiler.h config.h insnsi.h nasm.h regs.h version.h
+exprlib.$(O): exprlib.c compiler.h config.h insnsi.h nasm.h nasmlib.h regs.h \
+ version.h
+float.$(O): float.c compiler.h config.h insnsi.h nasm.h nasmlib.h regs.h \
+ version.h
 hashtbl.$(O): hashtbl.c compiler.h config.h hashtbl.h insnsi.h nasm.h \
  nasmlib.h regs.h version.h
-insnsa.$(O): insnsa.c compiler.h config.h insns.h insnsi.h nasm.h regs.h \
- version.h
-insnsd.$(O): insnsd.c compiler.h config.h insns.h insnsi.h nasm.h regs.h \
- version.h
+insnsa.$(O): insnsa.c compiler.h config.h insns.h insnsi.h nasm.h nasmlib.h \
+ regs.h version.h
+insnsd.$(O): insnsd.c compiler.h config.h insns.h insnsi.h nasm.h nasmlib.h \
+ regs.h version.h
 insnsn.$(O): insnsn.c
 labels.$(O): labels.c compiler.h config.h hashtbl.h insnsi.h nasm.h \
  nasmlib.h regs.h version.h
@@ -235,8 +238,8 @@ nasmlib.$(O): nasmlib.c compiler.h config.h insns.h insnsi.h nasm.h \
  nasmlib.h regs.h version.h
 ndisasm.$(O): ndisasm.c compiler.h config.h disasm.h insns.h insnsi.h nasm.h \
  nasmlib.h regs.h sync.h version.h
-outform.$(O): outform.c compiler.h config.h insnsi.h nasm.h outform.h regs.h \
- version.h
+outform.$(O): outform.c compiler.h config.h insnsi.h nasm.h nasmlib.h \
+ outform.h regs.h version.h
 output/outaout.$(O): output/outaout.c compiler.h config.h insnsi.h nasm.h \
  nasmlib.h outform.h regs.h stdscan.h version.h
 output/outas86.$(O): output/outas86.c compiler.h config.h insnsi.h nasm.h \
@@ -272,6 +275,6 @@ regs.$(O): regs.c
 regvals.$(O): regvals.c
 stdscan.$(O): stdscan.c compiler.h config.h insns.h insnsi.h nasm.h \
  nasmlib.h regs.h stdscan.h version.h
-sync.$(O): sync.c sync.h
-tokhash.$(O): tokhash.c compiler.h config.h insns.h insnsi.h nasm.h regs.h \
- version.h
+sync.$(O): sync.c compiler.h config.h nasmlib.h sync.h
+tokhash.$(O): tokhash.c compiler.h config.h insns.h insnsi.h nasm.h \
+ nasmlib.h regs.h version.h
diff --git a/Mkfiles/msvc.mak b/Mkfiles/msvc.mak
index d70973d9..96b5449c 100644
--- a/Mkfiles/msvc.mak
+++ b/Mkfiles/msvc.mak
@@ -38,7 +38,7 @@ NASM =	nasm.$(O) nasmlib.$(O) float.$(O) insnsa.$(O) assemble.$(O) \
 	output/outobj.$(O) output/outas86.$(O) output/outrdf2.$(O) \
 	output/outdbg.$(O) output/outieee.$(O) output/outmacho.$(O) \
 	preproc.$(O) pptok.$(O) \
-	listing.$(O) eval.$(O) stdscan.$(O) tokhash.$(O)
+	listing.$(O) eval.$(O) exprlib.$(O) stdscan.$(O) tokhash.$(O)
 
 NDISASM = ndisasm.$(O) disasm.$(O) sync.$(O) nasmlib.$(O) insnsd.$(O)
 
@@ -169,16 +169,19 @@ assemble.$(O): assemble.c assemble.h compiler.h config.h insns.h insnsi.h \
  nasm.h nasmlib.h pptok.h preproc.h regflags.c regs.h regvals.c version.h
 crc64.$(O): crc64.c
 disasm.$(O): disasm.c compiler.h config.h disasm.h insns.h insnsi.h insnsn.c \
- names.c nasm.h regdis.c regs.c regs.h sync.h version.h
+ names.c nasm.h nasmlib.h regdis.c regs.c regs.h sync.h version.h
 eval.$(O): eval.c compiler.h config.h eval.h insnsi.h labels.h nasm.h \
  nasmlib.h regs.h version.h
-float.$(O): float.c compiler.h config.h insnsi.h nasm.h regs.h version.h
+exprlib.$(O): exprlib.c compiler.h config.h insnsi.h nasm.h nasmlib.h regs.h \
+ version.h
+float.$(O): float.c compiler.h config.h insnsi.h nasm.h nasmlib.h regs.h \
+ version.h
 hashtbl.$(O): hashtbl.c compiler.h config.h hashtbl.h insnsi.h nasm.h \
  nasmlib.h regs.h version.h
-insnsa.$(O): insnsa.c compiler.h config.h insns.h insnsi.h nasm.h regs.h \
- version.h
-insnsd.$(O): insnsd.c compiler.h config.h insns.h insnsi.h nasm.h regs.h \
- version.h
+insnsa.$(O): insnsa.c compiler.h config.h insns.h insnsi.h nasm.h nasmlib.h \
+ regs.h version.h
+insnsd.$(O): insnsd.c compiler.h config.h insns.h insnsi.h nasm.h nasmlib.h \
+ regs.h version.h
 insnsn.$(O): insnsn.c
 labels.$(O): labels.c compiler.h config.h hashtbl.h insnsi.h nasm.h \
  nasmlib.h regs.h version.h
@@ -193,8 +196,8 @@ nasmlib.$(O): nasmlib.c compiler.h config.h insns.h insnsi.h nasm.h \
  nasmlib.h regs.h version.h
 ndisasm.$(O): ndisasm.c compiler.h config.h disasm.h insns.h insnsi.h nasm.h \
  nasmlib.h regs.h sync.h version.h
-outform.$(O): outform.c compiler.h config.h insnsi.h nasm.h outform.h regs.h \
- version.h
+outform.$(O): outform.c compiler.h config.h insnsi.h nasm.h nasmlib.h \
+ outform.h regs.h version.h
 output/outaout.$(O): output/outaout.c compiler.h config.h insnsi.h nasm.h \
  nasmlib.h outform.h regs.h stdscan.h version.h
 output/outas86.$(O): output/outas86.c compiler.h config.h insnsi.h nasm.h \
@@ -230,6 +233,6 @@ regs.$(O): regs.c
 regvals.$(O): regvals.c
 stdscan.$(O): stdscan.c compiler.h config.h insns.h insnsi.h nasm.h \
  nasmlib.h regs.h stdscan.h version.h
-sync.$(O): sync.c sync.h
-tokhash.$(O): tokhash.c compiler.h config.h insns.h insnsi.h nasm.h regs.h \
- version.h
+sync.$(O): sync.c compiler.h config.h nasmlib.h sync.h
+tokhash.$(O): tokhash.c compiler.h config.h insns.h insnsi.h nasm.h \
+ nasmlib.h regs.h version.h
diff --git a/exprlib.c b/exprlib.c
new file mode 100644
index 00000000..2f03ff0a
--- /dev/null
+++ b/exprlib.c
@@ -0,0 +1,154 @@
+/*
+ * exprlib.c
+ *
+ * Library routines to manipulate expression data types.
+ */
+
+#include "nasm.h"
+
+/*
+ * Return TRUE if the argument is a simple scalar. (Or a far-
+ * absolute, which counts.)
+ */
+int is_simple(expr * vect)
+{
+    while (vect->type && !vect->value)
+        vect++;
+    if (!vect->type)
+        return 1;
+    if (vect->type != EXPR_SIMPLE)
+        return 0;
+    do {
+        vect++;
+    } while (vect->type && !vect->value);
+    if (vect->type && vect->type < EXPR_SEGBASE + SEG_ABS)
+        return 0;
+    return 1;
+}
+
+/*
+ * Return TRUE if the argument is a simple scalar, _NOT_ a far-
+ * absolute.
+ */
+int is_really_simple(expr * vect)
+{
+    while (vect->type && !vect->value)
+        vect++;
+    if (!vect->type)
+        return 1;
+    if (vect->type != EXPR_SIMPLE)
+        return 0;
+    do {
+        vect++;
+    } while (vect->type && !vect->value);
+    if (vect->type)
+        return 0;
+    return 1;
+}
+
+/*
+ * Return TRUE if the argument is relocatable (i.e. a simple
+ * scalar, plus at most one segment-base, plus possibly a WRT).
+ */
+int is_reloc(expr * vect)
+{
+    while (vect->type && !vect->value)  /* skip initial value-0 terms */
+        vect++;
+    if (!vect->type)            /* trivially return TRUE if nothing */
+        return 1;               /* is present apart from value-0s */
+    if (vect->type < EXPR_SIMPLE)       /* FALSE if a register is present */
+        return 0;
+    if (vect->type == EXPR_SIMPLE) {    /* skip over a pure number term... */
+        do {
+            vect++;
+        } while (vect->type && !vect->value);
+        if (!vect->type)        /* ...returning TRUE if that's all */
+            return 1;
+    }
+    if (vect->type == EXPR_WRT) {       /* skip over a WRT term... */
+        do {
+            vect++;
+        } while (vect->type && !vect->value);
+        if (!vect->type)        /* ...returning TRUE if that's all */
+            return 1;
+    }
+    if (vect->value != 0 && vect->value != 1)
+        return 0;               /* segment base multiplier non-unity */
+    do {                        /* skip over _one_ seg-base term... */
+        vect++;
+    } while (vect->type && !vect->value);
+    if (!vect->type)            /* ...returning TRUE if that's all */
+        return 1;
+    return 0;                   /* And return FALSE if there's more */
+}
+
+/*
+ * Return TRUE if the argument contains an `unknown' part.
+ */
+int is_unknown(expr * vect)
+{
+    while (vect->type && vect->type < EXPR_UNKNOWN)
+        vect++;
+    return (vect->type == EXPR_UNKNOWN);
+}
+
+/*
+ * Return TRUE if the argument contains nothing but an `unknown'
+ * part.
+ */
+int is_just_unknown(expr * vect)
+{
+    while (vect->type && !vect->value)
+        vect++;
+    return (vect->type == EXPR_UNKNOWN);
+}
+
+/*
+ * Return the scalar part of a relocatable vector. (Including
+ * simple scalar vectors - those qualify as relocatable.)
+ */
+int64_t reloc_value(expr * vect)
+{
+    while (vect->type && !vect->value)
+        vect++;
+    if (!vect->type)
+        return 0;
+    if (vect->type == EXPR_SIMPLE)
+        return vect->value;
+    else
+        return 0;
+}
+
+/*
+ * Return the segment number of a relocatable vector, or NO_SEG for
+ * simple scalars.
+ */
+int32_t reloc_seg(expr * vect)
+{
+    while (vect->type && (vect->type == EXPR_WRT || !vect->value))
+        vect++;
+    if (vect->type == EXPR_SIMPLE) {
+        do {
+            vect++;
+        } while (vect->type && (vect->type == EXPR_WRT || !vect->value));
+    }
+    if (!vect->type)
+        return NO_SEG;
+    else
+        return vect->type - EXPR_SEGBASE;
+}
+
+/*
+ * Return the WRT segment number of a relocatable vector, or NO_SEG
+ * if no WRT part is present.
+ */
+int32_t reloc_wrt(expr * vect)
+{
+    while (vect->type && vect->type < EXPR_WRT)
+        vect++;
+    if (vect->type == EXPR_WRT) {
+        return vect->value;
+    } else
+        return NO_SEG;
+}
+
diff --git a/nasm.h b/nasm.h
index f4afad36..a7c26c88 100644
--- a/nasm.h
+++ b/nasm.h
@@ -15,6 +15,7 @@
 #include <inttypes.h>
 #include "version.h"            /* generated NASM version macros */
 #include "compiler.h"
+#include "nasmlib.h"
 #include "insnsi.h"		/* For enum opcode */
 
 #ifndef NULL
@@ -60,48 +61,6 @@
  */
 struct ofmt;
 
-/*
- * -------------------------
- * Error reporting functions
- * -------------------------
- */
-
-/*
- * An error reporting function should look like this.
- */
-typedef void (*efunc) (int severity, const char *fmt, ...);
-
-/*
- * These are the error severity codes which get passed as the first
- * argument to an efunc.
- */
-
-#define ERR_DEBUG  	0x00000008      /* put out debugging message */
-#define ERR_WARNING	0x00000000      /* warn only: no further action */
-#define ERR_NONFATAL	0x00000001      /* terminate assembly after phase */
-#define ERR_FATAL	0x00000002      /* instantly fatal: exit with error */
-#define ERR_PANIC	0x00000003      /* internal error: panic instantly
-                                         * and dump core for reference */
-#define ERR_MASK	0x0000000F      /* mask off the above codes */
-#define ERR_NOFILE	0x00000010      /* don't give source file name/line */
-#define ERR_USAGE	0x00000020      /* print a usage message */
-#define ERR_PASS1	0x00000040      /* only print this error on pass one */
-
-/*
- * These codes define specific types of suppressible warning.
- */
-
-#define ERR_WARN_MASK	0x0000FF00      /* the mask for this feature */
-#define ERR_WARN_SHR  8         /* how far to shift right */
-
-#define ERR_WARN_MNP	0x00000100      /* macro-num-parameters warning */
-#define ERR_WARN_MSR	0x00000200      /* macro self-reference */
-#define ERR_WARN_OL	0x00000300      /* orphan label (no colon, and
-                                         * alone on line) */
-#define ERR_WARN_NOV	0x00000400      /* numeric overflow */
-#define ERR_WARN_GNUELF	0x00000500      /* using GNU ELF extensions */
-#define ERR_WARN_MAX	5       /* the highest numbered one */
-
 /*
  * -----------------------
  * Other function typedefs
@@ -242,6 +201,18 @@ typedef struct {
     int64_t value;                 /* must be >= 32 bits */
 } expr;
 
+/*
+ * Library routines to manipulate expression data types.
+ */
+int is_reloc(expr *);
+int is_simple(expr *);
+int is_really_simple(expr *);
+int is_unknown(expr *);
+int is_just_unknown(expr *);
+int64_t reloc_value(expr *);
+int32_t reloc_seg(expr *);
+int32_t reloc_wrt(expr *);
+
 /*
  * The evaluator can also return hints about which of two registers
  * used in an expression should be the base register. See also the
diff --git a/nasmlib.c b/nasmlib.c
index b96fe126..59971c9d 100644
--- a/nasmlib.c
+++ b/nasmlib.c
@@ -702,152 +702,6 @@ const char *prefix_name(int token)
     return prefix_names[prefix];
 }
 
-/*
- * Return TRUE if the argument is a simple scalar. (Or a far-
- * absolute, which counts.)
- */
-int is_simple(expr * vect)
-{
-    while (vect->type && !vect->value)
-        vect++;
-    if (!vect->type)
-        return 1;
-    if (vect->type != EXPR_SIMPLE)
-        return 0;
-    do {
-        vect++;
-    } while (vect->type && !vect->value);
-    if (vect->type && vect->type < EXPR_SEGBASE + SEG_ABS)
-        return 0;
-    return 1;
-}
-
-/*
- * Return TRUE if the argument is a simple scalar, _NOT_ a far-
- * absolute.
- */
-int is_really_simple(expr * vect)
-{
-    while (vect->type && !vect->value)
-        vect++;
-    if (!vect->type)
-        return 1;
-    if (vect->type != EXPR_SIMPLE)
-        return 0;
-    do {
-        vect++;
-    } while (vect->type && !vect->value);
-    if (vect->type)
-        return 0;
-    return 1;
-}
-
-/*
- * Return TRUE if the argument is relocatable (i.e. a simple
- * scalar, plus at most one segment-base, plus possibly a WRT).
- */
-int is_reloc(expr * vect)
-{
-    while (vect->type && !vect->value)  /* skip initial value-0 terms */
-        vect++;
-    if (!vect->type)            /* trivially return TRUE if nothing */
-        return 1;               /* is present apart from value-0s */
-    if (vect->type < EXPR_SIMPLE)       /* FALSE if a register is present */
-        return 0;
-    if (vect->type == EXPR_SIMPLE) {    /* skip over a pure number term... */
-        do {
-            vect++;
-        } while (vect->type && !vect->value);
-        if (!vect->type)        /* ...returning TRUE if that's all */
-            return 1;
-    }
-    if (vect->type == EXPR_WRT) {       /* skip over a WRT term... */
-        do {
-            vect++;
-        } while (vect->type && !vect->value);
-        if (!vect->type)        /* ...returning TRUE if that's all */
-            return 1;
-    }
-    if (vect->value != 0 && vect->value != 1)
-        return 0;               /* segment base multiplier non-unity */
-    do {                        /* skip over _one_ seg-base term... */
-        vect++;
-    } while (vect->type && !vect->value);
-    if (!vect->type)            /* ...returning TRUE if that's all */
-        return 1;
-    return 0;                   /* And return FALSE if there's more */
-}
-
-/*
- * Return TRUE if the argument contains an `unknown' part.
- */
-int is_unknown(expr * vect)
-{
-    while (vect->type && vect->type < EXPR_UNKNOWN)
-        vect++;
-    return (vect->type == EXPR_UNKNOWN);
-}
-
-/*
- * Return TRUE if the argument contains nothing but an `unknown'
- * part.
- */
-int is_just_unknown(expr * vect)
-{
-    while (vect->type && !vect->value)
-        vect++;
-    return (vect->type == EXPR_UNKNOWN);
-}
-
-/*
- * Return the scalar part of a relocatable vector. (Including
- * simple scalar vectors - those qualify as relocatable.)
- */
-int64_t reloc_value(expr * vect)
-{
-    while (vect->type && !vect->value)
-        vect++;
-    if (!vect->type)
-        return 0;
-    if (vect->type == EXPR_SIMPLE)
-        return vect->value;
-    else
-        return 0;
-}
-
-/*
- * Return the segment number of a relocatable vector, or NO_SEG for
- * simple scalars.
- */
-int32_t reloc_seg(expr * vect)
-{
-    while (vect->type && (vect->type == EXPR_WRT || !vect->value))
-        vect++;
-    if (vect->type == EXPR_SIMPLE) {
-        do {
-            vect++;
-        } while (vect->type && (vect->type == EXPR_WRT || !vect->value));
-    }
-    if (!vect->type)
-        return NO_SEG;
-    else
-        return vect->type - EXPR_SEGBASE;
-}
-
-/*
- * Return the WRT segment number of a relocatable vector, or NO_SEG
- * if no WRT part is present.
- */
-int32_t reloc_wrt(expr * vect)
-{
-    while (vect->type && vect->type < EXPR_WRT)
-        vect++;
-    if (vect->type == EXPR_WRT) {
-        return vect->value;
-    } else
-        return NO_SEG;
-}
-
 /*
  * Binary search.
  */
diff --git a/nasmlib.h b/nasmlib.h
index 82a35e30..43342096 100644
--- a/nasmlib.h
+++ b/nasmlib.h
@@ -22,6 +22,48 @@
  */
 /* #define LOGALLOC */
 
+/*
+ * -------------------------
+ * Error reporting functions
+ * -------------------------
+ */
+
+/*
+ * An error reporting function should look like this.
+ */
+typedef void (*efunc) (int severity, const char *fmt, ...);
+
+/*
+ * These are the error severity codes which get passed as the first
+ * argument to an efunc.
+ */
+
+#define ERR_DEBUG  	0x00000008      /* put out debugging message */
+#define ERR_WARNING	0x00000000      /* warn only: no further action */
+#define ERR_NONFATAL	0x00000001      /* terminate assembly after phase */
+#define ERR_FATAL	0x00000002      /* instantly fatal: exit with error */
+#define ERR_PANIC	0x00000003      /* internal error: panic instantly
+                                         * and dump core for reference */
+#define ERR_MASK	0x0000000F      /* mask off the above codes */
+#define ERR_NOFILE	0x00000010      /* don't give source file name/line */
+#define ERR_USAGE	0x00000020      /* print a usage message */
+#define ERR_PASS1	0x00000040      /* only print this error on pass one */
+
+/*
+ * These codes define specific types of suppressible warning.
+ */
+
+#define ERR_WARN_MASK	0x0000FF00      /* the mask for this feature */
+#define ERR_WARN_SHR  8         /* how far to shift right */
+
+#define ERR_WARN_MNP	0x00000100      /* macro-num-parameters warning */
+#define ERR_WARN_MSR	0x00000200      /* macro self-reference */
+#define ERR_WARN_OL	0x00000300      /* orphan label (no colon, and
+                                         * alone on line) */
+#define ERR_WARN_NOV	0x00000400      /* numeric overflow */
+#define ERR_WARN_GNUELF	0x00000500      /* using GNU ELF extensions */
+#define ERR_WARN_MAX	5       /* the highest numbered one */
+
 /*
  * Wrappers around malloc, realloc and free. nasm_malloc will
  * fatal-error and die rather than return NULL; nasm_realloc will
@@ -29,7 +71,6 @@
  * passed a NULL pointer; nasm_free will do nothing if it is passed
  * a NULL pointer.
  */
-#ifdef NASM_NASM_H              /* need efunc defined for this */
 void nasm_set_malloc_error(efunc);
 #ifndef LOGALLOC
 void *nasm_malloc(size_t);
@@ -49,7 +90,6 @@ char *nasm_strndup_log(char *, int, char *, size_t);
 #define nasm_strdup(x) nasm_strdup_log(__FILE__,__LINE__,x)
 #define nasm_strndup(x,y) nasm_strndup_log(__FILE__,__LINE__,x,y)
 #endif
-#endif
 
 /*
  * ANSI doesn't guarantee the presence of `stricmp' or
@@ -234,20 +274,6 @@ void saa_fread(struct SAA *s, int32_t posn, void *p, int32_t len);    /* fixup *
 void saa_fwrite(struct SAA *s, int32_t posn, void *p, int32_t len);   /* fixup */
 void saa_fpwrite(struct SAA *, FILE *);
 
-#ifdef NASM_NASM_H
-/*
- * Library routines to manipulate expression data types.
- */
-int is_reloc(expr *);
-int is_simple(expr *);
-int is_really_simple(expr *);
-int is_unknown(expr *);
-int is_just_unknown(expr *);
-int64_t reloc_value(expr *);
-int32_t reloc_seg(expr *);
-int32_t reloc_wrt(expr *);
-#endif
-
 /*
  * Binary search routine. Returns index into `array' of an entry
  * matching `string', or <0 if no match. `array' is taken to
diff --git a/ndisasm.c b/ndisasm.c
index f9793838..ea4dc537 100644
--- a/ndisasm.c
+++ b/ndisasm.c
@@ -7,6 +7,7 @@
  */
 
 #include <stdio.h>
+#include <stdarg.h>
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
@@ -36,6 +37,17 @@ static const char *help =
 static void output_ins(uint32_t, uint8_t *, int, char *);
 static void skip(uint32_t dist, FILE * fp);
 
+static void ndisasm_error(int severity, const char *fmt, ...)
+{
+    va_list va;
+
+    va_start(va, fmt);
+    vfprintf(stderr, fmt, va);
+
+    if (severity & ERR_FATAL)
+	exit(1);
+}
+
 int main(int argc, char **argv)
 {
     char buffer[INSN_MAX * 2], *p, *ep, *q;
@@ -53,6 +65,8 @@ int main(int argc, char **argv)
     int32_t offset;
     FILE *fp;
 
+    nasm_set_malloc_error(ndisasm_error);
+
     offset = 0;
     init_sync();
 

From 8d024e7965efb208b0831ee7289329f85cf4433f Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 19 Sep 2007 21:41:02 -0700
Subject: [PATCH 28/29] Remove limit on number of sync points

Make it possible for ndisasm to allocate more memory for sync points
as needed.
---
 sync.c | 32 +++++++++-----------------------
 1 file changed, 9 insertions(+), 23 deletions(-)

diff --git a/sync.c b/sync.c
index 88d882a2..562c59d6 100644
--- a/sync.c
+++ b/sync.c
@@ -11,9 +11,10 @@
 #include <limits.h>
 #include <inttypes.h>
 
+#include "nasmlib.h"
 #include "sync.h"
 
-#define SYNC_MAX 4096           /* max # of sync points */
+#define SYNC_MAX 4096           /* max # of sync points (initial) */
 
 /*
  * This lot manages the current set of sync points by means of a
@@ -24,29 +25,12 @@ static struct Sync {
     uint32_t pos;
     uint32_t length;
 } *synx;
-static int nsynx;
+static int max_synx, nsynx;
 
 void init_sync(void)
 {
-    /*
-     * I'd like to allocate an array of size SYNC_MAX, then write
-     * `synx--' which would allow numbering the array from one
-     * instead of zero without wasting memory. Sadly I don't trust
-     * this to work in 16-bit Large model, so it's staying the way
-     * it is. Btw, we don't care about freeing this array, since it
-     * has to last for the duration of the program and will then be
-     * auto-freed on exit. And I'm lazy ;-)
-     * 
-     * Speaking of 16-bit Large model, that's also the reason I'm
-     * not declaring this array statically - by doing it
-     * dynamically I avoid problems with the total size of DGROUP
-     * in Borland C.
-     */
-    synx = malloc((SYNC_MAX + 1) * sizeof(*synx));
-    if (!synx) {
-        fprintf(stderr, "ndisasm: not enough memory for sync array\n");
-        exit(1);
-    }
+    max_synx = SYNC_MAX-1;
+    synx = nasm_malloc(SYNC_MAX * sizeof(*synx));
     nsynx = 0;
 }
 
@@ -54,8 +38,10 @@ void add_sync(uint32_t pos, uint32_t length)
 {
     int i;
 
-    if (nsynx == SYNC_MAX)
-        return;                 /* can't do anything - overflow */
+    if (nsynx >= max_synx) {
+	max_synx = (max_synx << 1)+1;
+	synx = nasm_realloc(synx, (max_synx+1) * sizeof(*synx));
+    }
 
     nsynx++;
     synx[nsynx].pos = pos;

From d9a979559e76028f671891483134251656793d0c Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 19 Sep 2007 21:41:27 -0700
Subject: [PATCH 29/29] Update manual pages

Update manual pages to include 64-bit support, and remove section
about sync point limits in ndisasm.
---
 nasm.1    | 10 ++++++----
 ndisasm.1 |  7 +------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/nasm.1 b/nasm.1
index e3284406..7b5d2929 100644
--- a/nasm.1
+++ b/nasm.1
@@ -192,9 +192,10 @@ is reserved using the
 .IR RESB ,
 .IR RESW ,
 .IR RESD ,
-.I RESQ
-and
+.IR RESQ ,
 .I REST
+and
+.I RESO
 pseudo-opcodes, each taking one parameter which gives the number of
 bytes, words, doublewords, quadwords or ten-byte words to reserve.
 .PP
@@ -297,9 +298,10 @@ finished doing absolute assembly, you must issue another
 .I SECTION
 directive to return to normal assembly.
 .PP
-.I BITS 16
-or
+.I BITS 16,
 .I BITS 32
+or
+.I BITS 64
 switches the default processor mode for which
 .B nasm
 is generating code: it is equivalent to
diff --git a/ndisasm.1 b/ndisasm.1
index d48a1827..622500f9 100644
--- a/ndisasm.1
+++ b/ndisasm.1
@@ -88,7 +88,7 @@ means of examining the target addresses of the relative jumps and
 calls it disassembles.
 .TP
 .BI \-b " bits"
-Specifies either 16-bit or 32-bit mode. The default is 16-bit mode.
+Specifies 16-, 32- or 64-bit mode. The default is 16-bit mode.
 .TP
 .B \-u
 Specifies 32-bit mode, more compactly than using `-b 32'.
@@ -125,10 +125,5 @@ or calls result from disassembling non-machine-code data, sync
 markers may get placed in strange places. Feel free to turn
 auto-sync off and go back to doing it manually if necessary.
 .PP
-.B ndisasm
-can only keep track of 8192 sync markers internally at once: this is
-to do with portability, since DOS machines don't take kindly to more
-than 64K being allocated at a time.
-.PP
 .SH SEE ALSO
 .BR objdump "(" 1 ")."