preprocessor: major cleanups; inline text into Token

Major cleanups of the preprocessor. In particular, the
block-allocation of Token is pretty ridiculous since nearly every
token requires a text allocation anyway. Change the definition of
Token so that only very long tokens (48+ characters on 64-bit systems)
need to be stored out of line.

If malloc() preserves alignment (XXX: glibc doesn't) then this means
that each Token will fit in a cache line.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
This commit is contained in:
H. Peter Anvin 2019-09-23 16:40:03 -07:00
parent f7dbdb2e13
commit 8571f06061
6 changed files with 1065 additions and 706 deletions

File diff suppressed because it is too large Load Diff

View File

@ -83,7 +83,7 @@ void stdscan_cleanup(void)
nasm_free(stdscan_tempstorage);
}
static char *stdscan_copy(char *p, int len)
static char *stdscan_copy(const char *p, int len)
{
char *text;
@ -124,7 +124,7 @@ static int stdscan_handle_brace(struct tokenval *tv)
int stdscan(void *private_data, struct tokenval *tv)
{
char ourcopy[MAX_KEYWORD + 1], *r, *s;
const char *r;
(void)private_data; /* Don't warn that this parameter is unused */
@ -156,13 +156,7 @@ int stdscan(void *private_data, struct tokenval *tv)
if (is_sym || stdscan_bufptr - r > MAX_KEYWORD)
return tv->t_type = TOKEN_ID; /* bypass all other checks */
for (s = tv->t_charptr, r = ourcopy; *s; s++)
*r++ = nasm_tolower(*s);
*r = '\0';
/* right, so we have an identifier sitting in temp storage. now,
* is it actually a register or instruction name, or what? */
token_type = nasm_token_hash(ourcopy, tv);
token_type = nasm_token_hash(tv->t_charptr, tv);
if (unlikely(tv->t_flag & TFLAG_WARN)) {
/*!
*!ptr [on] non-NASM keyword used in other assemblers
@ -293,14 +287,8 @@ int stdscan(void *private_data, struct tokenval *tv)
stdscan_bufptr++; /* skip closing brace */
for (s = tv->t_charptr, r = ourcopy; *s; s++)
*r++ = nasm_tolower(*s);
*r = '\0';
/* right, so we have a decorator sitting in temp storage. */
nasm_token_hash(ourcopy, tv);
/* handle tokens inside braces */
nasm_token_hash(tv->t_charptr, tv);
return stdscan_handle_brace(tv);
} else if (*stdscan_bufptr == ';') {
/* a comment has happened - stay */
@ -332,8 +320,13 @@ int stdscan(void *private_data, struct tokenval *tv)
stdscan_bufptr += 2;
return tv->t_type = TOKEN_NE;
} else if (stdscan_bufptr[0] == '<' && stdscan_bufptr[1] == '=') {
if (stdscan_bufptr[2] == '>') {
stdscan_bufptr += 3;
return tv->t_type = TOKEN_LEG;
} else {
stdscan_bufptr += 2;
return tv->t_type = TOKEN_LE;
}
} else if (stdscan_bufptr[0] == '>' && stdscan_bufptr[1] == '=') {
stdscan_bufptr += 2;
return tv->t_type = TOKEN_GE;

View File

@ -149,3 +149,45 @@ z
evex
vex3
vex2
# Multi-character operators. Used in ppscan().
% TOKEN_SHR, 0, 0, 0
>>
% TOKEN_SAR, 0, 0, 0
>>>
% TOKEN_SHL, 0, 0, 0
<<
<<<
% TOKEN_SDIV, 0, 0, 0
//
% TOKEN_SMOD, 0, 0, 0
%%
% TOKEN_EQ, 0, 0, 0
==
% TOKEN_NE, 0, 0, 0
!=
<>
% TOKEN_LE, 0, 0, 0
<=
% TOKEN_GE, 0, 0, 0
>=
% TOKEN_LEG, 0, 0, 0
<=>
% TOKEN_DBL_AND, 0, 0, 0
&&
% TOKEN_DBL_OR, 0, 0, 0
||
% TOKEN_DBL_XOR, 0, 0, 0
^^

View File

@ -129,9 +129,10 @@ close(RD);
#
open(TD, '<', $tokens_dat) or die "$0: cannot open $tokens_dat: $!\n";
while (defined($line = <TD>)) {
$line =~ s/\s*(|\#.*)$//;
if ($line =~ /^\%\s+(.*)$/) {
$pattern = $1;
} elsif ($line =~ /^([\?\@\.a-z0-9_-]+)/) {
} elsif ($line =~ /^(\S+)/) {
$token = $1;
if (defined($tokens{$token})) {
@ -257,19 +258,23 @@ if ($output eq 'h') {
print " };\n";
print " uint32_t k1, k2;\n";
print " uint64_t crc;\n";
print " size_t len;\n";
# For correct overflow behavior, "ix" should be unsigned of the same
# width as the hash arrays.
print " uint16_t ix;\n";
print " const struct tokendata *data;\n";
printf " char lcbuf[%d];\n", $max_len+1;
print " const char *p = token;\n";
print " char c, *q = lcbuf;\n";
print " size_t len = 0;\n";
printf " uint64_t crc = UINT64_C(0x%08x%08x);\n", $$sv[0], $$sv[1];
print "\n";
print " len = strlen(token);\n";
print " if (unlikely(len > $max_len))\n";
print " while ((c = *p++)) {\n";
printf " if (++len > %d)\n", $max_len;
print " goto notfound;\n";
print " *q++ = c = nasm_tolower(c);\n";
print " crc = crc64_byte(crc, c);\n";
print " };\n";
print "\n";
printf " crc = crc64b(UINT64_C(0x%08x%08x), token, len);\n",
$$sv[0], $$sv[1];
print " k1 = (uint32_t)crc;\n";
print " k2 = (uint32_t)(crc >> 32);\n";
print "\n";
@ -278,7 +283,9 @@ if ($output eq 'h') {
print " goto notfound;\n";
print "\n";
print " data = &tokendata[ix];\n";
print " if (data->len != len || memcmp(data->string, token, len))\n";
print " if (data->len != len)\n";
print " goto notfound;\n";
print " if (memcmp(data->string, lcbuf, len))\n";
print " goto notfound;\n";
print "\n";
print " tv->t_integer = data->num;\n";

View File

@ -73,6 +73,12 @@ uint64_t crc64b(uint64_t crc, const void *data, size_t len);
uint64_t crc64ib(uint64_t crc, const void *data, size_t len);
#define CRC64_INIT UINT64_C(0xffffffffffffffff)
static inline uint64_t crc64_byte(uint64_t crc, uint8_t v)
{
extern const uint64_t crc64_tab[256];
return crc64_tab[(uint8_t)(v ^ crc)] ^ (crc >> 8);
}
void **hash_find(struct hash_table *head, const char *string,
struct hash_insert *insert);
void **hash_findb(struct hash_table *head, const void *key, size_t keylen,

View File

@ -35,7 +35,7 @@
#include "nctype.h"
#include "hashtbl.h"
static const uint64_t crc64_tab[256] = {
const uint64_t crc64_tab[256] = {
UINT64_C(0x0000000000000000), UINT64_C(0x7ad870c830358979),
UINT64_C(0xf5b0e190606b12f2), UINT64_C(0x8f689158505e9b8b),
UINT64_C(0xc038e5739841b68f), UINT64_C(0xbae095bba8743ff6),
@ -170,9 +170,8 @@ uint64_t crc64(uint64_t crc, const char *str)
{
uint8_t c;
while ((c = *str++) != 0) {
crc = crc64_tab[(uint8_t)crc ^ c] ^ (crc >> 8);
}
while ((c = *str++) != 0)
crc = crc64_byte(crc, c);
return crc;
}
@ -181,9 +180,8 @@ uint64_t crc64i(uint64_t crc, const char *str)
{
uint8_t c;
while ((c = *str++) != 0) {
crc = crc64_tab[(uint8_t)crc ^ nasm_tolower(c)] ^ (crc >> 8);
}
while ((c = *str++) != 0)
crc = crc64_byte(crc, nasm_tolower(c));
return crc;
}
@ -192,9 +190,8 @@ uint64_t crc64b(uint64_t crc, const void *data, size_t len)
{
const uint8_t *str = data;
while (len--) {
crc = crc64_tab[(uint8_t)crc ^ *str++] ^ (crc >> 8);
}
while (len--)
crc = crc64_byte(crc, *str++);
return crc;
}
@ -203,9 +200,8 @@ uint64_t crc64ib(uint64_t crc, const void *data, size_t len)
{
const uint8_t *str = data;
while (len--) {
crc = crc64_tab[(uint8_t)crc ^ nasm_tolower(*str++)] ^ (crc >> 8);
}
while (len--)
crc = crc64_byte(crc, nasm_tolower(*str++));
return crc;
}