nasm/quote.c
H. Peter Anvin 88c9e1f88c Fix memory management issues with expanded %include
Ownership of the filename string was a bit fuzzy, with the result that
we were freeing it even though it was retained for use by __FILE__.
Clean up a number of other memory management issues with the new
quoting code, and change the stdscan implementation to one pass over
the string.
2008-06-04 11:26:59 -07:00

439 lines
7.9 KiB
C

/* quote.c library routines for the Netwide Assembler
*
* The Netwide Assembler is copyright (C) 1996 Simon Tatham and
* Julian Hall. All rights reserved. The software is
* redistributable under the license given in the file "LICENSE"
* distributed in the NASM archive.
*/
#include "compiler.h"
#include <assert.h>
#include <stdlib.h>
#include "nasmlib.h"
#include "quote.h"
#define numvalue(c) ((c)>='a' ? (c)-'a'+10 : (c)>='A' ? (c)-'A'+10 : (c)-'0')
char *nasm_quote(char *str, size_t len)
{
char c, c1, *p, *q, *nstr, *ep;
bool sq_ok, dq_ok;
size_t qlen;
sq_ok = dq_ok = true;
ep = str+len;
qlen = 0; /* Length if we need `...` quotes */
for (p = str; p < ep; p++) {
c = *p;
switch (c) {
case '\'':
sq_ok = false;
qlen++;
break;
case '\"':
dq_ok = false;
qlen++;
break;
case '`':
case '\\':
qlen += 2;
break;
default:
if (c < ' ' || c > '~') {
sq_ok = dq_ok = false;
switch (c) {
case '\a':
case '\b':
case '\t':
case '\n':
case '\v':
case '\f':
case '\r':
case 27:
qlen += 2;
break;
default:
c1 = (p+1 < ep) ? p[1] : 0;
if (c > 077 || (c1 >= '0' && c1 <= '7'))
qlen += 4; /* Must use the full form */
else if (c > 07)
qlen += 3;
else
qlen += 2;
break;
}
} else {
qlen++;
}
break;
}
}
if (sq_ok || dq_ok) {
/* Use '...' or "..." */
nstr = nasm_malloc(len+3);
nstr[0] = nstr[len+1] = sq_ok ? '\'' : '\"';
nstr[len+2] = '\0';
memcpy(nstr+1, str, len);
} else {
/* Need to use `...` quoted syntax */
nstr = nasm_malloc(qlen+3);
q = nstr;
*q++ = '`';
for (p = str; p < ep; p++) {
c = *p;
switch (c) {
case '`':
case '\\':
*q++ = '\\';
*q++ = c;
break;
case 7:
*q++ = '\\';
*q++ = 'a';
break;
case 8:
*q++ = '\\';
*q++ = 'b';
break;
case 9:
*q++ = '\\';
*q++ = 't';
break;
case 10:
*q++ = '\\';
*q++ = 'n';
break;
case 11:
*q++ = '\\';
*q++ = 'v';
break;
case 12:
*q++ = '\\';
*q++ = 'f';
break;
case 13:
*q++ = '\\';
*q++ = 'r';
break;
case 27:
*q++ = '\\';
*q++ = 'e';
break;
default:
if (c < ' ' || c > '~') {
c1 = (p+1 < ep) ? p[1] : 0;
if (c1 >= '0' && c1 <= '7')
q += sprintf(q, "\\%03o", (unsigned char)c);
else
q += sprintf(q, "\\%o", (unsigned char)c);
} else {
*q++ = c;
}
break;
}
}
*q++ = '`';
*q++ = '\0';
assert((size_t)(q-nstr) == qlen+3);
}
return nstr;
}
static char *emit_utf8(char *q, int32_t v)
{
if (v < 0) {
/* Impossible - do nothing */
} else if (v <= 0x7f) {
*q++ = v;
} else if (v <= 0x000007ff) {
*q++ = 0xc0 | (v >> 6);
*q++ = 0x80 | (v & 63);
} else if (v <= 0x0000ffff) {
*q++ = 0xe0 | (v >> 12);
*q++ = 0x80 | ((v >> 6) & 63);
*q++ = 0x80 | (v & 63);
} else if (v <= 0x001fffff) {
*q++ = 0xf0 | (v >> 18);
*q++ = 0x80 | ((v >> 12) & 63);
*q++ = 0x80 | ((v >> 6) & 63);
*q++ = 0x80 | (v & 63);
} else if (v <= 0x03ffffff) {
*q++ = 0xf8 | (v >> 24);
*q++ = 0x80 | ((v >> 18) & 63);
*q++ = 0x80 | ((v >> 12) & 63);
*q++ = 0x80 | ((v >> 6) & 63);
*q++ = 0x80 | (v & 63);
} else {
*q++ = 0xfc | (v >> 30);
*q++ = 0x80 | ((v >> 24) & 63);
*q++ = 0x80 | ((v >> 18) & 63);
*q++ = 0x80 | ((v >> 12) & 63);
*q++ = 0x80 | ((v >> 6) & 63);
*q++ = 0x80 | (v & 63);
}
return q;
}
/*
* Do an *in-place* dequoting of the specified string, returning the
* resulting length (which may be containing embedded nulls.)
*
* In-place replacement is possible since the unquoted length is always
* shorter than or equal to the quoted length.
*
* *ep points to the final quote, or to the null if improperly quoted.
*/
size_t nasm_unquote(char *str, char **ep)
{
char bq;
char *p, *q;
char *escp = NULL;
char c;
enum unq_state {
st_start,
st_backslash,
st_hex,
st_oct,
st_ucs,
} state;
int ndig = 0;
int32_t nval = 0;
p = q = str;
bq = *p++;
if (!bq)
return 0;
switch (bq) {
case '\'':
case '\"':
/* '...' or "..." string */
while ((c = *p) && c != bq) {
p++;
*q++ = c;
}
*q = '\0';
break;
case '`':
/* `...` string */
state = st_start;
while ((c = *p)) {
p++;
switch (state) {
case st_start:
switch (c) {
case '\\':
state = st_backslash;
break;
case '`':
p--;
goto out;
default:
*q++ = c;
break;
}
break;
case st_backslash:
state = st_start;
escp = p; /* Beginning of argument sequence */
nval = 0;
switch (c) {
case 'a':
*q++ = 7;
break;
case 'b':
*q++ = 8;
break;
case 'e':
*q++ = 27;
break;
case 'f':
*q++ = 12;
break;
case 'n':
*q++ = 10;
break;
case 'r':
*q++ = 13;
break;
case 't':
*q++ = 9;
break;
case 'u':
state = st_ucs;
ndig = 4;
break;
case 'U':
state = st_ucs;
ndig = 8;
break;
case 'v':
*q++ = 11;
break;
case 'x':
case 'X':
state = st_hex;
ndig = 2;
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
state = st_oct;
ndig = 2; /* Up to two more digits */
nval = c - '0';
break;
default:
*q++ = c;
break;
}
break;
case st_oct:
if (c >= '0' && c <= '7') {
nval = (nval << 3) + (c - '0');
if (!--ndig) {
*q++ = nval;
state = st_start;
}
} else {
p--; /* Process this character again */
*q++ = nval;
state = st_start;
}
break;
case st_hex:
if ((c >= '0' && c <= '9') ||
(c >= 'A' && c <= 'F') ||
(c >= 'a' && c <= 'f')) {
nval = (nval << 4) + numvalue(c);
if (--ndig) {
*q++ = nval;
state = st_start;
}
} else {
p--; /* Process this character again */
*q++ = (p > escp) ? nval : escp[-1];
state = st_start;
}
break;
case st_ucs:
if ((c >= '0' && c <= '9') ||
(c >= 'A' && c <= 'F') ||
(c >= 'a' && c <= 'f')) {
nval = (nval << 4) + numvalue(c);
if (!--ndig) {
q = emit_utf8(q, nval);
state = st_start;
}
} else {
p--; /* Process this character again */
if (p > escp)
q = emit_utf8(q, nval);
else
*q++ = escp[-1];
state = st_start;
}
break;
}
}
switch (state) {
case st_start:
case st_backslash:
break;
case st_oct:
*q++ = nval;
break;
case st_hex:
*q++ = (p > escp) ? nval : escp[-1];
break;
case st_ucs:
if (p > escp)
q = emit_utf8(q, nval);
else
*q++ = escp[-1];
break;
}
out:
break;
default:
/* Not a quoted string, just return the input... */
p = q = strchr(str, '\0');
break;
}
if (ep)
*ep = p;
return q-str;
}
/*
* Find the end of a quoted string; returns the pointer to the terminating
* character (either the ending quote or the null character, if unterminated.)
*/
char *nasm_skip_string(char *str)
{
char bq;
char *p;
char c;
enum unq_state {
st_start,
st_backslash,
} state;
bq = str[0];
if (bq == '\'' || bq == '\"') {
/* '...' or "..." string */
for (p = str+1; *p && *p != bq; p++)
;
return p;
} else if (bq == '`') {
/* `...` string */
p = str+1;
state = st_start;
while ((c = *p++)) {
switch (state) {
case st_start:
switch (c) {
case '\\':
state = st_backslash;
break;
case '`':
return p-1; /* Found the end */
default:
break;
}
break;
case st_backslash:
/*
* Note: for the purpose of finding the end of the string,
* all successor states to st_backslash are functionally
* equivalent to st_start, since either a backslash or
* a backquote will force a return to the st_start state.
*/
state = st_start;
break;
}
}
return p; /* Unterminated string... */
} else {
return str; /* Not a string... */
}
}