quote: disallow control characters in C strings; concatendate; cleanups

In nasm_unquote_cstr(), disallow any control character, not just
NUL. This will matter when allowing quoting symbols.

Merge nasm_unquote() and nasm_unquote_cstr().

Strings can now be concatenated, C style: adjacent quoted strings
(including whitespace-separated) are merged into a single string.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
This commit is contained in:
H. Peter Anvin 2019-04-22 14:29:29 -07:00
parent 982186a1a3
commit bb42d30737
4 changed files with 159 additions and 101 deletions

View File

@ -461,22 +461,6 @@ static Token *delete_Token(Token * t);
#define tok_is_(x,v) (tok_type_((x), TOK_OTHER) && !strcmp((x)->text,(v)))
#define tok_isnt_(x,v) ((x) && ((x)->type!=TOK_OTHER || strcmp((x)->text,(v))))
/*
* nasm_unquote with error if the string contains NUL characters.
* If the string contains NUL characters, issue an error and return
* the C len, i.e. truncate at the NUL.
*/
static size_t nasm_unquote_cstr(char *qstr, enum preproc_token directive)
{
size_t len = nasm_unquote(qstr, NULL);
size_t clen = strlen(qstr);
if (len != clen)
nasm_nonfatal("NUL character in `%s' directive",
pp_directives[directive]);
return clen;
}
/*
* In-place reverse a list of tokens.
*/
@ -1780,7 +1764,7 @@ static bool if_condition(Token * tline, enum preproc_token ct)
if (tline->type == TOK_PREPROC_ID)
p += 2; /* Skip leading %! */
if (nasm_isquote(*p))
nasm_unquote_cstr(p, ct);
nasm_unquote_cstr(p, NULL);
if (getenv(p))
j = true;
tline = tline->next;
@ -2527,7 +2511,7 @@ static int do_directive(Token *tline, char **output)
nasm_warn(WARN_OTHER, "trailing garbage after `%%depend' ignored");
p = t->text;
if (t->type != TOK_INTERNAL_STRING)
nasm_unquote_cstr(p, i);
nasm_unquote_cstr(p, NULL);
strlist_add(deplist, p);
free_tlist(origline);
return DIRECTIVE_FOUND;
@ -2546,7 +2530,7 @@ static int do_directive(Token *tline, char **output)
nasm_warn(WARN_OTHER, "trailing garbage after `%%include' ignored");
p = t->text;
if (t->type != TOK_INTERNAL_STRING)
nasm_unquote_cstr(p, i);
nasm_unquote_cstr(p, NULL);
inc = nasm_malloc(sizeof(Include));
inc->next = istk;
inc->conds = NULL;
@ -2588,7 +2572,7 @@ static int do_directive(Token *tline, char **output)
if (tline->next)
nasm_warn(WARN_OTHER, "trailing garbage after `%%use' ignored");
if (tline->type == TOK_STRING)
nasm_unquote_cstr(tline->text, i);
nasm_unquote_cstr(tline->text, NULL);
use_pkg = nasm_stdmac_find_package(tline->text);
if (!use_pkg)
nasm_nonfatal("unknown `%%use' package: %s", tline->text);
@ -3240,7 +3224,7 @@ issue_error:
* are stored with the token stream reversed, so we have to
* reverse the output of tokenize().
*/
nasm_unquote_cstr(t->text, i);
nasm_unquote_cstr(t->text, NULL);
macro_start = reverse_tokens(tokenize(t->text));
/*

View File

@ -1,5 +1,5 @@
/* ----------------------------------------------------------------------- *
*
*
* Copyright 1996-2016 The NASM Authors - All Rights Reserved
* See the file AUTHORS included with the NASM distribution for
* the specific copyright holders.
@ -14,7 +14,7 @@
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials provided
* with the distribution.
*
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
@ -40,6 +40,8 @@
#include "nasmlib.h"
#include "quote.h"
#include "nctype.h"
#include "error.h"
char *nasm_quote(const char *str, size_t len)
{
@ -180,39 +182,55 @@ char *nasm_quote(const char *str, size_t len)
return nstr;
}
static char *emit_utf8(char *q, int32_t v)
static unsigned char *emit_utf8(unsigned char *q, uint32_t v)
{
if (v < 0) {
/* Impossible - do nothing */
} else if (v <= 0x7f) {
uint32_t vb1, vb2, vb3, vb4, vb5;
if (v <= 0x7f) {
*q++ = v;
} else if (v <= 0x000007ff) {
*q++ = 0xc0 | (v >> 6);
*q++ = 0x80 | (v & 63);
} else if (v <= 0x0000ffff) {
*q++ = 0xe0 | (v >> 12);
*q++ = 0x80 | ((v >> 6) & 63);
*q++ = 0x80 | (v & 63);
} else if (v <= 0x001fffff) {
*q++ = 0xf0 | (v >> 18);
*q++ = 0x80 | ((v >> 12) & 63);
*q++ = 0x80 | ((v >> 6) & 63);
*q++ = 0x80 | (v & 63);
} else if (v <= 0x03ffffff) {
*q++ = 0xf8 | (v >> 24);
*q++ = 0x80 | ((v >> 18) & 63);
*q++ = 0x80 | ((v >> 12) & 63);
*q++ = 0x80 | ((v >> 6) & 63);
*q++ = 0x80 | (v & 63);
} else {
*q++ = 0xfc | (v >> 30);
*q++ = 0x80 | ((v >> 24) & 63);
*q++ = 0x80 | ((v >> 18) & 63);
*q++ = 0x80 | ((v >> 12) & 63);
*q++ = 0x80 | ((v >> 6) & 63);
*q++ = 0x80 | (v & 63);
goto out0;
}
return q;
vb1 = v >> 6;
if (vb1 <= 0x3f) {
*q++ = 0xc0 + vb1;
goto out1;
}
vb2 = vb1 >> 6;
if (vb2 <= 0x1f) {
*q++ = 0xe0 + vb2;
goto out2;
}
vb3 = vb2 >> 6;
if (vb3 <= 0x0f) {
*q++ = 0xf0 + vb3;
goto out3;
}
vb4 = vb3 >> 6;
if (vb4 <= 0x07) {
*q++ = 0xf8 + vb4;
goto out4;
}
vb5 = vb4 >> 6;
if (vb5 <= 0x03) {
*q++ = 0xfc + vb5;
goto out5;
}
/* Otherwise invalid, even with 31-bit "extended Unicode" (pre-UTF-16) */
goto out0;
/* Emit extension bytes as appropriate */
out5: *q++ = 0x80 + (vb4 & 63);
out4: *q++ = 0x80 + (vb3 & 63);
out3: *q++ = 0x80 + (vb2 & 63);
out2: *q++ = 0x80 + (vb1 & 63);
out1: *q++ = 0x80 + (v & 63);
out0: return q;
}
/*
@ -223,13 +241,27 @@ static char *emit_utf8(char *q, int32_t v)
* shorter than or equal to the quoted length.
*
* *ep points to the final quote, or to the null if improperly quoted.
*
* Issue an error if the string contains characters less than cerr; in
* that case, the output string, but not *ep, is truncated before the
* first invalid character.
*/
size_t nasm_unquote(char *str, char **ep)
#define EMIT(c) \
do { \
unsigned char ec = (c); \
err |= ec < cerr; \
if (!err) \
*q++ = (c); \
} while (0)
static size_t nasm_unquote_common(char *str, char **ep,
const unsigned char cerr)
{
char bq;
char *p, *q;
char *escp = NULL;
char c;
unsigned char *p, *q;
unsigned char *escp = NULL;
unsigned char c;
bool err = false;
enum unq_state {
st_start,
st_backslash,
@ -238,10 +270,10 @@ size_t nasm_unquote(char *str, char **ep)
st_ucs
} state;
int ndig = 0;
int32_t nval = 0;
uint32_t nval = 0;
p = q = (unsigned char *)str;
p = q = str;
bq = *p++;
if (!bq)
return 0;
@ -250,11 +282,21 @@ size_t nasm_unquote(char *str, char **ep)
case '\'':
case '\"':
/* '...' or "..." string */
while ((c = *p) && c != bq) {
p++;
*q++ = c;
}
*q = '\0';
while (1) {
c = *p;
if (!c) {
break;
} else if (c == bq) {
/* Doubled quote = escaped quote */
c = p[1];
if (c != bq)
break;
p++;
}
p++;
EMIT(c);
}
*q = '\0';
break;
case '`':
@ -273,7 +315,7 @@ size_t nasm_unquote(char *str, char **ep)
p--;
goto out;
default:
*q++ = c;
EMIT(c);
break;
}
break;
@ -284,25 +326,25 @@ size_t nasm_unquote(char *str, char **ep)
nval = 0;
switch (c) {
case 'a':
*q++ = 7;
nval = 7;
break;
case 'b':
*q++ = 8;
nval = 8;
break;
case 'e':
*q++ = 27;
nval = 27;
break;
case 'f':
*q++ = 12;
nval = 12;
break;
case 'n':
*q++ = 10;
nval = 10;
break;
case 'r':
*q++ = 13;
nval = 13;
break;
case 't':
*q++ = 9;
nval = 9;
break;
case 'u':
state = st_ucs;
@ -313,7 +355,7 @@ size_t nasm_unquote(char *str, char **ep)
ndig = 8;
break;
case 'v':
*q++ = 11;
nval = 11;
break;
case 'x':
case 'X':
@ -333,9 +375,11 @@ size_t nasm_unquote(char *str, char **ep)
nval = c - '0';
break;
default:
*q++ = c;
nval = c;
break;
}
if (state == st_start)
EMIT(nval);
break;
case st_oct:
@ -347,15 +391,13 @@ size_t nasm_unquote(char *str, char **ep)
}
} else {
p--; /* Process this character again */
*q++ = nval;
EMIT(nval);
state = st_start;
}
break;
case st_hex:
if ((c >= '0' && c <= '9') ||
(c >= 'A' && c <= 'F') ||
(c >= 'a' && c <= 'f')) {
if (nasm_isxdigit(c)) {
nval = (nval << 4) + numvalue(c);
if (!--ndig) {
*q++ = nval;
@ -363,26 +405,29 @@ size_t nasm_unquote(char *str, char **ep)
}
} else {
p--; /* Process this character again */
*q++ = (p > escp) ? nval : escp[-1];
EMIT((p > escp) ? nval : escp[-1]);
state = st_start;
}
break;
case st_ucs:
if ((c >= '0' && c <= '9') ||
(c >= 'A' && c <= 'F') ||
(c >= 'a' && c <= 'f')) {
if (nasm_isxdigit(c)) {
nval = (nval << 4) + numvalue(c);
if (!--ndig) {
q = emit_utf8(q, nval);
err |= nval < cerr;
if (!err)
q = emit_utf8(q, nval);
state = st_start;
}
} else {
p--; /* Process this character again */
if (p > escp)
q = emit_utf8(q, nval);
else
*q++ = escp[-1];
if (p > escp) {
err |= nval < cerr;
if (!err)
q = emit_utf8(q, nval);
} else {
EMIT(escp[-1]);
}
state = st_start;
}
break;
@ -393,16 +438,19 @@ size_t nasm_unquote(char *str, char **ep)
case st_backslash:
break;
case st_oct:
*q++ = nval;
EMIT(nval);
break;
case st_hex:
*q++ = (p > escp) ? nval : escp[-1];
EMIT((p > escp) ? nval : escp[-1]);
break;
case st_ucs:
if (p > escp)
q = emit_utf8(q, nval);
else
*q++ = escp[-1];
if (p > escp) {
err |= nval < cerr;
if (!err)
q = emit_utf8(q, nval);
} else {
EMIT(escp[-1]);
}
break;
}
out:
@ -410,13 +458,32 @@ size_t nasm_unquote(char *str, char **ep)
default:
/* Not a quoted string, just return the input... */
p = q = strchr(str, '\0');
while ((c = *p++)) {
if (!c)
break;
EMIT(c);
}
break;
}
*q = '\0';
if (err)
nasm_nonfatal("control character in string not allowed here");
if (ep)
*ep = p;
return q-str;
*ep = (char *)p;
return (char *)q - str;
}
#undef EMIT
size_t nasm_unquote(char *str, char **ep)
{
return nasm_unquote_common(str, ep, 0);
}
size_t nasm_unquote_cstr(char *str, char **ep)
{
return nasm_unquote_common(str, ep, ' ');
}
/*
@ -436,8 +503,10 @@ char *nasm_skip_string(char *str)
bq = str[0];
if (bq == '\'' || bq == '\"') {
/* '...' or "..." string */
for (p = str+1; *p && *p != bq; p++)
;
for (p = str+1; *p; p++) {
if (p[0] == bq && p[1] != bq)
break;
}
return p;
} else if (bq == '`') {
/* `...` string */

View File

@ -38,6 +38,7 @@
char *nasm_quote(const char *str, size_t len);
size_t nasm_unquote(char *str, char **endptr);
size_t nasm_unquote_cstr(char *str, char **endptr);
char *nasm_skip_string(char *str);
#endif /* NASM_QUOTE_H */

View File

@ -199,8 +199,12 @@ char *nasm_strsep(char **stringp, const char *delim);
size_t pure_func strnlen(const char *, size_t);
#endif
/* This returns the numeric value of a given 'digit'. */
#define numvalue(c) ((c) >= 'a' ? (c) - 'a' + 10 : (c) >= 'A' ? (c) - 'A' + 10 : (c) - '0')
/* This returns the numeric value of a given 'digit'; no check for validity */
static inline unsigned int numvalue(unsigned char c)
{
c |= 0x20;
return c >= 'a' ? c - 'a' + 10 : c - '0';
}
/*
* Convert a string into a number, using NASM number rules. Sets