mirror of
https://github.com/netwide-assembler/nasm.git
synced 2024-12-15 09:09:58 +08:00
db6960c3fa
Explain why 0xfc + vb5 cannot overflow a byte value. Signed-off-by: H. Peter Anvin <hpa@zytor.com>
557 lines
12 KiB
C
557 lines
12 KiB
C
/* ----------------------------------------------------------------------- *
|
|
*
|
|
* Copyright 1996-2019 The NASM Authors - All Rights Reserved
|
|
* See the file AUTHORS included with the NASM distribution for
|
|
* the specific copyright holders.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following
|
|
* conditions are met:
|
|
*
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above
|
|
* copyright notice, this list of conditions and the following
|
|
* disclaimer in the documentation and/or other materials provided
|
|
* with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
|
|
* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
|
|
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
|
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
|
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
|
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
|
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
* ----------------------------------------------------------------------- */
|
|
|
|
/*
|
|
* quote.c
|
|
*/
|
|
|
|
#include "compiler.h"
|
|
#include "nasmlib.h"
|
|
#include "quote.h"
|
|
#include "nctype.h"
|
|
#include "error.h"
|
|
|
|
/*
|
|
* Create a NASM quoted string in newly allocated memory. Update the
|
|
* *lenp parameter with the output length (sans final NUL).
|
|
*/
|
|
|
|
char *nasm_quote(const char *str, size_t *lenp)
|
|
{
|
|
const char *p, *ep;
|
|
char c, c1, *q, *nstr;
|
|
unsigned char uc;
|
|
bool sq_ok, dq_ok;
|
|
size_t qlen;
|
|
size_t len = *lenp;
|
|
|
|
sq_ok = dq_ok = true;
|
|
ep = str+len;
|
|
qlen = 0; /* Length if we need `...` quotes */
|
|
for (p = str; p < ep; p++) {
|
|
c = *p;
|
|
switch (c) {
|
|
case '\'':
|
|
sq_ok = false;
|
|
qlen++;
|
|
break;
|
|
case '\"':
|
|
dq_ok = false;
|
|
qlen++;
|
|
break;
|
|
case '`':
|
|
case '\\':
|
|
qlen += 2;
|
|
break;
|
|
default:
|
|
if (c < ' ' || c > '~') {
|
|
sq_ok = dq_ok = false;
|
|
switch (c) {
|
|
case '\a':
|
|
case '\b':
|
|
case '\t':
|
|
case '\n':
|
|
case '\v':
|
|
case '\f':
|
|
case '\r':
|
|
case 27:
|
|
qlen += 2;
|
|
break;
|
|
default:
|
|
c1 = (p+1 < ep) ? p[1] : 0;
|
|
if (c1 >= '0' && c1 <= '7')
|
|
uc = 0377; /* Must use the full form */
|
|
else
|
|
uc = c;
|
|
if (uc > 077)
|
|
qlen++;
|
|
if (uc > 07)
|
|
qlen++;
|
|
qlen += 2;
|
|
break;
|
|
}
|
|
} else {
|
|
qlen++;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (sq_ok || dq_ok) {
|
|
/* Use '...' or "..." */
|
|
nstr = nasm_malloc(len+3);
|
|
nstr[0] = nstr[len+1] = sq_ok ? '\'' : '\"';
|
|
q = &nstr[len+2];
|
|
if (len > 0)
|
|
memcpy(nstr+1, str, len);
|
|
} else {
|
|
/* Need to use `...` quoted syntax */
|
|
nstr = nasm_malloc(qlen+3);
|
|
q = nstr;
|
|
*q++ = '`';
|
|
for (p = str; p < ep; p++) {
|
|
c = *p;
|
|
switch (c) {
|
|
case '`':
|
|
case '\\':
|
|
*q++ = '\\';
|
|
*q++ = c;
|
|
break;
|
|
case 7:
|
|
*q++ = '\\';
|
|
*q++ = 'a';
|
|
break;
|
|
case 8:
|
|
*q++ = '\\';
|
|
*q++ = 'b';
|
|
break;
|
|
case 9:
|
|
*q++ = '\\';
|
|
*q++ = 't';
|
|
break;
|
|
case 10:
|
|
*q++ = '\\';
|
|
*q++ = 'n';
|
|
break;
|
|
case 11:
|
|
*q++ = '\\';
|
|
*q++ = 'v';
|
|
break;
|
|
case 12:
|
|
*q++ = '\\';
|
|
*q++ = 'f';
|
|
break;
|
|
case 13:
|
|
*q++ = '\\';
|
|
*q++ = 'r';
|
|
break;
|
|
case 27:
|
|
*q++ = '\\';
|
|
*q++ = 'e';
|
|
break;
|
|
default:
|
|
if (c < ' ' || c > '~') {
|
|
c1 = (p+1 < ep) ? p[1] : 0;
|
|
if (c1 >= '0' && c1 <= '7')
|
|
uc = 0377; /* Must use the full form */
|
|
else
|
|
uc = c;
|
|
*q++ = '\\';
|
|
if (uc > 077)
|
|
*q++ = ((unsigned char)c >> 6) + '0';
|
|
if (uc > 07)
|
|
*q++ = (((unsigned char)c >> 3) & 7) + '0';
|
|
*q++ = ((unsigned char)c & 7) + '0';
|
|
break;
|
|
} else {
|
|
*q++ = c;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
*q++ = '`';
|
|
nasm_assert((size_t)(q-nstr) == qlen+2);
|
|
}
|
|
*q = '\0';
|
|
*lenp = q - nstr;
|
|
return nstr;
|
|
}
|
|
|
|
static unsigned char *emit_utf8(unsigned char *q, uint32_t v)
|
|
{
|
|
uint32_t vb1, vb2, vb3, vb4, vb5;
|
|
|
|
if (v <= 0x7f) {
|
|
*q++ = v;
|
|
goto out0;
|
|
}
|
|
|
|
vb1 = v >> 6;
|
|
if (vb1 <= 0x1f) {
|
|
*q++ = 0xc0 + vb1;
|
|
goto out1;
|
|
}
|
|
|
|
vb2 = vb1 >> 6;
|
|
if (vb2 <= 0x0f) {
|
|
*q++ = 0xe0 + vb2;
|
|
goto out2;
|
|
}
|
|
|
|
vb3 = vb2 >> 6;
|
|
if (vb3 <= 0x07) {
|
|
*q++ = 0xf0 + vb3;
|
|
goto out3;
|
|
}
|
|
|
|
vb4 = vb3 >> 6;
|
|
if (vb4 <= 0x03) {
|
|
*q++ = 0xf8 + vb4;
|
|
goto out4;
|
|
}
|
|
|
|
/*
|
|
* Note: this is invalid even for "classic" (pre-UTF16) 31-bit
|
|
* UTF-8 if the value is >= 0x8000000. This at least tries to do
|
|
* something vaguely sensible with it. Caveat programmer.
|
|
* The __utf*__ string transform functions do reject these
|
|
* as invalid input.
|
|
*
|
|
* vb5 cannot be more than 3, as a 32-bit value has been shifted
|
|
* right by 5*6 = 30 bits already.
|
|
*/
|
|
vb5 = vb4 >> 6;
|
|
*q++ = 0xfc + vb5;
|
|
goto out5;
|
|
|
|
/* Emit extension bytes as appropriate */
|
|
out5: *q++ = 0x80 + (vb4 & 63);
|
|
out4: *q++ = 0x80 + (vb3 & 63);
|
|
out3: *q++ = 0x80 + (vb2 & 63);
|
|
out2: *q++ = 0x80 + (vb1 & 63);
|
|
out1: *q++ = 0x80 + (v & 63);
|
|
out0: return q;
|
|
}
|
|
|
|
static inline uint32_t ctlbit(uint32_t v)
|
|
{
|
|
return unlikely(v < 32) ? UINT32_C(1) << v : 0;
|
|
}
|
|
|
|
#define CTL_ERR(c) \
|
|
(badctl & (ctlmask |= ctlbit(c)))
|
|
|
|
#define EMIT_UTF8(c) \
|
|
do { \
|
|
uint32_t ec = (c); \
|
|
if (!CTL_ERR(ec)) \
|
|
q = emit_utf8(q, ec); \
|
|
} while (0)
|
|
|
|
#define EMIT(c) \
|
|
do { \
|
|
unsigned char ec = (c); \
|
|
if (!CTL_ERR(ec)) \
|
|
*q++ = ec; \
|
|
} while (0)
|
|
|
|
/*
|
|
* Same as nasm_quote, but take the length of a C string;
|
|
* the lenp argument is optional.
|
|
*/
|
|
char *nasm_quote_cstr(const char *str, size_t *lenp)
|
|
{
|
|
size_t len = strlen(str);
|
|
char *qstr = nasm_quote(str, &len);
|
|
if (lenp)
|
|
*lenp = len;
|
|
return qstr;
|
|
}
|
|
|
|
/*
|
|
* Do an *in-place* dequoting of the specified string, returning the
|
|
* resulting length (which may be containing embedded nulls.)
|
|
*
|
|
* In-place replacement is possible since the unquoted length is always
|
|
* shorter than or equal to the quoted length.
|
|
*
|
|
* *ep points to the final quote, or to the null if improperly quoted.
|
|
*
|
|
* Issue an error if the string contains control characters
|
|
* corresponding to bits set in badctl; in that case, the output
|
|
* string, but not *ep, is truncated before the first invalid
|
|
* character.
|
|
*/
|
|
|
|
static size_t nasm_unquote_common(char *str, char **ep,
|
|
const uint32_t badctl)
|
|
{
|
|
unsigned char bq;
|
|
const unsigned char *p;
|
|
const unsigned char *escp = NULL;
|
|
unsigned char *q;
|
|
unsigned char c;
|
|
uint32_t ctlmask = 0; /* Mask of control characters seen */
|
|
enum unq_state {
|
|
st_start,
|
|
st_backslash,
|
|
st_hex,
|
|
st_oct,
|
|
st_ucs,
|
|
st_done
|
|
} state;
|
|
int ndig = 0;
|
|
uint32_t nval = 0;
|
|
|
|
p = q = (unsigned char *)str;
|
|
|
|
bq = *p++;
|
|
if (!bq)
|
|
return 0;
|
|
|
|
switch (bq) {
|
|
case '\'':
|
|
case '\"':
|
|
/* '...' or "..." string */
|
|
while ((c = *p++) && (c != bq))
|
|
EMIT(c);
|
|
break;
|
|
|
|
case '`':
|
|
/* `...` string */
|
|
state = st_start;
|
|
|
|
while (state != st_done) {
|
|
c = *p++;
|
|
switch (state) {
|
|
case st_start:
|
|
switch (c) {
|
|
case '\\':
|
|
state = st_backslash;
|
|
break;
|
|
case '`':
|
|
case '\0':
|
|
state = st_done;
|
|
break;
|
|
default:
|
|
EMIT(c);
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case st_backslash:
|
|
state = st_start;
|
|
escp = p; /* Beginning of argument sequence */
|
|
nval = 0;
|
|
switch (c) {
|
|
case 'a':
|
|
nval = 7;
|
|
break;
|
|
case 'b':
|
|
nval = 8;
|
|
break;
|
|
case 'e':
|
|
nval = 27;
|
|
break;
|
|
case 'f':
|
|
nval = 12;
|
|
break;
|
|
case 'n':
|
|
nval = 10;
|
|
break;
|
|
case 'r':
|
|
nval = 13;
|
|
break;
|
|
case 't':
|
|
nval = 9;
|
|
break;
|
|
case 'u':
|
|
state = st_ucs;
|
|
ndig = 4;
|
|
break;
|
|
case 'U':
|
|
state = st_ucs;
|
|
ndig = 8;
|
|
break;
|
|
case 'v':
|
|
nval = 11;
|
|
break;
|
|
case 'x':
|
|
case 'X':
|
|
state = st_hex;
|
|
ndig = 2;
|
|
break;
|
|
case '0':
|
|
case '1':
|
|
case '2':
|
|
case '3':
|
|
case '4':
|
|
case '5':
|
|
case '6':
|
|
case '7':
|
|
state = st_oct;
|
|
ndig = 2; /* Up to two more digits */
|
|
nval = c - '0';
|
|
break;
|
|
case '\0':
|
|
nval = '\\';
|
|
p--; /* Reprocess; terminates string */
|
|
break;
|
|
default:
|
|
nval = c;
|
|
break;
|
|
}
|
|
if (state == st_start)
|
|
EMIT(nval);
|
|
break;
|
|
|
|
case st_oct:
|
|
if (c >= '0' && c <= '7') {
|
|
nval = (nval << 3) + (c - '0');
|
|
if (--ndig)
|
|
break; /* Might have more digits */
|
|
} else {
|
|
p--; /* Process this character again */
|
|
}
|
|
EMIT(nval);
|
|
state = st_start;
|
|
break;
|
|
|
|
case st_hex:
|
|
case st_ucs:
|
|
if (nasm_isxdigit(c)) {
|
|
nval = (nval << 4) + numvalue(c);
|
|
if (--ndig)
|
|
break; /* Might have more digits */
|
|
} else {
|
|
p--; /* Process this character again */
|
|
}
|
|
|
|
if (unlikely(p <= escp))
|
|
EMIT(escp[-1]);
|
|
else if (state == st_ucs)
|
|
EMIT_UTF8(nval);
|
|
else
|
|
EMIT(nval);
|
|
|
|
state = st_start;
|
|
break;
|
|
|
|
default:
|
|
panic();
|
|
}
|
|
}
|
|
break;
|
|
|
|
default:
|
|
/* Not a quoted string, just return the input... */
|
|
while ((c = *p++))
|
|
EMIT(c);
|
|
break;
|
|
}
|
|
|
|
/* Zero-terminate the output */
|
|
*q = '\0';
|
|
|
|
if (ctlmask & badctl)
|
|
nasm_nonfatal("control character in string not allowed here");
|
|
|
|
if (ep)
|
|
*ep = (char *)p - 1;
|
|
return (char *)q - str;
|
|
}
|
|
#undef EMIT
|
|
|
|
size_t nasm_unquote(char *str, char **ep)
|
|
{
|
|
return nasm_unquote_common(str, ep, 0);
|
|
}
|
|
size_t nasm_unquote_cstr(char *str, char **ep)
|
|
{
|
|
/*
|
|
* These are the only control characters permitted: BEL BS TAB ESC
|
|
*/
|
|
const uint32_t okctl = (1 << '\a') | (1 << '\b') | (1 << '\t') | (1 << 27);
|
|
|
|
return nasm_unquote_common(str, ep, ~okctl);
|
|
}
|
|
|
|
/*
|
|
* Find the end of a quoted string; returns the pointer to the terminating
|
|
* character (either the ending quote or the null character, if unterminated.)
|
|
* If the input is not a quoted string, return NULL.
|
|
*/
|
|
char *nasm_skip_string(const char *str)
|
|
{
|
|
char bq;
|
|
const char *p;
|
|
char c;
|
|
enum unq_state {
|
|
st_start,
|
|
st_backslash,
|
|
st_done
|
|
} state;
|
|
|
|
bq = str[0];
|
|
p = str+1;
|
|
switch (bq) {
|
|
case '\'':
|
|
case '\"':
|
|
/* '...' or "..." string */
|
|
while ((c = *p++) && (c != bq))
|
|
;
|
|
break;
|
|
|
|
case '`':
|
|
/* `...` string */
|
|
state = st_start;
|
|
while (state != st_done) {
|
|
c = *p++;
|
|
switch (state) {
|
|
case st_start:
|
|
switch (c) {
|
|
case '\\':
|
|
state = st_backslash;
|
|
break;
|
|
case '`':
|
|
case '\0':
|
|
state = st_done;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case st_backslash:
|
|
/*
|
|
* Note: for the purpose of finding the end of the string,
|
|
* all successor states to st_backslash are functionally
|
|
* equivalent to st_start, since either a backslash or
|
|
* a backquote will force a return to the st_start state.
|
|
*/
|
|
state = c ? st_start : st_done;
|
|
break;
|
|
|
|
default:
|
|
panic();
|
|
}
|
|
}
|
|
break;
|
|
|
|
default:
|
|
/* Not a string at all... */
|
|
return NULL;
|
|
}
|
|
return (char *)p - 1;
|
|
}
|