quote: disallow control characters in C strings; concatendate; cleanups

In nasm_unquote_cstr(), disallow any control character, not just NUL. This will matter when allowing quoting symbols. Merge nasm_unquote() and nasm_unquote_cstr(). Strings can now be concatenated, C style: adjacent quoted strings (including whitespace-separated) are merged into a single string. Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2025-01-30 16:41:05 +08:00 · 2019-04-22 14:29:29 -07:00 · 2019-04-22 14:29:29 -07:00 · bb42d30737
commit bb42d30737
parent 982186a1a3
4 changed files with 159 additions and 101 deletions
--- a/asm/preproc.c
+++ b/asm/preproc.c
@ -461,22 +461,6 @@ static Token *delete_Token(Token * t);
 #define tok_is_(x,v)    (tok_type_((x), TOK_OTHER) && !strcmp((x)->text,(v)))
 #define tok_isnt_(x,v)  ((x) && ((x)->type!=TOK_OTHER || strcmp((x)->text,(v))))

-/*
- * nasm_unquote with error if the string contains NUL characters.
- * If the string contains NUL characters, issue an error and return
- * the C len, i.e. truncate at the NUL.
- */
-static size_t nasm_unquote_cstr(char *qstr, enum preproc_token directive)
-{
-    size_t len = nasm_unquote(qstr, NULL);
-    size_t clen = strlen(qstr);
-
-    if (len != clen)
-        nasm_nonfatal("NUL character in `%s' directive",
-                      pp_directives[directive]);
-    return clen;
-}
-
 /*
 * In-place reverse a list of tokens.
 */
@ -1780,7 +1764,7 @@ static bool if_condition(Token * tline, enum preproc_token ct)
            if (tline->type == TOK_PREPROC_ID)
                p += 2;         /* Skip leading %! */
            if (nasm_isquote(*p))
-                nasm_unquote_cstr(p, ct);
+                nasm_unquote_cstr(p, NULL);
            if (getenv(p))
                j = true;
            tline = tline->next;
@ -2527,7 +2511,7 @@ static int do_directive(Token *tline, char **output)
            nasm_warn(WARN_OTHER, "trailing garbage after `%%depend' ignored");
        p = t->text;
        if (t->type != TOK_INTERNAL_STRING)
-            nasm_unquote_cstr(p, i);
+            nasm_unquote_cstr(p, NULL);
        strlist_add(deplist, p);
        free_tlist(origline);
        return DIRECTIVE_FOUND;
@ -2546,7 +2530,7 @@ static int do_directive(Token *tline, char **output)
            nasm_warn(WARN_OTHER, "trailing garbage after `%%include' ignored");
        p = t->text;
        if (t->type != TOK_INTERNAL_STRING)
-            nasm_unquote_cstr(p, i);
+            nasm_unquote_cstr(p, NULL);
        inc = nasm_malloc(sizeof(Include));
        inc->next = istk;
        inc->conds = NULL;
@ -2588,7 +2572,7 @@ static int do_directive(Token *tline, char **output)
        if (tline->next)
            nasm_warn(WARN_OTHER, "trailing garbage after `%%use' ignored");
        if (tline->type == TOK_STRING)
-            nasm_unquote_cstr(tline->text, i);
+            nasm_unquote_cstr(tline->text, NULL);
        use_pkg = nasm_stdmac_find_package(tline->text);
        if (!use_pkg)
            nasm_nonfatal("unknown `%%use' package: %s", tline->text);
@ -3240,7 +3224,7 @@ issue_error:
         * are stored with the token stream reversed, so we have to
         * reverse the output of tokenize().
         */
-        nasm_unquote_cstr(t->text, i);
+        nasm_unquote_cstr(t->text, NULL);
        macro_start = reverse_tokens(tokenize(t->text));

        /*
--- a/asm/quote.c
+++ b/asm/quote.c
@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------- *
- *   
+ *
 *   Copyright 1996-2016 The NASM Authors - All Rights Reserved
 *   See the file AUTHORS included with the NASM distribution for
 *   the specific copyright holders.
@ -14,7 +14,7 @@
 *     copyright notice, this list of conditions and the following
 *     disclaimer in the documentation and/or other materials provided
 *     with the distribution.
- *     
+ *
 *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
 *     CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 *     INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
@ -40,6 +40,8 @@

 #include "nasmlib.h"
 #include "quote.h"
+#include "nctype.h"
+#include "error.h"

 char *nasm_quote(const char *str, size_t len)
 {
@ -180,39 +182,55 @@ char *nasm_quote(const char *str, size_t len)
    return nstr;
 }

-static char *emit_utf8(char *q, int32_t v)
+static unsigned char *emit_utf8(unsigned char *q, uint32_t v)
 {
-    if (v < 0) {
-	/* Impossible - do nothing */
-    } else if (v <= 0x7f) {
+    uint32_t vb1, vb2, vb3, vb4, vb5;
+
+    if (v <= 0x7f) {
 	*q++ = v;
-    } else if (v <= 0x000007ff) {
-	*q++ = 0xc0 | (v >> 6);
-	*q++ = 0x80 | (v & 63);
-    } else if (v <= 0x0000ffff) {
-	*q++ = 0xe0 | (v >> 12);
-	*q++ = 0x80 | ((v >> 6) & 63);
-	*q++ = 0x80 | (v & 63);
-    } else if (v <= 0x001fffff) {
-	*q++ = 0xf0 | (v >> 18);
-	*q++ = 0x80 | ((v >> 12) & 63);
-	*q++ = 0x80 | ((v >> 6) & 63);
-	*q++ = 0x80 | (v & 63);
-    } else if (v <= 0x03ffffff) {
-	*q++ = 0xf8 | (v >> 24);
-	*q++ = 0x80 | ((v >> 18) & 63);
-	*q++ = 0x80 | ((v >> 12) & 63);
-	*q++ = 0x80 | ((v >> 6) & 63);
-	*q++ = 0x80 | (v & 63);
-    } else {
-	*q++ = 0xfc | (v >> 30);
-	*q++ = 0x80 | ((v >> 24) & 63);
-	*q++ = 0x80 | ((v >> 18) & 63);
-	*q++ = 0x80 | ((v >> 12) & 63);
-	*q++ = 0x80 | ((v >> 6) & 63);
-	*q++ = 0x80 | (v & 63);
+        goto out0;
    }
-    return q;
+
+    vb1 = v >> 6;
+    if (vb1 <= 0x3f) {
+	*q++ = 0xc0 + vb1;
+        goto out1;
+    }
+
+    vb2 = vb1 >> 6;
+    if (vb2 <= 0x1f) {
+        *q++ = 0xe0 + vb2;
+        goto out2;
+    }
+
+    vb3 = vb2 >> 6;
+    if (vb3 <= 0x0f) {
+        *q++ = 0xf0 + vb3;
+        goto out3;
+    }
+
+    vb4 = vb3 >> 6;
+    if (vb4 <= 0x07) {
+        *q++ = 0xf8 + vb4;
+        goto out4;
+    }
+
+    vb5 = vb4 >> 6;
+    if (vb5 <= 0x03) {
+        *q++ = 0xfc + vb5;
+        goto out5;
+    }
+
+    /* Otherwise invalid, even with 31-bit "extended Unicode" (pre-UTF-16) */
+    goto out0;
+
+    /* Emit extension bytes as appropriate */
+out5: *q++ = 0x80 + (vb4 & 63);
+out4: *q++ = 0x80 + (vb3 & 63);
+out3: *q++ = 0x80 + (vb2 & 63);
+out2: *q++ = 0x80 + (vb1 & 63);
+out1: *q++ = 0x80 + (v & 63);
+out0: return q;
 }

 /*
@ -223,13 +241,27 @@ static char *emit_utf8(char *q, int32_t v)
 * shorter than or equal to the quoted length.
 *
 * *ep points to the final quote, or to the null if improperly quoted.
+ *
+ * Issue an error if the string contains characters less than cerr; in
+ * that case, the output string, but not *ep, is truncated before the
+ * first invalid character.
 */
-size_t nasm_unquote(char *str, char **ep)
+#define EMIT(c)                                                 \
+    do {                                                        \
+        unsigned char ec = (c);                                 \
+        err |= ec < cerr;                                       \
+        if (!err)                                               \
+            *q++ = (c);                                         \
+    } while (0)
+
+static size_t nasm_unquote_common(char *str, char **ep,
+                                  const unsigned char cerr)
 {
    char bq;
-    char *p, *q;
-    char *escp = NULL;
-    char c;
+    unsigned char *p, *q;
+    unsigned char *escp = NULL;
+    unsigned char c;
+    bool err = false;
    enum unq_state {
 	st_start,
 	st_backslash,
@ -238,10 +270,10 @@ size_t nasm_unquote(char *str, char **ep)
 	st_ucs
    } state;
    int ndig = 0;
-    int32_t nval = 0;
+    uint32_t nval = 0;
+
+    p = q = (unsigned char *)str;

-    p = q = str;
-    
    bq = *p++;
    if (!bq)
 	return 0;
@ -250,11 +282,21 @@ size_t nasm_unquote(char *str, char **ep)
    case '\'':
    case '\"':
 	/* '...' or "..." string */
-	while ((c = *p) && c != bq) {
-	    p++;
-	    *q++ = c;
-	}
-	*q = '\0';
+        while (1) {
+            c = *p;
+            if (!c) {
+                break;
+            } else if (c == bq) {
+                /* Doubled quote = escaped quote */
+                c = p[1];
+                if (c != bq)
+                    break;
+                p++;
+            }
+            p++;
+            EMIT(c);
+        }
+        *q = '\0';
 	break;

    case '`':
@ -273,7 +315,7 @@ size_t nasm_unquote(char *str, char **ep)
 		    p--;
 		    goto out;
 		default:
-		    *q++ = c;
+                    EMIT(c);
 		    break;
 		}
 		break;
@ -284,25 +326,25 @@ size_t nasm_unquote(char *str, char **ep)
 		nval = 0;
 		switch (c) {
 		case 'a':
-		    *q++ = 7;
+		    nval = 7;
 		    break;
 		case 'b':
-		    *q++ = 8;
+		    nval = 8;
 		    break;
 		case 'e':
-		    *q++ = 27;
+		    nval = 27;
 		    break;
 		case 'f':
-		    *q++ = 12;
+		    nval = 12;
 		    break;
 		case 'n':
-		    *q++ = 10;
+		    nval = 10;
 		    break;
 		case 'r':
-		    *q++ = 13;
+		    nval = 13;
 		    break;
 		case 't':
-		    *q++ = 9;
+		    nval = 9;
 		    break;
 		case 'u':
 		    state = st_ucs;
@ -313,7 +355,7 @@ size_t nasm_unquote(char *str, char **ep)
 		    ndig = 8;
 		    break;
 		case 'v':
-		    *q++ = 11;
+		    nval = 11;
 		    break;
 		case 'x':
 		case 'X':
@ -333,9 +375,11 @@ size_t nasm_unquote(char *str, char **ep)
 		    nval = c - '0';
 		    break;
 		default:
-		    *q++ = c;
+		    nval = c;
 		    break;
 		}
+                if (state == st_start)
+                    EMIT(nval);
 		break;

 	    case st_oct:
@ -347,15 +391,13 @@ size_t nasm_unquote(char *str, char **ep)
 		    }
 		} else {
 		    p--;	/* Process this character again */
-		    *q++ = nval;
+		    EMIT(nval);
 		    state = st_start;
 		}
 		break;

 	    case st_hex:
-		if ((c >= '0' && c <= '9') ||
-		    (c >= 'A' && c <= 'F') ||
-		    (c >= 'a' && c <= 'f')) {
+		if (nasm_isxdigit(c)) {
 		    nval = (nval << 4) + numvalue(c);
 		    if (!--ndig) {
 			*q++ = nval;
@ -363,26 +405,29 @@ size_t nasm_unquote(char *str, char **ep)
 		    }
 		} else {
 		    p--;	/* Process this character again */
-		    *q++ = (p > escp) ? nval : escp[-1];
+		    EMIT((p > escp) ? nval : escp[-1]);
 		    state = st_start;
 		}
 		break;

 	    case st_ucs:
-		if ((c >= '0' && c <= '9') ||
-		    (c >= 'A' && c <= 'F') ||
-		    (c >= 'a' && c <= 'f')) {
+		if (nasm_isxdigit(c)) {
 		    nval = (nval << 4) + numvalue(c);
 		    if (!--ndig) {
-			q = emit_utf8(q, nval);
+                        err |= nval < cerr;
+                        if (!err)
+                            q = emit_utf8(q, nval);
 			state = st_start;
 		    }
 		} else {
 		    p--;	/* Process this character again */
-		    if (p > escp)
-			q = emit_utf8(q, nval);
-		    else
-			*q++ = escp[-1];
+		    if (p > escp) {
+                        err |= nval < cerr;
+                        if (!err)
+                            q = emit_utf8(q, nval);
+                    } else {
+			EMIT(escp[-1]);
+                    }
 		    state = st_start;
 		}
 		break;
@ -393,16 +438,19 @@ size_t nasm_unquote(char *str, char **ep)
 	case st_backslash:
 	    break;
 	case st_oct:
-	    *q++ = nval;
+	    EMIT(nval);
 	    break;
 	case st_hex:
-	    *q++ = (p > escp) ? nval : escp[-1];
+	    EMIT((p > escp) ? nval : escp[-1]);
 	    break;
 	case st_ucs:
-	    if (p > escp)
-		q = emit_utf8(q, nval);
-	    else
-		*q++ = escp[-1];
+	    if (p > escp) {
+                err |= nval < cerr;
+                if (!err)
+                    q = emit_utf8(q, nval);
+            } else {
+		EMIT(escp[-1]);
+            }
 	    break;
 	}
    out:
@ -410,13 +458,32 @@ size_t nasm_unquote(char *str, char **ep)

    default:
 	/* Not a quoted string, just return the input... */
-	p = q = strchr(str, '\0');
+        while ((c = *p++)) {
+            if (!c)
+                break;
+            EMIT(c);
+        }
 	break;
    }

+    *q = '\0';
+
+    if (err)
+        nasm_nonfatal("control character in string not allowed here");
+
    if (ep)
-	*ep = p;
-    return q-str;
+	*ep = (char *)p;
+    return (char *)q - str;
+}
+#undef EMIT
+
+size_t nasm_unquote(char *str, char **ep)
+{
+    return nasm_unquote_common(str, ep, 0);
+}
+size_t nasm_unquote_cstr(char *str, char **ep)
+{
+    return nasm_unquote_common(str, ep, ' ');
 }

 /*
@ -436,8 +503,10 @@ char *nasm_skip_string(char *str)
    bq = str[0];
    if (bq == '\'' || bq == '\"') {
 	/* '...' or "..." string */
-	for (p = str+1; *p && *p != bq; p++)
-	    ;
+	for (p = str+1; *p; p++) {
+            if (p[0] == bq && p[1] != bq)
+                break;
+        }
 	return p;
    } else if (bq == '`') {
 	/* `...` string */
--- a/asm/quote.h
+++ b/asm/quote.h
@ -38,6 +38,7 @@

 char *nasm_quote(const char *str, size_t len);
 size_t nasm_unquote(char *str, char **endptr);
+size_t nasm_unquote_cstr(char *str, char **endptr);
 char *nasm_skip_string(char *str);

 #endif /* NASM_QUOTE_H */
--- a/include/nasmlib.h
+++ b/include/nasmlib.h
@ -199,8 +199,12 @@ char *nasm_strsep(char **stringp, const char *delim);
 size_t pure_func strnlen(const char *, size_t);
 #endif

-/* This returns the numeric value of a given 'digit'. */
-#define numvalue(c)         ((c) >= 'a' ? (c) - 'a' + 10 : (c) >= 'A' ? (c) - 'A' + 10 : (c) - '0')
+/* This returns the numeric value of a given 'digit'; no check for validity */
+static inline unsigned int numvalue(unsigned char c)
+{
+    c |= 0x20;
+    return c >= 'a' ? c - 'a' + 10 : c - '0';
+}

 /*
 * Convert a string into a number, using NASM number rules. Sets