mirror of
git://gcc.gnu.org/git/gcc.git
synced 2025-03-08 10:47:29 +08:00
Add initial support for '\uNNNN' specifier.
* lex.c (read_ucs): New fn. (readescape, skip_white_space): Call it. (is_extended_char, is_extended_char_1): New fns. (utf8_extend_token): New fn, #if 0'd out. (real_yylex): Treat extended chars like letters. From-SVN: r32414
This commit is contained in:
parent
f48d005079
commit
1c08915645
@ -1,5 +1,12 @@
|
||||
2000-03-07 Jason Merrill <jason@casey.cygnus.com>
|
||||
|
||||
Add initial support for '\uNNNN' specifier.
|
||||
* lex.c (read_ucs): New fn.
|
||||
(readescape, skip_white_space): Call it.
|
||||
(is_extended_char, is_extended_char_1): New fns.
|
||||
(utf8_extend_token): New fn, #if 0'd out.
|
||||
(real_yylex): Treat extended chars like letters.
|
||||
|
||||
* search.c (note_debug_info_needed): Walk the bases even if we
|
||||
weren't deferring the type itself.
|
||||
|
||||
|
417
gcc/cp/lex.c
417
gcc/cp/lex.c
@ -90,6 +90,9 @@ static int read_line_number PARAMS ((int *));
|
||||
static int token_getch PARAMS ((void));
|
||||
static void token_put_back PARAMS ((int));
|
||||
static void mark_impl_file_chain PARAMS ((void *));
|
||||
static int read_ucs PARAMS ((int));
|
||||
static int is_extended_char PARAMS ((int));
|
||||
static int is_extended_char_1 PARAMS ((int));
|
||||
|
||||
/* Given a file name X, return the nondirectory portion.
|
||||
Keep in mind that X can be computed more than once. */
|
||||
@ -2236,10 +2239,16 @@ skip_white_space (c)
|
||||
case '\\':
|
||||
c = getch ();
|
||||
if (c == '\n')
|
||||
lineno++;
|
||||
{
|
||||
lineno++;
|
||||
c = getch ();
|
||||
}
|
||||
else if (c == 'u')
|
||||
c = read_ucs (4);
|
||||
else if (c == 'U')
|
||||
c = read_ucs (8);
|
||||
else
|
||||
error ("stray '\\' in program");
|
||||
c = getch ();
|
||||
break;
|
||||
|
||||
default:
|
||||
@ -2799,6 +2808,376 @@ do_pending_lang_change ()
|
||||
pop_lang_context ();
|
||||
}
|
||||
|
||||
/* Parse a '\uNNNN' or '\UNNNNNNNN' sequence.
|
||||
|
||||
[lex.charset]: The character designated by the universal-character-name
|
||||
\UNNNNNNNN is that character whose character short name in ISO/IEC 10646
|
||||
is NNNNNNNN; the character designated by the universal-character-name
|
||||
\uNNNN is that character whose character short name in ISO/IEC 10646 is
|
||||
0000NNNN. If the hexadecimal value for a universal character name is
|
||||
less than 0x20 or in the range 0x7F-0x9F (inclusive), or if the
|
||||
universal character name designates a character in the basic source
|
||||
character set, then the program is ill-formed.
|
||||
|
||||
We assume that wchar_t is Unicode, so we don't need to do any
|
||||
mapping. Is this ever wrong? */
|
||||
|
||||
static int
|
||||
read_ucs (length)
|
||||
int length;
|
||||
{
|
||||
unsigned int code = 0;
|
||||
int c;
|
||||
|
||||
for (; length; --length)
|
||||
{
|
||||
c = getch ();
|
||||
if (! ISXDIGIT (c))
|
||||
{
|
||||
error ("non hex digit '%c' in universal-character-name", c);
|
||||
put_back (c);
|
||||
break;
|
||||
}
|
||||
code <<= 4;
|
||||
if (c >= 'a' && c <= 'f')
|
||||
code += c - 'a' + 10;
|
||||
if (c >= 'A' && c <= 'F')
|
||||
code += c - 'A' + 10;
|
||||
if (c >= '0' && c <= '9')
|
||||
code += c - '0';
|
||||
}
|
||||
|
||||
#ifdef TARGET_EBCDIC
|
||||
sorry ("universal-character-name on EBCDIC target");
|
||||
return 0x3F;
|
||||
#endif
|
||||
|
||||
if (code > 0x9f && !(code & 0x80000000))
|
||||
/* True extended character, OK. */;
|
||||
else if (code >= 0x20 && code < 0x7f)
|
||||
{
|
||||
/* ASCII printable character. The C character set consists of all of
|
||||
these except $, @ and `. We use hex escapes so that this also
|
||||
works with EBCDIC hosts. */
|
||||
if (code != 0x24 && code != 0x40 && code != 0x60)
|
||||
error ("universal-character-name designates `%c', part of the basic source character set", code);
|
||||
}
|
||||
else
|
||||
error ("invalid universal-character-name");
|
||||
return code;
|
||||
}
|
||||
|
||||
/* Returns nonzero if C is a universal-character-name. Give an error if it
|
||||
is not one which may appear in an identifier, as per [extendid]. */
|
||||
|
||||
static inline int
|
||||
is_extended_char (c)
|
||||
int c;
|
||||
{
|
||||
#ifdef TARGET_EBCDIC
|
||||
return 0;
|
||||
#else
|
||||
/* ASCII. */
|
||||
if (c < 0x7f)
|
||||
return 0;
|
||||
|
||||
return is_extended_char_1 (c);
|
||||
#endif
|
||||
}
|
||||
|
||||
static int
|
||||
is_extended_char_1 (c)
|
||||
int c;
|
||||
{
|
||||
/* None of the valid chars are outside the Basic Multilingual Plane (the
|
||||
low 16 bits). */
|
||||
if (c > 0xffff)
|
||||
{
|
||||
error ("universal-character-name `\\U%08x' not valid in identifier", c);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Latin */
|
||||
if ((c >= 0x00c0 && c <= 0x00d6)
|
||||
|| (c >= 0x00d8 && c <= 0x00f6)
|
||||
|| (c >= 0x00f8 && c <= 0x01f5)
|
||||
|| (c >= 0x01fa && c <= 0x0217)
|
||||
|| (c >= 0x0250 && c <= 0x02a8)
|
||||
|| (c >= 0x1e00 && c <= 0x1e9a)
|
||||
|| (c >= 0x1ea0 && c <= 0x1ef9))
|
||||
return 1;
|
||||
|
||||
/* Greek */
|
||||
if ((c == 0x0384)
|
||||
|| (c >= 0x0388 && c <= 0x038a)
|
||||
|| (c == 0x038c)
|
||||
|| (c >= 0x038e && c <= 0x03a1)
|
||||
|| (c >= 0x03a3 && c <= 0x03ce)
|
||||
|| (c >= 0x03d0 && c <= 0x03d6)
|
||||
|| (c == 0x03da)
|
||||
|| (c == 0x03dc)
|
||||
|| (c == 0x03de)
|
||||
|| (c == 0x03e0)
|
||||
|| (c >= 0x03e2 && c <= 0x03f3)
|
||||
|| (c >= 0x1f00 && c <= 0x1f15)
|
||||
|| (c >= 0x1f18 && c <= 0x1f1d)
|
||||
|| (c >= 0x1f20 && c <= 0x1f45)
|
||||
|| (c >= 0x1f48 && c <= 0x1f4d)
|
||||
|| (c >= 0x1f50 && c <= 0x1f57)
|
||||
|| (c == 0x1f59)
|
||||
|| (c == 0x1f5b)
|
||||
|| (c == 0x1f5d)
|
||||
|| (c >= 0x1f5f && c <= 0x1f7d)
|
||||
|| (c >= 0x1f80 && c <= 0x1fb4)
|
||||
|| (c >= 0x1fb6 && c <= 0x1fbc)
|
||||
|| (c >= 0x1fc2 && c <= 0x1fc4)
|
||||
|| (c >= 0x1fc6 && c <= 0x1fcc)
|
||||
|| (c >= 0x1fd0 && c <= 0x1fd3)
|
||||
|| (c >= 0x1fd6 && c <= 0x1fdb)
|
||||
|| (c >= 0x1fe0 && c <= 0x1fec)
|
||||
|| (c >= 0x1ff2 && c <= 0x1ff4)
|
||||
|| (c >= 0x1ff6 && c <= 0x1ffc))
|
||||
return 1;
|
||||
|
||||
/* Cyrillic */
|
||||
if ((c >= 0x0401 && c <= 0x040d)
|
||||
|| (c >= 0x040f && c <= 0x044f)
|
||||
|| (c >= 0x0451 && c <= 0x045c)
|
||||
|| (c >= 0x045e && c <= 0x0481)
|
||||
|| (c >= 0x0490 && c <= 0x04c4)
|
||||
|| (c >= 0x04c7 && c <= 0x04c8)
|
||||
|| (c >= 0x04cb && c <= 0x04cc)
|
||||
|| (c >= 0x04d0 && c <= 0x04eb)
|
||||
|| (c >= 0x04ee && c <= 0x04f5)
|
||||
|| (c >= 0x04f8 && c <= 0x04f9))
|
||||
return 1;
|
||||
|
||||
/* Armenian */
|
||||
if ((c >= 0x0531 && c <= 0x0556)
|
||||
|| (c >= 0x0561 && c <= 0x0587))
|
||||
return 1;
|
||||
|
||||
/* Hebrew */
|
||||
if ((c >= 0x05d0 && c <= 0x05ea)
|
||||
|| (c >= 0x05f0 && c <= 0x05f4))
|
||||
return 1;
|
||||
|
||||
/* Arabic */
|
||||
if ((c >= 0x0621 && c <= 0x063a)
|
||||
|| (c >= 0x0640 && c <= 0x0652)
|
||||
|| (c >= 0x0670 && c <= 0x06b7)
|
||||
|| (c >= 0x06ba && c <= 0x06be)
|
||||
|| (c >= 0x06c0 && c <= 0x06ce)
|
||||
|| (c >= 0x06e5 && c <= 0x06e7))
|
||||
return 1;
|
||||
|
||||
/* Devanagari */
|
||||
if ((c >= 0x0905 && c <= 0x0939)
|
||||
|| (c >= 0x0958 && c <= 0x0962))
|
||||
return 1;
|
||||
|
||||
/* Bengali */
|
||||
if ((c >= 0x0985 && c <= 0x098c)
|
||||
|| (c >= 0x098f && c <= 0x0990)
|
||||
|| (c >= 0x0993 && c <= 0x09a8)
|
||||
|| (c >= 0x09aa && c <= 0x09b0)
|
||||
|| (c == 0x09b2)
|
||||
|| (c >= 0x09b6 && c <= 0x09b9)
|
||||
|| (c >= 0x09dc && c <= 0x09dd)
|
||||
|| (c >= 0x09df && c <= 0x09e1)
|
||||
|| (c >= 0x09f0 && c <= 0x09f1))
|
||||
return 1;
|
||||
|
||||
/* Gurmukhi */
|
||||
if ((c >= 0x0a05 && c <= 0x0a0a)
|
||||
|| (c >= 0x0a0f && c <= 0x0a10)
|
||||
|| (c >= 0x0a13 && c <= 0x0a28)
|
||||
|| (c >= 0x0a2a && c <= 0x0a30)
|
||||
|| (c >= 0x0a32 && c <= 0x0a33)
|
||||
|| (c >= 0x0a35 && c <= 0x0a36)
|
||||
|| (c >= 0x0a38 && c <= 0x0a39)
|
||||
|| (c >= 0x0a59 && c <= 0x0a5c)
|
||||
|| (c == 0x0a5e))
|
||||
return 1;
|
||||
|
||||
/* Gujarati */
|
||||
if ((c >= 0x0a85 && c <= 0x0a8b)
|
||||
|| (c == 0x0a8d)
|
||||
|| (c >= 0x0a8f && c <= 0x0a91)
|
||||
|| (c >= 0x0a93 && c <= 0x0aa8)
|
||||
|| (c >= 0x0aaa && c <= 0x0ab0)
|
||||
|| (c >= 0x0ab2 && c <= 0x0ab3)
|
||||
|| (c >= 0x0ab5 && c <= 0x0ab9)
|
||||
|| (c == 0x0ae0))
|
||||
return 1;
|
||||
|
||||
/* Oriya */
|
||||
if ((c >= 0x0b05 && c <= 0x0b0c)
|
||||
|| (c >= 0x0b0f && c <= 0x0b10)
|
||||
|| (c >= 0x0b13 && c <= 0x0b28)
|
||||
|| (c >= 0x0b2a && c <= 0x0b30)
|
||||
|| (c >= 0x0b32 && c <= 0x0b33)
|
||||
|| (c >= 0x0b36 && c <= 0x0b39)
|
||||
|| (c >= 0x0b5c && c <= 0x0b5d)
|
||||
|| (c >= 0x0b5f && c <= 0x0b61))
|
||||
return 1;
|
||||
|
||||
/* Tamil */
|
||||
if ((c >= 0x0b85 && c <= 0x0b8a)
|
||||
|| (c >= 0x0b8e && c <= 0x0b90)
|
||||
|| (c >= 0x0b92 && c <= 0x0b95)
|
||||
|| (c >= 0x0b99 && c <= 0x0b9a)
|
||||
|| (c == 0x0b9c)
|
||||
|| (c >= 0x0b9e && c <= 0x0b9f)
|
||||
|| (c >= 0x0ba3 && c <= 0x0ba4)
|
||||
|| (c >= 0x0ba8 && c <= 0x0baa)
|
||||
|| (c >= 0x0bae && c <= 0x0bb5)
|
||||
|| (c >= 0x0bb7 && c <= 0x0bb9))
|
||||
return 1;
|
||||
|
||||
/* Telugu */
|
||||
if ((c >= 0x0c05 && c <= 0x0c0c)
|
||||
|| (c >= 0x0c0e && c <= 0x0c10)
|
||||
|| (c >= 0x0c12 && c <= 0x0c28)
|
||||
|| (c >= 0x0c2a && c <= 0x0c33)
|
||||
|| (c >= 0x0c35 && c <= 0x0c39)
|
||||
|| (c >= 0x0c60 && c <= 0x0c61))
|
||||
return 1;
|
||||
|
||||
/* Kannada */
|
||||
if ((c >= 0x0c85 && c <= 0x0c8c)
|
||||
|| (c >= 0x0c8e && c <= 0x0c90)
|
||||
|| (c >= 0x0c92 && c <= 0x0ca8)
|
||||
|| (c >= 0x0caa && c <= 0x0cb3)
|
||||
|| (c >= 0x0cb5 && c <= 0x0cb9)
|
||||
|| (c >= 0x0ce0 && c <= 0x0ce1))
|
||||
return 1;
|
||||
|
||||
/* Malayalam */
|
||||
if ((c >= 0x0d05 && c <= 0x0d0c)
|
||||
|| (c >= 0x0d0e && c <= 0x0d10)
|
||||
|| (c >= 0x0d12 && c <= 0x0d28)
|
||||
|| (c >= 0x0d2a && c <= 0x0d39)
|
||||
|| (c >= 0x0d60 && c <= 0x0d61))
|
||||
return 1;
|
||||
|
||||
/* Thai */
|
||||
if ((c >= 0x0e01 && c <= 0x0e30)
|
||||
|| (c >= 0x0e32 && c <= 0x0e33)
|
||||
|| (c >= 0x0e40 && c <= 0x0e46)
|
||||
|| (c >= 0x0e4f && c <= 0x0e5b))
|
||||
return 1;
|
||||
|
||||
/* Lao */
|
||||
if ((c >= 0x0e81 && c <= 0x0e82)
|
||||
|| (c == 0x0e84)
|
||||
|| (c == 0x0e87)
|
||||
|| (c == 0x0e88)
|
||||
|| (c == 0x0e8a)
|
||||
|| (c == 0x0e0d)
|
||||
|| (c >= 0x0e94 && c <= 0x0e97)
|
||||
|| (c >= 0x0e99 && c <= 0x0e9f)
|
||||
|| (c >= 0x0ea1 && c <= 0x0ea3)
|
||||
|| (c == 0x0ea5)
|
||||
|| (c == 0x0ea7)
|
||||
|| (c == 0x0eaa)
|
||||
|| (c == 0x0eab)
|
||||
|| (c >= 0x0ead && c <= 0x0eb0)
|
||||
|| (c == 0x0eb2)
|
||||
|| (c == 0x0eb3)
|
||||
|| (c == 0x0ebd)
|
||||
|| (c >= 0x0ec0 && c <= 0x0ec4)
|
||||
|| (c == 0x0ec6))
|
||||
return 1;
|
||||
|
||||
/* Georgian */
|
||||
if ((c >= 0x10a0 && c <= 0x10c5)
|
||||
|| (c >= 0x10d0 && c <= 0x10f6))
|
||||
return 1;
|
||||
|
||||
/* Hiragana */
|
||||
if ((c >= 0x3041 && c <= 0x3094)
|
||||
|| (c >= 0x309b && c <= 0x309e))
|
||||
return 1;
|
||||
|
||||
/* Katakana */
|
||||
if ((c >= 0x30a1 && c <= 0x30fe))
|
||||
return 1;
|
||||
|
||||
/* Bopmofo */
|
||||
if ((c >= 0x3105 && c <= 0x312c))
|
||||
return 1;
|
||||
|
||||
/* Hangul */
|
||||
if ((c >= 0x1100 && c <= 0x1159)
|
||||
|| (c >= 0x1161 && c <= 0x11a2)
|
||||
|| (c >= 0x11a8 && c <= 0x11f9))
|
||||
return 1;
|
||||
|
||||
/* CJK Unified Ideographs */
|
||||
if ((c >= 0xf900 && c <= 0xfa2d)
|
||||
|| (c >= 0xfb1f && c <= 0xfb36)
|
||||
|| (c >= 0xfb38 && c <= 0xfb3c)
|
||||
|| (c == 0xfb3e)
|
||||
|| (c >= 0xfb40 && c <= 0xfb41)
|
||||
|| (c >= 0xfb42 && c <= 0xfb44)
|
||||
|| (c >= 0xfb46 && c <= 0xfbb1)
|
||||
|| (c >= 0xfbd3 && c <= 0xfd3f)
|
||||
|| (c >= 0xfd50 && c <= 0xfd8f)
|
||||
|| (c >= 0xfd92 && c <= 0xfdc7)
|
||||
|| (c >= 0xfdf0 && c <= 0xfdfb)
|
||||
|| (c >= 0xfe70 && c <= 0xfe72)
|
||||
|| (c == 0xfe74)
|
||||
|| (c >= 0xfe76 && c <= 0xfefc)
|
||||
|| (c >= 0xff21 && c <= 0xff3a)
|
||||
|| (c >= 0xff41 && c <= 0xff5a)
|
||||
|| (c >= 0xff66 && c <= 0xffbe)
|
||||
|| (c >= 0xffc2 && c <= 0xffc7)
|
||||
|| (c >= 0xffca && c <= 0xffcf)
|
||||
|| (c >= 0xffd2 && c <= 0xffd7)
|
||||
|| (c >= 0xffda && c <= 0xffdc)
|
||||
|| (c >= 0x4e00 && c <= 0x9fa5))
|
||||
return 1;
|
||||
|
||||
error ("universal-character-name `\\u%04x' not valid in identifier", c);
|
||||
return 1;
|
||||
}
|
||||
|
||||
#if 0
|
||||
/* Add the UTF-8 representation of C to the token_buffer. */
|
||||
|
||||
static void
|
||||
utf8_extend_token (c)
|
||||
int c;
|
||||
{
|
||||
int shift, mask;
|
||||
|
||||
if (c <= 0x0000007f)
|
||||
{
|
||||
extend_token (c);
|
||||
return;
|
||||
}
|
||||
else if (c <= 0x000007ff)
|
||||
shift = 6, mask = 0xc0;
|
||||
else if (c <= 0x0000ffff)
|
||||
shift = 12, mask = 0xe0;
|
||||
else if (c <= 0x001fffff)
|
||||
shift = 18, mask = 0xf0;
|
||||
else if (c <= 0x03ffffff)
|
||||
shift = 24, mask = 0xf8;
|
||||
else
|
||||
shift = 30, mask = 0xfc;
|
||||
|
||||
extend_token (mask | (c >> shift));
|
||||
do
|
||||
{
|
||||
shift -= 6;
|
||||
extend_token ((unsigned char) (0x80 | (c >> shift)));
|
||||
}
|
||||
while (shift);
|
||||
}
|
||||
#endif
|
||||
|
||||
#define ENDFILE -1 /* token that represents end-of-file */
|
||||
|
||||
/* Read an escape sequence, returning its equivalent as a character,
|
||||
@ -2869,6 +3248,11 @@ readescape (ignore_ptr)
|
||||
put_back (c);
|
||||
return code;
|
||||
|
||||
case 'U':
|
||||
return read_ucs (8);
|
||||
case 'u':
|
||||
return read_ucs (4);
|
||||
|
||||
case '\\': case '\'': case '"':
|
||||
return c;
|
||||
|
||||
@ -3542,8 +3926,8 @@ real_yylex ()
|
||||
case 'z':
|
||||
case '_':
|
||||
case '$':
|
||||
#if USE_CPPLIB
|
||||
letter:
|
||||
#if USE_CPPLIB
|
||||
if (cpp_token == CPP_NAME)
|
||||
{
|
||||
/* Note that one character has already been read from
|
||||
@ -3561,22 +3945,43 @@ real_yylex ()
|
||||
#endif
|
||||
{
|
||||
p = token_buffer;
|
||||
while (ISALNUM (c) || (c == '_') || c == '$')
|
||||
while (1)
|
||||
{
|
||||
/* Make sure this char really belongs in an identifier. */
|
||||
if (c == '$')
|
||||
if (ISALNUM (c) || c == '_')
|
||||
/* OK */;
|
||||
else if (c == '$')
|
||||
{
|
||||
if (! dollars_in_ident)
|
||||
error ("`$' in identifier");
|
||||
else if (pedantic)
|
||||
pedwarn ("`$' in identifier");
|
||||
}
|
||||
/* FIXME we should use some sort of multibyte character
|
||||
encoding. Locale-dependent? Always UTF-8? */
|
||||
else if (is_extended_char (c))
|
||||
{
|
||||
sorry ("universal characters in identifiers");
|
||||
c = '_';
|
||||
}
|
||||
else
|
||||
break;
|
||||
|
||||
if (p >= token_buffer + maxtoken)
|
||||
p = extend_token_buffer (p);
|
||||
|
||||
*p++ = c;
|
||||
|
||||
idtryagain:
|
||||
c = token_getch ();
|
||||
|
||||
if (c == '\\')
|
||||
{
|
||||
int ignore = 0;
|
||||
c = readescape (&ignore);
|
||||
if (ignore)
|
||||
goto idtryagain;
|
||||
}
|
||||
}
|
||||
|
||||
*p = 0;
|
||||
@ -4634,6 +5039,8 @@ real_yylex ()
|
||||
break;
|
||||
|
||||
default:
|
||||
if (is_extended_char (c))
|
||||
goto letter;
|
||||
value = c;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user