gcc/libcpp/ucnid.pl

#! /usr/bin/perl -w
use strict;

# Convert cppucnid.tab to cppucnid.h.  We use two arrays of length
# 65536 to represent the table, since this is nice and simple.  The
# first array holds the tags indicating which ranges are valid in
# which contexts.  The second array holds the language name associated
# with each element.

our(@tags, @names);
@tags = ("") x 65536;
@names = ("") x 65536;


# Array mapping tag numbers to standard #defines
our @stds;

# Current standard and language
our($curstd, $curlang);

# First block of the file is a template to be saved for later.
our @template;

while (<>) {
    chomp;
    last if $_ eq '%%';
    push @template, $_;
};

# Second block of the file is the UCN tables.
# The format looks like this:
#
# [std]
#
# ; language
# xxxx-xxxx xxxx xxxx-xxxx ....
#
# with comment lines starting with #.

while (<>) {
    chomp;
    /^#/ and next;
    /^\s*$/ and next;
    /^\[(.+)\]$/ and do {
	$curstd = $1;
 	next;
    };
    /^; (.+)$/ and do {
	$curlang = $1;
	next;
    };

    process_range(split);
}

# Print out the template, inserting as requested.
$\ = "\n";
for (@template) {
    print("/* Automatically generated from cppucnid.tab, do not edit */"),
        next if $_ eq "[dne]";
    print_table(), next if $_ eq "[table]";
    print;
}

sub print_table {
    my($lo, $hi);
    my $prevname = "";

    for ($lo = 0; $lo <= $#tags; $lo = $hi) {
	$hi = $lo;
	$hi++ while $hi <= $#tags
	    && $tags[$hi] eq $tags[$lo]
	    && $names[$hi] eq $names[$lo];

	# Range from $lo to $hi-1.
	# Don't make entries for ranges that are not valid idchars.
	next if ($tags[$lo] eq "");
	my $tag = $tags[$lo];
        $tag = "    ".$tag if $tag =~ /^C99/;

	if ($names[$lo] eq $prevname) {
	    printf("  { 0x%04x, 0x%04x, %-11s },\n",
		   $lo, $hi-1, $tag);
	} else {
	    printf("  { 0x%04x, 0x%04x, %-11s },  /* %s */\n",
		   $lo, $hi-1, $tag, $names[$lo]);
	}
	$prevname = $names[$lo];
    }
}

# The line is a list of four-digit hexadecimal numbers or
# pairs of such numbers.  Each is a valid identifier character
# from the given language, under the given standard.
sub process_range {
    for my $range (@_) {
	if ($range =~ /^[0-9a-f]{4}$/) {
	    my $i = hex($range);
	    if ($tags[$i] eq "") {
		$tags[$i] = $curstd;
	    } else {
		$tags[$i] = $curstd . "|" . $tags[$i];
	    }
	    if ($names[$i] ne "" && $names[$i] ne $curlang) {
		warn sprintf ("language overlap: %s/%s at %x (tag %d)",
			      $names[$i], $curlang, $i, $tags[$i]);
		next;
	    }
	    $names[$i] = $curlang;
	} elsif ($range =~ /^ ([0-9a-f]{4}) - ([0-9a-f]{4}) $/x) {
	    my ($start, $end) = (hex($1), hex($2));
	    my $i;
	    for ($i = $start; $i <= $end; $i++) {
		if ($tags[$i] eq "") {
		    $tags[$i] = $curstd;
		} else {
		    $tags[$i] = $curstd . "|" . $tags[$i];
		}
		if ($names[$i] ne "" && $names[$i] ne $curlang) {
		    warn sprintf ("language overlap: %s/%s at %x (tag %d)",
				  $names[$i], $curlang, $i, $tags[$i]);
		    next;
		}
		$names[$i] = $curlang;
	    }
	} else {
	    warn "malformed range expression $range";
	}
    }
}
cpplib.h (CPP_AT_NAME, [...]): New token types. * cpplib.h (CPP_AT_NAME, CPP_OBJC_STRING): New token types. (struct cpp_options): Add narrow_charset, wide_charset, bytes_big_endian fields. Remove EBCDIC field. (cpp_init_iconv, cpp_interpret_string): New external interfaces. * cpphash.h: Include <iconv.h> if we have it, otherwise provide a dummy definition of iconv_t. (struct cpp_reader): Add narrow_cset_desc and wide_cset_desc fields. (_cpp_valid_ucn): Update prototype. (_cpp_destroy_iconv): New prototype. * doc/cpp.texi: Document character set handling. * doc/cppopts.texi: Document -fexec-charset= and -fexec-wide-charset=. * doc/extend.texi: Delete entire section on multiline strings. Rewrite section on __FUNCTION__ etc now that these are variables in C. * cppucnid.tab, cppucnid.pl: New files. * cppucnid.h: New generated file. * cppcharset.c: Include cppucnid.h. Lots of commentary added. (iconv_open, iconv, iconv_close): Provide dummy definitions if !HAVE_ICONV. (SOURCE_CHARSET, struct strbuf, init_iconv_desc, cpp_init_iconv, _cpp_destroy_iconv, convert_cset, width_to_mask, convert_ucn, emit_numeric_escape, convert_hex, convert_oct, convert_escape, cpp_interpret_string, narrow_str_to_charconst, wide_str_to_charconst): New. (ucn_valid_in_identifier): Use a binary search through the ucnranges table defined in cppucnid.h, not a long chain of if statements. (_cpp_valid_ucn): Add a limit pointer. Downgrade "universal character names are only valid in C++ and C99" to a warning. Issue the "meaning of \[uU] is different in traditional C" warning here. Take care not to let iconv see an invalid UCS value if we get a malformed UCN. Issue an error if we don't have iconv. (cpp_interpret_charconst): Moved here from cpplex.c. Use cpp_interpret_string to do the heavy lifting. * cppinit.c (cpp_create_reader): Initialize bytes_big_endian, narrow_charset, wide_charset fields of options structure. (cpp_destroy): Call _cpp_destroy_iconv. * cpplex.c (forms_identifier_p): Adjust call to _cpp_valid_ucn. (maybe_read_ucn, hex_digit_value, cpp_parse_escape): Delete. (cpp_interpret_charconst): Moved to cppcharset.c. * cpplib.c (dequote_string): Delete. (interpret_string_notranslate): New. (do_line, do_linemarker): Use interpret_string_notranslate. * Makefile.in (cppcharset.o): Depend on cppucnid.h. * c-common.c (fname_string, combine_strings): Delete. * c-common.h (fname_string, combine_strings): Delete prototypes. * c-lex.c (ignore_escape_flag): Delete. (cb_ident): Use cpp_interpret_string, not lex_string. (get_nonpadding_token): New function. (c_lex): Handle Objective-C @-prefixed identifiers and strings here. Adjust calls to lex_string. Don't write value twice. (lex_string): Now handles string constant concatenation. Most of the work handed off to cpp_interpret_string. Call fix_string_type here. c-parse.in (STRING_FUNC_NAME, VAR_FUNC_NAME): Replace with FUNC_NAME, throughout. (OBJC_STRING): New token type. (primary:STRING): No need to call fix_string_type here. (primary:objc_string): Make that OBJC_STRING. (objc_string nonterminal): Delete. (yylexname): Delete code to handle fake string constants. (yylexstring): Delete entirely. (_yylex): Handle CPP_AT_NAME and CPP_OBJC_STRING. No need to handle CPP_ATSIGN. * c.opt (-fexec-charset=, -fwide-exec-charset=): New options. * c-opts.c (missing_arg, c_common_handle_option): Handle OPT_fexec_charset_ and OPT_fwide_exec_charset_. (c_common_init): Set cpp_opts->bytes_big_endian, not cpp_opts->EBCDIC. Call cpp_init_iconv. (print_help): Document -fexec-charset= and -fexec-wide-charset=. (TARGET_EBCDIC): Delete default definition. * objc/objc-act.c (build_objc_string_object): No need to handle string constant concatenation. cp: * parser.c (cp_lexer_read_token): No need to handle string constant concatenation. testsuite: * gcc.c-torture/execute/wchar_t-1.x: New file; XFAIL wchar_t-1.c everywhere. * gcc.dg/concat.c: Concatenation of string constants with __FUNCTION__ / __PRETTY_FUNCTION__ is now a hard error. * gcc.dg/wtr-strcat-1.c: Loosen dg-warning regexp. * gcc.dg/cpp/escape-2.c: Use wide character constants where necessary to avoid multi-character character constant warning. * gcc.dg/cpp/escape.c: Likewise. * gcc.dg/cpp/ucs.c: Likewise. Remove backslashes from dg-bogus comments, as they confuse Tcl. Fix a typo. libstdc++-v3: * testsuite/22_locale/collate/compare/wchar_t/2.cc * testsuite/22_locale/collate/compare/wchar_t/wrapped_env.cc * testsuite/22_locale/collate/compare/wchar_t/wrapped_locale.cc * testsuite/22_locale/collate/hash/wchar_t/2.cc * testsuite/22_locale/collate/hash/wchar_t/wrapped_env.cc * testsuite/22_locale/collate/hash/wchar_t/wrapped_locale.cc * testsuite/22_locale/collate/transform/wchar_t/2.cc * testsuite/22_locale/collate/transform/wchar_t/wrapped_env.cc * testsuite/22_locale/collate/transform/wchar_t/wrapped_locale.cc: XFAIL on all targets. From-SVN: r68952 2003-07-05 08:24:00 +08:00			`#! /usr/bin/perl -w`
			`use strict;`

			`# Convert cppucnid.tab to cppucnid.h. We use two arrays of length`
			`# 65536 to represent the table, since this is nice and simple. The`
			`# first array holds the tags indicating which ranges are valid in`
			`# which contexts. The second array holds the language name associated`
			`# with each element.`

			`our(@tags, @names);`
			`@tags = ("") x 65536;`
			`@names = ("") x 65536;`


			`# Array mapping tag numbers to standard #defines`
			`our @stds;`

			`# Current standard and language`
			`our($curstd, $curlang);`

			`# First block of the file is a template to be saved for later.`
			`our @template;`

			`while (<>) {`
			`chomp;`
			`last if $_ eq '%%';`
			`push @template, $_;`
			`};`

			`# Second block of the file is the UCN tables.`
			`# The format looks like this:`
			`#`
			`# [std]`
			`#`
			`# ; language`
			`# xxxx-xxxx xxxx xxxx-xxxx ....`
			`#`
			`# with comment lines starting with #.`

			`while (<>) {`
			`chomp;`
			`/^#/ and next;`
			`/^\s*$/ and next;`
			`/^\[(.+)\]$/ and do {`
			`$curstd = $1;`
			`next;`
			`};`
			`/^; (.+)$/ and do {`
			`$curlang = $1;`
			`next;`
			`};`

			`process_range(split);`
			`}`

			`# Print out the template, inserting as requested.`
			`$\ = "\n";`
			`for (@template) {`
			`print("/* Automatically generated from cppucnid.tab, do not edit */"),`
			`next if $_ eq "[dne]";`
			`print_table(), next if $_ eq "[table]";`
			`print;`
			`}`

			`sub print_table {`
			`my($lo, $hi);`
			`my $prevname = "";`

			`for ($lo = 0; $lo <= $#tags; $lo = $hi) {`
			`$hi = $lo;`
			`$hi++ while $hi <= $#tags`
			`&& $tags[$hi] eq $tags[$lo]`
			`&& $names[$hi] eq $names[$lo];`

			`# Range from $lo to $hi-1.`
			`# Don't make entries for ranges that are not valid idchars.`
			`next if ($tags[$lo] eq "");`
			`my $tag = $tags[$lo];`
			`$tag = " ".$tag if $tag =~ /^C99/;`

			`if ($names[$lo] eq $prevname) {`
			`printf(" { 0x%04x, 0x%04x, %-11s },\n",`
			`$lo, $hi-1, $tag);`
			`} else {`
			`printf(" { 0x%04x, 0x%04x, %-11s }, /* %s */\n",`
			`$lo, $hi-1, $tag, $names[$lo]);`
			`}`
			`$prevname = $names[$lo];`
			`}`
			`}`

			`# The line is a list of four-digit hexadecimal numbers or`
			`# pairs of such numbers. Each is a valid identifier character`
			`# from the given language, under the given standard.`
			`sub process_range {`
			`for my $range (@_) {`
			`if ($range =~ /^[0-9a-f]{4}$/) {`
			`my $i = hex($range);`
			`if ($tags[$i] eq "") {`
			`$tags[$i] = $curstd;`
			`} else {`
			`$tags[$i] = $curstd . "\|" . $tags[$i];`
			`}`
			`if ($names[$i] ne "" && $names[$i] ne $curlang) {`
			`warn sprintf ("language overlap: %s/%s at %x (tag %d)",`
			`$names[$i], $curlang, $i, $tags[$i]);`
			`next;`
			`}`
			`$names[$i] = $curlang;`
			`} elsif ($range =~ /^ ([0-9a-f]{4}) - ([0-9a-f]{4}) $/x) {`
			`my ($start, $end) = (hex($1), hex($2));`
			`my $i;`
			`for ($i = $start; $i <= $end; $i++) {`
			`if ($tags[$i] eq "") {`
			`$tags[$i] = $curstd;`
			`} else {`
			`$tags[$i] = $curstd . "\|" . $tags[$i];`
			`}`
			`if ($names[$i] ne "" && $names[$i] ne $curlang) {`
			`warn sprintf ("language overlap: %s/%s at %x (tag %d)",`
			`$names[$i], $curlang, $i, $tags[$i]);`
			`next;`
			`}`
			`$names[$i] = $curlang;`
			`}`
			`} else {`
			`warn "malformed range expression $range";`
			`}`
			`}`
			`}`