From 36d20fa4a83d1a294462c2622ca76eac93465c2c Mon Sep 17 00:00:00 2001 From: Joseph Myers Date: Fri, 14 Oct 2022 23:07:50 +0000 Subject: [PATCH] preprocessor: C2x identifier rules C2x has, like C++, adopted rules for identifiers based directly on an unversioned normative reference to Unicode. Make libcpp follow those rules for c2x / gnu2x standards (this involves bringing back a flag separate from the C++ one for whether to use these identifier rules, but this time enabled for all C++ language versions since that was the conclusion adopted for C++ identifier handling). There is one change here that affects C++. I believe the new normative requirement for NFC only applies to identifiers, not to the use of identifier-continue characters in pp-numbers, where there is no such requirement and so the diagnostic ought to be a warning not a pedwarn in pp-numbers, and that this is the case for both C and C++. Bootstrapped with no regressions for x86_64-pc-linux-gnu. libcpp/ * charset.cc (ucn_valid_in_identifier): Check xid_identifiers not cplusplus to determine whether to use CXX23 and NXX23 flags. * include/cpplib.h (struct cpp_options): Add xid_identifiers. * init.cc (struct lang_flags, lang_defaults): Add xid_identifiers. (cpp_set_lang): Set xid_identifiers. * lex.cc (warn_about_normalization): Add parameter identifier. Only pedwarn about non-NFC for identifiers, not pp-numbers. (_cpp_lex_direct): Update calls to warn_about_normalization. gcc/testsuite/ * gcc.dg/cpp/c2x-ucnid-1-utf8.c, gcc.dg/cpp/c2x-ucnid-1.c: New tests. --- gcc/testsuite/gcc.dg/cpp/c2x-ucnid-1-utf8.c | 13 ++++++ gcc/testsuite/gcc.dg/cpp/c2x-ucnid-1.c | 13 ++++++ libcpp/charset.cc | 4 +- libcpp/include/cpplib.h | 4 ++ libcpp/init.cc | 52 +++++++++++---------- libcpp/lex.cc | 13 +++--- 6 files changed, 66 insertions(+), 33 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/cpp/c2x-ucnid-1-utf8.c create mode 100644 gcc/testsuite/gcc.dg/cpp/c2x-ucnid-1.c diff --git a/gcc/testsuite/gcc.dg/cpp/c2x-ucnid-1-utf8.c b/gcc/testsuite/gcc.dg/cpp/c2x-ucnid-1-utf8.c new file mode 100644 index 000000000000..55d228195632 --- /dev/null +++ b/gcc/testsuite/gcc.dg/cpp/c2x-ucnid-1-utf8.c @@ -0,0 +1,13 @@ +/* Test C2x (= Unicode) rules for characters in identifiers. */ +/* { dg-do preprocess } */ +/* { dg-options "-std=c2x -pedantic-errors" } */ + +¨ + +/* The requirement for NFC only applies in identifiers, not pp-numbers. */ + +À /* { dg-error "not in NFC" } */ +ÿÀ /* { dg-error "not in NFC" } */ + +0À /* { dg-warning "not in NFC" } */ +.1À /* { dg-warning "not in NFC" } */ diff --git a/gcc/testsuite/gcc.dg/cpp/c2x-ucnid-1.c b/gcc/testsuite/gcc.dg/cpp/c2x-ucnid-1.c new file mode 100644 index 000000000000..f9fdbea6eced --- /dev/null +++ b/gcc/testsuite/gcc.dg/cpp/c2x-ucnid-1.c @@ -0,0 +1,13 @@ +/* Test C2x (= Unicode) rules for characters in identifiers. */ +/* { dg-do preprocess } */ +/* { dg-options "-std=c2x -pedantic-errors" } */ + +\u00A8 /* { dg-error "is not valid in an identifier" } */ + +/* The requirement for NFC only applies in identifiers, not pp-numbers. */ + +A\u0300 /* { dg-error "not in NFC" } */ +\u00ffA\u0300 /* { dg-error "not in NFC" } */ + +0A\u0300 /* { dg-warning "not in NFC" } */ +.1A\u0300 /* { dg-warning "not in NFC" } */ diff --git a/libcpp/charset.cc b/libcpp/charset.cc index 6834969a919f..12a398e7527d 100644 --- a/libcpp/charset.cc +++ b/libcpp/charset.cc @@ -1291,7 +1291,7 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c, valid_flags = C99 | CXX | C11 | CXX23; if (CPP_PEDANTIC (pfile)) { - if (CPP_OPTION (pfile, cplusplus)) + if (CPP_OPTION (pfile, xid_identifiers)) valid_flags = CXX23; else if (CPP_OPTION (pfile, c11_identifiers)) valid_flags = C11; @@ -1355,7 +1355,7 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c, return 2; } - if (CPP_OPTION (pfile, cplusplus)) + if (CPP_OPTION (pfile, xid_identifiers)) invalid_start_flags = NXX23; else if (CPP_OPTION (pfile, c11_identifiers)) invalid_start_flags = N11; diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h index e97993e04bcb..d5ef12a30ead 100644 --- a/libcpp/include/cpplib.h +++ b/libcpp/include/cpplib.h @@ -496,6 +496,10 @@ struct cpp_options in C11. */ unsigned char c11_identifiers; + /* Nonzero means extended identifiers allow the characters specified + by Unicode XID_Start and XID_Continue properties. */ + unsigned char xid_identifiers; + /* Nonzero for C++ 2014 Standard binary constants. */ unsigned char binary_constants; diff --git a/libcpp/init.cc b/libcpp/init.cc index d3b4f00994b5..5f34e3515d2f 100644 --- a/libcpp/init.cc +++ b/libcpp/init.cc @@ -82,6 +82,7 @@ struct lang_flags char extended_numbers; char extended_identifiers; char c11_identifiers; + char xid_identifiers; char std; char digraphs; char uliterals; @@ -102,31 +103,31 @@ struct lang_flags }; static const struct lang_flags lang_defaults[] = -{ /* c99 c++ xnum xid c11 std digr ulit rlit udlit bincst digsep trig u8chlit vaopt scope dfp szlit elifdef warndir delim trufal */ - /* GNUC89 */ { 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0 }, - /* GNUC99 */ { 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0 }, - /* GNUC11 */ { 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0 }, - /* GNUC17 */ { 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0 }, - /* GNUC2X */ { 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1 }, - /* STDC89 */ { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, - /* STDC94 */ { 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, - /* STDC99 */ { 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, - /* STDC11 */ { 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, - /* STDC17 */ { 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, - /* STDC2X */ { 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1 }, - /* GNUCXX */ { 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1 }, - /* CXX98 */ { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1 }, - /* GNUCXX11 */ { 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1 }, - /* CXX11 */ { 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1 }, - /* GNUCXX14 */ { 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1 }, - /* CXX14 */ { 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1 }, - /* GNUCXX17 */ { 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1 }, - /* CXX17 */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1 }, - /* GNUCXX20 */ { 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1 }, - /* CXX20 */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1 }, - /* GNUCXX23 */ { 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1 }, - /* CXX23 */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1 }, - /* ASM */ { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } +{ /* c99 c++ xnum xid c11 xidid std digr ulit rlit udlit bincst digsep trig u8chlit vaopt scope dfp szlit elifdef warndir delim trufal */ + /* GNUC89 */ { 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0 }, + /* GNUC99 */ { 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0 }, + /* GNUC11 */ { 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0 }, + /* GNUC17 */ { 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0 }, + /* GNUC2X */ { 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1 }, + /* STDC89 */ { 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + /* STDC94 */ { 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + /* STDC99 */ { 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + /* STDC11 */ { 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + /* STDC17 */ { 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + /* STDC2X */ { 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1 }, + /* GNUCXX */ { 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1 }, + /* CXX98 */ { 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1 }, + /* GNUCXX11 */ { 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1 }, + /* CXX11 */ { 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1 }, + /* GNUCXX14 */ { 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1 }, + /* CXX14 */ { 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1 }, + /* GNUCXX17 */ { 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1 }, + /* CXX17 */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1 }, + /* GNUCXX20 */ { 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1 }, + /* CXX20 */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1 }, + /* GNUCXX23 */ { 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1 }, + /* CXX23 */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1 }, + /* ASM */ { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }; /* Sets internal flags correctly for a given language. */ @@ -142,6 +143,7 @@ cpp_set_lang (cpp_reader *pfile, enum c_lang lang) CPP_OPTION (pfile, extended_numbers) = l->extended_numbers; CPP_OPTION (pfile, extended_identifiers) = l->extended_identifiers; CPP_OPTION (pfile, c11_identifiers) = l->c11_identifiers; + CPP_OPTION (pfile, xid_identifiers) = l->xid_identifiers; CPP_OPTION (pfile, std) = l->std; CPP_OPTION (pfile, digraphs) = l->digraphs; CPP_OPTION (pfile, uliterals) = l->uliterals; diff --git a/libcpp/lex.cc b/libcpp/lex.cc index a429a3d44cee..cc12a52d2829 100644 --- a/libcpp/lex.cc +++ b/libcpp/lex.cc @@ -2007,7 +2007,8 @@ name_p (cpp_reader *pfile, const cpp_string *string) static void warn_about_normalization (cpp_reader *pfile, const cpp_token *token, - const struct normalize_state *s) + const struct normalize_state *s, + bool identifier) { if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s) && !pfile->state.skipping) @@ -2043,7 +2044,7 @@ warn_about_normalization (cpp_reader *pfile, if (NORMALIZE_STATE_RESULT (s) == normalized_C) cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc, "`%.*s' is not in NFKC", (int) sz, buf); - else if (CPP_OPTION (pfile, cplusplus)) + else if (identifier && CPP_OPTION (pfile, xid_identifiers)) cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc, "`%.*s' is not in NFC", (int) sz, buf); else @@ -3839,7 +3840,7 @@ _cpp_lex_direct (cpp_reader *pfile) struct normalize_state nst = INITIAL_NORMALIZE_STATE; result->type = CPP_NUMBER; lex_number (pfile, &result->val.str, &nst); - warn_about_normalization (pfile, result, &nst); + warn_about_normalization (pfile, result, &nst, false); break; } @@ -3888,7 +3889,7 @@ _cpp_lex_direct (cpp_reader *pfile) result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false, &nst, &result->val.node.spelling); - warn_about_normalization (pfile, result, &nst); + warn_about_normalization (pfile, result, &nst, true); } /* Convert named operators to their proper types. */ @@ -4101,7 +4102,7 @@ _cpp_lex_direct (cpp_reader *pfile) struct normalize_state nst = INITIAL_NORMALIZE_STATE; result->type = CPP_NUMBER; lex_number (pfile, &result->val.str, &nst); - warn_about_normalization (pfile, result, &nst); + warn_about_normalization (pfile, result, &nst, false); } else if (*buffer->cur == '.' && buffer->cur[1] == '.') buffer->cur += 2, result->type = CPP_ELLIPSIS; @@ -4192,7 +4193,7 @@ _cpp_lex_direct (cpp_reader *pfile) result->type = CPP_NAME; result->val.node.node = lex_identifier (pfile, base, true, &nst, &result->val.node.spelling); - warn_about_normalization (pfile, result, &nst); + warn_about_normalization (pfile, result, &nst, true); break; }