diff --git a/gcc/c-family/c.opt b/gcc/c-family/c.opt index 4515664aa590..1c7f89eeb94e 100644 --- a/gcc/c-family/c.opt +++ b/gcc/c-family/c.opt @@ -822,8 +822,8 @@ C ObjC C++ ObjC++ CPP(warn_invalid_pch) CppReason(CPP_W_INVALID_PCH) Var(cpp_war Warn about PCH files that are found but not used. Winvalid-utf8 -C objC C++ ObjC++ CPP(cpp_warn_invalid_utf8) CppReason(CPP_W_INVALID_UTF8) Var(warn_invalid_utf8) Init(0) Warning -Warn about invalid UTF-8 characters in comments. +C ObjC C++ ObjC++ CPP(cpp_warn_invalid_utf8) CppReason(CPP_W_INVALID_UTF8) Var(warn_invalid_utf8) Init(0) Warning +Warn about invalid UTF-8 characters. Wjump-misses-init C ObjC Var(warn_jump_misses_init) Warning LangEnabledby(C ObjC,Wc++-compat) @@ -1345,6 +1345,10 @@ Wundef C ObjC C++ ObjC++ CPP(warn_undef) CppReason(CPP_W_UNDEF) Var(cpp_warn_undef) Init(0) Warning Warn if an undefined macro is used in an #if directive. +Wunicode +C ObjC C++ ObjC++ CPP(cpp_warn_unicode) CppReason(CPP_W_UNICODE) Var(warn_unicode) Init(1) Warning +Warn about invalid forms of delimited or named escape sequences. + Wuninitialized C ObjC C++ ObjC++ LTO LangEnabledBy(C ObjC C++ ObjC++ LTO,Wall) ; diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 9d662e353163..cc631dfa2617 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -365,7 +365,7 @@ Objective-C and Objective-C++ Dialects}. -Winfinite-recursion @gol -Winit-self -Winline -Wno-int-conversion -Wint-in-bool-context @gol -Wno-int-to-pointer-cast -Wno-invalid-memory-model @gol --Winvalid-pch -Winvalid-utf8 -Wjump-misses-init @gol +-Winvalid-pch -Winvalid-utf8 -Wno-unicode -Wjump-misses-init @gol -Wlarger-than=@var{byte-size} -Wlogical-not-parentheses -Wlogical-op @gol -Wlong-long -Wno-lto-type-mismatch -Wmain -Wmaybe-uninitialized @gol -Wmemset-elt-size -Wmemset-transposed-args @gol @@ -9578,6 +9578,12 @@ Warn if an invalid UTF-8 character is found. This warning is on by default for C++23 if @option{-finput-charset=UTF-8} is used and turned into error with @option{-pedantic-errors}. +@item -Wno-unicode +@opindex Wunicode +@opindex Wno-unicode +Don't diagnose invalid forms of delimited or named escape sequences which are +treated as separate tokens. @option{Wunicode} is enabled by default. + @item -Wlong-long @opindex Wlong-long @opindex Wno-long-long diff --git a/gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-4.c b/gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-4.c new file mode 100644 index 000000000000..107051fcce38 --- /dev/null +++ b/gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-4.c @@ -0,0 +1,13 @@ +/* P2290R3 - Delimited escape sequences */ +/* { dg-do compile } */ +/* { dg-require-effective-target wchar } */ +/* { dg-options "-std=gnu99 -Wno-c++-compat" { target c } } */ +/* { dg-options "-std=gnu++20" { target c++ } } */ + +#define z(x) 0 +#define a z( +int b = a\u{}); /* { dg-warning "empty delimited escape sequence; treating it as separate tokens" } */ +int c = a\u{); /* { dg-warning "'\\\\u\\\{' not terminated with '\\\}' after \\\\u\\\{; treating it as separate tokens" } */ +int d = a\u{12XYZ}); /* { dg-warning "'\\\\u\\\{' not terminated with '\\\}' after \\\\u\\\{12; treating it as separate tokens" } */ +int e = a\u123); +int f = a\U1234567); diff --git a/gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-5.c b/gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-5.c new file mode 100644 index 000000000000..e04f519674ed --- /dev/null +++ b/gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-5.c @@ -0,0 +1,13 @@ +/* P2290R3 - Delimited escape sequences */ +/* { dg-do compile } */ +/* { dg-require-effective-target wchar } */ +/* { dg-options "-std=c17 -Wno-c++-compat" { target c } } */ +/* { dg-options "-std=c++23" { target c++ } } */ + +#define z(x) 0 +#define a z( +int b = a\u{}); /* { dg-warning "empty delimited escape sequence; treating it as separate tokens" "" { target c++23 } } */ +int c = a\u{); /* { dg-warning "'\\\\u\\\{' not terminated with '\\\}' after \\\\u\\\{; treating it as separate tokens" "" { target c++23 } } */ +int d = a\u{12XYZ}); /* { dg-warning "'\\\\u\\\{' not terminated with '\\\}' after \\\\u\\\{12; treating it as separate tokens" "" { target c++23 } } */ +int e = a\u123); +int f = a\U1234567); diff --git a/gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-6.c b/gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-6.c new file mode 100644 index 000000000000..f2a4e9390940 --- /dev/null +++ b/gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-6.c @@ -0,0 +1,13 @@ +/* P2290R3 - Delimited escape sequences */ +/* { dg-do compile } */ +/* { dg-require-effective-target wchar } */ +/* { dg-options "-std=gnu99 -Wno-c++-compat -Wno-unicode" { target c } } */ +/* { dg-options "-std=gnu++20 -Wno-unicode" { target c++ } } */ + +#define z(x) 0 +#define a z( +int b = a\u{}); /* { dg-bogus "empty delimited escape sequence; treating it as separate tokens" } */ +int c = a\u{); /* { dg-bogus "'\\\\u\\\{' not terminated with '\\\}' after \\\\u\\\{; treating it as separate tokens" } */ +int d = a\u{12XYZ}); /* { dg-bogus "'\\\\u\\\{' not terminated with '\\\}' after \\\\u\\\{12; treating it as separate tokens" } */ +int e = a\u123); +int f = a\U1234567); diff --git a/gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-7.c b/gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-7.c new file mode 100644 index 000000000000..e2f0da4e4d4d --- /dev/null +++ b/gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-7.c @@ -0,0 +1,13 @@ +/* P2290R3 - Delimited escape sequences */ +/* { dg-do compile } */ +/* { dg-require-effective-target wchar } */ +/* { dg-options "-std=c17 -Wno-c++-compat -Wno-unicode" { target c } } */ +/* { dg-options "-std=c++23 -Wno-unicode" { target c++ } } */ + +#define z(x) 0 +#define a z( +int b = a\u{}); /* { dg-bogus "empty delimited escape sequence; treating it as separate tokens" } */ +int c = a\u{); /* { dg-bogus "'\\\\u\\\{' not terminated with '\\\}' after \\\\u\\\{; treating it as separate tokens" } */ +int d = a\u{12XYZ}); /* { dg-bogus "'\\\\u\\\{' not terminated with '\\\}' after \\\\u\\\{12; treating it as separate tokens" } */ +int e = a\u123); +int f = a\U1234567); diff --git a/gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-5.c b/gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-5.c new file mode 100644 index 000000000000..a1c53c7f649a --- /dev/null +++ b/gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-5.c @@ -0,0 +1,17 @@ +/* P2071R2 - Named universal character escapes */ +/* { dg-do compile } */ +/* { dg-require-effective-target wchar } */ +/* { dg-options "-std=gnu99 -Wno-c++-compat" { target c } } */ +/* { dg-options "-std=gnu++20" { target c++ } } */ + +#define z(x) 0 +#define a z( +int b = a\N{}); /* { dg-warning "empty named universal character escape sequence; treating it as separate tokens" } */ +int c = a\N{); /* { dg-warning "'\\\\N\\\{' not terminated with '\\\}' after \\\\N\\\{; treating it as separate tokens" } */ +int d = a\N); +int e = a\NARG); +int f = a\N{abc}); /* { dg-warning "\\\\N\\\{abc\\\} is not a valid universal character; treating it as separate tokens" } */ +int g = a\N{ABC.123}); /* { dg-warning "'\\\\N\\\{' not terminated with '\\\}' after \\\\N\\\{ABC; treating it as separate tokens" } */ +int h = a\N{NON-EXISTENT CHAR}); /* { dg-warning "\\\\N\\\{NON-EXISTENT CHAR\\\} is not a valid universal character; treating it as separate tokens" } */ +int i = a\N{Latin_Small_Letter_A_With_Acute}); /* { dg-warning "\\\\N\\\{Latin_Small_Letter_A_With_Acute\\\} is not a valid universal character; treating it as separate tokens" } */ + /* { dg-message "did you mean \\\\N\\\{LATIN SMALL LETTER A WITH ACUTE\\\}\\?" "" { target *-*-* } .-1 } */ diff --git a/gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-6.c b/gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-6.c new file mode 100644 index 000000000000..a6a5a102aac4 --- /dev/null +++ b/gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-6.c @@ -0,0 +1,17 @@ +/* P2071R2 - Named universal character escapes */ +/* { dg-do compile } */ +/* { dg-require-effective-target wchar } */ +/* { dg-options "-std=c17 -Wno-c++-compat" { target c } } */ +/* { dg-options "-std=c++20" { target c++ } } */ + +#define z(x) 0 +#define a z( +int b = a\N{}); +int c = a\N{); +int d = a\N); +int e = a\NARG); +int f = a\N{abc}); +int g = a\N{ABC.123}); +int h = a\N{NON-EXISTENT CHAR}); /* { dg-bogus "is not a valid universal character" } */ +int i = a\N{Latin_Small_Letter_A_With_Acute}); +int j = a\N{LATIN SMALL LETTER A WITH ACUTE}); diff --git a/gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-7.c b/gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-7.c new file mode 100644 index 000000000000..e6142bff63cd --- /dev/null +++ b/gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-7.c @@ -0,0 +1,17 @@ +/* P2071R2 - Named universal character escapes */ +/* { dg-do compile } */ +/* { dg-require-effective-target wchar } */ +/* { dg-options "-std=gnu99 -Wno-c++-compat -Wno-unicode" { target c } } */ +/* { dg-options "-std=gnu++20 -Wno-unicode" { target c++ } } */ + +#define z(x) 0 +#define a z( +int b = a\N{}); /* { dg-bogus "empty named universal character escape sequence; treating it as separate tokens" } */ +int c = a\N{); /* { dg-bogus "'\\\\N\\\{' not terminated with '\\\}' after \\\\N\\\{; treating it as separate tokens" } */ +int d = a\N); +int e = a\NARG); +int f = a\N{abc}); /* { dg-bogus "\\\\N\\\{abc\\\} is not a valid universal character; treating it as separate tokens" } */ +int g = a\N{ABC.123}); /* { dg-bogus "'\\\\N\\\{' not terminated with '\\\}' after \\\\N\\\{ABC; treating it as separate tokens" } */ +int h = a\N{NON-EXISTENT CHAR}); /* { dg-bogus "\\\\N\\\{NON-EXISTENT CHAR\\\} is not a valid universal character; treating it as separate tokens" } */ +int i = a\N{Latin_Small_Letter_A_With_Acute}); /* { dg-bogus "\\\\N\\\{Latin_Small_Letter_A_With_Acute\\\} is not a valid universal character; treating it as separate tokens" } */ + /* { dg-bogus "did you mean \\\\N\\\{LATIN SMALL LETTER A WITH ACUTE\\\}\\?" "" { target *-*-* } .-1 } */ diff --git a/gcc/testsuite/g++.dg/cpp23/named-universal-char-escape1.C b/gcc/testsuite/g++.dg/cpp23/named-universal-char-escape1.C new file mode 100644 index 000000000000..fe494824ed48 --- /dev/null +++ b/gcc/testsuite/g++.dg/cpp23/named-universal-char-escape1.C @@ -0,0 +1,16 @@ +// P2071R2 - Named universal character escapes +// { dg-do compile } +// { dg-require-effective-target wchar } + +#define z(x) 0 +#define a z( +int b = a\N{}); // { dg-warning "empty named universal character escape sequence; treating it as separate tokens" "" { target c++23 } } +int c = a\N{); // { dg-warning "'\\\\N\\\{' not terminated with '\\\}' after \\\\N\\\{; treating it as separate tokens" "" { target c++23 } } +int d = a\N); +int e = a\NARG); +int f = a\N{abc}); // { dg-warning "\\\\N\\\{abc\\\} is not a valid universal character; treating it as separate tokens" "" { target c++23 } } +int g = a\N{ABC.123}); // { dg-warning "'\\\\N\\\{' not terminated with '\\\}' after \\\\N\\\{ABC; treating it as separate tokens" "" { target c++23 } } +int h = a\N{NON-EXISTENT CHAR}); // { dg-error "is not a valid universal character" "" { target c++23 } } + // { dg-error "was not declared in this scope" "" { target c++23 } .-1 } +int i = a\N{Latin_Small_Letter_A_With_Acute}); // { dg-warning "\\\\N\\\{Latin_Small_Letter_A_With_Acute\\\} is not a valid universal character; treating it as separate tokens" "" { target c++23 } } + // { dg-message "did you mean \\\\N\\\{LATIN SMALL LETTER A WITH ACUTE\\\}\\?" "" { target c++23 } .-1 } diff --git a/gcc/testsuite/g++.dg/cpp23/named-universal-char-escape2.C b/gcc/testsuite/g++.dg/cpp23/named-universal-char-escape2.C new file mode 100644 index 000000000000..8699e098c887 --- /dev/null +++ b/gcc/testsuite/g++.dg/cpp23/named-universal-char-escape2.C @@ -0,0 +1,18 @@ +// P2071R2 - Named universal character escapes +// { dg-do compile } +// { dg-require-effective-target wchar } +// { dg-options "" } + +#define z(x) 0 +#define a z( +int b = a\N{}); // { dg-warning "empty named universal character escape sequence; treating it as separate tokens" } +int c = a\N{); // { dg-warning "'\\\\N\\\{' not terminated with '\\\}' after \\\\N\\\{; treating it as separate tokens" } +int d = a\N); +int e = a\NARG); +int f = a\N{abc}); // { dg-warning "\\\\N\\\{abc\\\} is not a valid universal character; treating it as separate tokens" } +int g = a\N{ABC.123}); // { dg-warning "'\\\\N\\\{' not terminated with '\\\}' after \\\\N\\\{ABC; treating it as separate tokens" } +int h = a\N{NON-EXISTENT CHAR}); // { dg-error "is not a valid universal character" "" { target c++23 } } + // { dg-error "was not declared in this scope" "" { target c++23 } .-1 } + // { dg-warning "\\\\N\\\{NON-EXISTENT CHAR\\\} is not a valid universal character; treating it as separate tokens" "" { target c++20_down } .-2 } +int i = a\N{Latin_Small_Letter_A_With_Acute}); // { dg-warning "\\\\N\\\{Latin_Small_Letter_A_With_Acute\\\} is not a valid universal character; treating it as separate tokens" } + // { dg-message "did you mean \\\\N\\\{LATIN SMALL LETTER A WITH ACUTE\\\}\\?" "" { target *-*-* } .-1 } diff --git a/libcpp/charset.cc b/libcpp/charset.cc index c9656dbbe156..6834969a919f 100644 --- a/libcpp/charset.cc +++ b/libcpp/charset.cc @@ -1448,7 +1448,11 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, if (str[-1] == 'u') { length = 4; - if (str < limit && *str == '{') + if (str < limit + && *str == '{' + && (!identifier_pos + || CPP_OPTION (pfile, delimited_escape_seqs) + || !CPP_OPTION (pfile, std))) { str++; /* Magic value to indicate no digits seen. */ @@ -1462,8 +1466,22 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, else if (str[-1] == 'N') { length = 4; + if (identifier_pos + && !CPP_OPTION (pfile, delimited_escape_seqs) + && CPP_OPTION (pfile, std)) + { + *cp = 0; + return false; + } if (str == limit || *str != '{') - cpp_error (pfile, CPP_DL_ERROR, "'\\N' not followed by '{'"); + { + if (identifier_pos) + { + *cp = 0; + return false; + } + cpp_error (pfile, CPP_DL_ERROR, "'\\N' not followed by '{'"); + } else { str++; @@ -1489,15 +1507,19 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, if (str < limit && *str == '}') { - if (name == str && identifier_pos) + if (identifier_pos && name == str) { + cpp_warning (pfile, CPP_W_UNICODE, + "empty named universal character escape " + "sequence; treating it as separate tokens"); *cp = 0; return false; } if (name == str) cpp_error (pfile, CPP_DL_ERROR, "empty named universal character escape sequence"); - else if (!CPP_OPTION (pfile, delimited_escape_seqs) + else if ((!identifier_pos || strict) + && !CPP_OPTION (pfile, delimited_escape_seqs) && CPP_OPTION (pfile, cpp_pedantic)) cpp_error (pfile, CPP_DL_PEDWARN, "named universal character escapes are only valid " @@ -1515,27 +1537,51 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, uname2c_tree, NULL); if (result == (cppchar_t) -1) { - cpp_error (pfile, CPP_DL_ERROR, - "\\N{%.*s} is not a valid universal " - "character", (int) (str - name), name); + bool ret = true; + if (identifier_pos + && (!CPP_OPTION (pfile, delimited_escape_seqs) + || !strict)) + ret = cpp_warning (pfile, CPP_W_UNICODE, + "\\N{%.*s} is not a valid " + "universal character; treating it " + "as separate tokens", + (int) (str - name), name); + else + cpp_error (pfile, CPP_DL_ERROR, + "\\N{%.*s} is not a valid universal " + "character", (int) (str - name), name); /* Try to do a loose name lookup according to Unicode loose matching rule UAX44-LM2. */ char canon_name[uname2c_max_name_len + 1]; result = _cpp_uname2c_uax44_lm2 ((const char *) name, str - name, canon_name); - if (result != (cppchar_t) -1) + if (result != (cppchar_t) -1 && ret) cpp_error (pfile, CPP_DL_NOTE, "did you mean \\N{%s}?", canon_name); else - result = 0x40; + result = 0xC0; + if (identifier_pos + && (!CPP_OPTION (pfile, delimited_escape_seqs) + || !strict)) + { + *cp = 0; + return false; + } } } str++; extend_char_range (char_range, loc_reader); } else if (identifier_pos) - length = 1; + { + cpp_warning (pfile, CPP_W_UNICODE, + "'\\N{' not terminated with '}' after %.*s; " + "treating it as separate tokens", + (int) (str - base), base); + *cp = 0; + return false; + } else { cpp_error (pfile, CPP_DL_ERROR, @@ -1584,12 +1630,17 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, } while (--length); - if (delimited - && str < limit - && *str == '}' - && (length != 32 || !identifier_pos)) + if (delimited && str < limit && *str == '}') { - if (length == 32) + if (length == 32 && identifier_pos) + { + cpp_warning (pfile, CPP_W_UNICODE, + "empty delimited escape sequence; " + "treating it as separate tokens"); + *cp = 0; + return false; + } + else if (length == 32) cpp_error (pfile, CPP_DL_ERROR, "empty delimited escape sequence"); else if (!CPP_OPTION (pfile, delimited_escape_seqs) @@ -1607,6 +1658,11 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, error message in that case. */ if (length && identifier_pos) { + if (delimited) + cpp_warning (pfile, CPP_W_UNICODE, + "'\\u{' not terminated with '}' after %.*s; " + "treating it as separate tokens", + (int) (str - base), base); *cp = 0; return false; } diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h index 1a3fb19d7979..c25bcf215e02 100644 --- a/libcpp/include/cpplib.h +++ b/libcpp/include/cpplib.h @@ -565,6 +565,10 @@ struct cpp_options 2 if it should be a pedwarn. */ unsigned char cpp_warn_invalid_utf8; + /* True if libcpp should warn about invalid forms of delimited or named + escape sequences. */ + bool cpp_warn_unicode; + /* True if -finput-charset= option has been used explicitly. */ bool cpp_input_charset_explicit; @@ -675,7 +679,8 @@ enum cpp_warning_reason { CPP_W_CXX20_COMPAT, CPP_W_EXPANSION_TO_DEFINED, CPP_W_BIDIRECTIONAL, - CPP_W_INVALID_UTF8 + CPP_W_INVALID_UTF8, + CPP_W_UNICODE }; /* Callback for header lookup for HEADER, which is the name of a diff --git a/libcpp/init.cc b/libcpp/init.cc index 3e5601a5d962..629252445e12 100644 --- a/libcpp/init.cc +++ b/libcpp/init.cc @@ -228,6 +228,7 @@ cpp_create_reader (enum c_lang lang, cpp_hash_table *table, CPP_OPTION (pfile, warn_date_time) = 0; CPP_OPTION (pfile, cpp_warn_bidirectional) = bidirectional_unpaired; CPP_OPTION (pfile, cpp_warn_invalid_utf8) = 0; + CPP_OPTION (pfile, cpp_warn_unicode) = 1; CPP_OPTION (pfile, cpp_input_charset_explicit) = 0; /* Default CPP arithmetic to something sensible for the host for the