mirror of
git://gcc.gnu.org/git/gcc.git
synced 2025-01-25 19:44:52 +08:00
libstdc++: Fix Unicode codecvt and add tests [PR86419]
Fixes the conversion from UTF-8 to UTF-16 to properly return partial instead ok. Fixes the conversion from UTF-16 to UTF-8 to properly return partial instead ok. Fixes the conversion from UTF-8 to UCS-2 to properly return partial instead error. Fixes the conversion from UTF-8 to UCS-2 to treat 4-byte UTF-8 sequences as error just by seeing the leading byte. Fixes UTF-8 decoding for all codecvts so they detect error at the end of the input range when the last code point is also incomplete. libstdc++-v3/ChangeLog: PR libstdc++/86419 * src/c++11/codecvt.cc (read_utf8_code_point): Correctly detect errors in incomplete multibyte sequences. (utf16_in): Remove surrogates parameter. Fix conditions for returning partial. (utf16_out): Fix condition for returning partial. (ucs2_in): Do not pass surrogates argument to utf16_in. * testsuite/22_locale/codecvt/codecvt_unicode.cc: New test. * testsuite/22_locale/codecvt/codecvt_unicode.h: New header for tests. * testsuite/22_locale/codecvt/codecvt_unicode_wchar_t.cc: New test.
This commit is contained in:
parent
e2fc12a5da
commit
02dab99866
@ -277,13 +277,15 @@ namespace
|
||||
}
|
||||
else if (c1 < 0xF0) // 3-byte sequence
|
||||
{
|
||||
if (avail < 3)
|
||||
if (avail < 2)
|
||||
return incomplete_mb_character;
|
||||
char32_t c2 = (unsigned char) from[1];
|
||||
if ((c2 & 0xC0) != 0x80)
|
||||
return invalid_mb_sequence;
|
||||
if (c1 == 0xE0 && c2 < 0xA0) // overlong
|
||||
return invalid_mb_sequence;
|
||||
if (avail < 3)
|
||||
return incomplete_mb_character;
|
||||
char32_t c3 = (unsigned char) from[2];
|
||||
if ((c3 & 0xC0) != 0x80)
|
||||
return invalid_mb_sequence;
|
||||
@ -292,9 +294,9 @@ namespace
|
||||
from += 3;
|
||||
return c;
|
||||
}
|
||||
else if (c1 < 0xF5) // 4-byte sequence
|
||||
else if (c1 < 0xF5 && maxcode > 0xFFFF) // 4-byte sequence
|
||||
{
|
||||
if (avail < 4)
|
||||
if (avail < 2)
|
||||
return incomplete_mb_character;
|
||||
char32_t c2 = (unsigned char) from[1];
|
||||
if ((c2 & 0xC0) != 0x80)
|
||||
@ -302,10 +304,14 @@ namespace
|
||||
if (c1 == 0xF0 && c2 < 0x90) // overlong
|
||||
return invalid_mb_sequence;
|
||||
if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF
|
||||
return invalid_mb_sequence;
|
||||
return invalid_mb_sequence;
|
||||
if (avail < 3)
|
||||
return incomplete_mb_character;
|
||||
char32_t c3 = (unsigned char) from[2];
|
||||
if ((c3 & 0xC0) != 0x80)
|
||||
return invalid_mb_sequence;
|
||||
if (avail < 4)
|
||||
return incomplete_mb_character;
|
||||
char32_t c4 = (unsigned char) from[3];
|
||||
if ((c4 & 0xC0) != 0x80)
|
||||
return invalid_mb_sequence;
|
||||
@ -527,12 +533,11 @@ namespace
|
||||
// Flag indicating whether to process UTF-16 or UCS2
|
||||
enum class surrogates { allowed, disallowed };
|
||||
|
||||
// utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed)
|
||||
template<typename C8, typename C16>
|
||||
// utf8 -> utf16 (or utf8 -> ucs2 if maxcode <= 0xFFFF)
|
||||
template <typename C8, typename C16>
|
||||
codecvt_base::result
|
||||
utf16_in(range<const C8>& from, range<C16>& to,
|
||||
unsigned long maxcode = max_code_point, codecvt_mode mode = {},
|
||||
surrogates s = surrogates::allowed)
|
||||
utf16_in(range<const C8> &from, range<C16> &to,
|
||||
unsigned long maxcode = max_code_point, codecvt_mode mode = {})
|
||||
{
|
||||
read_utf8_bom(from, mode);
|
||||
while (from.size() && to.size())
|
||||
@ -540,12 +545,7 @@ namespace
|
||||
auto orig = from;
|
||||
const char32_t codepoint = read_utf8_code_point(from, maxcode);
|
||||
if (codepoint == incomplete_mb_character)
|
||||
{
|
||||
if (s == surrogates::allowed)
|
||||
return codecvt_base::partial;
|
||||
else
|
||||
return codecvt_base::error; // No surrogates in UCS2
|
||||
}
|
||||
return codecvt_base::partial;
|
||||
if (codepoint > maxcode)
|
||||
return codecvt_base::error;
|
||||
if (!write_utf16_code_point(to, codepoint, mode))
|
||||
@ -554,7 +554,7 @@ namespace
|
||||
return codecvt_base::partial;
|
||||
}
|
||||
}
|
||||
return codecvt_base::ok;
|
||||
return from.size() ? codecvt_base::partial : codecvt_base::ok;
|
||||
}
|
||||
|
||||
// utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed)
|
||||
@ -576,7 +576,7 @@ namespace
|
||||
return codecvt_base::error; // No surrogates in UCS-2
|
||||
|
||||
if (from.size() < 2)
|
||||
return codecvt_base::ok; // stop converting at this point
|
||||
return codecvt_base::partial; // stop converting at this point
|
||||
|
||||
const char32_t c2 = from[1];
|
||||
if (is_low_surrogate(c2))
|
||||
@ -629,7 +629,7 @@ namespace
|
||||
{
|
||||
// UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
|
||||
maxcode = std::min(max_single_utf16_unit, maxcode);
|
||||
return utf16_in(from, to, maxcode, mode, surrogates::disallowed);
|
||||
return utf16_in(from, to, maxcode, mode);
|
||||
}
|
||||
|
||||
// ucs2 -> utf8
|
||||
|
68
libstdc++-v3/testsuite/22_locale/codecvt/codecvt_unicode.cc
Normal file
68
libstdc++-v3/testsuite/22_locale/codecvt/codecvt_unicode.cc
Normal file
@ -0,0 +1,68 @@
|
||||
// Copyright (C) 2020-2023 Free Software Foundation, Inc.
|
||||
//
|
||||
// This file is part of the GNU ISO C++ Library. This library is free
|
||||
// software; you can redistribute it and/or modify it under the
|
||||
// terms of the GNU General Public License as published by the
|
||||
// Free Software Foundation; either version 3, or (at your option)
|
||||
// any later version.
|
||||
|
||||
// This library is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License along
|
||||
// with this library; see the file COPYING3. If not see
|
||||
// <http://www.gnu.org/licenses/>.
|
||||
|
||||
// { dg-do run { target c++11 } }
|
||||
|
||||
#include "codecvt_unicode.h"
|
||||
|
||||
#include <codecvt>
|
||||
|
||||
using namespace std;
|
||||
|
||||
void
|
||||
test_utf8_utf32_codecvts ()
|
||||
{
|
||||
using codecvt_c32 = codecvt<char32_t, char, mbstate_t>;
|
||||
auto loc_c = locale::classic ();
|
||||
VERIFY (has_facet<codecvt_c32> (loc_c));
|
||||
auto &cvt = use_facet<codecvt_c32> (loc_c);
|
||||
test_utf8_utf32_codecvts (cvt);
|
||||
|
||||
auto cvt_ptr = to_unique_ptr (new codecvt_utf8<char32_t> ());
|
||||
test_utf8_utf32_codecvts (*cvt_ptr);
|
||||
}
|
||||
|
||||
void
|
||||
test_utf8_utf16_codecvts ()
|
||||
{
|
||||
using codecvt_c16 = codecvt<char16_t, char, mbstate_t>;
|
||||
auto loc_c = locale::classic ();
|
||||
VERIFY (has_facet<codecvt_c16> (loc_c));
|
||||
auto &cvt = use_facet<codecvt_c16> (loc_c);
|
||||
test_utf8_utf16_cvts (cvt);
|
||||
|
||||
auto cvt_ptr = to_unique_ptr (new codecvt_utf8_utf16<char16_t> ());
|
||||
test_utf8_utf16_cvts (*cvt_ptr);
|
||||
|
||||
auto cvt_ptr2 = to_unique_ptr (new codecvt_utf8_utf16<char32_t> ());
|
||||
test_utf8_utf16_cvts (*cvt_ptr2);
|
||||
}
|
||||
|
||||
void
|
||||
test_utf8_ucs2_codecvts ()
|
||||
{
|
||||
auto cvt_ptr = to_unique_ptr (new codecvt_utf8<char16_t> ());
|
||||
test_utf8_ucs2_cvts (*cvt_ptr);
|
||||
}
|
||||
|
||||
int
|
||||
main ()
|
||||
{
|
||||
test_utf8_utf32_codecvts ();
|
||||
test_utf8_utf16_codecvts ();
|
||||
test_utf8_ucs2_codecvts ();
|
||||
}
|
1269
libstdc++-v3/testsuite/22_locale/codecvt/codecvt_unicode.h
Normal file
1269
libstdc++-v3/testsuite/22_locale/codecvt/codecvt_unicode.h
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,59 @@
|
||||
// Copyright (C) 2020-2023 Free Software Foundation, Inc.
|
||||
//
|
||||
// This file is part of the GNU ISO C++ Library. This library is free
|
||||
// software; you can redistribute it and/or modify it under the
|
||||
// terms of the GNU General Public License as published by the
|
||||
// Free Software Foundation; either version 3, or (at your option)
|
||||
// any later version.
|
||||
|
||||
// This library is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License along
|
||||
// with this library; see the file COPYING3. If not see
|
||||
// <http://www.gnu.org/licenses/>.
|
||||
|
||||
// { dg-do run { target c++11 } }
|
||||
|
||||
#include "codecvt_unicode.h"
|
||||
|
||||
#include <codecvt>
|
||||
|
||||
using namespace std;
|
||||
|
||||
void
|
||||
test_utf8_utf32_codecvts ()
|
||||
{
|
||||
#if __SIZEOF_WCHAR_T__ == 4
|
||||
auto cvt_ptr = to_unique_ptr (new codecvt_utf8<wchar_t> ());
|
||||
test_utf8_utf32_codecvts (*cvt_ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
test_utf8_utf16_codecvts ()
|
||||
{
|
||||
#if __SIZEOF_WCHAR_T__ >= 2
|
||||
auto cvt_ptr = to_unique_ptr (new codecvt_utf8_utf16<wchar_t> ());
|
||||
test_utf8_utf16_cvts (*cvt_ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
test_utf8_ucs2_codecvts ()
|
||||
{
|
||||
#if __SIZEOF_WCHAR_T__ == 2
|
||||
auto cvt_ptr = to_unique_ptr (new codecvt_utf8<wchar_t> ());
|
||||
test_utf8_ucs2_cvts (*cvt_ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
int
|
||||
main ()
|
||||
{
|
||||
test_utf8_utf32_codecvts ();
|
||||
test_utf8_utf16_codecvts ();
|
||||
test_utf8_ucs2_codecvts ();
|
||||
}
|
Loading…
Reference in New Issue
Block a user