libstdc++: Fix Unicode codecvt and add tests [PR86419]

Fixes the conversion from UTF-8 to UTF-16 to properly return partial
instead ok.
Fixes the conversion from UTF-16 to UTF-8 to properly return partial
instead ok.
Fixes the conversion from UTF-8 to UCS-2 to properly return partial
instead error.
Fixes the conversion from UTF-8 to UCS-2 to treat 4-byte UTF-8 sequences
as error just by seeing the leading byte.
Fixes UTF-8 decoding for all codecvts so they detect error at the end of
the input range when the last code point is also incomplete.

libstdc++-v3/ChangeLog:

	PR libstdc++/86419
	* src/c++11/codecvt.cc (read_utf8_code_point): Correctly detect
	errors in incomplete multibyte sequences.
	(utf16_in): Remove surrogates parameter. Fix conditions for
	returning partial.
	(utf16_out): Fix condition for returning partial.
	(ucs2_in): Do not pass surrogates argument to utf16_in.
	* testsuite/22_locale/codecvt/codecvt_unicode.cc: New test.
	* testsuite/22_locale/codecvt/codecvt_unicode.h: New header for
	tests.
	* testsuite/22_locale/codecvt/codecvt_unicode_wchar_t.cc: New
	test.
This commit is contained in:
Dimitrij Mijoski 2023-01-10 13:58:59 +01:00 committed by Jonathan Wakely
parent e2fc12a5da
commit 02dab99866
4 changed files with 1414 additions and 18 deletions

View File

@ -277,13 +277,15 @@ namespace
}
else if (c1 < 0xF0) // 3-byte sequence
{
if (avail < 3)
if (avail < 2)
return incomplete_mb_character;
char32_t c2 = (unsigned char) from[1];
if ((c2 & 0xC0) != 0x80)
return invalid_mb_sequence;
if (c1 == 0xE0 && c2 < 0xA0) // overlong
return invalid_mb_sequence;
if (avail < 3)
return incomplete_mb_character;
char32_t c3 = (unsigned char) from[2];
if ((c3 & 0xC0) != 0x80)
return invalid_mb_sequence;
@ -292,9 +294,9 @@ namespace
from += 3;
return c;
}
else if (c1 < 0xF5) // 4-byte sequence
else if (c1 < 0xF5 && maxcode > 0xFFFF) // 4-byte sequence
{
if (avail < 4)
if (avail < 2)
return incomplete_mb_character;
char32_t c2 = (unsigned char) from[1];
if ((c2 & 0xC0) != 0x80)
@ -302,10 +304,14 @@ namespace
if (c1 == 0xF0 && c2 < 0x90) // overlong
return invalid_mb_sequence;
if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF
return invalid_mb_sequence;
return invalid_mb_sequence;
if (avail < 3)
return incomplete_mb_character;
char32_t c3 = (unsigned char) from[2];
if ((c3 & 0xC0) != 0x80)
return invalid_mb_sequence;
if (avail < 4)
return incomplete_mb_character;
char32_t c4 = (unsigned char) from[3];
if ((c4 & 0xC0) != 0x80)
return invalid_mb_sequence;
@ -527,12 +533,11 @@ namespace
// Flag indicating whether to process UTF-16 or UCS2
enum class surrogates { allowed, disallowed };
// utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed)
template<typename C8, typename C16>
// utf8 -> utf16 (or utf8 -> ucs2 if maxcode <= 0xFFFF)
template <typename C8, typename C16>
codecvt_base::result
utf16_in(range<const C8>& from, range<C16>& to,
unsigned long maxcode = max_code_point, codecvt_mode mode = {},
surrogates s = surrogates::allowed)
utf16_in(range<const C8> &from, range<C16> &to,
unsigned long maxcode = max_code_point, codecvt_mode mode = {})
{
read_utf8_bom(from, mode);
while (from.size() && to.size())
@ -540,12 +545,7 @@ namespace
auto orig = from;
const char32_t codepoint = read_utf8_code_point(from, maxcode);
if (codepoint == incomplete_mb_character)
{
if (s == surrogates::allowed)
return codecvt_base::partial;
else
return codecvt_base::error; // No surrogates in UCS2
}
return codecvt_base::partial;
if (codepoint > maxcode)
return codecvt_base::error;
if (!write_utf16_code_point(to, codepoint, mode))
@ -554,7 +554,7 @@ namespace
return codecvt_base::partial;
}
}
return codecvt_base::ok;
return from.size() ? codecvt_base::partial : codecvt_base::ok;
}
// utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed)
@ -576,7 +576,7 @@ namespace
return codecvt_base::error; // No surrogates in UCS-2
if (from.size() < 2)
return codecvt_base::ok; // stop converting at this point
return codecvt_base::partial; // stop converting at this point
const char32_t c2 = from[1];
if (is_low_surrogate(c2))
@ -629,7 +629,7 @@ namespace
{
// UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
maxcode = std::min(max_single_utf16_unit, maxcode);
return utf16_in(from, to, maxcode, mode, surrogates::disallowed);
return utf16_in(from, to, maxcode, mode);
}
// ucs2 -> utf8

View File

@ -0,0 +1,68 @@
// Copyright (C) 2020-2023 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License along
// with this library; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.
// { dg-do run { target c++11 } }
#include "codecvt_unicode.h"
#include <codecvt>
using namespace std;
void
test_utf8_utf32_codecvts ()
{
using codecvt_c32 = codecvt<char32_t, char, mbstate_t>;
auto loc_c = locale::classic ();
VERIFY (has_facet<codecvt_c32> (loc_c));
auto &cvt = use_facet<codecvt_c32> (loc_c);
test_utf8_utf32_codecvts (cvt);
auto cvt_ptr = to_unique_ptr (new codecvt_utf8<char32_t> ());
test_utf8_utf32_codecvts (*cvt_ptr);
}
void
test_utf8_utf16_codecvts ()
{
using codecvt_c16 = codecvt<char16_t, char, mbstate_t>;
auto loc_c = locale::classic ();
VERIFY (has_facet<codecvt_c16> (loc_c));
auto &cvt = use_facet<codecvt_c16> (loc_c);
test_utf8_utf16_cvts (cvt);
auto cvt_ptr = to_unique_ptr (new codecvt_utf8_utf16<char16_t> ());
test_utf8_utf16_cvts (*cvt_ptr);
auto cvt_ptr2 = to_unique_ptr (new codecvt_utf8_utf16<char32_t> ());
test_utf8_utf16_cvts (*cvt_ptr2);
}
void
test_utf8_ucs2_codecvts ()
{
auto cvt_ptr = to_unique_ptr (new codecvt_utf8<char16_t> ());
test_utf8_ucs2_cvts (*cvt_ptr);
}
int
main ()
{
test_utf8_utf32_codecvts ();
test_utf8_utf16_codecvts ();
test_utf8_ucs2_codecvts ();
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,59 @@
// Copyright (C) 2020-2023 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License along
// with this library; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.
// { dg-do run { target c++11 } }
#include "codecvt_unicode.h"
#include <codecvt>
using namespace std;
void
test_utf8_utf32_codecvts ()
{
#if __SIZEOF_WCHAR_T__ == 4
auto cvt_ptr = to_unique_ptr (new codecvt_utf8<wchar_t> ());
test_utf8_utf32_codecvts (*cvt_ptr);
#endif
}
void
test_utf8_utf16_codecvts ()
{
#if __SIZEOF_WCHAR_T__ >= 2
auto cvt_ptr = to_unique_ptr (new codecvt_utf8_utf16<wchar_t> ());
test_utf8_utf16_cvts (*cvt_ptr);
#endif
}
void
test_utf8_ucs2_codecvts ()
{
#if __SIZEOF_WCHAR_T__ == 2
auto cvt_ptr = to_unique_ptr (new codecvt_utf8<wchar_t> ());
test_utf8_ucs2_cvts (*cvt_ptr);
#endif
}
int
main ()
{
test_utf8_utf32_codecvts ();
test_utf8_utf16_codecvts ();
test_utf8_ucs2_codecvts ();
}