mirror of
git://sourceware.org/git/glibc.git
synced 2024-11-27 03:41:23 +08:00
locale: localdef input files are now encoded in UTF-8
Previously, they were assumed to be in ISO-8859-1, and that the output charset overlapped with ISO-8859-1 for the characters actually used. However, this did not work as intended on many architectures even for an ISO-8859-1 output encoding because of the char signedness bug in lr_getc. Therefore, this commit switches to UTF-8 without making provisions for backwards compatibility. The following Elisp code can be used to convert locale definition files to UTF-8: (defun glibc/convert-localedef (from to) (interactive "r") (save-excursion (save-restriction (narrow-to-region from to) (goto-char (point-min)) (save-match-data (while (re-search-forward "<U\\([0-9a-fA-F]+\\)>" nil t) (let* ((codepoint (string-to-number (match-string 1) 16)) (converted (cond ((memq codepoint '(?/ ?\ ?< ?>)) (string ?/ codepoint)) ((= codepoint ?\") "<U0022>") (t (string codepoint))))) (replace-match converted t))))))) Reviewed-by: Carlos O'Donell <carlos@redhat.com> Tested-by: Carlos O'Donell <carlos@redhat.com>
This commit is contained in:
parent
7dcaabb94c
commit
b15538d77c
4
NEWS
4
NEWS
@ -46,6 +46,10 @@ Major new features:
|
||||
to more flexibly configure and operate on filesystem mounts. The new
|
||||
mount APIs are specifically designed to work with namespaces.
|
||||
|
||||
* localedef now accepts locale definition files encoded in UTF-8.
|
||||
Previously, input bytes not within the ASCII range resulted in
|
||||
unpredictable output.
|
||||
|
||||
Deprecated and removed features, and other changes affecting compatibility:
|
||||
|
||||
* Support for prelink will be removed in the next release; this includes
|
||||
|
@ -42,6 +42,7 @@ static struct token *get_string (struct linereader *lr,
|
||||
struct localedef_t *locale,
|
||||
const struct repertoire_t *repertoire,
|
||||
int verbose);
|
||||
static bool utf8_decode (struct linereader *lr, uint8_t ch1, uint32_t *wch);
|
||||
|
||||
|
||||
struct linereader *
|
||||
@ -327,6 +328,17 @@ lr_token (struct linereader *lr, const struct charmap_t *charmap,
|
||||
}
|
||||
lr_ungetn (lr, 2);
|
||||
break;
|
||||
|
||||
case 0x80 ... 0xff: /* UTF-8 sequence. */
|
||||
uint32_t wch;
|
||||
if (!utf8_decode (lr, ch, &wch))
|
||||
{
|
||||
lr->token.tok = tok_error;
|
||||
return &lr->token;
|
||||
}
|
||||
lr->token.tok = tok_ucs4;
|
||||
lr->token.val.ucs4 = wch;
|
||||
return &lr->token;
|
||||
}
|
||||
|
||||
return get_ident (lr);
|
||||
@ -673,6 +685,87 @@ translate_unicode_codepoint (struct localedef_t *locale,
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Returns true if ch is not EOF (that is, non-negative) and a valid
|
||||
UTF-8 trailing byte. */
|
||||
static bool
|
||||
utf8_valid_trailing (int ch)
|
||||
{
|
||||
return ch >= 0 && (ch & 0xc0) == 0x80;
|
||||
}
|
||||
|
||||
/* Reports an error for a broken UTF-8 sequence. CH2 to CH4 may be
|
||||
EOF. Always returns false. */
|
||||
static bool
|
||||
utf8_sequence_error (struct linereader *lr, uint8_t ch1, int ch2, int ch3,
|
||||
int ch4)
|
||||
{
|
||||
char buf[30];
|
||||
|
||||
if (ch2 < 0)
|
||||
snprintf (buf, sizeof (buf), "0x%02x", ch1);
|
||||
else if (ch3 < 0)
|
||||
snprintf (buf, sizeof (buf), "0x%02x 0x%02x", ch1, ch2);
|
||||
else if (ch4 < 0)
|
||||
snprintf (buf, sizeof (buf), "0x%02x 0x%02x 0x%02x", ch1, ch2, ch3);
|
||||
else
|
||||
snprintf (buf, sizeof (buf), "0x%02x 0x%02x 0x%02x 0x%02x",
|
||||
ch1, ch2, ch3, ch4);
|
||||
|
||||
lr_error (lr, _("invalid UTF-8 sequence %s"), buf);
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Reads a UTF-8 sequence from LR, with the leading byte CH1, and
|
||||
stores the decoded codepoint in *WCH. Returns false on failure and
|
||||
reports an error. */
|
||||
static bool
|
||||
utf8_decode (struct linereader *lr, uint8_t ch1, uint32_t *wch)
|
||||
{
|
||||
/* See RFC 3629 section 4 and __gconv_transform_utf8_internal. */
|
||||
if (ch1 < 0xc2)
|
||||
return utf8_sequence_error (lr, ch1, -1, -1, -1);
|
||||
|
||||
int ch2 = lr_getc (lr);
|
||||
if (!utf8_valid_trailing (ch2))
|
||||
return utf8_sequence_error (lr, ch1, ch2, -1, -1);
|
||||
|
||||
if (ch1 <= 0xdf)
|
||||
{
|
||||
uint32_t result = ((ch1 & 0x1f) << 6) | (ch2 & 0x3f);
|
||||
if (result < 0x80)
|
||||
return utf8_sequence_error (lr, ch1, ch2, -1, -1);
|
||||
*wch = result;
|
||||
return true;
|
||||
}
|
||||
|
||||
int ch3 = lr_getc (lr);
|
||||
if (!utf8_valid_trailing (ch3) || ch1 < 0xe0)
|
||||
return utf8_sequence_error (lr, ch1, ch2, ch3, -1);
|
||||
|
||||
if (ch1 <= 0xef)
|
||||
{
|
||||
uint32_t result = (((ch1 & 0x0f) << 12)
|
||||
| ((ch2 & 0x3f) << 6)
|
||||
| (ch3 & 0x3f));
|
||||
if (result < 0x800)
|
||||
return utf8_sequence_error (lr, ch1, ch2, ch3, -1);
|
||||
*wch = result;
|
||||
return true;
|
||||
}
|
||||
|
||||
int ch4 = lr_getc (lr);
|
||||
if (!utf8_valid_trailing (ch4) || ch1 < 0xf0 || ch1 > 0xf4)
|
||||
return utf8_sequence_error (lr, ch1, ch2, ch3, ch4);
|
||||
|
||||
uint32_t result = (((ch1 & 0x07) << 18)
|
||||
| ((ch2 & 0x3f) << 12)
|
||||
| ((ch3 & 0x3f) << 6)
|
||||
| (ch4 & 0x3f));
|
||||
if (result < 0x10000)
|
||||
return utf8_sequence_error (lr, ch1, ch2, ch3, ch4);
|
||||
*wch = result;
|
||||
return true;
|
||||
}
|
||||
|
||||
static struct token *
|
||||
get_string (struct linereader *lr, const struct charmap_t *charmap,
|
||||
@ -696,7 +789,11 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
|
||||
|
||||
buf2 = NULL;
|
||||
while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
|
||||
addc (&lrb, ch);
|
||||
{
|
||||
if (ch >= 0x80)
|
||||
lr_error (lr, _("illegal 8-bit character in untranslated string"));
|
||||
addc (&lrb, ch);
|
||||
}
|
||||
|
||||
/* Catch errors with trailing escape character. */
|
||||
if (lrb.act > 0 && lrb.buf[lrb.act - 1] == lr->escape_char
|
||||
@ -730,24 +827,49 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
|
||||
|
||||
if (ch != '<')
|
||||
{
|
||||
/* The standards leave it up to the implementation to decide
|
||||
what to do with character which stand for themself. We
|
||||
could jump through hoops to find out the value relative to
|
||||
the charmap and the repertoire map, but instead we leave
|
||||
it up to the locale definition author to write a better
|
||||
definition. We assume here that every character which
|
||||
stands for itself is encoded using ISO 8859-1. Using the
|
||||
escape character is allowed. */
|
||||
/* The standards leave it up to the implementation to
|
||||
decide what to do with characters which stand for
|
||||
themselves. This implementation treats the input
|
||||
file as encoded in UTF-8. */
|
||||
if (ch == lr->escape_char)
|
||||
{
|
||||
ch = lr_getc (lr);
|
||||
if (ch >= 0x80)
|
||||
{
|
||||
lr_error (lr, _("illegal 8-bit escape sequence"));
|
||||
illegal_string = true;
|
||||
break;
|
||||
}
|
||||
if (ch == '\n' || ch == EOF)
|
||||
break;
|
||||
addc (&lrb, ch);
|
||||
wch = ch;
|
||||
}
|
||||
else if (ch < 0x80)
|
||||
{
|
||||
wch = ch;
|
||||
addc (&lrb, ch);
|
||||
}
|
||||
else /* UTF-8 sequence. */
|
||||
{
|
||||
if (!utf8_decode (lr, ch, &wch))
|
||||
{
|
||||
illegal_string = true;
|
||||
break;
|
||||
}
|
||||
if (!translate_unicode_codepoint (locale, charmap,
|
||||
repertoire, wch, &lrb))
|
||||
{
|
||||
/* Ignore the rest of the string. Callers may
|
||||
skip this string because it cannot be encoded
|
||||
in the output character set. */
|
||||
illegal_string = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
addc (&lrb, ch);
|
||||
if (return_widestr)
|
||||
ADDWC ((uint32_t) ch);
|
||||
ADDWC (wch);
|
||||
|
||||
continue;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user