Support 3 and 4-byte unicode characters.

John Hansen
This commit is contained in:
Bruce Momjian 2005-06-15 00:15:08 +00:00
parent f4c4f1ce52
commit 5955945828
3 changed files with 76 additions and 40 deletions

View File

@ -6,7 +6,7 @@
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/mb/conv.c,v 1.52 2005/03/07 04:30:52 momjian Exp $ * $PostgreSQL: pgsql/src/backend/utils/mb/conv.c,v 1.53 2005/06/15 00:15:08 momjian Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
@ -361,12 +361,19 @@ UtfToLocal(unsigned char *utf, unsigned char *iso,
iutf = *utf++ << 8; iutf = *utf++ << 8;
iutf |= *utf++; iutf |= *utf++;
} }
else else if (l == 3)
{ {
iutf = *utf++ << 16; iutf = *utf++ << 16;
iutf |= *utf++ << 8; iutf |= *utf++ << 8;
iutf |= *utf++; iutf |= *utf++;
} }
else if (l == 4)
{
iutf = *utf++ << 24;
iutf |= *utf++ << 16;
iutf |= *utf++ << 8;
iutf |= *utf++;
}
p = bsearch(&iutf, map, size, p = bsearch(&iutf, map, size,
sizeof(pg_utf_to_local), compare1); sizeof(pg_utf_to_local), compare1);
if (p == NULL) if (p == NULL)

View File

@ -1,7 +1,7 @@
/* /*
* conversion functions between pg_wchar and multibyte streams. * conversion functions between pg_wchar and multibyte streams.
* Tatsuo Ishii * Tatsuo Ishii
* $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.43 2005/03/14 18:31:20 momjian Exp $ * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.44 2005/06/15 00:15:08 momjian Exp $
* *
* WIN1250 client encoding updated by Pavel Behal * WIN1250 client encoding updated by Pavel Behal
* *
@ -406,8 +406,14 @@ pg_utf_mblen(const unsigned char *s)
len = 1; len = 1;
else if ((*s & 0xe0) == 0xc0) else if ((*s & 0xe0) == 0xc0)
len = 2; len = 2;
else if ((*s & 0xe0) == 0xe0) else if ((*s & 0xf0) == 0xe0)
len = 3; len = 3;
else if ((*s & 0xf8) == 0xf0)
len = 4;
else if ((*s & 0xfc) == 0xf8)
len = 5;
else if ((*s & 0xfe) == 0xfc)
len = 6;
return (len); return (len);
} }
@ -721,7 +727,7 @@ pg_wchar_tbl pg_wchar_table[] = {
{pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3}, /* 3; PG_EUC_KR */ {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3}, /* 3; PG_EUC_KR */
{pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3}, /* 4; PG_EUC_TW */ {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3}, /* 4; PG_EUC_TW */
{pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3}, /* 5; PG_JOHAB */ {pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3}, /* 5; PG_JOHAB */
{pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 3}, /* 6; PG_UTF8 */ {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 4}, /* 6; PG_UTF8 */
{pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7; PG_MULE_INTERNAL */ {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7; PG_MULE_INTERNAL */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 8; PG_LATIN1 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 8; PG_LATIN1 */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 9; PG_LATIN2 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 9; PG_LATIN2 */
@ -800,6 +806,31 @@ pg_encoding_max_length(int encoding)
#ifndef FRONTEND #ifndef FRONTEND
bool pg_utf8_islegal(const unsigned char *source, int length) {
unsigned char a;
const unsigned char *srcptr = source+length;
switch (length) {
default: return false;
/* Everything else falls through when "true"... */
case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 2: if ((a = (*--srcptr)) > 0xBF) return false;
switch (*source) {
/* no fall-through in this inner switch */
case 0xE0: if (a < 0xA0) return false; break;
case 0xED: if (a > 0x9F) return false; break;
case 0xF0: if (a < 0x90) return false; break;
case 0xF4: if (a > 0x8F) return false; break;
default: if (a < 0x80) return false;
}
case 1: if (*source >= 0x80 && *source < 0xC2) return false;
}
if (*source > 0xF4) return false;
return true;
}
/* /*
* Verify mbstr to make sure that it has a valid character sequence. * Verify mbstr to make sure that it has a valid character sequence.
* mbstr is not necessarily NULL terminated; length of mbstr is * mbstr is not necessarily NULL terminated; length of mbstr is
@ -823,51 +854,47 @@ pg_verifymbstr(const unsigned char *mbstr, int len, bool noError)
while (len > 0 && *mbstr) while (len > 0 && *mbstr)
{ {
/* special UTF8 check */
if (encoding == PG_UTF8 && (*mbstr & 0xf8) == 0xf0)
{
if (noError)
return false;
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
errmsg("Unicode characters greater than or equal to 0x10000 are not supported")));
}
l = pg_mblen(mbstr); l = pg_mblen(mbstr);
for (i = 1; i < l; i++) /* special UTF-8 check */
{ if (encoding == PG_UTF8) {
/* if(!pg_utf8_islegal(mbstr,l)) {
* we expect that every multibyte char consists of bytes if (noError) return false;
* having the 8th bit set ereport(ERROR,(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),errmsg("Invalid UNICODE byte sequence detected near byte %c",*mbstr)));
*/ }
if (i >= len || (mbstr[i] & 0x80) == 0) } else {
for (i = 1; i < l; i++)
{ {
char buf[8 * 2 + 1]; /*
char *p = buf; * we expect that every multibyte char consists of bytes
int j, * having the 8th bit set
*/
if (i >= len || (mbstr[i] & 0x80) == 0)
{
char buf[8 * 2 + 1];
char *p = buf;
int j,
jlimit; jlimit;
if (noError) if (noError)
return false; return false;
jlimit = Min(l, len); jlimit = Min(l, len);
jlimit = Min(jlimit, 8); /* prevent buffer overrun */ jlimit = Min(jlimit, 8); /* prevent buffer overrun */
for (j = 0; j < jlimit; j++) for (j = 0; j < jlimit; j++)
p += sprintf(p, "%02x", mbstr[j]); p += sprintf(p, "%02x", mbstr[j]);
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
errmsg("invalid byte sequence for encoding \"%s\": 0x%s", errmsg("invalid byte sequence for encoding \"%s\": 0x%s",
GetDatabaseEncodingName(), buf))); GetDatabaseEncodingName(), buf)));
}
} }
} }
len -= l; len -= l;
mbstr += l; mbstr += l;
} }
return true; return true;
} }

View File

@ -1,4 +1,4 @@
/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.58 2005/03/14 18:31:24 momjian Exp $ */ /* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.59 2005/06/15 00:15:08 momjian Exp $ */
#ifndef PG_WCHAR_H #ifndef PG_WCHAR_H
#define PG_WCHAR_H #define PG_WCHAR_H
@ -340,4 +340,6 @@ extern void mic2latin(unsigned char *mic, unsigned char *p, int len, int lc);
extern void latin2mic_with_table(unsigned char *l, unsigned char *p, int len, int lc, unsigned char *tab); extern void latin2mic_with_table(unsigned char *l, unsigned char *p, int len, int lc, unsigned char *tab);
extern void mic2latin_with_table(unsigned char *mic, unsigned char *p, int len, int lc, unsigned char *tab); extern void mic2latin_with_table(unsigned char *mic, unsigned char *p, int len, int lc, unsigned char *tab);
extern bool pg_utf8_islegal(const unsigned char *source, int length);
#endif /* PG_WCHAR_H */ #endif /* PG_WCHAR_H */