From 5955945828d96511798c3edf158ff2c341257d19 Mon Sep 17 00:00:00 2001 From: Bruce Momjian Date: Wed, 15 Jun 2005 00:15:08 +0000 Subject: [PATCH] Support 3 and 4-byte unicode characters. John Hansen --- src/backend/utils/mb/conv.c | 11 +++- src/backend/utils/mb/wchar.c | 101 ++++++++++++++++++++++------------- src/include/mb/pg_wchar.h | 4 +- 3 files changed, 76 insertions(+), 40 deletions(-) diff --git a/src/backend/utils/mb/conv.c b/src/backend/utils/mb/conv.c index 8789dcbc3e..f4a6083f20 100644 --- a/src/backend/utils/mb/conv.c +++ b/src/backend/utils/mb/conv.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conv.c,v 1.52 2005/03/07 04:30:52 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conv.c,v 1.53 2005/06/15 00:15:08 momjian Exp $ * *------------------------------------------------------------------------- */ @@ -361,12 +361,19 @@ UtfToLocal(unsigned char *utf, unsigned char *iso, iutf = *utf++ << 8; iutf |= *utf++; } - else + else if (l == 3) { iutf = *utf++ << 16; iutf |= *utf++ << 8; iutf |= *utf++; } + else if (l == 4) + { + iutf = *utf++ << 24; + iutf |= *utf++ << 16; + iutf |= *utf++ << 8; + iutf |= *utf++; + } p = bsearch(&iutf, map, size, sizeof(pg_utf_to_local), compare1); if (p == NULL) diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c index 7ff16809d8..1450b8b13c 100644 --- a/src/backend/utils/mb/wchar.c +++ b/src/backend/utils/mb/wchar.c @@ -1,7 +1,7 @@ /* * conversion functions between pg_wchar and multibyte streams. * Tatsuo Ishii - * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.43 2005/03/14 18:31:20 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.44 2005/06/15 00:15:08 momjian Exp $ * * WIN1250 client encoding updated by Pavel Behal * @@ -406,8 +406,14 @@ pg_utf_mblen(const unsigned char *s) len = 1; else if ((*s & 0xe0) == 0xc0) len = 2; - else if ((*s & 0xe0) == 0xe0) - len = 3; + else if ((*s & 0xf0) == 0xe0) + len = 3; + else if ((*s & 0xf8) == 0xf0) + len = 4; + else if ((*s & 0xfc) == 0xf8) + len = 5; + else if ((*s & 0xfe) == 0xfc) + len = 6; return (len); } @@ -721,7 +727,7 @@ pg_wchar_tbl pg_wchar_table[] = { {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3}, /* 3; PG_EUC_KR */ {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3}, /* 4; PG_EUC_TW */ {pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3}, /* 5; PG_JOHAB */ - {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 3}, /* 6; PG_UTF8 */ + {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 4}, /* 6; PG_UTF8 */ {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7; PG_MULE_INTERNAL */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 8; PG_LATIN1 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 9; PG_LATIN2 */ @@ -800,6 +806,31 @@ pg_encoding_max_length(int encoding) #ifndef FRONTEND +bool pg_utf8_islegal(const unsigned char *source, int length) { + unsigned char a; + const unsigned char *srcptr = source+length; + switch (length) { + default: return false; + /* Everything else falls through when "true"... */ + case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 2: if ((a = (*--srcptr)) > 0xBF) return false; + switch (*source) { + /* no fall-through in this inner switch */ + case 0xE0: if (a < 0xA0) return false; break; + case 0xED: if (a > 0x9F) return false; break; + case 0xF0: if (a < 0x90) return false; break; + case 0xF4: if (a > 0x8F) return false; break; + default: if (a < 0x80) return false; + } + + case 1: if (*source >= 0x80 && *source < 0xC2) return false; + } + if (*source > 0xF4) return false; + return true; +} + + /* * Verify mbstr to make sure that it has a valid character sequence. * mbstr is not necessarily NULL terminated; length of mbstr is @@ -823,51 +854,47 @@ pg_verifymbstr(const unsigned char *mbstr, int len, bool noError) while (len > 0 && *mbstr) { - /* special UTF8 check */ - if (encoding == PG_UTF8 && (*mbstr & 0xf8) == 0xf0) - { - if (noError) - return false; - ereport(ERROR, - (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("Unicode characters greater than or equal to 0x10000 are not supported"))); - } - l = pg_mblen(mbstr); - - for (i = 1; i < l; i++) - { - /* - * we expect that every multibyte char consists of bytes - * having the 8th bit set - */ - if (i >= len || (mbstr[i] & 0x80) == 0) + + /* special UTF-8 check */ + if (encoding == PG_UTF8) { + if(!pg_utf8_islegal(mbstr,l)) { + if (noError) return false; + ereport(ERROR,(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),errmsg("Invalid UNICODE byte sequence detected near byte %c",*mbstr))); + } + } else { + for (i = 1; i < l; i++) { - char buf[8 * 2 + 1]; - char *p = buf; - int j, + /* + * we expect that every multibyte char consists of bytes + * having the 8th bit set + */ + if (i >= len || (mbstr[i] & 0x80) == 0) + { + char buf[8 * 2 + 1]; + char *p = buf; + int j, jlimit; - if (noError) - return false; + if (noError) + return false; - jlimit = Min(l, len); - jlimit = Min(jlimit, 8); /* prevent buffer overrun */ + jlimit = Min(l, len); + jlimit = Min(jlimit, 8); /* prevent buffer overrun */ - for (j = 0; j < jlimit; j++) - p += sprintf(p, "%02x", mbstr[j]); + for (j = 0; j < jlimit; j++) + p += sprintf(p, "%02x", mbstr[j]); - ereport(ERROR, - (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("invalid byte sequence for encoding \"%s\": 0x%s", - GetDatabaseEncodingName(), buf))); + ereport(ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("invalid byte sequence for encoding \"%s\": 0x%s", + GetDatabaseEncodingName(), buf))); + } } } - len -= l; mbstr += l; } - return true; } diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 41ca953a40..32e9d739a8 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -1,4 +1,4 @@ -/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.58 2005/03/14 18:31:24 momjian Exp $ */ +/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.59 2005/06/15 00:15:08 momjian Exp $ */ #ifndef PG_WCHAR_H #define PG_WCHAR_H @@ -340,4 +340,6 @@ extern void mic2latin(unsigned char *mic, unsigned char *p, int len, int lc); extern void latin2mic_with_table(unsigned char *l, unsigned char *p, int len, int lc, unsigned char *tab); extern void mic2latin_with_table(unsigned char *mic, unsigned char *p, int len, int lc, unsigned char *tab); +extern bool pg_utf8_islegal(const unsigned char *source, int length); + #endif /* PG_WCHAR_H */