diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 5db8856963..44077a3fb2 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.236.4.2 2005/10/03 23:43:29 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.236.4.3 2006/05/21 20:06:16 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -98,6 +98,7 @@ static bool fe_eof; /* true if detected end of copy data */ static EolType eol_type; /* EOL type of input */ static int client_encoding; /* remote side's character encoding */ static int server_encoding; /* local encoding */ +static int server_max_length; /* local encoding max length */ static bool embedded_line_warning; /* these are just for error messages, see copy_in_error_callback */ @@ -988,6 +989,7 @@ DoCopy(const CopyStmt *stmt) client_encoding = pg_get_client_encoding(); server_encoding = GetDatabaseEncoding(); + server_max_length = pg_database_encoding_max_length(); copy_dest = COPY_FILE; /* default */ copy_file = NULL; @@ -2010,7 +2012,8 @@ static bool CopyReadLine(void) { bool result; - bool change_encoding = (client_encoding != server_encoding); + bool change_encoding = (client_encoding != server_encoding || + server_max_length > 1); int c; int mblen; int j; diff --git a/src/backend/utils/adt/name.c b/src/backend/utils/adt/name.c index 1200ad9b34..211619ec15 100644 --- a/src/backend/utils/adt/name.c +++ b/src/backend/utils/adt/name.c @@ -14,7 +14,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/name.c,v 1.55 2004/12/31 22:01:22 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/name.c,v 1.55.4.1 2006/05/21 20:06:16 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -49,10 +49,7 @@ namein(PG_FUNCTION_ARGS) NameData *result; int len; - /* verify encoding */ len = strlen(s); - pg_verifymbstr(s, len, false); - len = pg_mbcliplen(s, len, NAMEDATALEN - 1); result = (NameData *) palloc0(NAMEDATALEN); diff --git a/src/backend/utils/adt/varchar.c b/src/backend/utils/adt/varchar.c index 40e771ffab..19b2de618a 100644 --- a/src/backend/utils/adt/varchar.c +++ b/src/backend/utils/adt/varchar.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/varchar.c,v 1.108.4.1 2005/12/22 22:50:14 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/varchar.c,v 1.108.4.2 2006/05/21 20:06:16 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -75,10 +75,7 @@ bpcharin(PG_FUNCTION_ARGS) int i; int charlen; /* number of charcters in the input string */ - /* verify encoding */ len = strlen(s); - pg_verifymbstr(s, len, false); - charlen = pg_mbstrlen(s); /* If typmod is -1 (or invalid), use the actual string length */ @@ -364,10 +361,7 @@ varcharin(PG_FUNCTION_ARGS) size_t len, maxlen; - /* verify encoding */ len = strlen(s); - pg_verifymbstr(s, len, false); - maxlen = atttypmod - VARHDRSZ; if (atttypmod >= (int32) VARHDRSZ && len > maxlen) diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index f7f64add23..891c412d9e 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.118.4.1 2005/12/22 22:50:14 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.118.4.2 2006/05/21 20:06:16 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -260,10 +260,7 @@ textin(PG_FUNCTION_ARGS) text *result; int len; - /* verify encoding */ len = strlen(inputText); - pg_verifymbstr(inputText, len, false); - result = (text *) palloc(len + VARHDRSZ); VARATT_SIZEP(result) = len + VARHDRSZ; diff --git a/src/backend/utils/mb/conv.c b/src/backend/utils/mb/conv.c index c84d860e84..7997985b55 100644 --- a/src/backend/utils/mb/conv.c +++ b/src/backend/utils/mb/conv.c @@ -6,172 +6,81 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conv.c,v 1.51 2004/12/31 22:01:42 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conv.c,v 1.51.4.1 2006/05/21 20:06:16 tgl Exp $ * *------------------------------------------------------------------------- */ #include "postgres.h" #include "mb/pg_wchar.h" + /* - * convert bogus chars that cannot be represented in the current - * encoding system. + * LATINn ---> MIC when the charset's local codes map directly to MIC + * + * l points to the source string of length len + * p is the output area (must be large enough!) + * lc is the mule character set id for the local encoding + * encoding is the PG identifier for the local encoding */ void -pg_print_bogus_char(unsigned char **mic, unsigned char **p) -{ - char strbuf[16]; - int l = pg_mic_mblen(*mic); - - *(*p)++ = '('; - while (l--) - { - sprintf(strbuf, "%02x", *(*mic)++); - *(*p)++ = strbuf[0]; - *(*p)++ = strbuf[1]; - } - *(*p)++ = ')'; -} - -#ifdef NOT_USED - -/* - * GB18030 ---> MIC - * Added by Bill Huang , - */ -static void -gb180302mic(unsigned char *gb18030, unsigned char *p, int len) -{ - int c1; - int c2; - - while (len > 0 && (c1 = *gb18030++)) - { - if (c1 < 0x80) - { /* should be ASCII */ - len--; - *p++ = c1; - } - else if (c1 >= 0x81 && c1 <= 0xfe) - { - c2 = *gb18030++; - - if (c2 >= 0x30 && c2 <= 0x69) - { - len -= 4; - *p++ = c1; - *p++ = c2; - *p++ = *gb18030++; - *p++ = *gb18030++; - *p++ = *gb18030++; - } - else if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfe)) - { - len -= 2; - *p++ = c1; - *p++ = c2; - *p++ = *gb18030++; - } - else - { /* throw the strange code */ - len--; - } - } - } - *p = '\0'; -} - -/* - * MIC ---> GB18030 - * Added by Bill Huang , - */ -static void -mic2gb18030(unsigned char *mic, unsigned char *p, int len) -{ - int c1; - int c2; - - while (len > 0 && (c1 = *mic)) - { - len -= pg_mic_mblen(mic++); - - if (c1 <= 0x7f) /* ASCII */ - *p++ = c1; - else if (c1 >= 0x81 && c1 <= 0xfe) - { - c2 = *mic++; - - if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfe)) - { - *p++ = c1; - *p++ = c2; - } - else if (c2 >= 0x30 && c2 <= 0x39) - { - *p++ = c1; - *p++ = c2; - *p++ = *mic++; - *p++ = *mic++; - } - else - { - mic--; - pg_print_bogus_char(&mic, &p); - mic--; - pg_print_bogus_char(&mic, &p); - } - } - else - { - mic--; - pg_print_bogus_char(&mic, &p); - } - } - *p = '\0'; -} -#endif - -/* - * LATINn ---> MIC - */ -void -latin2mic(unsigned char *l, unsigned char *p, int len, int lc) +latin2mic(const unsigned char *l, unsigned char *p, int len, + int lc, int encoding) { int c1; - while (len-- > 0 && (c1 = *l++)) + while (len > 0) { - if (c1 > 0x7f) - { /* Latin? */ + c1 = *l; + if (c1 == 0) + report_invalid_encoding(encoding, (const char *) l, len); + if (IS_HIGHBIT_SET(c1)) *p++ = lc; - } *p++ = c1; + l++; + len--; } *p = '\0'; } /* - * MIC ---> LATINn + * MIC ---> LATINn when the charset's local codes map directly to MIC + * + * mic points to the source string of length len + * p is the output area (must be large enough!) + * lc is the mule character set id for the local encoding + * encoding is the PG identifier for the local encoding */ void -mic2latin(unsigned char *mic, unsigned char *p, int len, int lc) +mic2latin(const unsigned char *mic, unsigned char *p, int len, + int lc, int encoding) { int c1; - while (len > 0 && (c1 = *mic)) + while (len > 0) { - len -= pg_mic_mblen(mic++); - - if (c1 == lc) - *p++ = *mic++; - else if (c1 > 0x7f) + c1 = *mic; + if (c1 == 0) + report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); + if (!IS_HIGHBIT_SET(c1)) { - mic--; - pg_print_bogus_char(&mic, &p); + /* easy for ASCII */ + *p++ = c1; + mic++; + len--; } else - { /* should be ASCII */ - *p++ = c1; + { + int l = pg_mic_mblen(mic); + + if (len < l) + report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, + len); + if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1])) + report_untranslatable_char(PG_MULE_INTERNAL, encoding, + (const char *) mic, len); + *p++ = mic[1]; + mic += 2; + len -= 2; } } *p = '\0'; @@ -180,14 +89,25 @@ mic2latin(unsigned char *mic, unsigned char *p, int len, int lc) /* * ASCII ---> MIC + * + * While ordinarily SQL_ASCII encoding is forgiving of high-bit-set + * characters, here we must take a hard line because we don't know + * the appropriate MIC equivalent. */ void -pg_ascii2mic(unsigned char *l, unsigned char *p, int len) +pg_ascii2mic(const unsigned char *l, unsigned char *p, int len) { int c1; - while (len-- > 0 && (c1 = *l++)) - *p++ = (c1 & 0x7f); + while (len > 0) + { + c1 = *l; + if (c1 == 0 || IS_HIGHBIT_SET(c1)) + report_invalid_encoding(PG_SQL_ASCII, (const char *) l, len); + *p++ = c1; + l++; + len--; + } *p = '\0'; } @@ -195,19 +115,19 @@ pg_ascii2mic(unsigned char *l, unsigned char *p, int len) * MIC ---> ASCII */ void -pg_mic2ascii(unsigned char *mic, unsigned char *p, int len) +pg_mic2ascii(const unsigned char *mic, unsigned char *p, int len) { int c1; - while (len-- > 0 && (c1 = *mic)) + while (len > 0) { - if (c1 > 0x7f) - pg_print_bogus_char(&mic, &p); - else - { /* should be ASCII */ - *p++ = c1; - mic++; - } + c1 = *mic; + if (c1 == 0 || IS_HIGHBIT_SET(c1)) + report_untranslatable_char(PG_MULE_INTERNAL, PG_SQL_ASCII, + (const char *) mic, len); + *p++ = c1; + mic++; + len--; } *p = '\0'; } @@ -215,87 +135,103 @@ pg_mic2ascii(unsigned char *mic, unsigned char *p, int len) /* * latin2mic_with_table: a generic single byte charset encoding * conversion from a local charset to the mule internal code. - * with a encoding conversion table. - * the table is ordered according to the local charset, + * + * l points to the source string of length len + * p is the output area (must be large enough!) + * lc is the mule character set id for the local encoding + * encoding is the PG identifier for the local encoding + * tab holds conversion entries for the local charset * starting from 128 (0x80). each entry in the table * holds the corresponding code point for the mule internal code. */ void -latin2mic_with_table( - unsigned char *l, /* local charset string (source) */ - unsigned char *p, /* pointer to store mule internal - * code (destination) */ - int len, /* length of l */ - int lc, /* leading character of p */ - unsigned char *tab /* code conversion table */ -) +latin2mic_with_table(const unsigned char *l, + unsigned char *p, + int len, + int lc, + int encoding, + const unsigned char *tab) { unsigned char c1, c2; - while (len-- > 0 && (c1 = *l++)) + while (len > 0) { - if (c1 < 128) + c1 = *l; + if (c1 == 0) + report_invalid_encoding(encoding, (const char *) l, len); + if (!IS_HIGHBIT_SET(c1)) *p++ = c1; else { - c2 = tab[c1 - 128]; + c2 = tab[c1 - HIGHBIT]; if (c2) { *p++ = lc; *p++ = c2; } else - { - *p++ = ' '; /* cannot convert */ - } + report_untranslatable_char(encoding, PG_MULE_INTERNAL, + (const char *) l, len); } + l++; + len--; } *p = '\0'; } /* * mic2latin_with_table: a generic single byte charset encoding - * conversion from the mule internal code to a local charset - * with a encoding conversion table. - * the table is ordered according to the second byte of the mule - * internal code starting from 128 (0x80). - * each entry in the table - * holds the corresponding code point for the local code. + * conversion from the mule internal code to a local charset. + * + * mic points to the source string of length len + * p is the output area (must be large enough!) + * lc is the mule character set id for the local encoding + * encoding is the PG identifier for the local encoding + * tab holds conversion entries for the mule internal code's + * second byte, starting from 128 (0x80). each entry in the table + * holds the corresponding code point for the local charset. */ void -mic2latin_with_table( - unsigned char *mic, /* mule internal code - * (source) */ - unsigned char *p, /* local code (destination) */ - int len, /* length of p */ - int lc, /* leading character */ - unsigned char *tab /* code conversion table */ -) +mic2latin_with_table(const unsigned char *mic, + unsigned char *p, + int len, + int lc, + int encoding, + const unsigned char *tab) { - unsigned char c1, c2; - while (len-- > 0 && (c1 = *mic++)) + while (len > 0) { - if (c1 < 128) - *p++ = c1; - else if (c1 == lc) + c1 = *mic; + if (c1 == 0) + report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); + if (!IS_HIGHBIT_SET(c1)) { - c1 = *mic++; + /* easy for ASCII */ + *p++ = c1; + mic++; len--; - c2 = tab[c1 - 128]; - if (c2) - *p++ = c2; - else - { - *p++ = ' '; /* cannot convert */ - } } else { - *p++ = ' '; /* bogus character */ + int l = pg_mic_mblen(mic); + + if (len < l) + report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, + len); + if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) || + (c2 = tab[mic[1] - HIGHBIT]) == 0) + { + report_untranslatable_char(PG_MULE_INTERNAL, encoding, + (const char *) mic, len); + break; /* keep compiler quiet */ + } + *p++ = c2; + mic += 2; + len -= 2; } } *p = '\0'; @@ -332,27 +268,40 @@ compare2(const void *p1, const void *p2) } /* - * UTF-8 ---> local code + * UTF8 ---> local code * - * utf: input UTF-8 string. Its length is limited by "len" parameter - * or a null terminator. - * iso: pointer to the output. + * utf: input UTF8 string (need not be null-terminated). + * iso: pointer to the output area (must be large enough!) * map: the conversion map. * size: the size of the conversion map. + * encoding: the PG identifier for the local encoding. + * len: length of input string. */ void -UtfToLocal(unsigned char *utf, unsigned char *iso, - pg_utf_to_local *map, int size, int len) +UtfToLocal(const unsigned char *utf, unsigned char *iso, + const pg_utf_to_local *map, int size, int encoding, int len) { unsigned int iutf; int l; pg_utf_to_local *p; - for (; len > 0 && *utf; len -= l) + for (; len > 0; len -= l) { + /* "break" cases all represent errors */ + if (*utf == '\0') + break; + l = pg_utf_mblen(utf); + + if (len < l) + break; + + if (!pg_utf8_islegal(utf, l)) + break; + if (l == 1) { + /* ASCII case is easy */ *iso++ = *utf++; continue; } @@ -361,22 +310,27 @@ UtfToLocal(unsigned char *utf, unsigned char *iso, iutf = *utf++ << 8; iutf |= *utf++; } - else + else if (l == 3) { iutf = *utf++ << 16; iutf |= *utf++ << 8; iutf |= *utf++; } + else if (l == 4) + { + iutf = *utf++ << 24; + iutf |= *utf++ << 16; + iutf |= *utf++ << 8; + iutf |= *utf++; + } + p = bsearch(&iutf, map, size, sizeof(pg_utf_to_local), compare1); + if (p == NULL) - { - ereport(WARNING, - (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER), - errmsg("ignoring unconvertible UTF-8 character 0x%04x", - iutf))); - continue; - } + report_untranslatable_char(PG_UTF8, encoding, + (const char *) (utf - l), len); + if (p->code & 0xff000000) *iso++ = p->code >> 24; if (p->code & 0x00ff0000) @@ -386,15 +340,26 @@ UtfToLocal(unsigned char *utf, unsigned char *iso, if (p->code & 0x000000ff) *iso++ = p->code & 0x000000ff; } + + if (len > 0) + report_invalid_encoding(PG_UTF8, (const char *) utf, len); + *iso = '\0'; } /* - * local code ---> UTF-8 + * local code ---> UTF8 + * + * iso: input local string (need not be null-terminated). + * utf: pointer to the output area (must be large enough!) + * map: the conversion map. + * size: the size of the conversion map. + * encoding: the PG identifier for the local encoding. + * len: length of input string. */ void -LocalToUtf(unsigned char *iso, unsigned char *utf, - pg_local_to_utf *map, int size, int encoding, int len) +LocalToUtf(const unsigned char *iso, unsigned char *utf, + const pg_local_to_utf *map, int size, int encoding, int len) { unsigned int iiso; int l; @@ -405,16 +370,23 @@ LocalToUtf(unsigned char *iso, unsigned char *utf, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid encoding number: %d", encoding))); - for (; len > 0 && *iso; len -= l) + for (; len > 0; len -= l) { - if (*iso < 0x80) + /* "break" cases all represent errors */ + if (*iso == '\0') + break; + + if (!IS_HIGHBIT_SET(*iso)) { + /* ASCII case is easy */ *utf++ = *iso++; l = 1; continue; } - l = pg_encoding_mblen(encoding, iso); + l = pg_encoding_verifymb(encoding, (const char *) iso, len); + if (l < 0) + break; if (l == 1) iiso = *iso++; @@ -436,16 +408,13 @@ LocalToUtf(unsigned char *iso, unsigned char *utf, iiso |= *iso++ << 8; iiso |= *iso++; } + p = bsearch(&iiso, map, size, sizeof(pg_local_to_utf), compare2); if (p == NULL) - { - ereport(WARNING, - (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER), - errmsg("ignoring unconvertible %s character 0x%04x", - (&pg_enc2name_tbl[encoding])->name, iiso))); - continue; - } + report_untranslatable_char(encoding, PG_UTF8, + (const char *) (iso - l), len); + if (p->utf & 0xff000000) *utf++ = p->utf >> 24; if (p->utf & 0x00ff0000) @@ -455,5 +424,9 @@ LocalToUtf(unsigned char *iso, unsigned char *utf, if (p->utf & 0x000000ff) *utf++ = p->utf & 0x000000ff; } + + if (len > 0) + report_invalid_encoding(encoding, (const char *) iso, len); + *utf = '\0'; } diff --git a/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c b/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c index 436218fd7f..0ccf97cfab 100644 --- a/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c +++ b/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c,v 1.9 2004/12/31 22:01:48 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c,v 1.9.4.1 2006/05/21 20:06:16 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -70,14 +70,14 @@ extern Datum alt_to_iso(PG_FUNCTION_ARGS); * ---------- */ -static void koi8r2mic(unsigned char *l, unsigned char *p, int len); -static void mic2koi8r(unsigned char *mic, unsigned char *p, int len); -static void iso2mic(unsigned char *l, unsigned char *p, int len); -static void mic2iso(unsigned char *mic, unsigned char *p, int len); -static void win12512mic(unsigned char *l, unsigned char *p, int len); -static void mic2win1251(unsigned char *mic, unsigned char *p, int len); -static void alt2mic(unsigned char *l, unsigned char *p, int len); -static void mic2alt(unsigned char *mic, unsigned char *p, int len); +static void koi8r2mic(const unsigned char *l, unsigned char *p, int len); +static void mic2koi8r(const unsigned char *mic, unsigned char *p, int len); +static void iso2mic(const unsigned char *l, unsigned char *p, int len); +static void mic2iso(const unsigned char *mic, unsigned char *p, int len); +static void win12512mic(const unsigned char *l, unsigned char *p, int len); +static void mic2win1251(const unsigned char *mic, unsigned char *p, int len); +static void alt2mic(const unsigned char *l, unsigned char *p, int len); +static void mic2alt(const unsigned char *mic, unsigned char *p, int len); Datum koi8r_to_mic(PG_FUNCTION_ARGS) @@ -401,7 +401,7 @@ win1251_to_iso(PG_FUNCTION_ARGS) buf = palloc(len * ENCODING_GROWTH_RATE); win12512mic(src, buf, len); - mic2win1251(buf, dest, strlen(buf)); + mic2iso(buf, dest, strlen(buf)); pfree(buf); PG_RETURN_VOID(); @@ -441,7 +441,7 @@ alt_to_iso(PG_FUNCTION_ARGS) buf = palloc(len * ENCODING_GROWTH_RATE); alt2mic(src, buf, len); - mic2alt(buf, dest, strlen(buf)); + mic2iso(buf, dest, strlen(buf)); pfree(buf); PG_RETURN_VOID(); @@ -460,23 +460,23 @@ alt_to_iso(PG_FUNCTION_ARGS) /* koi8r2mic: KOI8-R to Mule internal code */ static void -koi8r2mic(unsigned char *l, unsigned char *p, int len) +koi8r2mic(const unsigned char *l, unsigned char *p, int len) { - latin2mic(l, p, len, LC_KOI8_R); + latin2mic(l, p, len, LC_KOI8_R, PG_KOI8R); } /* mic2koi8r: Mule internal code to KOI8-R */ static void -mic2koi8r(unsigned char *mic, unsigned char *p, int len) +mic2koi8r(const unsigned char *mic, unsigned char *p, int len) { - mic2latin(mic, p, len, LC_KOI8_R); + mic2latin(mic, p, len, LC_KOI8_R, PG_KOI8R); } /* iso2mic: ISO-8859-5 to Mule internal code */ static void -iso2mic(unsigned char *l, unsigned char *p, int len) +iso2mic(const unsigned char *l, unsigned char *p, int len) { - static unsigned char iso2koi[] = { + static const unsigned char iso2koi[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -495,14 +495,14 @@ iso2mic(unsigned char *l, unsigned char *p, int len) 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; - latin2mic_with_table(l, p, len, LC_KOI8_R, iso2koi); + latin2mic_with_table(l, p, len, LC_KOI8_R, PG_ISO_8859_5, iso2koi); } /* mic2iso: Mule internal code to ISO8859-5 */ static void -mic2iso(unsigned char *mic, unsigned char *p, int len) +mic2iso(const unsigned char *mic, unsigned char *p, int len) { - static unsigned char koi2iso[] = { + static const unsigned char koi2iso[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -521,14 +521,14 @@ mic2iso(unsigned char *mic, unsigned char *p, int len) 0xcc, 0xcb, 0xb7, 0xc8, 0xcd, 0xc9, 0xc7, 0xca }; - mic2latin_with_table(mic, p, len, LC_KOI8_R, koi2iso); + mic2latin_with_table(mic, p, len, LC_KOI8_R, PG_ISO_8859_5, koi2iso); } /* win2mic: CP1251 to Mule internal code */ static void -win12512mic(unsigned char *l, unsigned char *p, int len) +win12512mic(const unsigned char *l, unsigned char *p, int len) { - static unsigned char win2koi[] = { + static const unsigned char win2koi[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -547,14 +547,14 @@ win12512mic(unsigned char *l, unsigned char *p, int len) 0xdb, 0xdd, 0xdf, 0xd9, 0xd8, 0xdc, 0xc0, 0xd1 }; - latin2mic_with_table(l, p, len, LC_KOI8_R, win2koi); + latin2mic_with_table(l, p, len, LC_KOI8_R, PG_WIN1251, win2koi); } /* mic2win: Mule internal code to CP1251 */ static void -mic2win1251(unsigned char *mic, unsigned char *p, int len) +mic2win1251(const unsigned char *mic, unsigned char *p, int len) { - static unsigned char koi2win[] = { + static const unsigned char koi2win[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -573,14 +573,14 @@ mic2win1251(unsigned char *mic, unsigned char *p, int len) 0xdc, 0xdb, 0xc7, 0xd8, 0xdd, 0xd9, 0xd7, 0xda }; - mic2latin_with_table(mic, p, len, LC_KOI8_R, koi2win); + mic2latin_with_table(mic, p, len, LC_KOI8_R, PG_WIN1251, koi2win); } /* alt2mic: CP866 to Mule internal code */ static void -alt2mic(unsigned char *l, unsigned char *p, int len) +alt2mic(const unsigned char *l, unsigned char *p, int len) { - static unsigned char alt2koi[] = { + static const unsigned char alt2koi[] = { 0xe1, 0xe2, 0xf7, 0xe7, 0xe4, 0xe5, 0xf6, 0xfa, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf2, 0xf3, 0xf4, 0xf5, 0xe6, 0xe8, 0xe3, 0xfe, @@ -599,14 +599,14 @@ alt2mic(unsigned char *l, unsigned char *p, int len) 0xb6, 0xa6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; - latin2mic_with_table(l, p, len, LC_KOI8_R, alt2koi); + latin2mic_with_table(l, p, len, LC_KOI8_R, PG_ALT, alt2koi); } /* mic2alt: Mule internal code to CP866 */ static void -mic2alt(unsigned char *mic, unsigned char *p, int len) +mic2alt(const unsigned char *mic, unsigned char *p, int len) { - static unsigned char koi2alt[] = { + static const unsigned char koi2alt[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -625,5 +625,5 @@ mic2alt(unsigned char *mic, unsigned char *p, int len) 0x9c, 0x9b, 0x87, 0x98, 0x9d, 0x99, 0x97, 0x9a }; - mic2latin_with_table(mic, p, len, LC_KOI8_R, koi2alt); + mic2latin_with_table(mic, p, len, LC_KOI8_R, PG_ALT, koi2alt); } diff --git a/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c b/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c index 30d7bf7634..4d3bdfe2d1 100644 --- a/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c +++ b/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c,v 1.9 2004/12/31 22:01:51 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c,v 1.9.4.1 2006/05/21 20:06:16 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -32,8 +32,8 @@ extern Datum mic_to_euc_cn(PG_FUNCTION_ARGS); * ---------- */ -static void euc_cn2mic(unsigned char *euc, unsigned char *p, int len); -static void mic2euc_cn(unsigned char *mic, unsigned char *p, int len); +static void euc_cn2mic(const unsigned char *euc, unsigned char *p, int len); +static void mic2euc_cn(const unsigned char *mic, unsigned char *p, int len); Datum euc_cn_to_mic(PG_FUNCTION_ARGS) @@ -71,23 +71,30 @@ mic_to_euc_cn(PG_FUNCTION_ARGS) * EUC_CN ---> MIC */ static void -euc_cn2mic(unsigned char *euc, unsigned char *p, int len) +euc_cn2mic(const unsigned char *euc, unsigned char *p, int len) { int c1; - while (len >= 0 && (c1 = *euc++)) + while (len > 0) { - if (c1 & 0x80) + c1 = *euc; + if (IS_HIGHBIT_SET(c1)) { - len -= 2; + if (len < 2 || !IS_HIGHBIT_SET(euc[1])) + report_invalid_encoding(PG_EUC_CN, (const char *) euc, len); *p++ = LC_GB2312_80; *p++ = c1; - *p++ = *euc++; + *p++ = euc[1]; + euc += 2; + len -= 2; } else { /* should be ASCII */ - len--; + if (c1 == 0) + report_invalid_encoding(PG_EUC_CN, (const char *) euc, len); *p++ = c1; + euc++; + len--; } } *p = '\0'; @@ -97,27 +104,34 @@ euc_cn2mic(unsigned char *euc, unsigned char *p, int len) * MIC ---> EUC_CN */ static void -mic2euc_cn(unsigned char *mic, unsigned char *p, int len) +mic2euc_cn(const unsigned char *mic, unsigned char *p, int len) { int c1; - while (len >= 0 && (c1 = *mic)) + while (len > 0) { - len -= pg_mic_mblen(mic++); - - if (c1 == LC_GB2312_80) + c1 = *mic; + if (IS_HIGHBIT_SET(c1)) { + if (c1 != LC_GB2312_80) + report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_CN, + (const char *) mic, len); + if (len < 3 || !IS_HIGHBIT_SET(mic[1]) || !IS_HIGHBIT_SET(mic[2])) + report_invalid_encoding(PG_MULE_INTERNAL, + (const char *) mic, len); + mic++; *p++ = *mic++; *p++ = *mic++; - } - else if (c1 > 0x7f) - { /* cannot convert to EUC_CN! */ - mic--; - pg_print_bogus_char(&mic, &p); + len -= 3; } else { /* should be ASCII */ + if (c1 == 0) + report_invalid_encoding(PG_MULE_INTERNAL, + (const char *) mic, len); *p++ = c1; + mic++; + len--; } } *p = '\0'; diff --git a/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c b/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c index 0958af9cd7..6dba03b52f 100644 --- a/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c +++ b/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c,v 1.9.4.3 2006/03/04 12:37:01 ishii Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c,v 1.9.4.4 2006/05/21 20:06:16 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -22,9 +22,6 @@ #define PGSJISALTCODE 0x81ac #define PGEUCALTCODE 0xa2ae -#define ISSJISHEAD(c) ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xfc)) -#define ISSJISTAIL(c) ((c >= 0x40 && c <= 0x7e) || (c >= 0x80 && c <= 0xfc)) - /* * conversion table between SJIS UDC (IBM kanji) and EUC_JP */ @@ -57,10 +54,10 @@ extern Datum mic_to_sjis(PG_FUNCTION_ARGS); * ---------- */ -static void sjis2mic(unsigned char *sjis, unsigned char *p, int len); -static void mic2sjis(unsigned char *mic, unsigned char *p, int len); -static void euc_jp2mic(unsigned char *euc, unsigned char *p, int len); -static void mic2euc_jp(unsigned char *mic, unsigned char *p, int len); +static void sjis2mic(const unsigned char *sjis, unsigned char *p, int len); +static void mic2sjis(const unsigned char *mic, unsigned char *p, int len); +static void euc_jp2mic(const unsigned char *euc, unsigned char *p, int len); +static void mic2euc_jp(const unsigned char *mic, unsigned char *p, int len); Datum euc_jp_to_sjis(PG_FUNCTION_ARGS) @@ -170,38 +167,34 @@ mic_to_sjis(PG_FUNCTION_ARGS) * SJIS ---> MIC */ static void -sjis2mic(unsigned char *sjis, unsigned char *p, int len) +sjis2mic(const unsigned char *sjis, unsigned char *p, int len) { int c1, c2, -/* Eiji Tokuya patched begin */ i, k, k2; -/* Eiji Tokuya patched end */ - while (len >= 0 && (c1 = *sjis++)) + while (len > 0) { + c1 = *sjis; if (c1 >= 0xa1 && c1 <= 0xdf) { /* JIS X0201 (1 byte kana) */ - len--; *p++ = LC_JISX0201K; *p++ = c1; + sjis++; + len--; } - else if (c1 > 0x7f) + else if (IS_HIGHBIT_SET(c1)) { /* * JIS X0208, X0212, user defined extended characters */ - c2 = *sjis++; - if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2)) - ereport(ERROR, - (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("invalid byte sequence for encoding \"SJIS\": 0x%02x%02x", - c1, c2))); + if (len < 2 || !ISSJISHEAD(c1) || !ISSJISTAIL(sjis[1])) + report_invalid_encoding(PG_SJIS, (const char *) sjis, len); + c2 = sjis[1]; k = (c1 << 8) + c2; -/* Eiji Tokuya patched begin */ if (k >= 0xed40 && k < 0xf040) { /* NEC selection IBM kanji */ @@ -220,19 +213,15 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len) } if (k < 0xeb3f) -/* Eiji Tokuya patched end */ { /* JIS X0208 */ - len -= 2; *p++ = LC_JISX0208; *p++ = ((c1 & 0x3f) << 1) + 0x9f + (c2 > 0x9e); *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80); } -/* Eiji Tokuya patched begin */ else if ((k >= 0xeb40 && k < 0xf040) || (k >= 0xfc4c && k <= 0xfcfc)) { /* NEC selection IBM kanji - Other undecided justice */ -/* Eiji Tokuya patched end */ *p++ = LC_JISX0208; *p++ = PGEUCALTCODE >> 8; *p++ = PGEUCALTCODE & 0xff; @@ -243,7 +232,6 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len) * UDC1 mapping to X0208 85 ku - 94 ku JIS code 0x7521 - * 0x7e7e EUC 0xf5a1 - 0xfefe */ - len -= 2; *p++ = LC_JISX0208; c1 -= 0x6f; *p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e); @@ -255,7 +243,6 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len) * UDC2 mapping to X0212 85 ku - 94 ku JIS code 0x7521 - * 0x7e7e EUC 0x8ff5a1 - 0x8ffefe */ - len -= 2; *p++ = LC_JISX0212; c1 -= 0x74; *p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e); @@ -265,9 +252,7 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len) { /* * mapping IBM kanji to X0208 and X0212 - * */ - len -= 2; for (i = 0;; i++) { k2 = ibmkanji[i].sjis; @@ -291,11 +276,16 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len) } } } + sjis += 2; + len -= 2; } else { /* should be ASCII */ - len--; + if (c1 == 0) + report_invalid_encoding(PG_SJIS, (const char *) sjis, len); *p++ = c1; + sjis++; + len--; } } *p = '\0'; @@ -305,22 +295,37 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len) * MIC ---> SJIS */ static void -mic2sjis(unsigned char *mic, unsigned char *p, int len) +mic2sjis(const unsigned char *mic, unsigned char *p, int len) { int c1, c2, - k; + k, + l; - while (len >= 0 && (c1 = *mic)) + while (len > 0) { - len -= pg_mic_mblen(mic++); - + c1 = *mic; + if (!IS_HIGHBIT_SET(c1)) + { + /* ASCII */ + if (c1 == 0) + report_invalid_encoding(PG_MULE_INTERNAL, + (const char *) mic, len); + *p++ = c1; + mic++; + len--; + continue; + } + l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len); + if (l < 0) + report_invalid_encoding(PG_MULE_INTERNAL, + (const char *) mic, len); if (c1 == LC_JISX0201K) - *p++ = *mic++; + *p++ = mic[1]; else if (c1 == LC_JISX0208) { - c1 = *mic++; - c2 = *mic++; + c1 = mic[1]; + c2 = mic[2]; k = (c1 << 8) | (c2 & 0xff); if (k >= 0xf5a1) { @@ -337,8 +342,8 @@ mic2sjis(unsigned char *mic, unsigned char *p, int len) int i, k2; - c1 = *mic++; - c2 = *mic++; + c1 = mic[1]; + c2 = mic[2]; k = c1 << 8 | c2; if (k >= 0xf5a1) { @@ -369,16 +374,11 @@ mic2sjis(unsigned char *mic, unsigned char *p, int len) } } } - else if (c1 > 0x7f) - { - /* cannot convert to SJIS! */ - *p++ = PGSJISALTCODE >> 8; - *p++ = PGSJISALTCODE & 0xff; - } else - { /* should be ASCII */ - *p++ = c1; - } + report_untranslatable_char(PG_MULE_INTERNAL, PG_SJIS, + (const char *) mic, len); + mic += l; + len -= l; } *p = '\0'; } @@ -387,37 +387,48 @@ mic2sjis(unsigned char *mic, unsigned char *p, int len) * EUC_JP ---> MIC */ static void -euc_jp2mic(unsigned char *euc, unsigned char *p, int len) +euc_jp2mic(const unsigned char *euc, unsigned char *p, int len) { int c1; + int l; - while (len >= 0 && (c1 = *euc++)) + while (len > 0) { + c1 = *euc; + if (!IS_HIGHBIT_SET(c1)) + { + /* ASCII */ + if (c1 == 0) + report_invalid_encoding(PG_EUC_JP, + (const char *) euc, len); + *p++ = c1; + euc++; + len--; + continue; + } + l = pg_encoding_verifymb(PG_EUC_JP, (const char *) euc, len); + if (l < 0) + report_invalid_encoding(PG_EUC_JP, + (const char *) euc, len); if (c1 == SS2) { /* 1 byte kana? */ - len -= 2; *p++ = LC_JISX0201K; - *p++ = *euc++; + *p++ = euc[1]; } else if (c1 == SS3) { /* JIS X0212 kanji? */ - len -= 3; *p++ = LC_JISX0212; - *p++ = *euc++; - *p++ = *euc++; - } - else if (c1 & 0x80) - { /* kanji? */ - len -= 2; - *p++ = LC_JISX0208; - *p++ = c1; - *p++ = *euc++; + *p++ = euc[1]; + *p++ = euc[2]; } else - { /* should be ASCII */ - len--; + { /* kanji? */ + *p++ = LC_JISX0208; *p++ = c1; + *p++ = euc[1]; } + euc += l; + len -= l; } *p = '\0'; } @@ -426,39 +437,50 @@ euc_jp2mic(unsigned char *euc, unsigned char *p, int len) * MIC ---> EUC_JP */ static void -mic2euc_jp(unsigned char *mic, unsigned char *p, int len) +mic2euc_jp(const unsigned char *mic, unsigned char *p, int len) { int c1; + int l; - while (len >= 0 && (c1 = *mic)) + while (len > 0) { - len -= pg_mic_mblen(mic++); - + c1 = *mic; + if (!IS_HIGHBIT_SET(c1)) + { + /* ASCII */ + if (c1 == 0) + report_invalid_encoding(PG_MULE_INTERNAL, + (const char *) mic, len); + *p++ = c1; + mic++; + len--; + continue; + } + l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len); + if (l < 0) + report_invalid_encoding(PG_MULE_INTERNAL, + (const char *) mic, len); if (c1 == LC_JISX0201K) { *p++ = SS2; - *p++ = *mic++; + *p++ = mic[1]; } else if (c1 == LC_JISX0212) { *p++ = SS3; - *p++ = *mic++; - *p++ = *mic++; + *p++ = mic[1]; + *p++ = mic[2]; } else if (c1 == LC_JISX0208) { - *p++ = *mic++; - *p++ = *mic++; - } - else if (c1 > 0x7f) - { /* cannot convert to EUC_JP! */ - mic--; - pg_print_bogus_char(&mic, &p); + *p++ = mic[1]; + *p++ = mic[2]; } else - { /* should be ASCII */ - *p++ = c1; - } + report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_JP, + (const char *) mic, len); + mic += l; + len -= l; } *p = '\0'; } diff --git a/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c b/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c index be7df0fc43..4ef0302cc8 100644 --- a/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c +++ b/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c,v 1.9 2004/12/31 22:01:56 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c,v 1.9.4.1 2006/05/21 20:06:16 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -32,8 +32,8 @@ extern Datum mic_to_euc_kr(PG_FUNCTION_ARGS); * ---------- */ -static void euc_kr2mic(unsigned char *euc, unsigned char *p, int len); -static void mic2euc_kr(unsigned char *mic, unsigned char *p, int len); +static void euc_kr2mic(const unsigned char *euc, unsigned char *p, int len); +static void mic2euc_kr(const unsigned char *mic, unsigned char *p, int len); Datum euc_kr_to_mic(PG_FUNCTION_ARGS) @@ -71,23 +71,34 @@ mic_to_euc_kr(PG_FUNCTION_ARGS) * EUC_KR ---> MIC */ static void -euc_kr2mic(unsigned char *euc, unsigned char *p, int len) +euc_kr2mic(const unsigned char *euc, unsigned char *p, int len) { int c1; + int l; - while (len >= 0 && (c1 = *euc++)) + while (len > 0) { - if (c1 & 0x80) + c1 = *euc; + if (IS_HIGHBIT_SET(c1)) { - len -= 2; + l = pg_encoding_verifymb(PG_EUC_KR, (const char *) euc, len); + if (l != 2) + report_invalid_encoding(PG_EUC_KR, + (const char *) euc, len); *p++ = LC_KS5601; *p++ = c1; - *p++ = *euc++; + *p++ = euc[1]; + euc += 2; + len -= 2; } else { /* should be ASCII */ - len--; + if (c1 == 0) + report_invalid_encoding(PG_EUC_KR, + (const char *) euc, len); *p++ = c1; + euc++; + len--; } } *p = '\0'; @@ -97,28 +108,39 @@ euc_kr2mic(unsigned char *euc, unsigned char *p, int len) * MIC ---> EUC_KR */ static void -mic2euc_kr(unsigned char *mic, unsigned char *p, int len) +mic2euc_kr(const unsigned char *mic, unsigned char *p, int len) { int c1; + int l; - while (len >= 0 && (c1 = *mic)) + while (len > 0) { - len -= pg_mic_mblen(mic++); - + c1 = *mic; + if (!IS_HIGHBIT_SET(c1)) + { + /* ASCII */ + if (c1 == 0) + report_invalid_encoding(PG_MULE_INTERNAL, + (const char *) mic, len); + *p++ = c1; + mic++; + len--; + continue; + } + l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len); + if (l < 0) + report_invalid_encoding(PG_MULE_INTERNAL, + (const char *) mic, len); if (c1 == LC_KS5601) { - *p++ = *mic++; - *p++ = *mic++; - } - else if (c1 > 0x7f) - { /* cannot convert to EUC_KR! */ - mic--; - pg_print_bogus_char(&mic, &p); + *p++ = mic[1]; + *p++ = mic[2]; } else - { /* should be ASCII */ - *p++ = c1; - } + report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_KR, + (const char *) mic, len); + mic += l; + len -= l; } *p = '\0'; } diff --git a/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c b/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c index 5cd6c0b763..8e3fd653c4 100644 --- a/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c +++ b/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c,v 1.9 2004/12/31 22:02:07 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c,v 1.9.4.1 2006/05/21 20:06:16 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -42,10 +42,10 @@ extern Datum mic_to_big5(PG_FUNCTION_ARGS); * ---------- */ -static void big52mic(unsigned char *big5, unsigned char *p, int len); -static void mic2big5(unsigned char *mic, unsigned char *p, int len); -static void euc_tw2mic(unsigned char *euc, unsigned char *p, int len); -static void mic2euc_tw(unsigned char *mic, unsigned char *p, int len); +static void big52mic(const unsigned char *big5, unsigned char *p, int len); +static void mic2big5(const unsigned char *mic, unsigned char *p, int len); +static void euc_tw2mic(const unsigned char *euc, unsigned char *p, int len); +static void mic2euc_tw(const unsigned char *mic, unsigned char *p, int len); Datum euc_tw_to_big5(PG_FUNCTION_ARGS) @@ -114,7 +114,7 @@ mic_to_euc_tw(PG_FUNCTION_ARGS) Assert(PG_GETARG_INT32(1) == PG_EUC_TW); Assert(len >= 0); - mic2big5(src, dest, len); + mic2euc_tw(src, dest, len); PG_RETURN_VOID(); } @@ -155,39 +155,52 @@ mic_to_big5(PG_FUNCTION_ARGS) * EUC_TW ---> MIC */ static void -euc_tw2mic(unsigned char *euc, unsigned char *p, int len) +euc_tw2mic(const unsigned char *euc, unsigned char *p, int len) { int c1; + int l; - while (len >= 0 && (c1 = *euc++)) + while (len > 0) { - if (c1 == SS2) + c1 = *euc; + if (IS_HIGHBIT_SET(c1)) { - len -= 4; - c1 = *euc++; /* plane No. */ - if (c1 == 0xa1) - *p++ = LC_CNS11643_1; - else if (c1 == 0xa2) - *p++ = LC_CNS11643_2; - else + l = pg_encoding_verifymb(PG_EUC_TW, (const char *) euc, len); + if (l < 0) + report_invalid_encoding(PG_EUC_TW, + (const char *) euc, len); + if (c1 == SS2) { - *p++ = 0x9d; /* LCPRV2 */ - *p++ = 0xa3 - c1 + LC_CNS11643_3; + c1 = euc[1]; /* plane No. */ + if (c1 == 0xa1) + *p++ = LC_CNS11643_1; + else if (c1 == 0xa2) + *p++ = LC_CNS11643_2; + else + { + *p++ = 0x9d; /* LCPRV2 */ + *p++ = c1 - 0xa3 + LC_CNS11643_3; + } + *p++ = euc[2]; + *p++ = euc[3]; } - *p++ = *euc++; - *p++ = *euc++; - } - else if (c1 & 0x80) - { /* CNS11643-1 */ - len -= 2; - *p++ = LC_CNS11643_1; - *p++ = c1; - *p++ = *euc++; + else + { /* CNS11643-1 */ + *p++ = LC_CNS11643_1; + *p++ = c1; + *p++ = euc[1]; + } + euc += l; + len -= l; } else { /* should be ASCII */ - len--; + if (c1 == 0) + report_invalid_encoding(PG_EUC_TW, + (const char *) euc, len); *p++ = c1; + euc++; + len--; } } *p = '\0'; @@ -197,42 +210,54 @@ euc_tw2mic(unsigned char *euc, unsigned char *p, int len) * MIC ---> EUC_TW */ static void -mic2euc_tw(unsigned char *mic, unsigned char *p, int len) +mic2euc_tw(const unsigned char *mic, unsigned char *p, int len) { int c1; + int l; - while (len >= 0 && (c1 = *mic)) + while (len > 0) { - len -= pg_mic_mblen(mic++); - + c1 = *mic; + if (!IS_HIGHBIT_SET(c1)) + { + /* ASCII */ + if (c1 == 0) + report_invalid_encoding(PG_MULE_INTERNAL, + (const char *) mic, len); + *p++ = c1; + mic++; + len--; + continue; + } + l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len); + if (l < 0) + report_invalid_encoding(PG_MULE_INTERNAL, + (const char *) mic, len); if (c1 == LC_CNS11643_1) { - *p++ = *mic++; - *p++ = *mic++; + *p++ = mic[1]; + *p++ = mic[2]; } else if (c1 == LC_CNS11643_2) { *p++ = SS2; *p++ = 0xa2; - *p++ = *mic++; - *p++ = *mic++; + *p++ = mic[1]; + *p++ = mic[2]; } - else if (c1 == 0x9d) + else if (c1 == 0x9d && + mic[1] >= LC_CNS11643_3 && mic[1] <= LC_CNS11643_7) { /* LCPRV2? */ *p++ = SS2; - *p++ = *mic++ - LC_CNS11643_3 + 0xa3; - *p++ = *mic++; - *p++ = *mic++; - } - else if (c1 > 0x7f) - { /* cannot convert to EUC_TW! */ - mic--; - pg_print_bogus_char(&mic, &p); + *p++ = mic[1] - LC_CNS11643_3 + 0xa3; + *p++ = mic[2]; + *p++ = mic[3]; } else - { /* should be ASCII */ - *p++ = c1; - } + report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_TW, + (const char *) mic, len); + mic += l; + len -= l; } *p = '\0'; } @@ -241,52 +266,49 @@ mic2euc_tw(unsigned char *mic, unsigned char *p, int len) * Big5 ---> MIC */ static void -big52mic(unsigned char *big5, unsigned char *p, int len) +big52mic(const unsigned char *big5, unsigned char *p, int len) { unsigned short c1; unsigned short big5buf, cnsBuf; unsigned char lc; - char bogusBuf[3]; - int i; + int l; - while (len >= 0 && (c1 = *big5++)) + while (len > 0) { - if (c1 <= 0x7fU) - { /* ASCII */ - len--; + c1 = *big5; + if (!IS_HIGHBIT_SET(c1)) + { + /* ASCII */ + if (c1 == 0) + report_invalid_encoding(PG_BIG5, + (const char *) big5, len); *p++ = c1; + big5++; + len--; + continue; + } + l = pg_encoding_verifymb(PG_BIG5, (const char *) big5, len); + if (l < 0) + report_invalid_encoding(PG_BIG5, + (const char *) big5, len); + big5buf = (c1 << 8) | big5[1]; + cnsBuf = BIG5toCNS(big5buf, &lc); + if (lc != 0) + { + if (lc == LC_CNS11643_3 || lc == LC_CNS11643_4) + { + *p++ = 0x9d; /* LCPRV2 */ + } + *p++ = lc; /* Plane No. */ + *p++ = (cnsBuf >> 8) & 0x00ff; + *p++ = cnsBuf & 0x00ff; } else - { - len -= 2; - big5buf = c1 << 8; - c1 = *big5++; - big5buf |= c1; - cnsBuf = BIG5toCNS(big5buf, &lc); - if (lc != 0) - { - if (lc == LC_CNS11643_3 || lc == LC_CNS11643_4) - { - *p++ = 0x9d; /* LCPRV2 */ - } - *p++ = lc; /* Plane No. */ - *p++ = (cnsBuf >> 8) & 0x00ff; - *p++ = cnsBuf & 0x00ff; - } - else - { /* cannot convert */ - big5 -= 2; - *p++ = '('; - for (i = 0; i < 2; i++) - { - sprintf(bogusBuf, "%02x", *big5++); - *p++ = bogusBuf[0]; - *p++ = bogusBuf[1]; - } - *p++ = ')'; - } - } + report_untranslatable_char(PG_BIG5, PG_MULE_INTERNAL, + (const char *) big5, len); + big5 += l; + len -= l; } *p = '\0'; } @@ -295,46 +317,55 @@ big52mic(unsigned char *big5, unsigned char *p, int len) * MIC ---> Big5 */ static void -mic2big5(unsigned char *mic, unsigned char *p, int len) +mic2big5(const unsigned char *mic, unsigned char *p, int len) { - int l; unsigned short c1; unsigned short big5buf, cnsBuf; + int l; - while (len >= 0 && (c1 = *mic)) + while (len > 0) { - l = pg_mic_mblen(mic++); - len -= l; - + c1 = *mic; + if (!IS_HIGHBIT_SET(c1)) + { + /* ASCII */ + if (c1 == 0) + report_invalid_encoding(PG_MULE_INTERNAL, + (const char *) mic, len); + *p++ = c1; + mic++; + len--; + continue; + } + l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len); + if (l < 0) + report_invalid_encoding(PG_MULE_INTERNAL, + (const char *) mic, len); /* 0x9d means LCPRV2 */ if (c1 == LC_CNS11643_1 || c1 == LC_CNS11643_2 || c1 == 0x9d) { if (c1 == 0x9d) { - c1 = *mic++; /* get plane no. */ - } - cnsBuf = (*mic++) << 8; - cnsBuf |= (*mic++) & 0x00ff; - big5buf = CNStoBIG5(cnsBuf, c1); - if (big5buf == 0) - { /* cannot convert to Big5! */ - mic -= l; - pg_print_bogus_char(&mic, &p); + c1 = mic[1]; /* get plane no. */ + cnsBuf = (mic[2] << 8) | mic[3]; } else { - *p++ = (big5buf >> 8) & 0x00ff; - *p++ = big5buf & 0x00ff; + cnsBuf = (mic[1] << 8) | mic[2]; } + big5buf = CNStoBIG5(cnsBuf, c1); + if (big5buf == 0) + report_untranslatable_char(PG_MULE_INTERNAL, PG_BIG5, + (const char *) mic, len); + *p++ = (big5buf >> 8) & 0x00ff; + *p++ = big5buf & 0x00ff; } - else if (c1 <= 0x7f) /* ASCII */ - *p++ = c1; else - { /* cannot convert to Big5! */ - mic--; - pg_print_bogus_char(&mic, &p); - } + report_untranslatable_char(PG_MULE_INTERNAL, PG_BIG5, + (const char *) mic, len); + mic += l; + len -= l; } *p = '\0'; } diff --git a/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c b/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c index ec2da8bf9f..6f5bb71a38 100644 --- a/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c +++ b/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c,v 1.9 2004/12/31 22:02:08 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c,v 1.9.4.1 2006/05/21 20:06:16 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -42,10 +42,10 @@ extern Datum win1250_to_latin2(PG_FUNCTION_ARGS); * ---------- */ -static void latin22mic(unsigned char *l, unsigned char *p, int len); -static void mic2latin2(unsigned char *mic, unsigned char *p, int len); -static void win12502mic(unsigned char *l, unsigned char *p, int len); -static void mic2win1250(unsigned char *mic, unsigned char *p, int len); +static void latin22mic(const unsigned char *l, unsigned char *p, int len); +static void mic2latin2(const unsigned char *mic, unsigned char *p, int len); +static void win12502mic(const unsigned char *l, unsigned char *p, int len); +static void mic2win1250(const unsigned char *mic, unsigned char *p, int len); Datum latin2_to_mic(PG_FUNCTION_ARGS) @@ -152,14 +152,15 @@ win1250_to_latin2(PG_FUNCTION_ARGS) } static void -latin22mic(unsigned char *l, unsigned char *p, int len) +latin22mic(const unsigned char *l, unsigned char *p, int len) { - latin2mic(l, p, len, LC_ISO8859_2); + latin2mic(l, p, len, LC_ISO8859_2, PG_LATIN2); } + static void -mic2latin2(unsigned char *mic, unsigned char *p, int len) +mic2latin2(const unsigned char *mic, unsigned char *p, int len) { - mic2latin(mic, p, len, LC_ISO8859_2); + mic2latin(mic, p, len, LC_ISO8859_2, PG_LATIN2); } /*----------------------------------------------------------------- @@ -167,9 +168,9 @@ mic2latin2(unsigned char *mic, unsigned char *p, int len) * Microsoft's CP1250(windows-1250) *-----------------------------------------------------------------*/ static void -win12502mic(unsigned char *l, unsigned char *p, int len) +win12502mic(const unsigned char *l, unsigned char *p, int len) { - static unsigned char win1250_2_iso88592[] = { + static const unsigned char win1250_2_iso88592[] = { 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0xA9, 0x8B, 0xA6, 0xAB, 0xAE, 0xAC, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, @@ -188,12 +189,14 @@ win12502mic(unsigned char *l, unsigned char *p, int len) 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF }; - latin2mic_with_table(l, p, len, LC_ISO8859_2, win1250_2_iso88592); + latin2mic_with_table(l, p, len, LC_ISO8859_2, PG_WIN1250, + win1250_2_iso88592); } + static void -mic2win1250(unsigned char *mic, unsigned char *p, int len) +mic2win1250(const unsigned char *mic, unsigned char *p, int len) { - static unsigned char iso88592_2_win1250[] = { + static const unsigned char iso88592_2_win1250[] = { 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x00, 0x8B, 0x00, 0x00, 0x00, 0x00, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, @@ -212,5 +215,6 @@ mic2win1250(unsigned char *mic, unsigned char *p, int len) 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF }; - mic2latin_with_table(mic, p, len, LC_ISO8859_2, iso88592_2_win1250); + mic2latin_with_table(mic, p, len, LC_ISO8859_2, PG_WIN1250, + iso88592_2_win1250); } diff --git a/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c b/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c index f0435da45d..097975f952 100644 --- a/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c +++ b/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c,v 1.9 2004/12/31 22:02:10 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c,v 1.9.4.1 2006/05/21 20:06:16 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -40,12 +40,12 @@ extern Datum mic_to_latin4(PG_FUNCTION_ARGS); * ---------- */ -static void latin12mic(unsigned char *l, unsigned char *p, int len); -static void mic2latin1(unsigned char *mic, unsigned char *p, int len); -static void latin32mic(unsigned char *l, unsigned char *p, int len); -static void mic2latin3(unsigned char *mic, unsigned char *p, int len); -static void latin42mic(unsigned char *l, unsigned char *p, int len); -static void mic2latin4(unsigned char *mic, unsigned char *p, int len); +static void latin12mic(const unsigned char *l, unsigned char *p, int len); +static void mic2latin1(const unsigned char *mic, unsigned char *p, int len); +static void latin32mic(const unsigned char *l, unsigned char *p, int len); +static void mic2latin3(const unsigned char *mic, unsigned char *p, int len); +static void latin42mic(const unsigned char *l, unsigned char *p, int len); +static void mic2latin4(const unsigned char *mic, unsigned char *p, int len); Datum latin1_to_mic(PG_FUNCTION_ARGS) @@ -144,32 +144,37 @@ mic_to_latin4(PG_FUNCTION_ARGS) } static void -latin12mic(unsigned char *l, unsigned char *p, int len) +latin12mic(const unsigned char *l, unsigned char *p, int len) { - latin2mic(l, p, len, LC_ISO8859_1); + latin2mic(l, p, len, LC_ISO8859_1, PG_LATIN1); } + static void -mic2latin1(unsigned char *mic, unsigned char *p, int len) +mic2latin1(const unsigned char *mic, unsigned char *p, int len) { - mic2latin(mic, p, len, LC_ISO8859_1); + mic2latin(mic, p, len, LC_ISO8859_1, PG_LATIN1); } + static void -latin32mic(unsigned char *l, unsigned char *p, int len) +latin32mic(const unsigned char *l, unsigned char *p, int len) { - latin2mic(l, p, len, LC_ISO8859_3); + latin2mic(l, p, len, LC_ISO8859_3, PG_LATIN3); } + static void -mic2latin3(unsigned char *mic, unsigned char *p, int len) +mic2latin3(const unsigned char *mic, unsigned char *p, int len) { - mic2latin(mic, p, len, LC_ISO8859_3); + mic2latin(mic, p, len, LC_ISO8859_3, PG_LATIN3); } + static void -latin42mic(unsigned char *l, unsigned char *p, int len) +latin42mic(const unsigned char *l, unsigned char *p, int len) { - latin2mic(l, p, len, LC_ISO8859_4); + latin2mic(l, p, len, LC_ISO8859_4, PG_LATIN4); } + static void -mic2latin4(unsigned char *mic, unsigned char *p, int len) +mic2latin4(const unsigned char *mic, unsigned char *p, int len) { - mic2latin(mic, p, len, LC_ISO8859_4); + mic2latin(mic, p, len, LC_ISO8859_4, PG_LATIN4); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_ascii/utf8_and_ascii.c b/src/backend/utils/mb/conversion_procs/utf8_and_ascii/utf8_and_ascii.c index 93ce77377b..78d0b3ca83 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_ascii/utf8_and_ascii.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_ascii/utf8_and_ascii.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_ascii/utf8_and_ascii.c,v 1.9 2004/12/31 22:02:11 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_ascii/utf8_and_ascii.c,v 1.9.4.1 2006/05/21 20:06:16 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -43,6 +43,7 @@ ascii_to_utf8(PG_FUNCTION_ARGS) Assert(PG_GETARG_INT32(1) == PG_UTF8); Assert(len >= 0); + /* this looks wrong, but basically we're just rejecting high-bit-set */ pg_ascii2mic(src, dest, len); PG_RETURN_VOID(); @@ -59,6 +60,7 @@ utf8_to_ascii(PG_FUNCTION_ARGS) Assert(PG_GETARG_INT32(1) == PG_SQL_ASCII); Assert(len >= 0); + /* this looks wrong, but basically we're just rejecting high-bit-set */ pg_mic2ascii(src, dest, len); PG_RETURN_VOID(); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c b/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c index 31b5b5f3e2..ea923b66fa 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c,v 1.9 2004/12/31 22:02:13 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c,v 1.9.4.1 2006/05/21 20:06:17 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -62,7 +62,7 @@ utf8_to_big5(PG_FUNCTION_ARGS) Assert(len >= 0); UtfToLocal(src, dest, ULmapBIG5, - sizeof(ULmapBIG5) / sizeof(pg_utf_to_local), len); + sizeof(ULmapBIG5) / sizeof(pg_utf_to_local), PG_BIG5, len); PG_RETURN_VOID(); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c b/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c index 81dc25f361..ec91c41f8a 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c,v 1.9 2004/12/31 22:02:14 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c,v 1.9.4.1 2006/05/21 20:06:17 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -58,7 +58,7 @@ utf8_to_koi8r(PG_FUNCTION_ARGS) Assert(len >= 0); UtfToLocal(src, dest, ULmap_KOI8R, - sizeof(ULmap_KOI8R) / sizeof(pg_utf_to_local), len); + sizeof(ULmap_KOI8R) / sizeof(pg_utf_to_local), PG_KOI8R, len); PG_RETURN_VOID(); } @@ -92,7 +92,7 @@ utf8_to_win1251(PG_FUNCTION_ARGS) Assert(len >= 0); UtfToLocal(src, dest, ULmap_WIN1251, - sizeof(ULmap_WIN1251) / sizeof(pg_utf_to_local), len); + sizeof(ULmap_WIN1251) / sizeof(pg_utf_to_local), PG_WIN1251, len); PG_RETURN_VOID(); } @@ -126,7 +126,7 @@ utf8_to_alt(PG_FUNCTION_ARGS) Assert(len >= 0); UtfToLocal(src, dest, ULmap_ALT, - sizeof(ULmap_ALT) / sizeof(pg_utf_to_local), len); + sizeof(ULmap_ALT) / sizeof(pg_utf_to_local), PG_ALT, len); PG_RETURN_VOID(); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c index 59dfe9d0a7..25f4ce6252 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c,v 1.9 2004/12/31 22:02:16 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c,v 1.9.4.1 2006/05/21 20:06:17 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -62,7 +62,7 @@ utf8_to_euc_cn(PG_FUNCTION_ARGS) Assert(len >= 0); UtfToLocal(src, dest, ULmapEUC_CN, - sizeof(ULmapEUC_CN) / sizeof(pg_utf_to_local), len); + sizeof(ULmapEUC_CN) / sizeof(pg_utf_to_local), PG_EUC_CN, len); PG_RETURN_VOID(); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c index d438148d31..38c8827789 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c,v 1.9 2004/12/31 22:02:17 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c,v 1.9.4.1 2006/05/21 20:06:17 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -62,7 +62,7 @@ utf8_to_euc_jp(PG_FUNCTION_ARGS) Assert(len >= 0); UtfToLocal(src, dest, ULmapEUC_JP, - sizeof(ULmapEUC_JP) / sizeof(pg_utf_to_local), len); + sizeof(ULmapEUC_JP) / sizeof(pg_utf_to_local), PG_EUC_JP, len); PG_RETURN_VOID(); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c index 0f8668546f..4ce2feeeb0 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c,v 1.9 2004/12/31 22:02:19 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c,v 1.9.4.1 2006/05/21 20:06:17 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -62,7 +62,7 @@ utf8_to_euc_kr(PG_FUNCTION_ARGS) Assert(len >= 0); UtfToLocal(src, dest, ULmapEUC_KR, - sizeof(ULmapEUC_KR) / sizeof(pg_utf_to_local), len); + sizeof(ULmapEUC_KR) / sizeof(pg_utf_to_local), PG_EUC_KR, len); PG_RETURN_VOID(); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c index 8ceb89140c..c008632229 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c,v 1.9 2004/12/31 22:02:20 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c,v 1.9.4.1 2006/05/21 20:06:17 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -62,7 +62,7 @@ utf8_to_euc_tw(PG_FUNCTION_ARGS) Assert(len >= 0); UtfToLocal(src, dest, ULmapEUC_TW, - sizeof(ULmapEUC_TW) / sizeof(pg_utf_to_local), len); + sizeof(ULmapEUC_TW) / sizeof(pg_utf_to_local), PG_EUC_TW, len); PG_RETURN_VOID(); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c index ae06726158..7d532591fd 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c,v 1.9 2004/12/31 22:02:23 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c,v 1.9.4.1 2006/05/21 20:06:17 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -62,7 +62,7 @@ utf8_to_gb18030(PG_FUNCTION_ARGS) Assert(len >= 0); UtfToLocal(src, dest, ULmapGB18030, - sizeof(ULmapGB18030) / sizeof(pg_utf_to_local), len); + sizeof(ULmapGB18030) / sizeof(pg_utf_to_local), PG_GB18030, len); PG_RETURN_VOID(); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c b/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c index 04524c8497..463fedba2a 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c,v 1.9 2004/12/31 22:02:26 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c,v 1.9.4.1 2006/05/21 20:06:17 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -62,7 +62,7 @@ utf8_to_gbk(PG_FUNCTION_ARGS) Assert(len >= 0); UtfToLocal(src, dest, ULmapGBK, - sizeof(ULmapGBK) / sizeof(pg_utf_to_local), len); + sizeof(ULmapGBK) / sizeof(pg_utf_to_local), PG_GBK, len); PG_RETURN_VOID(); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c index 1bb101d4da..edd8cfbd6d 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c,v 1.11 2004/12/31 22:02:27 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c,v 1.11.4.1 2006/05/21 20:06:17 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -151,7 +151,7 @@ utf8_to_iso8859(PG_FUNCTION_ARGS) Assert(PG_GETARG_INT32(0) == PG_UTF8); Assert(len >= 0); - UtfToLocal(src, dest, maps[encoding].map2, maps[encoding].size2, len); + UtfToLocal(src, dest, maps[encoding].map2, maps[encoding].size2, encoding, len); PG_RETURN_VOID(); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c index 8b5812eead..8accfb4bf6 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c,v 1.10 2004/12/31 22:02:29 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c,v 1.10.4.1 2006/05/21 20:06:17 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -44,15 +44,20 @@ iso8859_1_to_utf8(PG_FUNCTION_ARGS) Assert(PG_GETARG_INT32(1) == PG_UTF8); Assert(len >= 0); - while (len-- > 0 && (c = *src++)) + while (len > 0) { - if (c < 0x80) + c = *src; + if (c == 0) + report_invalid_encoding(PG_LATIN1, (const char *) src, len); + if (!IS_HIGHBIT_SET(c)) *dest++ = c; else { *dest++ = (c >> 6) | 0xc0; *dest++ = (c & 0x003f) | 0x80; } + src++; + len--; } *dest = '\0'; @@ -66,31 +71,45 @@ utf8_to_iso8859_1(PG_FUNCTION_ARGS) unsigned char *dest = PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); unsigned short c, - c1, - c2; + c1; Assert(PG_GETARG_INT32(0) == PG_UTF8); Assert(PG_GETARG_INT32(1) == PG_LATIN1); Assert(len >= 0); - while (len >= 0 && (c = *src++)) + while (len > 0) { - if ((c & 0xe0) == 0xc0) - { - c1 = c & 0x1f; - c2 = *src++ & 0x3f; - *dest = c1 << 6; - *dest++ |= c2; - len -= 2; - } - else if ((c & 0xe0) == 0xe0) - elog(ERROR, "could not convert UTF-8 character 0x%04x to ISO8859-1", - c); - else + c = *src; + if (c == 0) + report_invalid_encoding(PG_UTF8, (const char *) src, len); + /* fast path for ASCII-subset characters */ + if (!IS_HIGHBIT_SET(c)) { *dest++ = c; + src++; len--; } + else + { + int l = pg_utf_mblen(src); + + if (l > len || !pg_utf8_islegal(src, l)) + report_invalid_encoding(PG_UTF8, (const char *) src, len); + if (l != 2) + report_untranslatable_char(PG_UTF8, PG_LATIN1, + (const char *) src, len); + c1 = src[1] & 0x3f; + c = ((c & 0x1f) << 6) | c1; + if (c >= 0x80 && c <= 0xff) + { + *dest++ = (unsigned char) c; + src += 2; + len -= 2; + } + else + report_untranslatable_char(PG_UTF8, PG_LATIN1, + (const char *) src, len); + } } *dest = '\0'; diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c b/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c index 3083d397ce..97bd44c3fb 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c,v 1.9 2004/12/31 22:02:31 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c,v 1.9.4.1 2006/05/21 20:06:17 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -62,7 +62,7 @@ utf8_to_johab(PG_FUNCTION_ARGS) Assert(len >= 0); UtfToLocal(src, dest, ULmapJOHAB, - sizeof(ULmapJOHAB) / sizeof(pg_utf_to_local), len); + sizeof(ULmapJOHAB) / sizeof(pg_utf_to_local), PG_JOHAB, len); PG_RETURN_VOID(); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c b/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c index 3d3f805f1f..852ca98a56 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c,v 1.9 2004/12/31 22:02:33 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c,v 1.9.4.1 2006/05/21 20:06:17 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -62,7 +62,7 @@ utf8_to_sjis(PG_FUNCTION_ARGS) Assert(len >= 0); UtfToLocal(src, dest, ULmapSJIS, - sizeof(ULmapSJIS) / sizeof(pg_utf_to_local), len); + sizeof(ULmapSJIS) / sizeof(pg_utf_to_local), PG_SJIS, len); PG_RETURN_VOID(); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_tcvn/utf8_and_tcvn.c b/src/backend/utils/mb/conversion_procs/utf8_and_tcvn/utf8_and_tcvn.c index 4894d3fee8..a3d7b3b398 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_tcvn/utf8_and_tcvn.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_tcvn/utf8_and_tcvn.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_tcvn/utf8_and_tcvn.c,v 1.9 2004/12/31 22:02:35 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_tcvn/utf8_and_tcvn.c,v 1.9.4.1 2006/05/21 20:06:18 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -62,7 +62,7 @@ utf8_to_tcvn(PG_FUNCTION_ARGS) Assert(len >= 0); UtfToLocal(src, dest, ULmapTCVN, - sizeof(ULmapTCVN) / sizeof(pg_utf_to_local), len); + sizeof(ULmapTCVN) / sizeof(pg_utf_to_local), PG_TCVN, len); PG_RETURN_VOID(); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c b/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c index c995ebc59a..33f6e59b05 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c,v 1.9 2004/12/31 22:02:36 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c,v 1.9.4.1 2006/05/21 20:06:17 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -62,7 +62,7 @@ utf8_to_uhc(PG_FUNCTION_ARGS) Assert(len >= 0); UtfToLocal(src, dest, ULmapUHC, - sizeof(ULmapUHC) / sizeof(pg_utf_to_local), len); + sizeof(ULmapUHC) / sizeof(pg_utf_to_local), PG_UHC, len); PG_RETURN_VOID(); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_win1250/utf8_and_win1250.c b/src/backend/utils/mb/conversion_procs/utf8_and_win1250/utf8_and_win1250.c index 3b260a0e64..403edc27a8 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_win1250/utf8_and_win1250.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_win1250/utf8_and_win1250.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_win1250/utf8_and_win1250.c,v 1.9 2004/12/31 22:02:38 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_win1250/utf8_and_win1250.c,v 1.9.4.1 2006/05/21 20:06:17 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -46,7 +46,7 @@ utf_to_win1250(PG_FUNCTION_ARGS) Assert(len >= 0); UtfToLocal(src, dest, ULmapWIN1250, - sizeof(ULmapWIN1250) / sizeof(pg_utf_to_local), len); + sizeof(ULmapWIN1250) / sizeof(pg_utf_to_local), PG_WIN1250, len); PG_RETURN_VOID(); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_win1256/utf8_and_win1256.c b/src/backend/utils/mb/conversion_procs/utf8_and_win1256/utf8_and_win1256.c index 1001e19e75..fcdd418fac 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_win1256/utf8_and_win1256.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_win1256/utf8_and_win1256.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_win1256/utf8_and_win1256.c,v 1.9 2004/12/31 22:02:39 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_win1256/utf8_and_win1256.c,v 1.9.4.1 2006/05/21 20:06:17 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -46,7 +46,7 @@ utf_to_win1256(PG_FUNCTION_ARGS) Assert(len >= 0); UtfToLocal(src, dest, ULmapWIN1256, - sizeof(ULmapWIN1256) / sizeof(pg_utf_to_local), len); + sizeof(ULmapWIN1256) / sizeof(pg_utf_to_local), PG_WIN1256, len); PG_RETURN_VOID(); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_win874/utf8_and_win874.c b/src/backend/utils/mb/conversion_procs/utf8_and_win874/utf8_and_win874.c index c37c5c495b..21432c0b5f 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_win874/utf8_and_win874.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_win874/utf8_and_win874.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_win874/utf8_and_win874.c,v 1.9 2004/12/31 22:02:41 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_win874/utf8_and_win874.c,v 1.9.4.1 2006/05/21 20:06:17 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -46,7 +46,7 @@ utf_to_win874(PG_FUNCTION_ARGS) Assert(len >= 0); UtfToLocal(src, dest, ULmapWIN874, - sizeof(ULmapWIN874) / sizeof(pg_utf_to_local), len); + sizeof(ULmapWIN874) / sizeof(pg_utf_to_local), PG_WIN874, len); PG_RETURN_VOID(); } diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c index 9718e7e73e..da5f8e66c5 100644 --- a/src/backend/utils/mb/mbutils.c +++ b/src/backend/utils/mb/mbutils.c @@ -4,7 +4,7 @@ * (currently mule internal code (mic) is used) * Tatsuo Ishii * - * $PostgreSQL: pgsql/src/backend/utils/mb/mbutils.c,v 1.48 2004/10/13 01:25:12 neilc Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/mbutils.c,v 1.48.4.1 2006/05/21 20:06:16 tgl Exp $ */ #include "postgres.h" @@ -370,9 +370,50 @@ pg_client_to_server(unsigned char *s, int len) Assert(DatabaseEncoding); Assert(ClientEncoding); - if (ClientEncoding->encoding == DatabaseEncoding->encoding) + if (len <= 0) return s; + if (ClientEncoding->encoding == DatabaseEncoding->encoding || + ClientEncoding->encoding == PG_SQL_ASCII) + { + /* + * No conversion is needed, but we must still validate the data. + */ + (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false); + return s; + } + + if (DatabaseEncoding->encoding == PG_SQL_ASCII) + { + /* + * No conversion is possible, but we must still validate the data, + * because the client-side code might have done string escaping + * using the selected client_encoding. If the client encoding is + * ASCII-safe then we just do a straight validation under that + * encoding. For an ASCII-unsafe encoding we have a problem: + * we dare not pass such data to the parser but we have no way + * to convert it. We compromise by rejecting the data if it + * contains any non-ASCII characters. + */ + if (PG_VALID_BE_ENCODING(ClientEncoding->encoding)) + (void) pg_verify_mbstr(ClientEncoding->encoding, s, len, false); + else + { + int i; + + for (i = 0; i < len; i++) + { + if (s[i] == '\0' || IS_HIGHBIT_SET(s[i])) + ereport(ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("invalid byte value for encoding \"%s\": 0x%02x", + pg_enc2name_tbl[PG_SQL_ASCII].name, + (unsigned char) s[i]))); + } + } + return s; + } + return perform_default_encoding_conversion(s, len, true); } @@ -385,9 +426,14 @@ pg_server_to_client(unsigned char *s, int len) Assert(DatabaseEncoding); Assert(ClientEncoding); - if (ClientEncoding->encoding == DatabaseEncoding->encoding) + if (len <= 0) return s; + if (ClientEncoding->encoding == DatabaseEncoding->encoding || + ClientEncoding->encoding == PG_SQL_ASCII || + DatabaseEncoding->encoding == PG_SQL_ASCII) + return s; /* assume data is valid */ + return perform_default_encoding_conversion(s, len, false); } @@ -406,9 +452,6 @@ perform_default_encoding_conversion(unsigned char *src, int len, bool is_client_ dest_encoding; FmgrInfo *flinfo; - if (len <= 0) - return src; - if (is_client_to_server) { src_encoding = ClientEncoding->encoding; @@ -425,12 +468,6 @@ perform_default_encoding_conversion(unsigned char *src, int len, bool is_client_ if (flinfo == NULL) return src; - if (src_encoding == dest_encoding) - return src; - - if (src_encoding == PG_SQL_ASCII || dest_encoding == PG_SQL_ASCII) - return src; - result = palloc(len * 4 + 1); FunctionCall5(flinfo, diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c index 3adf4481e8..96a2688695 100644 --- a/src/backend/utils/mb/wchar.c +++ b/src/backend/utils/mb/wchar.c @@ -1,7 +1,7 @@ /* * conversion functions between pg_wchar and multibyte streams. * Tatsuo Ishii - * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.40.4.1 2005/12/24 10:11:32 ishii Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.40.4.2 2006/05/21 20:06:16 tgl Exp $ * * WIN1250 client encoding updated by Pavel Behal * @@ -96,7 +96,7 @@ static int pg_euc2wchar_with_len return (cnt); } -static int +static inline int pg_euc_mblen(const unsigned char *s) { int len; @@ -112,7 +112,7 @@ pg_euc_mblen(const unsigned char *s) return (len); } -static int +static inline int pg_euc_dsplen(const unsigned char *s) { int len; @@ -714,53 +714,433 @@ pg_gb18030_dsplen(const unsigned char *s) return (len); } +/* + *------------------------------------------------------------------- + * multibyte sequence validators + * + * These functions accept "s", a pointer to the first byte of a string, + * and "len", the remaining length of the string. If there is a validly + * encoded character beginning at *s, return its length in bytes; else + * return -1. + * + * The functions can assume that len > 0 and that *s != '\0', but they must + * test for and reject zeroes in any additional bytes of a multibyte character. + * + * Note that this definition allows the function for a single-byte + * encoding to be just "return 1". + *------------------------------------------------------------------- + */ +static int +pg_ascii_verifier(const unsigned char *s, int len) +{ + return 1; +} + +#define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe) + +static int +pg_eucjp_verifier(const unsigned char *s, int len) +{ + int l; + unsigned char c1, c2; + + c1 = *s++; + + switch (c1) + { + case SS2: /* JIS X 0201 */ + l = 2; + if (l > len) + return -1; + c2 = *s++; + if (c2 < 0xa1 || c2 > 0xdf) + return -1; + break; + + case SS3: /* JIS X 0212 */ + l = 3; + if (l > len) + return -1; + c2 = *s++; + if (!IS_EUC_RANGE_VALID(c2)) + return -1; + c2 = *s++; + if (!IS_EUC_RANGE_VALID(c2)) + return -1; + break; + + default: + if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */ + { + l = 2; + if (l > len) + return -1; + if (!IS_EUC_RANGE_VALID(c1)) + return -1; + c2 = *s++; + if (!IS_EUC_RANGE_VALID(c2)) + return -1; + } + else /* must be ASCII */ + { + l = 1; + } + break; + } + + return l; +} + +static int +pg_euckr_verifier(const unsigned char *s, int len) +{ + int l; + unsigned char c1, c2; + + c1 = *s++; + + if (IS_HIGHBIT_SET(c1)) + { + l = 2; + if (l > len) + return -1; + if (!IS_EUC_RANGE_VALID(c1)) + return -1; + c2 = *s++; + if (!IS_EUC_RANGE_VALID(c2)) + return -1; + } + else /* must be ASCII */ + { + l = 1; + } + + return l; +} + +/* EUC-CN byte sequences are exactly same as EUC-KR */ +#define pg_euccn_verifier pg_euckr_verifier + +static int +pg_euctw_verifier(const unsigned char *s, int len) +{ + int l; + unsigned char c1, c2; + + c1 = *s++; + + switch (c1) + { + case SS2: /* CNS 11643 Plane 1-7 */ + l = 4; + if (l > len) + return -1; + c2 = *s++; + if (c2 < 0xa1 || c2 > 0xa7) + return -1; + c2 = *s++; + if (!IS_EUC_RANGE_VALID(c2)) + return -1; + c2 = *s++; + if (!IS_EUC_RANGE_VALID(c2)) + return -1; + break; + + case SS3: /* unused */ + return -1; + + default: + if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */ + { + l = 2; + if (l > len) + return -1; + /* no further range check on c1? */ + c2 = *s++; + if (!IS_EUC_RANGE_VALID(c2)) + return -1; + } + else /* must be ASCII */ + { + l = 1; + } + break; + } + return l; +} + +static int +pg_johab_verifier(const unsigned char *s, int len) +{ + int l, mbl; + unsigned char c; + + l = mbl = pg_johab_mblen(s); + + if (len < l) + return -1; + + if (!IS_HIGHBIT_SET(*s)) + return mbl; + + while (--l > 0) + { + c = *++s; + if (!IS_EUC_RANGE_VALID(c)) + return -1; + } + return mbl; +} + +static int +pg_mule_verifier(const unsigned char *s, int len) +{ + int l, mbl; + unsigned char c; + + l = mbl = pg_mule_mblen(s); + + if (len < l) + return -1; + + while (--l > 0) + { + c = *++s; + if (!IS_HIGHBIT_SET(c)) + return -1; + } + return mbl; +} + +static int +pg_latin1_verifier(const unsigned char *s, int len) +{ + return 1; +} + +static int +pg_sjis_verifier(const unsigned char *s, int len) +{ + int l, mbl; + unsigned char c1, c2; + + l = mbl = pg_sjis_mblen(s); + + if (len < l) + return -1; + + if (l == 1) /* pg_sjis_mblen already verified it */ + return mbl; + + c1 = *s++; + c2 = *s; + if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2)) + return -1; + return mbl; +} + +static int +pg_big5_verifier(const unsigned char *s, int len) +{ + int l, mbl; + + l = mbl = pg_big5_mblen(s); + + if (len < l) + return -1; + + while (--l > 0) + { + if (*++s == '\0') + return -1; + } + + return mbl; +} + +static int +pg_gbk_verifier(const unsigned char *s, int len) +{ + int l, mbl; + + l = mbl = pg_gbk_mblen(s); + + if (len < l) + return -1; + + while (--l > 0) + { + if (*++s == '\0') + return -1; + } + + return mbl; +} + +static int +pg_uhc_verifier(const unsigned char *s, int len) +{ + int l, mbl; + + l = mbl = pg_uhc_mblen(s); + + if (len < l) + return -1; + + while (--l > 0) + { + if (*++s == '\0') + return -1; + } + + return mbl; +} + +static int +pg_gb18030_verifier(const unsigned char *s, int len) +{ + int l, mbl; + + l = mbl = pg_gb18030_mblen(s); + + if (len < l) + return -1; + + while (--l > 0) + { + if (*++s == '\0') + return -1; + } + + return mbl; +} + +static int +pg_utf8_verifier(const unsigned char *s, int len) +{ + int l = pg_utf_mblen(s); + + if (len < l) + return -1; + + if (!pg_utf8_islegal(s, l)) + return -1; + + return l; +} + +/* + * Check for validity of a single UTF-8 encoded character + * + * This directly implements the rules in RFC3629, modified to restrict + * us to 16-bit Unicode code points (hence, at most 3 bytes in UTF8). + * The bizarre-looking + * restrictions on the second byte are meant to ensure that there isn't + * more than one encoding of a given Unicode character point; that is, + * you may not use a longer-than-necessary byte sequence with high order + * zero bits to represent a character that would fit in fewer bytes. + * To do otherwise is to create security hazards (eg, create an apparent + * non-ASCII character that decodes to plain ASCII). + * + * length is assumed to have been obtained by pg_utf_mblen(), and the + * caller must have checked that that many bytes are present in the buffer. + */ +bool +pg_utf8_islegal(const unsigned char *source, int length) +{ + unsigned char a; + + switch (length) + { + default: + /* reject lengths 4, 5 and 6 for now */ + return false; + case 3: + a = source[2]; + if (a < 0x80 || a > 0xBF) + return false; + /* FALL THRU */ + case 2: + a = source[1]; + switch (*source) + { + case 0xE0: + if (a < 0xA0 || a > 0xBF) + return false; + break; + case 0xED: + if (a < 0x80 || a > 0x9F) + return false; + break; + default: + if (a < 0x80 || a > 0xBF) + return false; + break; + } + /* FALL THRU */ + case 1: + a = *source; + if (a >= 0x80 && a < 0xC2) + return false; + if (a > 0xEF) + return false; + break; + } + return true; +} + +/* + *------------------------------------------------------------------- + * encoding info table + *------------------------------------------------------------------- + */ pg_wchar_tbl pg_wchar_table[] = { - {pg_ascii2wchar_with_len, pg_ascii_mblen, pg_ascii_dsplen, 1}, /* 0; PG_SQL_ASCII */ - {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, 3}, /* 1; PG_EUC_JP */ - {pg_euccn2wchar_with_len, pg_euccn_mblen, pg_euccn_dsplen, 3}, /* 2; PG_EUC_CN */ - {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3}, /* 3; PG_EUC_KR */ - {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3}, /* 4; PG_EUC_TW */ - {pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3}, /* 5; PG_JOHAB */ - {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 3}, /* 6; PG_UNICODE */ - {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7; PG_MULE_INTERNAL */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 8; PG_LATIN1 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 9; PG_LATIN2 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 10; PG_LATIN3 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 11; PG_LATIN4 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 12; PG_LATIN5 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 13; PG_LATIN6 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 14; PG_LATIN7 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 15; PG_LATIN8 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 16; PG_LATIN9 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 17; PG_LATIN10 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 18; PG_WIN1256 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 19; PG_TCVN */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 20; PG_WIN874 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 21; PG_KOI8 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 22; PG_WIN1251 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 23; PG_ALT */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 24; ISO-8859-5 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 25; ISO-8859-6 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 26; ISO-8859-7 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 27; ISO-8859-8 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 28; PG_WIN1250 */ - {0, pg_sjis_mblen, pg_sjis_dsplen, 2}, /* 29; PG_SJIS */ - {0, pg_big5_mblen, pg_big5_dsplen, 2}, /* 30; PG_BIG5 */ - {0, pg_gbk_mblen, pg_gbk_dsplen, 2}, /* 31; PG_GBK */ - {0, pg_uhc_mblen, pg_uhc_dsplen, 2}, /* 32; PG_UHC */ - {0, pg_gb18030_mblen, pg_gb18030_dsplen, 2} /* 33; PG_GB18030 */ + {pg_ascii2wchar_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* 0; PG_SQL_ASCII */ + {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* 1; PG_EUC_JP */ + {pg_euccn2wchar_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 3}, /* 2; PG_EUC_CN */ + {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3}, /* 3; PG_EUC_KR */ + {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 3}, /* 4; PG_EUC_TW */ + {pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3}, /* 5; PG_JOHAB */ + {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 3}, /* 6; PG_UNICODE */ + {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 3}, /* 7; PG_MULE_INTERNAL */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 8; PG_LATIN1 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 9; PG_LATIN2 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 10; PG_LATIN3 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 11; PG_LATIN4 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 12; PG_LATIN5 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 13; PG_LATIN6 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 14; PG_LATIN7 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 15; PG_LATIN8 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 16; PG_LATIN9 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 17; PG_LATIN10 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 18; PG_WIN1256 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 19; PG_TCVN */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 20; PG_WIN874 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 21; PG_KOI8 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 22; PG_WIN1251 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 23; PG_ALT */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 24; ISO-8859-5 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 25; ISO-8859-6 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 26; ISO-8859-7 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 27; ISO-8859-8 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 28; PG_WIN1250 */ + {0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* 29; PG_SJIS */ + {0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* 30; PG_BIG5 */ + {0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2}, /* 31; PG_GBK */ + {0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2}, /* 32; PG_UHC */ + {0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 2} /* 33; PG_GB18030 */ }; /* returns the byte length of a word for mule internal code */ int pg_mic_mblen(const unsigned char *mbstr) { - return (pg_mule_mblen(mbstr)); + return pg_mule_mblen(mbstr); } /* - * Returns the byte length of a multibyte word. + * Returns the byte length of a multibyte character. */ int pg_encoding_mblen(int encoding, const unsigned char *mbstr) @@ -769,12 +1149,12 @@ pg_encoding_mblen(int encoding, const unsigned char *mbstr) return ((encoding >= 0 && encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ? - ((*pg_wchar_table[encoding].mblen) (mbstr)) : - ((*pg_wchar_table[PG_SQL_ASCII].mblen) (mbstr))); + ((*pg_wchar_table[encoding].mblen) ((const unsigned char *) mbstr)) : + ((*pg_wchar_table[PG_SQL_ASCII].mblen) ((const unsigned char *) mbstr))); } /* - * Returns the display length of a multibyte word. + * Returns the display length of a multibyte character. */ int pg_encoding_dsplen(int encoding, const unsigned char *mbstr) @@ -783,12 +1163,28 @@ pg_encoding_dsplen(int encoding, const unsigned char *mbstr) return ((encoding >= 0 && encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ? - ((*pg_wchar_table[encoding].dsplen) (mbstr)) : - ((*pg_wchar_table[PG_SQL_ASCII].dsplen) (mbstr))); + ((*pg_wchar_table[encoding].dsplen) ((const unsigned char *) mbstr)) : + ((*pg_wchar_table[PG_SQL_ASCII].dsplen) ((const unsigned char *) mbstr))); } /* - * fetch maximum length of a char encoding + * Verify the first multibyte character of the given string. + * Return its byte length if good, -1 if bad. (See comments above for + * full details of the mbverify API.) + */ +int +pg_encoding_verifymb(int encoding, const char *mbstr, int len) +{ + Assert(PG_VALID_ENCODING(encoding)); + + return ((encoding >= 0 && + encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ? + ((*pg_wchar_table[encoding].mbverify) ((const unsigned char *) mbstr, len)) : + ((*pg_wchar_table[PG_SQL_ASCII].mbverify) ((const unsigned char *) mbstr, len))); +} + +/* + * fetch maximum length of a given encoding */ int pg_encoding_max_length(int encoding) @@ -801,78 +1197,7 @@ pg_encoding_max_length(int encoding) #ifndef FRONTEND /* - * Verify mbstr to make sure that it has a valid character sequence. - * mbstr is not necessarily NULL terminated; length of mbstr is - * specified by len. - * - * If OK, return TRUE. If a problem is found, return FALSE when noError is - * true; when noError is false, ereport() a descriptive message. - */ -bool -pg_verifymbstr(const unsigned char *mbstr, int len, bool noError) -{ - int l; - int i; - int encoding; - - /* we do not need any check in single-byte encodings */ - if (pg_database_encoding_max_length() <= 1) - return true; - - encoding = GetDatabaseEncoding(); - - while (len > 0 && *mbstr) - { - /* special UTF-8 check */ - if (encoding == PG_UTF8 && (*mbstr & 0xf8) == 0xf0) - { - if (noError) - return false; - ereport(ERROR, - (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("Unicode characters greater than or equal to 0x10000 are not supported"))); - } - - l = pg_mblen(mbstr); - - for (i = 1; i < l; i++) - { - /* - * we expect that every multibyte char consists of bytes - * having the 8th bit set - */ - if (i >= len || (mbstr[i] & 0x80) == 0) - { - char buf[8 * 2 + 1]; - char *p = buf; - int j, - jlimit; - - if (noError) - return false; - - jlimit = Min(l, len); - jlimit = Min(jlimit, 8); /* prevent buffer overrun */ - - for (j = 0; j < jlimit; j++) - p += sprintf(p, "%02x", mbstr[j]); - - ereport(ERROR, - (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("invalid byte sequence for encoding \"%s\": 0x%s", - GetDatabaseEncodingName(), buf))); - } - } - - len -= l; - mbstr += l; - } - - return true; -} - -/* - * fetch maximum length of a char encoding for the current database + * fetch maximum length of the encoding for the current database */ int pg_database_encoding_max_length(void) @@ -880,4 +1205,139 @@ pg_database_encoding_max_length(void) return pg_wchar_table[GetDatabaseEncoding()].maxmblen; } +/* + * Verify mbstr to make sure that it is validly encoded in the current + * database encoding. Otherwise same as pg_verify_mbstr(). + */ +bool +pg_verifymbstr(const char *mbstr, int len, bool noError) +{ + return pg_verify_mbstr(GetDatabaseEncoding(), mbstr, len, noError); +} + +/* + * Verify mbstr to make sure that it is validly encoded in the specified + * encoding. + * + * mbstr is not necessarily zero terminated; length of mbstr is + * specified by len. + * + * If OK, return TRUE. If a problem is found, return FALSE when noError is + * true; when noError is false, ereport() a descriptive message. + */ +bool +pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError) +{ + mbverifier mbverify; + + Assert(PG_VALID_ENCODING(encoding)); + + /* + * In single-byte encodings, we need only reject nulls (\0). + */ + if (pg_encoding_max_length(encoding) <= 1) + { + const char *nullpos = memchr(mbstr, 0, len); + + if (nullpos == NULL) + return true; + if (noError) + return false; + report_invalid_encoding(encoding, nullpos, 1); + } + + /* fetch function pointer just once */ + mbverify = pg_wchar_table[encoding].mbverify; + + while (len > 0) + { + int l; + + /* fast path for ASCII-subset characters */ + if (!IS_HIGHBIT_SET(*mbstr)) + { + if (*mbstr != '\0') + { + mbstr++; + len--; + continue; + } + if (noError) + return false; + report_invalid_encoding(encoding, mbstr, len); + } + + l = (*mbverify) ((const unsigned char *) mbstr, len); + + if (l < 0) + { + if (noError) + return false; + report_invalid_encoding(encoding, mbstr, len); + } + + mbstr += l; + len -= l; + } + return true; +} + +/* + * report_invalid_encoding: complain about invalid multibyte character + * + * note: len is remaining length of string, not length of character; + * len must be greater than zero, as we always examine the first byte. + */ +void +report_invalid_encoding(int encoding, const char *mbstr, int len) +{ + int l = pg_encoding_mblen(encoding, mbstr); + char buf[8 * 2 + 1]; + char *p = buf; + int j, + jlimit; + + jlimit = Min(l, len); + jlimit = Min(jlimit, 8); /* prevent buffer overrun */ + + for (j = 0; j < jlimit; j++) + p += sprintf(p, "%02x", (unsigned char) mbstr[j]); + + ereport(ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("invalid byte sequence for encoding \"%s\": 0x%s", + pg_enc2name_tbl[encoding].name, + buf))); +} + +/* + * report_untranslatable_char: complain about untranslatable character + * + * note: len is remaining length of string, not length of character; + * len must be greater than zero, as we always examine the first byte. + */ +void +report_untranslatable_char(int src_encoding, int dest_encoding, + const char *mbstr, int len) +{ + int l = pg_encoding_mblen(src_encoding, mbstr); + char buf[8 * 2 + 1]; + char *p = buf; + int j, + jlimit; + + jlimit = Min(l, len); + jlimit = Min(jlimit, 8); /* prevent buffer overrun */ + + for (j = 0; j < jlimit; j++) + p += sprintf(p, "%02x", (unsigned char) mbstr[j]); + + ereport(ERROR, + (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER), + errmsg("character 0x%s of encoding \"%s\" has no equivalent in \"%s\"", + buf, + pg_enc2name_tbl[src_encoding].name, + pg_enc2name_tbl[dest_encoding].name))); +} + #endif diff --git a/src/include/c.h b/src/include/c.h index 08ce6dd243..9bdbcd8184 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -12,7 +12,7 @@ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/c.h,v 1.178.4.1 2005/07/18 15:53:46 tgl Exp $ + * $PostgreSQL: pgsql/src/include/c.h,v 1.178.4.2 2006/05/21 20:06:18 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -670,6 +670,8 @@ typedef NameData *Name; /* msb for char */ #define CSIGNBIT (0x80) +#define HIGHBIT (0x80) +#define IS_HIGHBIT_SET(ch) ((unsigned char)(ch) & HIGHBIT) #define STATUS_OK (0) #define STATUS_ERROR (-1) diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 568679bf44..17e68edf6c 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -1,4 +1,4 @@ -/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.56 2004/12/04 18:19:33 momjian Exp $ */ +/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.56.4.1 2006/05/21 20:06:18 tgl Exp $ */ #ifndef PG_WCHAR_H #define PG_WCHAR_H @@ -23,11 +23,17 @@ typedef unsigned int pg_wchar; #define SS2 0x8e /* single shift 2 (JIS0201) */ #define SS3 0x8f /* single shift 3 (JIS0212) */ +/* + * SJIS validation macros + */ +#define ISSJISHEAD(c) (((c) >= 0x81 && (c) <= 0x9f) || ((c) >= 0xe0 && (c) <= 0xfc)) +#define ISSJISTAIL(c) (((c) >= 0x40 && (c) <= 0x7e) || ((c) >= 0x80 && (c) <= 0xfc)) + /* * Leading byte types or leading prefix byte for MULE internal code. * See http://www.xemacs.org for more details. (there is a doc titled * "XEmacs Internals Manual", "MULE Character Sets and Encodings" - * section. + * section.) */ /* * Is a leading byte for "official" single byte encodings? @@ -64,7 +70,7 @@ typedef unsigned int pg_wchar; #define LC_ISO8859_8 0x88 /* Hebrew (not supported yet) */ #define LC_JISX0201K 0x89 /* Japanese 1 byte kana */ #define LC_JISX0201R 0x8a /* Japanese 1 byte Roman */ -/* Note that 0x8b seems to be unused in as of Emacs 20.7. +/* Note that 0x8b seems to be unused as of Emacs 20.7. * However, there might be a chance that 0x8b could be used * in later version of Emacs. */ @@ -137,13 +143,13 @@ typedef unsigned int pg_wchar; /* #define FREE 0xff free (unused) */ /* - * Encoding numeral identificators + * PostgreSQL encoding identifiers * * WARNING: the order of this table must be same as order * in the pg_enc2name[] (mb/encnames.c) array! * - * If you add some encoding don'y forget check - * PG_ENCODING_[BE|FE]_LAST macros. + * If you add some encoding don't forget to check + * PG_ENCODING_BE_LAST macro. * * The PG_SQL_ASCII is default encoding and must be = 0. */ @@ -199,14 +205,13 @@ typedef enum pg_enc #define PG_VALID_BE_ENCODING(_enc) \ ((_enc) >= 0 && (_enc) <= PG_ENCODING_BE_LAST) -#define PG_ENCODING_IS_CLIEN_ONLY(_enc) \ - (((_enc) > PG_ENCODING_BE_LAST && (_enc) <= PG_ENCODING_FE_LAST) +#define PG_ENCODING_IS_CLIENT_ONLY(_enc) \ + ((_enc) > PG_ENCODING_BE_LAST && (_enc) <= PG_ENCODING_FE_LAST) #define PG_VALID_ENCODING(_enc) \ ((_enc) >= 0 && (_enc) < _PG_LAST_ENCODING_) -/* On FE are possible all encodings - */ +/* On FE are possible all encodings */ #define PG_VALID_FE_ENCODING(_enc) PG_VALID_ENCODING(_enc) /* @@ -246,18 +251,21 @@ extern const char *pg_encoding_to_char(int encoding); typedef int (*mb2wchar_with_len_converter) (const unsigned char *from, pg_wchar *to, int len); + typedef int (*mblen_converter) (const unsigned char *mbstr); typedef int (*mbdisplaylen_converter) (const unsigned char *mbstr); +typedef int (*mbverifier) (const unsigned char *mbstr, int len); + typedef struct { mb2wchar_with_len_converter mb2wchar_with_len; /* convert a multibyte * string to a wchar */ - mblen_converter mblen; /* returns the length of a multibyte char */ - mbdisplaylen_converter dsplen; /* returns the lenghth of a - * display length */ - int maxmblen; /* max bytes for a char in this charset */ + mblen_converter mblen; /* get byte length of a char */ + mbdisplaylen_converter dsplen; /* get display width of a char */ + mbverifier mbverify; /* verify multibyte sequence */ + int maxmblen; /* max bytes for a char in this encoding */ } pg_wchar_tbl; extern pg_wchar_tbl pg_wchar_table[]; @@ -290,6 +298,7 @@ extern int pg_mblen(const unsigned char *mbstr); extern int pg_dsplen(const unsigned char *mbstr); extern int pg_encoding_mblen(int encoding, const unsigned char *mbstr); extern int pg_encoding_dsplen(int encoding, const unsigned char *mbstr); +extern int pg_encoding_verifymb(int encoding, const char *mbstr, int len); extern int pg_mule_mblen(const unsigned char *mbstr); extern int pg_mic_mblen(const unsigned char *mbstr); extern int pg_mbstrlen(const unsigned char *mbstr); @@ -323,20 +332,33 @@ extern unsigned char *pg_server_to_client(unsigned char *s, int len); extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc); extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc); -extern void LocalToUtf(unsigned char *iso, unsigned char *utf, - pg_local_to_utf *map, int size, int encoding, int len); +extern void LocalToUtf(const unsigned char *iso, unsigned char *utf, + const pg_local_to_utf *map, int size, int encoding, int len); -extern void UtfToLocal(unsigned char *utf, unsigned char *iso, - pg_utf_to_local *map, int size, int len); +extern void UtfToLocal(const unsigned char *utf, unsigned char *iso, + const pg_utf_to_local *map, int size, int encoding, int len); -extern bool pg_verifymbstr(const unsigned char *mbstr, int len, bool noError); +extern bool pg_verifymbstr(const char *mbstr, int len, bool noError); +extern bool pg_verify_mbstr(int encoding, const char *mbstr, int len, + bool noError); -extern void pg_ascii2mic(unsigned char *src, unsigned char *dest, int len); -extern void pg_mic2ascii(unsigned char *src, unsigned char *dest, int len); -extern void pg_print_bogus_char(unsigned char **mic, unsigned char **p); -extern void latin2mic(unsigned char *l, unsigned char *p, int len, int lc); -extern void mic2latin(unsigned char *mic, unsigned char *p, int len, int lc); -extern void latin2mic_with_table(unsigned char *l, unsigned char *p, int len, int lc, unsigned char *tab); -extern void mic2latin_with_table(unsigned char *mic, unsigned char *p, int len, int lc, unsigned char *tab); +extern void report_invalid_encoding(int encoding, const char *mbstr, int len); +extern void report_untranslatable_char(int src_encoding, int dest_encoding, + const char *mbstr, int len); + +extern void pg_ascii2mic(const unsigned char *l, unsigned char *p, int len); +extern void pg_mic2ascii(const unsigned char *mic, unsigned char *p, int len); +extern void latin2mic(const unsigned char *l, unsigned char *p, int len, + int lc, int encoding); +extern void mic2latin(const unsigned char *mic, unsigned char *p, int len, + int lc, int encoding); +extern void latin2mic_with_table(const unsigned char *l, unsigned char *p, + int len, int lc, int encoding, + const unsigned char *tab); +extern void mic2latin_with_table(const unsigned char *mic, unsigned char *p, + int len, int lc, int encoding, + const unsigned char *tab); + +extern bool pg_utf8_islegal(const unsigned char *source, int length); #endif /* PG_WCHAR_H */