From 654dcfb9e4b6ba44df63fcb0c73403a82f05338c Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 9 Nov 2007 22:37:35 +0000 Subject: [PATCH] Clean up ts_locale.h/.c. Fix broken and not-consistent-across-platforms behavior of wchar2char/char2wchar; this should resolve bug #3730. Avoid excess computations of pg_mblen in t_isalpha and friends. Const-ify APIs where possible. --- src/backend/tsearch/ts_locale.c | 187 +++++++++++++++++++----------- src/backend/tsearch/ts_utils.c | 4 +- src/backend/tsearch/wparser_def.c | 6 +- src/include/tsearch/ts_locale.h | 63 ++++------ src/include/tsearch/ts_public.h | 4 +- 5 files changed, 151 insertions(+), 113 deletions(-) diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c index 361152e6be..784cc17edd 100644 --- a/src/backend/tsearch/ts_locale.c +++ b/src/backend/tsearch/ts_locale.c @@ -1,13 +1,13 @@ /*------------------------------------------------------------------------- * * ts_locale.c - * locale compatiblility layer for tsearch + * locale compatibility layer for tsearch * * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.2 2007/08/25 00:03:59 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.3 2007/11/09 22:37:35 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -16,41 +16,56 @@ #include "tsearch/ts_locale.h" #include "tsearch/ts_public.h" + #ifdef TS_USE_WIDE -#ifdef WIN32 - +/* + * wchar2char --- convert wide characters to multibyte format + * + * This has the same API as the standard wcstombs() function; in particular, + * tolen is the maximum number of bytes to store at *to, and *from should be + * zero-terminated. The output will be zero-terminated iff there is room. + */ size_t -wchar2char(char *to, const wchar_t *from, size_t len) +wchar2char(char *to, const wchar_t *from, size_t tolen) { - if (len == 0) + if (tolen == 0) return 0; +#ifdef WIN32 if (GetDatabaseEncoding() == PG_UTF8) { int r; - r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len, + r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen, NULL, NULL); - if (r == 0) - ereport(ERROR, - (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("UTF-16 to UTF-8 translation failed: %lu", - GetLastError()))); - Assert(r <= len); + if (r <= 0) + return (size_t) -1; - return r; + Assert(r <= tolen); + + /* Microsoft counts the zero terminator in the result */ + return r-1; } - - return wcstombs(to, from, len); -} #endif /* WIN32 */ + return wcstombs(to, from, tolen); +} + +/* + * char2wchar --- convert multibyte characters to wide characters + * + * This has almost the API of mbstowcs(), except that *from need not be + * null-terminated; instead, the number of input bytes is specified as + * fromlen. Also, we ereport() rather than returning -1 for invalid + * input encoding. tolen is the maximum number of wchar_t's to store at *to. + * The output will be zero-terminated iff there is room. + */ size_t -char2wchar(wchar_t *to, const char *from, size_t len) +char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen) { - if (len == 0) + if (tolen == 0) return 0; #ifdef WIN32 @@ -58,71 +73,117 @@ char2wchar(wchar_t *to, const char *from, size_t len) { int r; - r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len); + r = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen); - if (!r) + if (r <= 0) { - pg_verifymbstr(from, len, false); + pg_verifymbstr(from, fromlen, false); ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), errmsg("invalid multibyte character for locale"), errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding."))); } - Assert(r <= len); + Assert(r <= tolen); - return r; + /* Microsoft counts the zero terminator in the result */ + return r-1; } - else #endif /* WIN32 */ + if (lc_ctype_is_c()) { /* * pg_mb2wchar_with_len always adds trailing '\0', so 'to' should be * allocated with sufficient space */ - return pg_mb2wchar_with_len(from, (pg_wchar *) to, len); + return pg_mb2wchar_with_len(from, (pg_wchar *) to, fromlen); } else { /* - * mbstowcs require ending '\0' + * mbstowcs requires ending '\0' */ - char *str = pnstrdup(from, len); - size_t tolen; + char *str = pnstrdup(from, fromlen); + size_t result; + + result = mbstowcs(to, str, tolen); - tolen = mbstowcs(to, str, len); pfree(str); - return tolen; + if (result == (size_t) -1) + { + pg_verifymbstr(from, fromlen, false); + ereport(ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("invalid multibyte character for locale"), + errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding."))); + } + + if (result < tolen) + to[result] = 0; + + return result; } } + int -_t_isalpha(const char *ptr) +t_isdigit(const char *ptr) { + int clen = pg_mblen(ptr); wchar_t character[2]; - if (lc_ctype_is_c()) + if (clen == 1 || lc_ctype_is_c()) + return isdigit(TOUCHAR(ptr)); + + char2wchar(character, 2, ptr, clen); + + return iswdigit((wint_t) character[0]); +} + +int +t_isspace(const char *ptr) +{ + int clen = pg_mblen(ptr); + wchar_t character[2]; + + if (clen == 1 || lc_ctype_is_c()) + return isspace(TOUCHAR(ptr)); + + char2wchar(character, 2, ptr, clen); + + return iswspace((wint_t) character[0]); +} + +int +t_isalpha(const char *ptr) +{ + int clen = pg_mblen(ptr); + wchar_t character[2]; + + if (clen == 1 || lc_ctype_is_c()) return isalpha(TOUCHAR(ptr)); - char2wchar(character, ptr, 1); + char2wchar(character, 2, ptr, clen); - return iswalpha((wint_t) *character); + return iswalpha((wint_t) character[0]); } int -_t_isprint(const char *ptr) +t_isprint(const char *ptr) { + int clen = pg_mblen(ptr); wchar_t character[2]; - if (lc_ctype_is_c()) + if (clen == 1 || lc_ctype_is_c()) return isprint(TOUCHAR(ptr)); - char2wchar(character, ptr, 1); + char2wchar(character, 2, ptr, clen); - return iswprint((wint_t) *character); + return iswprint((wint_t) character[0]); } + #endif /* TS_USE_WIDE */ @@ -168,19 +229,27 @@ t_readline(FILE *fp) return recoded; } +/* + * lowerstr --- fold null-terminated string to lower case + * + * Returned string is palloc'd + */ char * -lowerstr(char *str) +lowerstr(const char *str) { return lowerstr_with_len(str, strlen(str)); } /* + * lowerstr_with_len --- fold string to lower case + * + * Input string need not be null-terminated. + * * Returned string is palloc'd */ char * -lowerstr_with_len(char *str, int len) +lowerstr_with_len(const char *str, int len) { - char *ptr = str; char *out; if (len == 0) @@ -202,23 +271,13 @@ lowerstr_with_len(char *str, int len) /* * alloc number of wchar_t for worst case, len contains number of - * bytes <= number of characters and alloc 1 wchar_t for 0, because - * wchar2char(wcstombs in really) wants zero-terminated string + * bytes >= number of characters and alloc 1 wchar_t for 0, because + * wchar2char wants zero-terminated string */ wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1)); - /* - * str SHOULD be cstring, so wlen contains number of converted - * character - */ - wlen = char2wchar(wstr, str, len); - if (wlen < 0) - ereport(ERROR, - (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("translation failed from server encoding to wchar_t"))); - + wlen = char2wchar(wstr, len+1, str, len); Assert(wlen <= len); - wstr[wlen] = 0; while (*wptr) { @@ -229,31 +288,29 @@ lowerstr_with_len(char *str, int len) /* * Alloc result string for worst case + '\0' */ - len = sizeof(char) * pg_database_encoding_max_length() *(wlen + 1); + len = pg_database_encoding_max_length() * wlen + 1; out = (char *) palloc(len); - /* - * wlen now is number of bytes which is always >= number of characters - */ wlen = wchar2char(out, wstr, len); + pfree(wstr); if (wlen < 0) ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("translation failed from wchar_t to server encoding %d", errno))); - Assert(wlen <= len); - out[wlen] = '\0'; + errmsg("translation from wchar_t to server encoding failed: %m"))); + Assert(wlen < len); } else -#endif +#endif /* TS_USE_WIDE */ { + const char *ptr = str; char *outptr; outptr = out = (char *) palloc(sizeof(char) * (len + 1)); - while (*ptr && ptr - str < len) + while ((ptr - str) < len && *ptr) { - *outptr++ = tolower(*(unsigned char *) ptr); + *outptr++ = tolower(TOUCHAR(ptr)); ptr++; } *outptr = '\0'; diff --git a/src/backend/tsearch/ts_utils.c b/src/backend/tsearch/ts_utils.c index 781146886a..6c98947420 100644 --- a/src/backend/tsearch/ts_utils.c +++ b/src/backend/tsearch/ts_utils.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tsearch/ts_utils.c,v 1.4 2007/09/04 02:16:56 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/tsearch/ts_utils.c,v 1.5 2007/11/09 22:37:35 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -75,7 +75,7 @@ comparestr(const void *a, const void *b) * or palloc a new version. */ void -readstoplist(const char *fname, StopList *s, char *(*wordop) (char *)) +readstoplist(const char *fname, StopList *s, char *(*wordop) (const char *)) { char **stop = NULL; diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c index 086ac95155..b79056ca68 100644 --- a/src/backend/tsearch/wparser_def.c +++ b/src/backend/tsearch/wparser_def.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.7 2007/10/27 19:03:45 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.8 2007/11/09 22:37:35 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -294,12 +294,12 @@ TParserInit(char *str, int len) /* * Use wide char code only when max encoding length > 1. */ - if (prs->charmaxlen > 1) { prs->usewide = true; prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1)); - prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr); + prs->lenwstr = char2wchar(prs->wstr, prs->lenstr + 1, + prs->str, prs->lenstr); } else #endif diff --git a/src/include/tsearch/ts_locale.h b/src/include/tsearch/ts_locale.h index dcae2af93a..cea3830a0f 100644 --- a/src/include/tsearch/ts_locale.h +++ b/src/include/tsearch/ts_locale.h @@ -1,15 +1,14 @@ /*------------------------------------------------------------------------- * * ts_locale.h - * helper utilities for tsearch + * locale compatibility layer for tsearch * * Copyright (c) 1998-2007, PostgreSQL Global Development Group * - * $PostgreSQL: pgsql/src/include/tsearch/ts_locale.h,v 1.2 2007/08/25 00:03:59 tgl Exp $ + * $PostgreSQL: pgsql/src/include/tsearch/ts_locale.h,v 1.3 2007/11/09 22:37:35 tgl Exp $ * *------------------------------------------------------------------------- */ - #ifndef __TSLOCALE_H__ #define __TSLOCALE_H__ @@ -34,55 +33,37 @@ #define TS_USE_WIDE #endif -#define TOUCHAR(x) (*((unsigned char*)(x))) +#define TOUCHAR(x) (*((const unsigned char *) (x))) #ifdef TS_USE_WIDE -extern size_t char2wchar(wchar_t *to, const char *from, size_t len); +extern size_t wchar2char(char *to, const wchar_t *from, size_t tolen); +extern size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen); -#ifdef WIN32 +extern int t_isdigit(const char *ptr); +extern int t_isspace(const char *ptr); +extern int t_isalpha(const char *ptr); +extern int t_isprint(const char *ptr); -extern size_t wchar2char(char *to, const wchar_t *from, size_t len); -#else /* WIN32 */ +/* The second argument of t_iseq() must be a plain ASCII character */ +#define t_iseq(x,c) (TOUCHAR(x) == (unsigned char) (c)) -/* correct wcstombs */ -#define wchar2char wcstombs +#define COPYCHAR(d,s) memcpy(d, s, pg_mblen(s)) -#endif /* WIN32 */ +#else /* not TS_USE_WIDE */ -#define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) ) -#define t_isspace(x) ( pg_mblen(x)==1 && isspace( TOUCHAR(x) ) ) -extern int _t_isalpha(const char *ptr); +#define t_isdigit(x) isdigit(TOUCHAR(x)) +#define t_isspace(x) isspace(TOUCHAR(x)) +#define t_isalpha(x) isalpha(TOUCHAR(x)) +#define t_isprint(x) isprint(TOUCHAR(x)) +#define t_iseq(x,c) (TOUCHAR(x) == (unsigned char) (c)) -#define t_isalpha(x) ( (pg_mblen(x)==1) ? isalpha( TOUCHAR(x) ) : _t_isalpha(x) ) -extern int _t_isprint(const char *ptr); +#define COPYCHAR(d,s) (*((unsigned char *) (d)) = TOUCHAR(s)) -#define t_isprint(x) ( (pg_mblen(x)==1) ? isprint( TOUCHAR(x) ) : _t_isprint(x) ) -/* - * t_iseq() should be called only for ASCII symbols - */ -#define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false ) +#endif /* TS_USE_WIDE */ -#define COPYCHAR(d,s) do { \ - int lll = pg_mblen( s ); \ - \ - while( lll-- ) \ - TOUCHAR((d)+lll) = TOUCHAR((s)+lll); \ -} while(0) - -#else /* not def TS_USE_WIDE */ - -#define t_isdigit(x) isdigit( TOUCHAR(x) ) -#define t_isspace(x) isspace( TOUCHAR(x) ) -#define t_isalpha(x) isalpha( TOUCHAR(x) ) -#define t_isprint(x) isprint( TOUCHAR(x) ) -#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)) ) - -#define COPYCHAR(d,s) TOUCHAR(d) = TOUCHAR(s) -#endif - -extern char *lowerstr(char *str); -extern char *lowerstr_with_len(char *str, int len); +extern char *lowerstr(const char *str); +extern char *lowerstr_with_len(const char *str, int len); extern char *t_readline(FILE *fp); #endif /* __TSLOCALE_H__ */ diff --git a/src/include/tsearch/ts_public.h b/src/include/tsearch/ts_public.h index ab19de7924..92736c4e1b 100644 --- a/src/include/tsearch/ts_public.h +++ b/src/include/tsearch/ts_public.h @@ -6,7 +6,7 @@ * * Copyright (c) 1998-2007, PostgreSQL Global Development Group * - * $PostgreSQL: pgsql/src/include/tsearch/ts_public.h,v 1.4 2007/09/07 15:09:56 teodor Exp $ + * $PostgreSQL: pgsql/src/include/tsearch/ts_public.h,v 1.5 2007/11/09 22:37:35 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -74,7 +74,7 @@ typedef struct } StopList; extern void readstoplist(const char *fname, StopList *s, - char *(*wordop) (char *)); + char *(*wordop) (const char *)); extern bool searchstoplist(StopList *s, char *key); /*