From 419fe7cd1b2e658dfec236422308a21cab6c6c30 Mon Sep 17 00:00:00 2001 From: Teodor Sigaev Date: Mon, 20 Nov 2006 14:03:30 +0000 Subject: [PATCH] Fix bug http://archives.postgresql.org/pgsql-bugs/2006-10/msg00258.php. Fix string's length calculation for recoding, fix strlower() to avoid wrong assumption about length of recoded string (was: recoded string is no greater that source, it may not true for multibyte encodings) Thanks to Thomas H. and Magnus Hagander --- contrib/tsearch2/dict_ex.c | 6 ++- contrib/tsearch2/dict_snowball.c | 8 ++-- contrib/tsearch2/dict_syn.c | 11 +++-- contrib/tsearch2/ispell/spell.c | 75 +++++++++++++++++++------------ contrib/tsearch2/stopword.c | 16 ++++--- contrib/tsearch2/ts_locale.c | 77 ++++++++++++++++++++++++-------- 6 files changed, 131 insertions(+), 62 deletions(-) diff --git a/contrib/tsearch2/dict_ex.c b/contrib/tsearch2/dict_ex.c index ccb7f3fcbe..2fd5cbb700 100644 --- a/contrib/tsearch2/dict_ex.c +++ b/contrib/tsearch2/dict_ex.c @@ -1,4 +1,4 @@ -/* $PostgreSQL: pgsql/contrib/tsearch2/dict_ex.c,v 1.8 2006/03/11 04:38:30 momjian Exp $ */ +/* $PostgreSQL: pgsql/contrib/tsearch2/dict_ex.c,v 1.9 2006/11/20 14:03:30 teodor Exp $ */ /* * example of dictionary @@ -52,9 +52,11 @@ dex_lexize(PG_FUNCTION_ARGS) { DictExample *d = (DictExample *) PG_GETARG_POINTER(0); char *in = (char *) PG_GETARG_POINTER(1); - char *txt = pnstrdup(in, PG_GETARG_INT32(2)); + char *utxt = pnstrdup(in, PG_GETARG_INT32(2)); TSLexeme *res = palloc(sizeof(TSLexeme) * 2); + char *txt = lowerstr(utxt); + pfree(utxt); memset(res, 0, sizeof(TSLexeme) * 2); if (*txt == '\0' || searchstoplist(&(d->stoplist), txt)) diff --git a/contrib/tsearch2/dict_snowball.c b/contrib/tsearch2/dict_snowball.c index f983ae8e13..6667744824 100644 --- a/contrib/tsearch2/dict_snowball.c +++ b/contrib/tsearch2/dict_snowball.c @@ -1,4 +1,4 @@ -/* $PostgreSQL: pgsql/contrib/tsearch2/dict_snowball.c,v 1.12 2006/07/11 16:35:31 momjian Exp $ */ +/* $PostgreSQL: pgsql/contrib/tsearch2/dict_snowball.c,v 1.13 2006/11/20 14:03:30 teodor Exp $ */ /* * example of Snowball dictionary @@ -142,9 +142,11 @@ snb_lexize(PG_FUNCTION_ARGS) { DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0); char *in = (char *) PG_GETARG_POINTER(1); - char *txt = pnstrdup(in, PG_GETARG_INT32(2)); + char *utxt = pnstrdup(in, PG_GETARG_INT32(2)); TSLexeme *res = palloc(sizeof(TSLexeme) * 2); - + char *txt = lowerstr(utxt); + + pfree(utxt); memset(res, 0, sizeof(TSLexeme) * 2); if (*txt == '\0' || searchstoplist(&(d->stoplist), txt)) { diff --git a/contrib/tsearch2/dict_syn.c b/contrib/tsearch2/dict_syn.c index d19686d63e..cddbd47350 100644 --- a/contrib/tsearch2/dict_syn.c +++ b/contrib/tsearch2/dict_syn.c @@ -1,4 +1,4 @@ -/* $PostgreSQL: pgsql/contrib/tsearch2/dict_syn.c,v 1.9 2006/03/11 04:38:30 momjian Exp $ */ +/* $PostgreSQL: pgsql/contrib/tsearch2/dict_syn.c,v 1.10 2006/11/20 14:03:30 teodor Exp $ */ /* * ISpell interface @@ -132,8 +132,8 @@ syn_init(PG_FUNCTION_ARGS) continue; *end = '\0'; - d->syn[cur].in = strdup(lowerstr(starti)); - d->syn[cur].out = strdup(lowerstr(starto)); + d->syn[cur].in = lowerstr(starti); + d->syn[cur].out = lowerstr(starto); if (!(d->syn[cur].in && d->syn[cur].out)) { fclose(fin); @@ -163,12 +163,15 @@ syn_lexize(PG_FUNCTION_ARGS) Syn key, *found; TSLexeme *res = NULL; + char *wrd; if (!PG_GETARG_INT32(2)) PG_RETURN_POINTER(NULL); key.out = NULL; - key.in = lowerstr(pnstrdup(in, PG_GETARG_INT32(2))); + wrd = pnstrdup(in, PG_GETARG_INT32(2)); + key.in = lowerstr(wrd); + pfree(wrd); found = (Syn *) bsearch(&key, d->syn, d->len, sizeof(Syn), compareSyn); pfree(key.in); diff --git a/contrib/tsearch2/ispell/spell.c b/contrib/tsearch2/ispell/spell.c index 9e4d689cd4..6eedc7f342 100644 --- a/contrib/tsearch2/ispell/spell.c +++ b/contrib/tsearch2/ispell/spell.c @@ -147,7 +147,7 @@ NIAddSpell(IspellDict * Conf, const char *word, const char *flag) int NIImportDictionary(IspellDict * Conf, const char *filename) { - char str[BUFSIZ]; + char str[BUFSIZ], *pstr; FILE *dict; if (!(dict = fopen(filename, "r"))) @@ -190,9 +190,10 @@ NIImportDictionary(IspellDict * Conf, const char *filename) } s += pg_mblen(s); } - lowerstr(str); + pstr = lowerstr(str); - NIAddSpell(Conf, str, flag); + NIAddSpell(Conf, pstr, flag); + pfree(pstr); } fclose(dict); return (0); @@ -418,8 +419,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl, int line) int NIImportAffixes(IspellDict * Conf, const char *filename) { - char str[BUFSIZ]; - char tmpstr[BUFSIZ]; + char str[BUFSIZ], *pstr = NULL; char mask[BUFSIZ]; char find[BUFSIZ]; char repl[BUFSIZ]; @@ -439,11 +439,14 @@ NIImportAffixes(IspellDict * Conf, const char *filename) while (fgets(str, sizeof(str), affix)) { line++; + if ( *str == '#' || *str == '\n' ) + continue; + pg_verifymbstr(str, strlen(str), false); - memcpy(tmpstr, str, 32); /* compoundwords... */ - tmpstr[32] = '\0'; - lowerstr(tmpstr); - if (STRNCMP(tmpstr, "compoundwords") == 0) + if ( pstr ) + pfree( pstr ); + pstr = lowerstr(str); + if (STRNCMP(pstr, "compoundwords") == 0) { s = findchar(str, 'l'); if (s) @@ -458,21 +461,21 @@ NIImportAffixes(IspellDict * Conf, const char *filename) continue; } } - if (STRNCMP(tmpstr, "suffixes") == 0) + if (STRNCMP(pstr, "suffixes") == 0) { suffixes = 1; prefixes = 0; oldformat++; continue; } - if (STRNCMP(tmpstr, "prefixes") == 0) + if (STRNCMP(pstr, "prefixes") == 0) { suffixes = 0; prefixes = 1; oldformat++; continue; } - if (STRNCMP(tmpstr, "flag") == 0) + if (STRNCMP(pstr, "flag") == 0) { s = str + 4; flagflags = 0; @@ -523,14 +526,16 @@ NIImportAffixes(IspellDict * Conf, const char *filename) if ((!suffixes) && (!prefixes)) continue; - lowerstr(str); - if (!parse_affentry(str, mask, find, repl, line)) + if (!parse_affentry(pstr, mask, find, repl, line)) continue; NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX); } fclose(affix); + if ( pstr ) + pfree( pstr ); + return (0); } @@ -538,11 +543,11 @@ int NIImportOOAffixes(IspellDict * Conf, const char *filename) { char str[BUFSIZ]; - char type[BUFSIZ]; + char type[BUFSIZ], *ptype = NULL; char sflag[BUFSIZ]; - char mask[BUFSIZ]; - char find[BUFSIZ]; - char repl[BUFSIZ]; + char mask[BUFSIZ], *pmask; + char find[BUFSIZ], *pfind; + char repl[BUFSIZ], *prepl; bool isSuffix = false; int flag = 0; char flagflags = 0; @@ -577,8 +582,10 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename) scanread = sscanf(str, scanbuf, type, sflag, find, repl, mask); - lowerstr(type); - if (scanread < 4 || (STRNCMP(type, "sfx") && STRNCMP(type, "pfx"))) + if (ptype) + pfree(ptype); + ptype = lowerstr(type); + if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx"))) continue; if (scanread == 4) @@ -586,29 +593,35 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename) if (strlen(sflag) != 1) continue; flag = *sflag; - isSuffix = (STRNCMP(type, "sfx") == 0) ? true : false; - lowerstr(find); + isSuffix = (STRNCMP(ptype, "sfx") == 0) ? true : false; + pfind = lowerstr(find); if (t_iseq(find, 'y')) flagflags |= FF_CROSSPRODUCT; else flagflags = 0; + pfree(pfind); } else { if (strlen(sflag) != 1 || flag != *sflag || flag == 0) continue; - lowerstr(repl); - lowerstr(find); - lowerstr(mask); + prepl = lowerstr(repl); + pfind = lowerstr(find); + pmask = lowerstr(mask); if (t_iseq(find, '0')) *find = '\0'; if (t_iseq(repl, '0')) *repl = '\0'; NIAddAffix(Conf, flag, flagflags, mask, find, repl, isSuffix ? FF_SUFFIX : FF_PREFIX); + pfree(prepl); + pfree(pfind); + pfree(pmask); } } + if (ptype) + pfree(ptype); fclose(affix); return 0; @@ -1053,7 +1066,6 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag) if (wrdlen > MAXNORMLEN) return NULL; - lowerstr(word); cur = forms = (char **) palloc(MAX_NORM * sizeof(char *)); *cur = NULL; @@ -1354,13 +1366,17 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word, } TSLexeme * -NINormalizeWord(IspellDict * Conf, char *word) +NINormalizeWord(IspellDict * Conf, char *uword) { - char **res = NormalizeSubWord(Conf, word, 0); + char **res; + char *word; TSLexeme *lcur = NULL, *lres = NULL; uint16 NVariant = 1; + word = lowerstr(uword); + res = NormalizeSubWord(Conf, word, 0); + if (res) { char **ptr = res; @@ -1431,6 +1447,9 @@ NINormalizeWord(IspellDict * Conf, char *word) var = ptr; } } + + pfree(word); + return lres; } diff --git a/contrib/tsearch2/stopword.c b/contrib/tsearch2/stopword.c index 73db8abba6..b9b7699594 100644 --- a/contrib/tsearch2/stopword.c +++ b/contrib/tsearch2/stopword.c @@ -36,7 +36,7 @@ readstoplist(text *in, StopList * s) { char *filename = to_absfilename(text2char(in)); FILE *hin; - char buf[STOPBUFLEN]; + char buf[STOPBUFLEN], *pbuf; int reallen = 0; if ((hin = fopen(filename, "r")) == NULL) @@ -49,7 +49,6 @@ readstoplist(text *in, StopList * s) { buf[strlen(buf) - 1] = '\0'; pg_verifymbstr(buf, strlen(buf), false); - lowerstr(buf); if (*buf == '\0') continue; @@ -70,7 +69,14 @@ readstoplist(text *in, StopList * s) stop = tmp; } - stop[s->len] = strdup(buf); + if (s->wordop) + { + pbuf = s->wordop(buf); + stop[s->len] = strdup(pbuf); + pfree(pbuf); + } else + stop[s->len] = strdup(buf); + if (!stop[s->len]) { freestoplist(s); @@ -79,8 +85,6 @@ readstoplist(text *in, StopList * s) (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); } - if (s->wordop) - stop[s->len] = (s->wordop) (stop[s->len]); (s->len)++; } @@ -106,7 +110,5 @@ sortstoplist(StopList * s) bool searchstoplist(StopList * s, char *key) { - if (s->wordop) - key = (*(s->wordop)) (key); return (s->stop && s->len > 0 && bsearch(&key, s->stop, s->len, sizeof(char *), comparestr)) ? true : false; } diff --git a/contrib/tsearch2/ts_locale.c b/contrib/tsearch2/ts_locale.c index 203c977e4e..cac5317a10 100644 --- a/contrib/tsearch2/ts_locale.c +++ b/contrib/tsearch2/ts_locale.c @@ -14,21 +14,12 @@ wchar2char(char *to, const wchar_t *from, size_t len) { if (GetDatabaseEncoding() == PG_UTF8) { - int r, - nbytes; + int r; if (len == 0) return 0; - /* in any case, *to should be allocated with enough space */ - nbytes = WideCharToMultiByte(CP_UTF8, 0, from, len, NULL, 0, NULL, NULL); - if (nbytes == 0) - ereport(ERROR, - (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("UTF-16 to UTF-8 translation failed: %lu", - GetLastError()))); - - r = WideCharToMultiByte(CP_UTF8, 0, from, len, to, nbytes, + r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len, NULL, NULL); if (r == 0) @@ -36,6 +27,8 @@ wchar2char(char *to, const wchar_t *from, size_t len) (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), errmsg("UTF-16 to UTF-8 translation failed: %lu", GetLastError()))); + Assert(r <= len); + return r; } @@ -56,7 +49,7 @@ char2wchar(wchar_t *to, const char *from, size_t len) if (!r) { - pg_verifymbstr(from, len, false); + pg_verifymbstr(from, strlen(from), false); ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), errmsg("invalid multibyte character for locale"), @@ -97,6 +90,11 @@ char * lowerstr(char *str) { char *ptr = str; + char *out; + int len = strlen(str); + + if ( len == 0 ) + return pstrdup(""); #ifdef TS_USE_WIDE @@ -110,24 +108,67 @@ lowerstr(char *str) { wchar_t *wstr, *wptr; - int len = strlen(str); + int wlen; + + /* + *alloc number of wchar_t for worst case, len contains + * number of bytes <= number of characters and + * alloc 1 wchar_t for 0, because wchar2char(wcstombs in really) + * wants zero-terminated string + */ + wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len+1)); + + /* + * str SHOULD be cstring, so wlen contains number + * of converted character + */ + wlen = char2wchar(wstr, str, len); + if ( wlen < 0 ) + ereport(ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("transalation failed from server encoding to wchar_t"))); + + Assert(wlen<=len); + wstr[wlen] = 0; - wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1)); - char2wchar(wstr, str, len + 1); while (*wptr) { *wptr = towlower((wint_t) *wptr); wptr++; } - wchar2char(str, wstr, len); + + /* + * Alloc result string for worst case + '\0' + */ + len = sizeof(char)*pg_database_encoding_max_length()*(wlen+1); + out = (char*)palloc(len); + + /* + * wlen now is number of bytes which is always >= number of characters + */ + wlen = wchar2char(out, wstr, len); pfree(wstr); + + if ( wlen < 0 ) + ereport(ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("transalation failed from wchar_t to server encoding %d", errno))); + Assert(wlen<=len); + out[wlen]='\0'; } else #endif + { + char *outptr; + + outptr = out = (char*)palloc( sizeof(char) * (len+1) ); while (*ptr) { - *ptr = tolower(*(unsigned char *) ptr); + *outptr++ = tolower(*(unsigned char *) ptr); ptr++; } - return str; + *outptr = '\0'; + } + + return out; }