mirror of
https://git.postgresql.org/git/postgresql.git
synced 2024-11-27 07:21:09 +08:00
Fix string's length calculation for recoding, fix strlower() to avoid wrong assumption about length of recoded string (was: recoded string is no greater that source, it may not true for multibyte encodings) Thanks to Thomas H. <me@alternize.com> and Magnus Hagander <mha@sollentuna.net>
This commit is contained in:
parent
1a5c450f30
commit
419fe7cd1b
@ -1,4 +1,4 @@
|
||||
/* $PostgreSQL: pgsql/contrib/tsearch2/dict_ex.c,v 1.8 2006/03/11 04:38:30 momjian Exp $ */
|
||||
/* $PostgreSQL: pgsql/contrib/tsearch2/dict_ex.c,v 1.9 2006/11/20 14:03:30 teodor Exp $ */
|
||||
|
||||
/*
|
||||
* example of dictionary
|
||||
@ -52,9 +52,11 @@ dex_lexize(PG_FUNCTION_ARGS)
|
||||
{
|
||||
DictExample *d = (DictExample *) PG_GETARG_POINTER(0);
|
||||
char *in = (char *) PG_GETARG_POINTER(1);
|
||||
char *txt = pnstrdup(in, PG_GETARG_INT32(2));
|
||||
char *utxt = pnstrdup(in, PG_GETARG_INT32(2));
|
||||
TSLexeme *res = palloc(sizeof(TSLexeme) * 2);
|
||||
char *txt = lowerstr(utxt);
|
||||
|
||||
pfree(utxt);
|
||||
memset(res, 0, sizeof(TSLexeme) * 2);
|
||||
|
||||
if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
|
||||
|
@ -1,4 +1,4 @@
|
||||
/* $PostgreSQL: pgsql/contrib/tsearch2/dict_snowball.c,v 1.12 2006/07/11 16:35:31 momjian Exp $ */
|
||||
/* $PostgreSQL: pgsql/contrib/tsearch2/dict_snowball.c,v 1.13 2006/11/20 14:03:30 teodor Exp $ */
|
||||
|
||||
/*
|
||||
* example of Snowball dictionary
|
||||
@ -142,9 +142,11 @@ snb_lexize(PG_FUNCTION_ARGS)
|
||||
{
|
||||
DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
|
||||
char *in = (char *) PG_GETARG_POINTER(1);
|
||||
char *txt = pnstrdup(in, PG_GETARG_INT32(2));
|
||||
char *utxt = pnstrdup(in, PG_GETARG_INT32(2));
|
||||
TSLexeme *res = palloc(sizeof(TSLexeme) * 2);
|
||||
|
||||
char *txt = lowerstr(utxt);
|
||||
|
||||
pfree(utxt);
|
||||
memset(res, 0, sizeof(TSLexeme) * 2);
|
||||
if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
|
||||
{
|
||||
|
@ -1,4 +1,4 @@
|
||||
/* $PostgreSQL: pgsql/contrib/tsearch2/dict_syn.c,v 1.9 2006/03/11 04:38:30 momjian Exp $ */
|
||||
/* $PostgreSQL: pgsql/contrib/tsearch2/dict_syn.c,v 1.10 2006/11/20 14:03:30 teodor Exp $ */
|
||||
|
||||
/*
|
||||
* ISpell interface
|
||||
@ -132,8 +132,8 @@ syn_init(PG_FUNCTION_ARGS)
|
||||
continue;
|
||||
*end = '\0';
|
||||
|
||||
d->syn[cur].in = strdup(lowerstr(starti));
|
||||
d->syn[cur].out = strdup(lowerstr(starto));
|
||||
d->syn[cur].in = lowerstr(starti);
|
||||
d->syn[cur].out = lowerstr(starto);
|
||||
if (!(d->syn[cur].in && d->syn[cur].out))
|
||||
{
|
||||
fclose(fin);
|
||||
@ -163,12 +163,15 @@ syn_lexize(PG_FUNCTION_ARGS)
|
||||
Syn key,
|
||||
*found;
|
||||
TSLexeme *res = NULL;
|
||||
char *wrd;
|
||||
|
||||
if (!PG_GETARG_INT32(2))
|
||||
PG_RETURN_POINTER(NULL);
|
||||
|
||||
key.out = NULL;
|
||||
key.in = lowerstr(pnstrdup(in, PG_GETARG_INT32(2)));
|
||||
wrd = pnstrdup(in, PG_GETARG_INT32(2));
|
||||
key.in = lowerstr(wrd);
|
||||
pfree(wrd);
|
||||
|
||||
found = (Syn *) bsearch(&key, d->syn, d->len, sizeof(Syn), compareSyn);
|
||||
pfree(key.in);
|
||||
|
@ -147,7 +147,7 @@ NIAddSpell(IspellDict * Conf, const char *word, const char *flag)
|
||||
int
|
||||
NIImportDictionary(IspellDict * Conf, const char *filename)
|
||||
{
|
||||
char str[BUFSIZ];
|
||||
char str[BUFSIZ], *pstr;
|
||||
FILE *dict;
|
||||
|
||||
if (!(dict = fopen(filename, "r")))
|
||||
@ -190,9 +190,10 @@ NIImportDictionary(IspellDict * Conf, const char *filename)
|
||||
}
|
||||
s += pg_mblen(s);
|
||||
}
|
||||
lowerstr(str);
|
||||
pstr = lowerstr(str);
|
||||
|
||||
NIAddSpell(Conf, str, flag);
|
||||
NIAddSpell(Conf, pstr, flag);
|
||||
pfree(pstr);
|
||||
}
|
||||
fclose(dict);
|
||||
return (0);
|
||||
@ -418,8 +419,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl, int line)
|
||||
int
|
||||
NIImportAffixes(IspellDict * Conf, const char *filename)
|
||||
{
|
||||
char str[BUFSIZ];
|
||||
char tmpstr[BUFSIZ];
|
||||
char str[BUFSIZ], *pstr = NULL;
|
||||
char mask[BUFSIZ];
|
||||
char find[BUFSIZ];
|
||||
char repl[BUFSIZ];
|
||||
@ -439,11 +439,14 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
|
||||
while (fgets(str, sizeof(str), affix))
|
||||
{
|
||||
line++;
|
||||
if ( *str == '#' || *str == '\n' )
|
||||
continue;
|
||||
|
||||
pg_verifymbstr(str, strlen(str), false);
|
||||
memcpy(tmpstr, str, 32); /* compoundwords... */
|
||||
tmpstr[32] = '\0';
|
||||
lowerstr(tmpstr);
|
||||
if (STRNCMP(tmpstr, "compoundwords") == 0)
|
||||
if ( pstr )
|
||||
pfree( pstr );
|
||||
pstr = lowerstr(str);
|
||||
if (STRNCMP(pstr, "compoundwords") == 0)
|
||||
{
|
||||
s = findchar(str, 'l');
|
||||
if (s)
|
||||
@ -458,21 +461,21 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (STRNCMP(tmpstr, "suffixes") == 0)
|
||||
if (STRNCMP(pstr, "suffixes") == 0)
|
||||
{
|
||||
suffixes = 1;
|
||||
prefixes = 0;
|
||||
oldformat++;
|
||||
continue;
|
||||
}
|
||||
if (STRNCMP(tmpstr, "prefixes") == 0)
|
||||
if (STRNCMP(pstr, "prefixes") == 0)
|
||||
{
|
||||
suffixes = 0;
|
||||
prefixes = 1;
|
||||
oldformat++;
|
||||
continue;
|
||||
}
|
||||
if (STRNCMP(tmpstr, "flag") == 0)
|
||||
if (STRNCMP(pstr, "flag") == 0)
|
||||
{
|
||||
s = str + 4;
|
||||
flagflags = 0;
|
||||
@ -523,14 +526,16 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
|
||||
if ((!suffixes) && (!prefixes))
|
||||
continue;
|
||||
|
||||
lowerstr(str);
|
||||
if (!parse_affentry(str, mask, find, repl, line))
|
||||
if (!parse_affentry(pstr, mask, find, repl, line))
|
||||
continue;
|
||||
|
||||
NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
|
||||
}
|
||||
fclose(affix);
|
||||
|
||||
if ( pstr )
|
||||
pfree( pstr );
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
@ -538,11 +543,11 @@ int
|
||||
NIImportOOAffixes(IspellDict * Conf, const char *filename)
|
||||
{
|
||||
char str[BUFSIZ];
|
||||
char type[BUFSIZ];
|
||||
char type[BUFSIZ], *ptype = NULL;
|
||||
char sflag[BUFSIZ];
|
||||
char mask[BUFSIZ];
|
||||
char find[BUFSIZ];
|
||||
char repl[BUFSIZ];
|
||||
char mask[BUFSIZ], *pmask;
|
||||
char find[BUFSIZ], *pfind;
|
||||
char repl[BUFSIZ], *prepl;
|
||||
bool isSuffix = false;
|
||||
int flag = 0;
|
||||
char flagflags = 0;
|
||||
@ -577,8 +582,10 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename)
|
||||
|
||||
scanread = sscanf(str, scanbuf, type, sflag, find, repl, mask);
|
||||
|
||||
lowerstr(type);
|
||||
if (scanread < 4 || (STRNCMP(type, "sfx") && STRNCMP(type, "pfx")))
|
||||
if (ptype)
|
||||
pfree(ptype);
|
||||
ptype = lowerstr(type);
|
||||
if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx")))
|
||||
continue;
|
||||
|
||||
if (scanread == 4)
|
||||
@ -586,29 +593,35 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename)
|
||||
if (strlen(sflag) != 1)
|
||||
continue;
|
||||
flag = *sflag;
|
||||
isSuffix = (STRNCMP(type, "sfx") == 0) ? true : false;
|
||||
lowerstr(find);
|
||||
isSuffix = (STRNCMP(ptype, "sfx") == 0) ? true : false;
|
||||
pfind = lowerstr(find);
|
||||
if (t_iseq(find, 'y'))
|
||||
flagflags |= FF_CROSSPRODUCT;
|
||||
else
|
||||
flagflags = 0;
|
||||
pfree(pfind);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
|
||||
continue;
|
||||
lowerstr(repl);
|
||||
lowerstr(find);
|
||||
lowerstr(mask);
|
||||
prepl = lowerstr(repl);
|
||||
pfind = lowerstr(find);
|
||||
pmask = lowerstr(mask);
|
||||
if (t_iseq(find, '0'))
|
||||
*find = '\0';
|
||||
if (t_iseq(repl, '0'))
|
||||
*repl = '\0';
|
||||
|
||||
NIAddAffix(Conf, flag, flagflags, mask, find, repl, isSuffix ? FF_SUFFIX : FF_PREFIX);
|
||||
pfree(prepl);
|
||||
pfree(pfind);
|
||||
pfree(pmask);
|
||||
}
|
||||
}
|
||||
|
||||
if (ptype)
|
||||
pfree(ptype);
|
||||
fclose(affix);
|
||||
|
||||
return 0;
|
||||
@ -1053,7 +1066,6 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag)
|
||||
|
||||
if (wrdlen > MAXNORMLEN)
|
||||
return NULL;
|
||||
lowerstr(word);
|
||||
cur = forms = (char **) palloc(MAX_NORM * sizeof(char *));
|
||||
*cur = NULL;
|
||||
|
||||
@ -1354,13 +1366,17 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word,
|
||||
}
|
||||
|
||||
TSLexeme *
|
||||
NINormalizeWord(IspellDict * Conf, char *word)
|
||||
NINormalizeWord(IspellDict * Conf, char *uword)
|
||||
{
|
||||
char **res = NormalizeSubWord(Conf, word, 0);
|
||||
char **res;
|
||||
char *word;
|
||||
TSLexeme *lcur = NULL,
|
||||
*lres = NULL;
|
||||
uint16 NVariant = 1;
|
||||
|
||||
word = lowerstr(uword);
|
||||
res = NormalizeSubWord(Conf, word, 0);
|
||||
|
||||
if (res)
|
||||
{
|
||||
char **ptr = res;
|
||||
@ -1431,6 +1447,9 @@ NINormalizeWord(IspellDict * Conf, char *word)
|
||||
var = ptr;
|
||||
}
|
||||
}
|
||||
|
||||
pfree(word);
|
||||
|
||||
return lres;
|
||||
}
|
||||
|
||||
|
@ -36,7 +36,7 @@ readstoplist(text *in, StopList * s)
|
||||
{
|
||||
char *filename = to_absfilename(text2char(in));
|
||||
FILE *hin;
|
||||
char buf[STOPBUFLEN];
|
||||
char buf[STOPBUFLEN], *pbuf;
|
||||
int reallen = 0;
|
||||
|
||||
if ((hin = fopen(filename, "r")) == NULL)
|
||||
@ -49,7 +49,6 @@ readstoplist(text *in, StopList * s)
|
||||
{
|
||||
buf[strlen(buf) - 1] = '\0';
|
||||
pg_verifymbstr(buf, strlen(buf), false);
|
||||
lowerstr(buf);
|
||||
if (*buf == '\0')
|
||||
continue;
|
||||
|
||||
@ -70,7 +69,14 @@ readstoplist(text *in, StopList * s)
|
||||
stop = tmp;
|
||||
}
|
||||
|
||||
stop[s->len] = strdup(buf);
|
||||
if (s->wordop)
|
||||
{
|
||||
pbuf = s->wordop(buf);
|
||||
stop[s->len] = strdup(pbuf);
|
||||
pfree(pbuf);
|
||||
} else
|
||||
stop[s->len] = strdup(buf);
|
||||
|
||||
if (!stop[s->len])
|
||||
{
|
||||
freestoplist(s);
|
||||
@ -79,8 +85,6 @@ readstoplist(text *in, StopList * s)
|
||||
(errcode(ERRCODE_OUT_OF_MEMORY),
|
||||
errmsg("out of memory")));
|
||||
}
|
||||
if (s->wordop)
|
||||
stop[s->len] = (s->wordop) (stop[s->len]);
|
||||
|
||||
(s->len)++;
|
||||
}
|
||||
@ -106,7 +110,5 @@ sortstoplist(StopList * s)
|
||||
bool
|
||||
searchstoplist(StopList * s, char *key)
|
||||
{
|
||||
if (s->wordop)
|
||||
key = (*(s->wordop)) (key);
|
||||
return (s->stop && s->len > 0 && bsearch(&key, s->stop, s->len, sizeof(char *), comparestr)) ? true : false;
|
||||
}
|
||||
|
@ -14,21 +14,12 @@ wchar2char(char *to, const wchar_t *from, size_t len)
|
||||
{
|
||||
if (GetDatabaseEncoding() == PG_UTF8)
|
||||
{
|
||||
int r,
|
||||
nbytes;
|
||||
int r;
|
||||
|
||||
if (len == 0)
|
||||
return 0;
|
||||
|
||||
/* in any case, *to should be allocated with enough space */
|
||||
nbytes = WideCharToMultiByte(CP_UTF8, 0, from, len, NULL, 0, NULL, NULL);
|
||||
if (nbytes == 0)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
|
||||
errmsg("UTF-16 to UTF-8 translation failed: %lu",
|
||||
GetLastError())));
|
||||
|
||||
r = WideCharToMultiByte(CP_UTF8, 0, from, len, to, nbytes,
|
||||
r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
|
||||
NULL, NULL);
|
||||
|
||||
if (r == 0)
|
||||
@ -36,6 +27,8 @@ wchar2char(char *to, const wchar_t *from, size_t len)
|
||||
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
|
||||
errmsg("UTF-16 to UTF-8 translation failed: %lu",
|
||||
GetLastError())));
|
||||
Assert(r <= len);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
@ -56,7 +49,7 @@ char2wchar(wchar_t *to, const char *from, size_t len)
|
||||
|
||||
if (!r)
|
||||
{
|
||||
pg_verifymbstr(from, len, false);
|
||||
pg_verifymbstr(from, strlen(from), false);
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
|
||||
errmsg("invalid multibyte character for locale"),
|
||||
@ -97,6 +90,11 @@ char *
|
||||
lowerstr(char *str)
|
||||
{
|
||||
char *ptr = str;
|
||||
char *out;
|
||||
int len = strlen(str);
|
||||
|
||||
if ( len == 0 )
|
||||
return pstrdup("");
|
||||
|
||||
#ifdef TS_USE_WIDE
|
||||
|
||||
@ -110,24 +108,67 @@ lowerstr(char *str)
|
||||
{
|
||||
wchar_t *wstr,
|
||||
*wptr;
|
||||
int len = strlen(str);
|
||||
int wlen;
|
||||
|
||||
/*
|
||||
*alloc number of wchar_t for worst case, len contains
|
||||
* number of bytes <= number of characters and
|
||||
* alloc 1 wchar_t for 0, because wchar2char(wcstombs in really)
|
||||
* wants zero-terminated string
|
||||
*/
|
||||
wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len+1));
|
||||
|
||||
/*
|
||||
* str SHOULD be cstring, so wlen contains number
|
||||
* of converted character
|
||||
*/
|
||||
wlen = char2wchar(wstr, str, len);
|
||||
if ( wlen < 0 )
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
|
||||
errmsg("transalation failed from server encoding to wchar_t")));
|
||||
|
||||
Assert(wlen<=len);
|
||||
wstr[wlen] = 0;
|
||||
|
||||
wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
|
||||
char2wchar(wstr, str, len + 1);
|
||||
while (*wptr)
|
||||
{
|
||||
*wptr = towlower((wint_t) *wptr);
|
||||
wptr++;
|
||||
}
|
||||
wchar2char(str, wstr, len);
|
||||
|
||||
/*
|
||||
* Alloc result string for worst case + '\0'
|
||||
*/
|
||||
len = sizeof(char)*pg_database_encoding_max_length()*(wlen+1);
|
||||
out = (char*)palloc(len);
|
||||
|
||||
/*
|
||||
* wlen now is number of bytes which is always >= number of characters
|
||||
*/
|
||||
wlen = wchar2char(out, wstr, len);
|
||||
pfree(wstr);
|
||||
|
||||
if ( wlen < 0 )
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
|
||||
errmsg("transalation failed from wchar_t to server encoding %d", errno)));
|
||||
Assert(wlen<=len);
|
||||
out[wlen]='\0';
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
char *outptr;
|
||||
|
||||
outptr = out = (char*)palloc( sizeof(char) * (len+1) );
|
||||
while (*ptr)
|
||||
{
|
||||
*ptr = tolower(*(unsigned char *) ptr);
|
||||
*outptr++ = tolower(*(unsigned char *) ptr);
|
||||
ptr++;
|
||||
}
|
||||
return str;
|
||||
*outptr = '\0';
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user