mirror of
https://git.postgresql.org/git/postgresql.git
synced 2025-01-06 15:24:56 +08:00
Clean up ts_locale.h/.c. Fix broken and not-consistent-across-platforms
behavior of wchar2char/char2wchar; this should resolve bug #3730. Avoid excess computations of pg_mblen in t_isalpha and friends. Const-ify APIs where possible.
This commit is contained in:
parent
83290b678d
commit
654dcfb9e4
@ -1,13 +1,13 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* ts_locale.c
|
||||
* locale compatiblility layer for tsearch
|
||||
* locale compatibility layer for tsearch
|
||||
*
|
||||
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.2 2007/08/25 00:03:59 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.3 2007/11/09 22:37:35 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@ -16,41 +16,56 @@
|
||||
#include "tsearch/ts_locale.h"
|
||||
#include "tsearch/ts_public.h"
|
||||
|
||||
|
||||
#ifdef TS_USE_WIDE
|
||||
|
||||
#ifdef WIN32
|
||||
|
||||
/*
|
||||
* wchar2char --- convert wide characters to multibyte format
|
||||
*
|
||||
* This has the same API as the standard wcstombs() function; in particular,
|
||||
* tolen is the maximum number of bytes to store at *to, and *from should be
|
||||
* zero-terminated. The output will be zero-terminated iff there is room.
|
||||
*/
|
||||
size_t
|
||||
wchar2char(char *to, const wchar_t *from, size_t len)
|
||||
wchar2char(char *to, const wchar_t *from, size_t tolen)
|
||||
{
|
||||
if (len == 0)
|
||||
if (tolen == 0)
|
||||
return 0;
|
||||
|
||||
#ifdef WIN32
|
||||
if (GetDatabaseEncoding() == PG_UTF8)
|
||||
{
|
||||
int r;
|
||||
|
||||
r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
|
||||
r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
|
||||
NULL, NULL);
|
||||
|
||||
if (r == 0)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
|
||||
errmsg("UTF-16 to UTF-8 translation failed: %lu",
|
||||
GetLastError())));
|
||||
Assert(r <= len);
|
||||
if (r <= 0)
|
||||
return (size_t) -1;
|
||||
|
||||
return r;
|
||||
Assert(r <= tolen);
|
||||
|
||||
/* Microsoft counts the zero terminator in the result */
|
||||
return r-1;
|
||||
}
|
||||
|
||||
return wcstombs(to, from, len);
|
||||
}
|
||||
#endif /* WIN32 */
|
||||
|
||||
return wcstombs(to, from, tolen);
|
||||
}
|
||||
|
||||
/*
|
||||
* char2wchar --- convert multibyte characters to wide characters
|
||||
*
|
||||
* This has almost the API of mbstowcs(), except that *from need not be
|
||||
* null-terminated; instead, the number of input bytes is specified as
|
||||
* fromlen. Also, we ereport() rather than returning -1 for invalid
|
||||
* input encoding. tolen is the maximum number of wchar_t's to store at *to.
|
||||
* The output will be zero-terminated iff there is room.
|
||||
*/
|
||||
size_t
|
||||
char2wchar(wchar_t *to, const char *from, size_t len)
|
||||
char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen)
|
||||
{
|
||||
if (len == 0)
|
||||
if (tolen == 0)
|
||||
return 0;
|
||||
|
||||
#ifdef WIN32
|
||||
@ -58,71 +73,117 @@ char2wchar(wchar_t *to, const char *from, size_t len)
|
||||
{
|
||||
int r;
|
||||
|
||||
r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
|
||||
r = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen);
|
||||
|
||||
if (!r)
|
||||
if (r <= 0)
|
||||
{
|
||||
pg_verifymbstr(from, len, false);
|
||||
pg_verifymbstr(from, fromlen, false);
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
|
||||
errmsg("invalid multibyte character for locale"),
|
||||
errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
|
||||
}
|
||||
|
||||
Assert(r <= len);
|
||||
Assert(r <= tolen);
|
||||
|
||||
return r;
|
||||
/* Microsoft counts the zero terminator in the result */
|
||||
return r-1;
|
||||
}
|
||||
else
|
||||
#endif /* WIN32 */
|
||||
|
||||
if (lc_ctype_is_c())
|
||||
{
|
||||
/*
|
||||
* pg_mb2wchar_with_len always adds trailing '\0', so 'to' should be
|
||||
* allocated with sufficient space
|
||||
*/
|
||||
return pg_mb2wchar_with_len(from, (pg_wchar *) to, len);
|
||||
return pg_mb2wchar_with_len(from, (pg_wchar *) to, fromlen);
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* mbstowcs require ending '\0'
|
||||
* mbstowcs requires ending '\0'
|
||||
*/
|
||||
char *str = pnstrdup(from, len);
|
||||
size_t tolen;
|
||||
char *str = pnstrdup(from, fromlen);
|
||||
size_t result;
|
||||
|
||||
result = mbstowcs(to, str, tolen);
|
||||
|
||||
tolen = mbstowcs(to, str, len);
|
||||
pfree(str);
|
||||
|
||||
return tolen;
|
||||
if (result == (size_t) -1)
|
||||
{
|
||||
pg_verifymbstr(from, fromlen, false);
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
|
||||
errmsg("invalid multibyte character for locale"),
|
||||
errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
|
||||
}
|
||||
|
||||
if (result < tolen)
|
||||
to[result] = 0;
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
_t_isalpha(const char *ptr)
|
||||
t_isdigit(const char *ptr)
|
||||
{
|
||||
int clen = pg_mblen(ptr);
|
||||
wchar_t character[2];
|
||||
|
||||
if (lc_ctype_is_c())
|
||||
if (clen == 1 || lc_ctype_is_c())
|
||||
return isdigit(TOUCHAR(ptr));
|
||||
|
||||
char2wchar(character, 2, ptr, clen);
|
||||
|
||||
return iswdigit((wint_t) character[0]);
|
||||
}
|
||||
|
||||
int
|
||||
t_isspace(const char *ptr)
|
||||
{
|
||||
int clen = pg_mblen(ptr);
|
||||
wchar_t character[2];
|
||||
|
||||
if (clen == 1 || lc_ctype_is_c())
|
||||
return isspace(TOUCHAR(ptr));
|
||||
|
||||
char2wchar(character, 2, ptr, clen);
|
||||
|
||||
return iswspace((wint_t) character[0]);
|
||||
}
|
||||
|
||||
int
|
||||
t_isalpha(const char *ptr)
|
||||
{
|
||||
int clen = pg_mblen(ptr);
|
||||
wchar_t character[2];
|
||||
|
||||
if (clen == 1 || lc_ctype_is_c())
|
||||
return isalpha(TOUCHAR(ptr));
|
||||
|
||||
char2wchar(character, ptr, 1);
|
||||
char2wchar(character, 2, ptr, clen);
|
||||
|
||||
return iswalpha((wint_t) *character);
|
||||
return iswalpha((wint_t) character[0]);
|
||||
}
|
||||
|
||||
int
|
||||
_t_isprint(const char *ptr)
|
||||
t_isprint(const char *ptr)
|
||||
{
|
||||
int clen = pg_mblen(ptr);
|
||||
wchar_t character[2];
|
||||
|
||||
if (lc_ctype_is_c())
|
||||
if (clen == 1 || lc_ctype_is_c())
|
||||
return isprint(TOUCHAR(ptr));
|
||||
|
||||
char2wchar(character, ptr, 1);
|
||||
char2wchar(character, 2, ptr, clen);
|
||||
|
||||
return iswprint((wint_t) *character);
|
||||
return iswprint((wint_t) character[0]);
|
||||
}
|
||||
|
||||
#endif /* TS_USE_WIDE */
|
||||
|
||||
|
||||
@ -168,19 +229,27 @@ t_readline(FILE *fp)
|
||||
return recoded;
|
||||
}
|
||||
|
||||
/*
|
||||
* lowerstr --- fold null-terminated string to lower case
|
||||
*
|
||||
* Returned string is palloc'd
|
||||
*/
|
||||
char *
|
||||
lowerstr(char *str)
|
||||
lowerstr(const char *str)
|
||||
{
|
||||
return lowerstr_with_len(str, strlen(str));
|
||||
}
|
||||
|
||||
/*
|
||||
* lowerstr_with_len --- fold string to lower case
|
||||
*
|
||||
* Input string need not be null-terminated.
|
||||
*
|
||||
* Returned string is palloc'd
|
||||
*/
|
||||
char *
|
||||
lowerstr_with_len(char *str, int len)
|
||||
lowerstr_with_len(const char *str, int len)
|
||||
{
|
||||
char *ptr = str;
|
||||
char *out;
|
||||
|
||||
if (len == 0)
|
||||
@ -202,23 +271,13 @@ lowerstr_with_len(char *str, int len)
|
||||
|
||||
/*
|
||||
* alloc number of wchar_t for worst case, len contains number of
|
||||
* bytes <= number of characters and alloc 1 wchar_t for 0, because
|
||||
* wchar2char(wcstombs in really) wants zero-terminated string
|
||||
* bytes >= number of characters and alloc 1 wchar_t for 0, because
|
||||
* wchar2char wants zero-terminated string
|
||||
*/
|
||||
wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
|
||||
|
||||
/*
|
||||
* str SHOULD be cstring, so wlen contains number of converted
|
||||
* character
|
||||
*/
|
||||
wlen = char2wchar(wstr, str, len);
|
||||
if (wlen < 0)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
|
||||
errmsg("translation failed from server encoding to wchar_t")));
|
||||
|
||||
wlen = char2wchar(wstr, len+1, str, len);
|
||||
Assert(wlen <= len);
|
||||
wstr[wlen] = 0;
|
||||
|
||||
while (*wptr)
|
||||
{
|
||||
@ -229,31 +288,29 @@ lowerstr_with_len(char *str, int len)
|
||||
/*
|
||||
* Alloc result string for worst case + '\0'
|
||||
*/
|
||||
len = sizeof(char) * pg_database_encoding_max_length() *(wlen + 1);
|
||||
len = pg_database_encoding_max_length() * wlen + 1;
|
||||
out = (char *) palloc(len);
|
||||
|
||||
/*
|
||||
* wlen now is number of bytes which is always >= number of characters
|
||||
*/
|
||||
wlen = wchar2char(out, wstr, len);
|
||||
|
||||
pfree(wstr);
|
||||
|
||||
if (wlen < 0)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
|
||||
errmsg("translation failed from wchar_t to server encoding %d", errno)));
|
||||
Assert(wlen <= len);
|
||||
out[wlen] = '\0';
|
||||
errmsg("translation from wchar_t to server encoding failed: %m")));
|
||||
Assert(wlen < len);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#endif /* TS_USE_WIDE */
|
||||
{
|
||||
const char *ptr = str;
|
||||
char *outptr;
|
||||
|
||||
outptr = out = (char *) palloc(sizeof(char) * (len + 1));
|
||||
while (*ptr && ptr - str < len)
|
||||
while ((ptr - str) < len && *ptr)
|
||||
{
|
||||
*outptr++ = tolower(*(unsigned char *) ptr);
|
||||
*outptr++ = tolower(TOUCHAR(ptr));
|
||||
ptr++;
|
||||
}
|
||||
*outptr = '\0';
|
||||
|
@ -7,7 +7,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/tsearch/ts_utils.c,v 1.4 2007/09/04 02:16:56 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/tsearch/ts_utils.c,v 1.5 2007/11/09 22:37:35 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@ -75,7 +75,7 @@ comparestr(const void *a, const void *b)
|
||||
* or palloc a new version.
|
||||
*/
|
||||
void
|
||||
readstoplist(const char *fname, StopList *s, char *(*wordop) (char *))
|
||||
readstoplist(const char *fname, StopList *s, char *(*wordop) (const char *))
|
||||
{
|
||||
char **stop = NULL;
|
||||
|
||||
|
@ -7,7 +7,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.7 2007/10/27 19:03:45 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.8 2007/11/09 22:37:35 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@ -294,12 +294,12 @@ TParserInit(char *str, int len)
|
||||
/*
|
||||
* Use wide char code only when max encoding length > 1.
|
||||
*/
|
||||
|
||||
if (prs->charmaxlen > 1)
|
||||
{
|
||||
prs->usewide = true;
|
||||
prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
|
||||
prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
|
||||
prs->lenwstr = char2wchar(prs->wstr, prs->lenstr + 1,
|
||||
prs->str, prs->lenstr);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
|
@ -1,15 +1,14 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* ts_locale.h
|
||||
* helper utilities for tsearch
|
||||
* locale compatibility layer for tsearch
|
||||
*
|
||||
* Copyright (c) 1998-2007, PostgreSQL Global Development Group
|
||||
*
|
||||
* $PostgreSQL: pgsql/src/include/tsearch/ts_locale.h,v 1.2 2007/08/25 00:03:59 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/include/tsearch/ts_locale.h,v 1.3 2007/11/09 22:37:35 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#ifndef __TSLOCALE_H__
|
||||
#define __TSLOCALE_H__
|
||||
|
||||
@ -34,55 +33,37 @@
|
||||
#define TS_USE_WIDE
|
||||
#endif
|
||||
|
||||
#define TOUCHAR(x) (*((unsigned char*)(x)))
|
||||
#define TOUCHAR(x) (*((const unsigned char *) (x)))
|
||||
|
||||
#ifdef TS_USE_WIDE
|
||||
|
||||
extern size_t char2wchar(wchar_t *to, const char *from, size_t len);
|
||||
extern size_t wchar2char(char *to, const wchar_t *from, size_t tolen);
|
||||
extern size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen);
|
||||
|
||||
#ifdef WIN32
|
||||
extern int t_isdigit(const char *ptr);
|
||||
extern int t_isspace(const char *ptr);
|
||||
extern int t_isalpha(const char *ptr);
|
||||
extern int t_isprint(const char *ptr);
|
||||
|
||||
extern size_t wchar2char(char *to, const wchar_t *from, size_t len);
|
||||
#else /* WIN32 */
|
||||
/* The second argument of t_iseq() must be a plain ASCII character */
|
||||
#define t_iseq(x,c) (TOUCHAR(x) == (unsigned char) (c))
|
||||
|
||||
/* correct wcstombs */
|
||||
#define wchar2char wcstombs
|
||||
#define COPYCHAR(d,s) memcpy(d, s, pg_mblen(s))
|
||||
|
||||
#endif /* WIN32 */
|
||||
#else /* not TS_USE_WIDE */
|
||||
|
||||
#define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
|
||||
#define t_isspace(x) ( pg_mblen(x)==1 && isspace( TOUCHAR(x) ) )
|
||||
extern int _t_isalpha(const char *ptr);
|
||||
#define t_isdigit(x) isdigit(TOUCHAR(x))
|
||||
#define t_isspace(x) isspace(TOUCHAR(x))
|
||||
#define t_isalpha(x) isalpha(TOUCHAR(x))
|
||||
#define t_isprint(x) isprint(TOUCHAR(x))
|
||||
#define t_iseq(x,c) (TOUCHAR(x) == (unsigned char) (c))
|
||||
|
||||
#define t_isalpha(x) ( (pg_mblen(x)==1) ? isalpha( TOUCHAR(x) ) : _t_isalpha(x) )
|
||||
extern int _t_isprint(const char *ptr);
|
||||
#define COPYCHAR(d,s) (*((unsigned char *) (d)) = TOUCHAR(s))
|
||||
|
||||
#define t_isprint(x) ( (pg_mblen(x)==1) ? isprint( TOUCHAR(x) ) : _t_isprint(x) )
|
||||
/*
|
||||
* t_iseq() should be called only for ASCII symbols
|
||||
*/
|
||||
#define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
|
||||
#endif /* TS_USE_WIDE */
|
||||
|
||||
#define COPYCHAR(d,s) do { \
|
||||
int lll = pg_mblen( s ); \
|
||||
\
|
||||
while( lll-- ) \
|
||||
TOUCHAR((d)+lll) = TOUCHAR((s)+lll); \
|
||||
} while(0)
|
||||
|
||||
#else /* not def TS_USE_WIDE */
|
||||
|
||||
#define t_isdigit(x) isdigit( TOUCHAR(x) )
|
||||
#define t_isspace(x) isspace( TOUCHAR(x) )
|
||||
#define t_isalpha(x) isalpha( TOUCHAR(x) )
|
||||
#define t_isprint(x) isprint( TOUCHAR(x) )
|
||||
#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)) )
|
||||
|
||||
#define COPYCHAR(d,s) TOUCHAR(d) = TOUCHAR(s)
|
||||
#endif
|
||||
|
||||
extern char *lowerstr(char *str);
|
||||
extern char *lowerstr_with_len(char *str, int len);
|
||||
extern char *lowerstr(const char *str);
|
||||
extern char *lowerstr_with_len(const char *str, int len);
|
||||
extern char *t_readline(FILE *fp);
|
||||
|
||||
#endif /* __TSLOCALE_H__ */
|
||||
|
@ -6,7 +6,7 @@
|
||||
*
|
||||
* Copyright (c) 1998-2007, PostgreSQL Global Development Group
|
||||
*
|
||||
* $PostgreSQL: pgsql/src/include/tsearch/ts_public.h,v 1.4 2007/09/07 15:09:56 teodor Exp $
|
||||
* $PostgreSQL: pgsql/src/include/tsearch/ts_public.h,v 1.5 2007/11/09 22:37:35 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@ -74,7 +74,7 @@ typedef struct
|
||||
} StopList;
|
||||
|
||||
extern void readstoplist(const char *fname, StopList *s,
|
||||
char *(*wordop) (char *));
|
||||
char *(*wordop) (const char *));
|
||||
extern bool searchstoplist(StopList *s, char *key);
|
||||
|
||||
/*
|
||||
|
Loading…
Reference in New Issue
Block a user