postgresql/contrib/unaccent/unaccent.c
Tom Lane c6b3fb4c53 Fix inadequately-sized output buffer in contrib/unaccent.
The output buffer size in unaccent_lexize() was calculated as input string
length times pg_database_encoding_max_length(), which effectively assumes
that replacement strings aren't more than one character.  While that was
all that we previously documented it to support, the code actually has
always allowed replacement strings of arbitrary length; so if you tried
to make use of longer strings, you were at risk of buffer overrun.  To fix,
use an expansible StringInfo buffer instead of trying to determine the
maximum space needed a-priori.

This would be a security issue if unaccent rules files could be installed
by unprivileged users; but fortunately they can't, so in the back branches
the problem can be labeled as improper configuration by a superuser.
Nonetheless, a memory stomp isn't a nice way of reacting to improper
configuration, so let's back-patch the fix.
2014-07-01 11:23:01 -04:00

372 lines
7.8 KiB
C

/*-------------------------------------------------------------------------
*
* unaccent.c
* Text search unaccent dictionary
*
* Copyright (c) 2009-2010, PostgreSQL Global Development Group
*
* IDENTIFICATION
* $PostgreSQL: pgsql/contrib/unaccent/unaccent.c,v 1.5 2010/02/26 02:00:32 momjian Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "fmgr.h"
#include "catalog/namespace.h"
#include "commands/defrem.h"
#include "lib/stringinfo.h"
#include "mb/pg_wchar.h"
#include "tsearch/ts_cache.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_public.h"
#include "utils/builtins.h"
PG_MODULE_MAGIC;
/*
* Unaccent dictionary uses uncompressed suffix tree to find a
* character to replace. Each node of tree is an array of
* SuffixChar struct with length = 256 (n-th element of array
* corresponds to byte)
*/
typedef struct SuffixChar
{
struct SuffixChar *nextChar;
char *replaceTo;
int replacelen;
} SuffixChar;
/*
* placeChar - put str into tree's structure, byte by byte.
*/
static SuffixChar *
placeChar(SuffixChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
{
SuffixChar *curnode;
if (!node)
{
node = palloc(sizeof(SuffixChar) * 256);
memset(node, 0, sizeof(SuffixChar) * 256);
}
curnode = node + *str;
if (lenstr == 1)
{
if (curnode->replaceTo)
elog(WARNING, "duplicate TO argument, use first one");
else
{
curnode->replacelen = replacelen;
curnode->replaceTo = palloc(replacelen);
memcpy(curnode->replaceTo, replaceTo, replacelen);
}
}
else
{
curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, replaceTo, replacelen);
}
return node;
}
/*
* initSuffixTree - create suffix tree from file. Function converts
* UTF8-encoded file into current encoding.
*/
static SuffixChar *
initSuffixTree(char *filename)
{
SuffixChar *volatile rootSuffixTree = NULL;
MemoryContext ccxt = CurrentMemoryContext;
tsearch_readline_state trst;
volatile bool skip;
filename = get_tsearch_config_filename(filename, "rules");
if (!tsearch_readline_begin(&trst, filename))
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("could not open unaccent file \"%s\": %m",
filename)));
do
{
/*
* pg_do_encoding_conversion() (called by tsearch_readline()) will
* emit exception if it finds untranslatable characters in current
* locale. We just skip such lines, continuing with the next.
*/
skip = true;
PG_TRY();
{
char *line;
while ((line = tsearch_readline(&trst)) != NULL)
{
/*
* The format of each line must be "src trg" where src and trg
* are sequences of one or more non-whitespace characters,
* separated by whitespace. Whitespace at start or end of
* line is ignored.
*/
int state;
char *ptr;
char *src = NULL;
char *trg = NULL;
int ptrlen;
int srclen = 0;
int trglen = 0;
state = 0;
for (ptr = line; *ptr; ptr += ptrlen)
{
ptrlen = pg_mblen(ptr);
/* ignore whitespace, but end src or trg */
if (t_isspace(ptr))
{
if (state == 1)
state = 2;
else if (state == 3)
state = 4;
continue;
}
switch (state)
{
case 0:
/* start of src */
src = ptr;
srclen = ptrlen;
state = 1;
break;
case 1:
/* continue src */
srclen += ptrlen;
break;
case 2:
/* start of trg */
trg = ptr;
trglen = ptrlen;
state = 3;
break;
case 3:
/* continue trg */
trglen += ptrlen;
break;
default:
/* bogus line format */
state = -1;
break;
}
}
if (state >= 3)
rootSuffixTree = placeChar(rootSuffixTree,
(unsigned char *) src, srclen,
trg, trglen);
pfree(line);
}
skip = false;
}
PG_CATCH();
{
ErrorData *errdata;
MemoryContext ecxt;
ecxt = MemoryContextSwitchTo(ccxt);
errdata = CopyErrorData();
if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
{
FlushErrorState();
}
else
{
MemoryContextSwitchTo(ecxt);
PG_RE_THROW();
}
}
PG_END_TRY();
}
while (skip);
tsearch_readline_end(&trst);
return rootSuffixTree;
}
/*
* findReplaceTo - find multibyte character in tree
*/
static SuffixChar *
findReplaceTo(SuffixChar *node, unsigned char *src, int srclen)
{
while (node)
{
node = node + *src;
if (srclen == 1)
return node;
src++;
srclen--;
node = node->nextChar;
}
return NULL;
}
PG_FUNCTION_INFO_V1(unaccent_init);
Datum unaccent_init(PG_FUNCTION_ARGS);
Datum
unaccent_init(PG_FUNCTION_ARGS)
{
List *dictoptions = (List *) PG_GETARG_POINTER(0);
SuffixChar *rootSuffixTree = NULL;
bool fileloaded = false;
ListCell *l;
foreach(l, dictoptions)
{
DefElem *defel = (DefElem *) lfirst(l);
if (pg_strcasecmp("Rules", defel->defname) == 0)
{
if (fileloaded)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("multiple Rules parameters")));
rootSuffixTree = initSuffixTree(defGetString(defel));
fileloaded = true;
}
else
{
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("unrecognized Unaccent parameter: \"%s\"",
defel->defname)));
}
}
if (!fileloaded)
{
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("missing Rules parameter")));
}
PG_RETURN_POINTER(rootSuffixTree);
}
PG_FUNCTION_INFO_V1(unaccent_lexize);
Datum unaccent_lexize(PG_FUNCTION_ARGS);
Datum
unaccent_lexize(PG_FUNCTION_ARGS)
{
SuffixChar *rootSuffixTree = (SuffixChar *) PG_GETARG_POINTER(0);
char *srcchar = (char *) PG_GETARG_POINTER(1);
int32 len = PG_GETARG_INT32(2);
char *srcstart = srcchar;
TSLexeme *res;
StringInfoData buf;
/* we allocate storage for the buffer only if needed */
buf.data = NULL;
while (srcchar - srcstart < len)
{
SuffixChar *node;
int charlen;
charlen = pg_mblen(srcchar);
node = findReplaceTo(rootSuffixTree, (unsigned char *) srcchar, charlen);
if (node && node->replaceTo)
{
if (buf.data == NULL)
{
/* initialize buffer */
initStringInfo(&buf);
/* insert any data we already skipped over */
if (srcchar != srcstart)
appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart);
}
appendBinaryStringInfo(&buf, node->replaceTo, node->replacelen);
}
else if (buf.data != NULL)
appendBinaryStringInfo(&buf, srcchar, charlen);
srcchar += charlen;
}
/* return a result only if we made at least one substitution */
if (buf.data != NULL)
{
res = (TSLexeme *) palloc0(sizeof(TSLexeme) * 2);
res->lexeme = buf.data;
res->flags = TSL_FILTER;
}
else
res = NULL;
PG_RETURN_POINTER(res);
}
/*
* Function-like wrapper for dictionary
*/
PG_FUNCTION_INFO_V1(unaccent_dict);
Datum unaccent_dict(PG_FUNCTION_ARGS);
Datum
unaccent_dict(PG_FUNCTION_ARGS)
{
text *str;
int strArg;
Oid dictOid;
TSDictionaryCacheEntry *dict;
TSLexeme *res;
if (PG_NARGS() == 1)
{
dictOid = TSDictionaryGetDictid(stringToQualifiedNameList("unaccent"), false);
strArg = 0;
}
else
{
dictOid = PG_GETARG_OID(0);
strArg = 1;
}
str = PG_GETARG_TEXT_P(strArg);
dict = lookup_ts_dictionary_cache(dictOid);
res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
PointerGetDatum(dict->dictData),
PointerGetDatum(VARDATA(str)),
Int32GetDatum(VARSIZE(str) - VARHDRSZ),
PointerGetDatum(NULL)));
PG_FREE_IF_COPY(str, strArg);
if (res == NULL)
{
PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
}
else if (res->lexeme == NULL)
{
pfree(res);
PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
}
else
{
text *txt = cstring_to_text(res->lexeme);
pfree(res->lexeme);
pfree(res);
PG_RETURN_TEXT_P(txt);
}
}