postgresql/contrib/tsearch2/dict_thesaurus.c
2006-10-04 00:30:14 +00:00

919 lines
18 KiB
C

/* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.6 2006/10/04 00:29:46 momjian Exp $ */
/*
* thesaurus
* Teodor Sigaev <teodor@sigaev.ru>
*/
#include "postgres.h"
#include "executor/spi.h"
#include <ctype.h>
#include "dict.h"
#include "common.h"
#include "ts_locale.h"
/*
* Temporay we use TSLexeme.flags for inner use...
*/
#define DT_USEASIS 0x1000
typedef struct LexemeInfo
{
uint16 idsubst; /* entry's number in DictThesaurus->subst */
uint16 posinsubst; /* pos info in entry */
uint16 tnvariant; /* total num lexemes in one variant */
struct LexemeInfo *nextentry;
struct LexemeInfo *nextvariant;
} LexemeInfo;
typedef struct
{
char *lexeme;
LexemeInfo *entries;
} TheLexeme;
typedef struct
{
uint16 lastlexeme; /* number lexemes to substitute */
uint16 reslen;
TSLexeme *res; /* prepared substituted result */
} TheSubstitute;
typedef struct
{
/* subdictionary to normalize lexemes */
DictInfo subdict;
/* Array to search lexeme by exact match */
TheLexeme *wrds;
int nwrds;
int ntwrds;
/*
* Storage of substituted result, n-th element is for n-th expression
*/
TheSubstitute *subst;
int nsubst;
} DictThesaurus;
PG_FUNCTION_INFO_V1(thesaurus_init);
Datum thesaurus_init(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(thesaurus_lexize);
Datum thesaurus_lexize(PG_FUNCTION_ARGS);
static void
freeDictThesaurus(DictThesaurus * d)
{
free(d);
}
static void
newLexeme(DictThesaurus * d, char *b, char *e, uint16 idsubst, uint16 posinsubst)
{
TheLexeme *ptr;
if (d->nwrds >= d->ntwrds)
{
if (d->ntwrds == 0)
{
d->ntwrds = 16;
d->wrds = (TheLexeme *) malloc(sizeof(TheLexeme) * d->ntwrds);
}
else
{
d->ntwrds *= 2;
d->wrds = (TheLexeme *) realloc(d->wrds, sizeof(TheLexeme) * d->ntwrds);
}
if (!d->wrds)
elog(ERROR, "Out of memory");
}
ptr = d->wrds + d->nwrds;
d->nwrds++;
if ((ptr->lexeme = malloc(e - b + 1)) == NULL)
elog(ERROR, "Out of memory");
memcpy(ptr->lexeme, b, e - b);
ptr->lexeme[e - b] = '\0';
if ((ptr->entries = (LexemeInfo *) malloc(sizeof(LexemeInfo))) == NULL)
elog(ERROR, "Out of memory");
ptr->entries->nextentry = NULL;
ptr->entries->idsubst = idsubst;
ptr->entries->posinsubst = posinsubst;
}
static void
addWrd(DictThesaurus * d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis)
{
static int nres = 0;
static int ntres = 0;
TheSubstitute *ptr;
if (nwrd == 0)
{
nres = ntres = 0;
if (idsubst <= d->nsubst)
{
if (d->nsubst == 0)
{
d->nsubst = 16;
d->subst = (TheSubstitute *) malloc(sizeof(TheSubstitute) * d->nsubst);
}
else
{
d->nsubst *= 2;
d->subst = (TheSubstitute *) realloc(d->subst, sizeof(TheSubstitute) * d->nsubst);
}
if (!d->subst)
elog(ERROR, "Out of memory");
}
}
ptr = d->subst + idsubst;
ptr->lastlexeme = posinsubst - 1;
if (nres + 1 >= ntres)
{
if (ntres == 0)
{
ntres = 2;
ptr->res = (TSLexeme *) malloc(sizeof(TSLexeme) * ntres);
}
else
{
ntres *= 2;
ptr->res = (TSLexeme *) realloc(ptr->res, sizeof(TSLexeme) * ntres);
}
if (!ptr->res)
elog(ERROR, "Out of memory");
}
if ((ptr->res[nres].lexeme = malloc(e - b + 1)) == 0)
elog(ERROR, "Out of memory");
memcpy(ptr->res[nres].lexeme, b, e - b);
ptr->res[nres].lexeme[e - b] = '\0';
ptr->res[nres].nvariant = nwrd;
if (useasis)
ptr->res[nres].flags = DT_USEASIS;
else
ptr->res[nres].flags = 0;
ptr->res[++nres].lexeme = NULL;
}
#define TR_WAITLEX 1
#define TR_INLEX 2
#define TR_WAITSUBS 3
#define TR_INSUBS 4
static void
thesaurusRead(char *filename, DictThesaurus * d)
{
FILE *fh;
char str[BUFSIZ];
int lineno = 0;
uint16 idsubst = 0;
bool useasis = false;
fh = fopen(to_absfilename(filename), "r");
if (!fh)
elog(ERROR, "Thesaurus: can't open '%s' file", filename);
while (fgets(str, sizeof(str), fh))
{
char *ptr = str;
int state = TR_WAITLEX;
char *beginwrd = NULL;
uint16 posinsubst = 0;
uint16 nwrd = 0;
lineno++;
/* is it comment ? */
while (t_isspace(ptr))
ptr += pg_mblen(ptr);
if (t_iseq(str, '#') || *str == '\0' || t_iseq(str, '\n') || t_iseq(str, '\r'))
continue;
pg_verifymbstr(ptr, strlen(ptr), false);
while (*ptr)
{
if (state == TR_WAITLEX)
{
if (t_iseq(ptr, ':'))
{
if (posinsubst == 0)
{
fclose(fh);
elog(ERROR, "Thesaurus: Unexpected delimiter at %d line", lineno);
}
state = TR_WAITSUBS;
}
else if (!t_isspace(ptr))
{
beginwrd = ptr;
state = TR_INLEX;
}
}
else if (state == TR_INLEX)
{
if (t_iseq(ptr, ':'))
{
newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
state = TR_WAITSUBS;
}
else if (t_isspace(ptr))
{
newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
state = TR_WAITLEX;
}
}
else if (state == TR_WAITSUBS)
{
if (t_iseq(ptr, '*'))
{
useasis = true;
state = TR_INSUBS;
beginwrd = ptr + pg_mblen(ptr);
}
else if (t_iseq(ptr, '\\'))
{
useasis = false;
state = TR_INSUBS;
beginwrd = ptr + pg_mblen(ptr);
}
else if (!t_isspace(ptr))
{
useasis = false;
beginwrd = ptr;
state = TR_INSUBS;
}
}
else if (state == TR_INSUBS)
{
if (t_isspace(ptr))
{
if (ptr == beginwrd)
elog(ERROR, "Thesaurus: Unexpected end of line or lexeme at %d line", lineno);
addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
state = TR_WAITSUBS;
}
}
else
elog(ERROR, "Thesaurus: Unknown state: %d", state);
ptr += pg_mblen(ptr);
}
if (state == TR_INSUBS)
{
if (ptr == beginwrd)
elog(ERROR, "Thesaurus: Unexpected end of line or lexeme at %d line", lineno);
addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
}
idsubst++;
if (!(nwrd && posinsubst))
{
fclose(fh);
elog(ERROR, "Thesaurus: Unexpected end of line at %d line", lineno);
}
}
d->nsubst = idsubst;
fclose(fh);
}
static TheLexeme *
addCompiledLexeme(TheLexeme * newwrds, int *nnw, int *tnm, TSLexeme * lexeme, LexemeInfo * src, uint16 tnvariant)
{
if (*nnw >= *tnm)
{
*tnm *= 2;
newwrds = (TheLexeme *) realloc(newwrds, sizeof(TheLexeme) * *tnm);
if (!newwrds)
elog(ERROR, "Out of memory");
}
newwrds[*nnw].entries = (LexemeInfo *) malloc(sizeof(LexemeInfo));
if (!newwrds[*nnw].entries)
elog(ERROR, "Out of memory");
if (lexeme && lexeme->lexeme)
{
newwrds[*nnw].lexeme = strdup(lexeme->lexeme);
if (!newwrds[*nnw].lexeme)
elog(ERROR, "Out of memory");
newwrds[*nnw].entries->tnvariant = tnvariant;
}
else
{
newwrds[*nnw].lexeme = NULL;
newwrds[*nnw].entries->tnvariant = 1;
}
newwrds[*nnw].entries->idsubst = src->idsubst;
newwrds[*nnw].entries->posinsubst = src->posinsubst;
newwrds[*nnw].entries->nextentry = NULL;
(*nnw)++;
return newwrds;
}
static int
cmpLexemeInfo(LexemeInfo * a, LexemeInfo * b)
{
if (a == NULL || b == NULL)
return 0;
if (a->idsubst == b->idsubst)
{
if (a->posinsubst == b->posinsubst)
{
if (a->tnvariant == b->tnvariant)
return 0;
return (a->tnvariant > b->tnvariant) ? 1 : -1;
}
return (a->posinsubst > b->posinsubst) ? 1 : -1;
}
return (a->idsubst > b->idsubst) ? 1 : -1;
}
static int
cmpLexeme(TheLexeme * a, TheLexeme * b)
{
if (a->lexeme == NULL)
{
if (b->lexeme == NULL)
return 0;
else
return 1;
}
else if (b->lexeme == NULL)
return -1;
return strcmp(a->lexeme, b->lexeme);
}
static int
cmpLexemeQ(const void *a, const void *b)
{
return cmpLexeme((TheLexeme *) a, (TheLexeme *) b);
}
static int
cmpTheLexeme(const void *a, const void *b)
{
TheLexeme *la = (TheLexeme *) a;
TheLexeme *lb = (TheLexeme *) b;
int res;
if ((res = cmpLexeme(la, lb)) != 0)
return res;
return -cmpLexemeInfo(la->entries, lb->entries);
}
static void
compileTheLexeme(DictThesaurus * d)
{
int i,
nnw = 0,
tnm = 16;
TheLexeme *newwrds = (TheLexeme *) malloc(sizeof(TheLexeme) * tnm),
*ptrwrds;
if (!newwrds)
elog(ERROR, "Out of memory");
for (i = 0; i < d->nwrds; i++)
{
TSLexeme *ptr;
ptr = (TSLexeme *) DatumGetPointer(
FunctionCall4(
&(d->subdict.lexize_info),
PointerGetDatum(d->subdict.dictionary),
PointerGetDatum(d->wrds[i].lexeme),
Int32GetDatum(strlen(d->wrds[i].lexeme)),
PointerGetDatum(NULL)
)
);
if (!(ptr && ptr->lexeme))
{
if (!ptr)
elog(ERROR, "Thesaurus: word-sample '%s' isn't recognized by subdictionary (rule %d)",
d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
else
elog(NOTICE, "Thesaurus: word-sample '%s' is recognized as stop-word, assign any stop-word (rule %d)",
d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
}
else
{
while (ptr->lexeme)
{
TSLexeme *remptr = ptr + 1;
int tnvar = 1;
int curvar = ptr->nvariant;
/* compute n words in one variant */
while (remptr->lexeme)
{
if (remptr->nvariant != (remptr - 1)->nvariant)
break;
tnvar++;
remptr++;
}
remptr = ptr;
while (remptr->lexeme && remptr->nvariant == curvar)
{
newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
remptr++;
}
ptr = remptr;
}
}
free(d->wrds[i].lexeme);
free(d->wrds[i].entries);
}
free(d->wrds);
d->wrds = newwrds;
d->nwrds = nnw;
d->ntwrds = tnm;
if (d->nwrds > 1)
{
qsort(d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme);
/* uniq */
newwrds = d->wrds;
ptrwrds = d->wrds + 1;
while (ptrwrds - d->wrds < d->nwrds)
{
if (cmpLexeme(ptrwrds, newwrds) == 0)
{
if (cmpLexemeInfo(ptrwrds->entries, newwrds->entries))
{
ptrwrds->entries->nextentry = newwrds->entries;
newwrds->entries = ptrwrds->entries;
}
else
free(ptrwrds->entries);
if (ptrwrds->lexeme)
free(ptrwrds->lexeme);
}
else
{
newwrds++;
*newwrds = *ptrwrds;
}
ptrwrds++;
}
d->nwrds = newwrds - d->wrds + 1;
d->wrds = (TheLexeme *) realloc(d->wrds, sizeof(TheLexeme) * d->nwrds);
}
}
static void
compileTheSubstitute(DictThesaurus * d)
{
int i;
for (i = 0; i < d->nsubst; i++)
{
TSLexeme *rem = d->subst[i].res,
*outptr,
*inptr;
int n = 2;
outptr = d->subst[i].res = (TSLexeme *) malloc(sizeof(TSLexeme) * n);
if (d->subst[i].res == NULL)
elog(ERROR, "Out of Memory");
outptr->lexeme = NULL;
inptr = rem;
while (inptr && inptr->lexeme)
{
TSLexeme *lexized,
tmplex[2];
if (inptr->flags & DT_USEASIS)
{ /* do not lexize */
tmplex[0] = *inptr;
tmplex[0].flags = 0;
tmplex[1].lexeme = NULL;
lexized = tmplex;
}
else
{
lexized = (TSLexeme *) DatumGetPointer(
FunctionCall4(
&(d->subdict.lexize_info),
PointerGetDatum(d->subdict.dictionary),
PointerGetDatum(inptr->lexeme),
Int32GetDatum(strlen(inptr->lexeme)),
PointerGetDatum(NULL)
)
);
}
if (lexized && lexized->lexeme)
{
int toset = (lexized->lexeme && outptr != d->subst[i].res) ? (outptr - d->subst[i].res) : -1;
while (lexized->lexeme)
{
if (outptr - d->subst[i].res + 1 >= n)
{
int diff = outptr - d->subst[i].res;
n *= 2;
d->subst[i].res = (TSLexeme *) realloc(d->subst[i].res, sizeof(TSLexeme) * n);
if (d->subst[i].res == NULL)
elog(ERROR, "Out of Memory");
outptr = d->subst[i].res + diff;
}
*outptr = *lexized;
if ((outptr->lexeme = strdup(lexized->lexeme)) == NULL)
elog(ERROR, "Out of Memory");
outptr++;
lexized++;
}
if (toset > 0)
d->subst[i].res[toset].flags |= TSL_ADDPOS;
}
else if (lexized)
{
elog(NOTICE, "Thesaurus: word '%s' in substition is a stop-word, ignored (rule %d)", inptr->lexeme, i + 1);
}
else
{
elog(ERROR, "Thesaurus: word '%s' in substition isn't recognized (rule %d)", inptr->lexeme, i + 1);
}
if (inptr->lexeme)
free(inptr->lexeme);
inptr++;
}
if (outptr == d->subst[i].res)
elog(ERROR, "Thesaurus: all words in subsitution are stop word (rule %d)", i + 1);
d->subst[i].reslen = outptr - d->subst[i].res;
free(rem);
}
}
Datum
thesaurus_init(PG_FUNCTION_ARGS)
{
DictThesaurus *d;
Map *cfg,
*pcfg;
text *in,
*subdictname = NULL;
bool fileloaded = false;
if (PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL)
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("Thesaurus confguration error")));
d = (DictThesaurus *) malloc(sizeof(DictThesaurus));
if (!d)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
memset(d, 0, sizeof(DictThesaurus));
in = PG_GETARG_TEXT_P(0);
parse_cfgdict(in, &cfg);
PG_FREE_IF_COPY(in, 0);
pcfg = cfg;
while (pcfg->key)
{
if (pg_strcasecmp("DictFile", pcfg->key) == 0)
{
if (fileloaded)
{
freeDictThesaurus(d);
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("Thesaurus file is already loaded")));
}
fileloaded = true;
thesaurusRead(pcfg->value, d);
}
else if (pg_strcasecmp("Dictionary", pcfg->key) == 0)
{
if (subdictname)
{
freeDictThesaurus(d);
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("Thesaurus: SubDictionary is already defined")));
}
subdictname = char2text(pcfg->value);
}
else
{
freeDictThesaurus(d);
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("unrecognized option: %s => %s",
pcfg->key, pcfg->value)));
}
pfree(pcfg->key);
pfree(pcfg->value);
pcfg++;
}
pfree(cfg);
if (!fileloaded)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("Thesaurus file isn't defined")));
if (subdictname)
{
DictInfo *subdictptr;
/*
* we already in SPI, but name2id_dict()/finddict() invoke
* SPI_connect()
*/
SPI_push();
subdictptr = finddict(name2id_dict(subdictname));
SPI_pop();
d->subdict = *subdictptr;
}
else
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("Thesaurus: SubDictionary isn't defined")));
compileTheLexeme(d);
compileTheSubstitute(d);
PG_RETURN_POINTER(d);
}
static LexemeInfo *
findTheLexeme(DictThesaurus * d, char *lexeme)
{
TheLexeme key = {lexeme, NULL}, *res;
if (d->nwrds == 0)
return NULL;
res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ);
if (res == NULL)
return NULL;
return res->entries;
}
static bool
matchIdSubst(LexemeInfo * stored, uint16 idsubst)
{
bool res = true;
if (stored)
{
res = false;
for (; stored; stored = stored->nextvariant)
if (stored->idsubst == idsubst)
{
res = true;
break;
}
}
return res;
}
static LexemeInfo *
findVariant(LexemeInfo * in, LexemeInfo * stored, uint16 curpos, LexemeInfo ** newin, int newn)
{
for (;;)
{
int i;
LexemeInfo *ptr = newin[0];
for (i = 0; i < newn; i++)
{
while (newin[i] && newin[i]->idsubst < ptr->idsubst)
newin[i] = newin[i]->nextentry;
if (newin[i] == NULL)
return in;
if (newin[i]->idsubst > ptr->idsubst)
{
ptr = newin[i];
i = -1;
continue;
}
while (newin[i]->idsubst == ptr->idsubst)
{
if (newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn)
{
ptr = newin[i];
break;
}
newin[i] = newin[i]->nextentry;
if (newin[i] == NULL)
return in;
}
if (newin[i]->idsubst != ptr->idsubst)
{
ptr = newin[i];
i = -1;
continue;
}
}
if (i == newn && matchIdSubst(stored, ptr->idsubst) && (in == NULL || !matchIdSubst(in, ptr->idsubst)))
{ /* found */
ptr->nextvariant = in;
in = ptr;
}
/* step forward */
for (i = 0; i < newn; i++)
newin[i] = newin[i]->nextentry;
}
return NULL;
}
static TSLexeme *
copyTSLexeme(TheSubstitute * ts)
{
TSLexeme *res;
uint16 i;
res = (TSLexeme *) palloc(sizeof(TSLexeme) * (ts->reslen + 1));
for (i = 0; i < ts->reslen; i++)
{
res[i] = ts->res[i];
res[i].lexeme = pstrdup(ts->res[i].lexeme);
}
res[ts->reslen].lexeme = NULL;
return res;
}
static TSLexeme *
checkMatch(DictThesaurus * d, LexemeInfo * info, uint16 curpos, bool *moreres)
{
*moreres = false;
while (info)
{
Assert(info->idsubst < d->nsubst);
if (info->nextvariant)
*moreres = true;
if (d->subst[info->idsubst].lastlexeme == curpos)
return copyTSLexeme(d->subst + info->idsubst);
info = info->nextvariant;
}
return NULL;
}
Datum
thesaurus_lexize(PG_FUNCTION_ARGS)
{
DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0);
DictSubState *dstate = (DictSubState *) PG_GETARG_POINTER(3);
TSLexeme *res = NULL;
LexemeInfo *stored,
*info = NULL;
uint16 curpos = 0;
bool moreres = false;
if (dstate == NULL || PG_NARGS() < 4)
elog(ERROR, "Forbidden call of thesaurus or nested call");
if (dstate->isend)
PG_RETURN_POINTER(NULL);
stored = (LexemeInfo *) dstate->private;
if (stored)
curpos = stored->posinsubst + 1;
res = (TSLexeme *) DatumGetPointer(
FunctionCall4(
&(d->subdict.lexize_info),
PointerGetDatum(d->subdict.dictionary),
PG_GETARG_DATUM(1),
PG_GETARG_INT32(2),
PointerGetDatum(NULL)
)
);
if (res && res->lexeme)
{
TSLexeme *ptr = res,
*basevar;
while (ptr->lexeme)
{
uint16 nv = ptr->nvariant;
uint16 i,
nlex = 0;
LexemeInfo **infos;
basevar = ptr;
while (ptr->lexeme && nv == ptr->nvariant)
{
nlex++;
ptr++;
}
infos = (LexemeInfo **) palloc(sizeof(LexemeInfo *) * nlex);
for (i = 0; i < nlex; i++)
if ((infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL)
break;
if (i < nlex)
{
/* no chance to find */
pfree(infos);
continue;
}
info = findVariant(info, stored, curpos, infos, nlex);
}
}
else if (res)
{ /* stop-word */
LexemeInfo *infos = findTheLexeme(d, NULL);
info = findVariant(NULL, stored, curpos, &infos, 1);
}
else
{
info = NULL; /* word isn't recognized */
}
dstate->private = (void *) info;
if (!info)
{
dstate->getnext = false;
PG_RETURN_POINTER(NULL);
}
if ((res = checkMatch(d, info, curpos, &moreres)) != NULL)
{
dstate->getnext = moreres;
PG_RETURN_POINTER(res);
}
dstate->getnext = true;
PG_RETURN_POINTER(NULL);
}