2006-06-07 00:25:55 +08:00
|
|
|
/* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.5 2006/06/06 16:25:55 teodor Exp $ */
|
2006-05-31 22:05:31 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* thesaurus
|
|
|
|
* Teodor Sigaev <teodor@sigaev.ru>
|
|
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
|
|
#include "executor/spi.h"
|
|
|
|
|
|
|
|
#include <ctype.h>
|
|
|
|
|
|
|
|
#include "dict.h"
|
|
|
|
#include "common.h"
|
|
|
|
#include "ts_locale.h"
|
|
|
|
|
2006-06-07 00:25:55 +08:00
|
|
|
/*
|
|
|
|
* Temporay we use TSLexeme.flags for inner use...
|
|
|
|
*/
|
|
|
|
#define DT_USEASIS 0x1000
|
|
|
|
|
2006-05-31 22:05:31 +08:00
|
|
|
typedef struct LexemeInfo {
|
|
|
|
uint16 idsubst; /* entry's number in DictThesaurus->subst */
|
|
|
|
uint16 posinsubst; /* pos info in entry */
|
|
|
|
uint16 tnvariant; /* total num lexemes in one variant */
|
|
|
|
struct LexemeInfo *nextentry;
|
|
|
|
struct LexemeInfo *nextvariant;
|
|
|
|
} LexemeInfo;
|
|
|
|
|
|
|
|
typedef struct {
|
|
|
|
char *lexeme;
|
|
|
|
LexemeInfo *entries;
|
|
|
|
} TheLexeme;
|
|
|
|
|
|
|
|
typedef struct {
|
|
|
|
uint16 lastlexeme; /* number lexemes to substitute */
|
|
|
|
uint16 reslen;
|
|
|
|
TSLexeme *res; /* prepared substituted result */
|
|
|
|
} TheSubstitute;
|
|
|
|
|
|
|
|
typedef struct
|
|
|
|
{
|
|
|
|
/* subdictionary to normalize lexemes */
|
|
|
|
DictInfo subdict;
|
|
|
|
|
|
|
|
/* Array to search lexeme by exact match */
|
|
|
|
TheLexeme *wrds;
|
|
|
|
int nwrds;
|
|
|
|
int ntwrds;
|
|
|
|
|
|
|
|
/* Storage of substituted result, n-th element is for
|
|
|
|
n-th expression */
|
|
|
|
TheSubstitute *subst;
|
|
|
|
int nsubst;
|
|
|
|
} DictThesaurus;
|
|
|
|
|
|
|
|
PG_FUNCTION_INFO_V1(thesaurus_init);
|
|
|
|
Datum thesaurus_init(PG_FUNCTION_ARGS);
|
|
|
|
|
|
|
|
PG_FUNCTION_INFO_V1(thesaurus_lexize);
|
|
|
|
Datum thesaurus_lexize(PG_FUNCTION_ARGS);
|
|
|
|
|
|
|
|
static void
|
|
|
|
freeDictThesaurus(DictThesaurus * d)
|
|
|
|
{
|
|
|
|
free(d);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
newLexeme( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 posinsubst ) {
|
|
|
|
TheLexeme *ptr;
|
|
|
|
|
|
|
|
if ( d->nwrds >= d->ntwrds ) {
|
|
|
|
if ( d->ntwrds == 0 ) {
|
|
|
|
d->ntwrds = 16;
|
|
|
|
d->wrds = (TheLexeme*)malloc(sizeof(TheLexeme) * d->ntwrds);
|
|
|
|
} else {
|
|
|
|
d->ntwrds *= 2;
|
|
|
|
d->wrds = (TheLexeme*)realloc(d->wrds, sizeof(TheLexeme) * d->ntwrds);
|
|
|
|
}
|
|
|
|
if (!d->wrds)
|
|
|
|
elog(ERROR,"Out of memory");
|
|
|
|
}
|
|
|
|
|
|
|
|
ptr = d->wrds + d->nwrds;
|
|
|
|
d->nwrds++;
|
|
|
|
|
|
|
|
if ( (ptr->lexeme = malloc(e-b+1)) == NULL )
|
|
|
|
elog(ERROR,"Out of memory");
|
|
|
|
|
|
|
|
memcpy(ptr->lexeme, b, e-b);
|
|
|
|
ptr->lexeme[e-b] = '\0';
|
|
|
|
|
|
|
|
if ( (ptr->entries = (LexemeInfo*)malloc( sizeof(LexemeInfo) ))==NULL )
|
|
|
|
elog(ERROR,"Out of memory");
|
|
|
|
|
|
|
|
ptr->entries->nextentry=NULL;
|
|
|
|
ptr->entries->idsubst = idsubst;
|
|
|
|
ptr->entries->posinsubst = posinsubst;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2006-06-07 00:25:55 +08:00
|
|
|
addWrd( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis ) {
|
2006-05-31 22:05:31 +08:00
|
|
|
static int nres=0;
|
|
|
|
static int ntres = 0;
|
|
|
|
TheSubstitute *ptr;
|
|
|
|
|
|
|
|
if ( nwrd == 0 ) {
|
|
|
|
nres = ntres = 0;
|
|
|
|
|
|
|
|
if ( idsubst <= d->nsubst ) {
|
|
|
|
if ( d->nsubst == 0 ) {
|
|
|
|
d->nsubst = 16;
|
|
|
|
d->subst = (TheSubstitute*)malloc(sizeof(TheSubstitute) * d->nsubst);
|
|
|
|
} else {
|
|
|
|
d->nsubst *= 2;
|
|
|
|
d->subst = (TheSubstitute*)realloc(d->subst, sizeof(TheSubstitute) * d->nsubst);
|
|
|
|
}
|
|
|
|
if (!d->subst)
|
|
|
|
elog(ERROR,"Out of memory");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ptr = d->subst + idsubst;
|
|
|
|
|
|
|
|
ptr->lastlexeme = posinsubst-1;
|
|
|
|
|
|
|
|
if ( nres+1 >= ntres ) {
|
|
|
|
if ( ntres == 0 ) {
|
|
|
|
ntres = 2;
|
|
|
|
ptr->res = (TSLexeme*)malloc( sizeof(TSLexeme) * ntres );
|
|
|
|
} else {
|
|
|
|
ntres *= 2;
|
|
|
|
ptr->res = (TSLexeme*)realloc( ptr->res, sizeof(TSLexeme) * ntres );
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( !ptr->res )
|
|
|
|
elog(ERROR,"Out of memory");
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( (ptr->res[ nres ].lexeme = malloc(e-b+1))==0 )
|
|
|
|
elog(ERROR,"Out of memory");
|
|
|
|
memcpy(ptr->res[ nres ].lexeme, b, e-b);
|
|
|
|
ptr->res[ nres ].lexeme[e-b] = '\0';
|
|
|
|
|
|
|
|
ptr->res[ nres ].nvariant = nwrd;
|
2006-06-07 00:25:55 +08:00
|
|
|
if ( useasis )
|
|
|
|
ptr->res[ nres ].flags = DT_USEASIS;
|
|
|
|
else
|
|
|
|
ptr->res[ nres ].flags = 0;
|
2006-05-31 22:05:31 +08:00
|
|
|
|
|
|
|
ptr->res[ ++nres ].lexeme = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define TR_WAITLEX 1
|
|
|
|
#define TR_INLEX 2
|
|
|
|
#define TR_WAITSUBS 3
|
|
|
|
#define TR_INSUBS 4
|
|
|
|
|
|
|
|
static void
|
|
|
|
thesaurusRead( char *filename, DictThesaurus *d ) {
|
|
|
|
FILE *fh;
|
|
|
|
char str[BUFSIZ];
|
|
|
|
int lineno=0;
|
|
|
|
uint16 idsubst = 0;
|
2006-06-07 00:25:55 +08:00
|
|
|
bool useasis=false;
|
2006-05-31 22:05:31 +08:00
|
|
|
|
|
|
|
fh = fopen(to_absfilename(filename), "r");
|
|
|
|
if (!fh)
|
|
|
|
elog(ERROR,"Thesaurus: can't open '%s' file", filename);
|
|
|
|
|
|
|
|
while( fgets(str, sizeof(str), fh)) {
|
|
|
|
char *ptr = str;
|
|
|
|
int state = TR_WAITLEX;
|
|
|
|
char *beginwrd = NULL;
|
|
|
|
uint16 posinsubst=0;
|
|
|
|
uint16 nwrd=0;
|
|
|
|
|
|
|
|
lineno++;
|
|
|
|
|
|
|
|
/* is it comment ? */
|
|
|
|
while( t_isspace(ptr) )
|
|
|
|
ptr += pg_mblen(ptr);
|
|
|
|
if ( t_iseq(str, '#') || *str=='\0' || t_iseq(str, '\n') || t_iseq(str, '\r') )
|
|
|
|
continue;
|
|
|
|
|
|
|
|
pg_verifymbstr(ptr, strlen(ptr), false);
|
|
|
|
while(*ptr) {
|
|
|
|
if ( state == TR_WAITLEX ) {
|
|
|
|
if ( t_iseq(ptr, ':' ) ) {
|
|
|
|
if ( posinsubst == 0 ) {
|
|
|
|
fclose(fh);
|
|
|
|
elog(ERROR, "Thesaurus: Unexpected delimiter at %d line", lineno);
|
|
|
|
}
|
|
|
|
state = TR_WAITSUBS;
|
|
|
|
} else if ( !t_isspace(ptr) ) {
|
|
|
|
beginwrd = ptr;
|
|
|
|
state = TR_INLEX;
|
|
|
|
}
|
|
|
|
} else if ( state == TR_INLEX ) {
|
|
|
|
if ( t_iseq(ptr, ':') ) {
|
|
|
|
newLexeme( d, beginwrd, ptr, idsubst, posinsubst++ );
|
|
|
|
state = TR_WAITSUBS;
|
|
|
|
} else if ( t_isspace(ptr) ) {
|
|
|
|
newLexeme( d, beginwrd, ptr, idsubst, posinsubst++ );
|
|
|
|
state = TR_WAITLEX;
|
|
|
|
}
|
|
|
|
} else if ( state == TR_WAITSUBS ) {
|
2006-06-07 00:25:55 +08:00
|
|
|
if ( t_iseq(ptr, '*') ) {
|
|
|
|
useasis = true;
|
|
|
|
state = TR_INSUBS;
|
|
|
|
beginwrd = ptr + pg_mblen(ptr);
|
|
|
|
} else if ( t_iseq(ptr, '\\') ) {
|
|
|
|
useasis = false;
|
|
|
|
state = TR_INSUBS;
|
|
|
|
beginwrd = ptr + pg_mblen(ptr);
|
|
|
|
} else if ( !t_isspace(ptr) ) {
|
|
|
|
useasis = false;
|
2006-05-31 22:05:31 +08:00
|
|
|
beginwrd = ptr;
|
|
|
|
state = TR_INSUBS;
|
|
|
|
}
|
|
|
|
} else if ( state == TR_INSUBS ) {
|
|
|
|
if ( t_isspace(ptr) ) {
|
2006-06-07 00:25:55 +08:00
|
|
|
if ( ptr == beginwrd )
|
|
|
|
elog(ERROR, "Thesaurus: Unexpected end of line or lexeme at %d line", lineno);
|
|
|
|
addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis );
|
2006-05-31 22:05:31 +08:00
|
|
|
state = TR_WAITSUBS;
|
|
|
|
}
|
|
|
|
} else
|
|
|
|
elog(ERROR,"Thesaurus: Unknown state: %d", state);
|
|
|
|
|
|
|
|
ptr += pg_mblen(ptr);
|
|
|
|
}
|
|
|
|
|
2006-06-07 00:25:55 +08:00
|
|
|
if ( state == TR_INSUBS ) {
|
|
|
|
if ( ptr == beginwrd )
|
|
|
|
elog(ERROR, "Thesaurus: Unexpected end of line or lexeme at %d line", lineno);
|
|
|
|
addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis );
|
|
|
|
}
|
2006-05-31 22:05:31 +08:00
|
|
|
|
|
|
|
idsubst++;
|
|
|
|
|
|
|
|
if ( !(nwrd && posinsubst) ) {
|
|
|
|
fclose(fh);
|
|
|
|
elog(ERROR, "Thesaurus: Unexpected end of line at %d line", lineno);
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
d->nsubst = idsubst;
|
|
|
|
|
|
|
|
fclose(fh);
|
|
|
|
}
|
|
|
|
|
|
|
|
static TheLexeme*
|
|
|
|
addCompiledLexeme(TheLexeme *newwrds, int *nnw, int *tnm, TSLexeme *lexeme, LexemeInfo* src, uint16 tnvariant) {
|
|
|
|
|
|
|
|
if ( *nnw >= *tnm ) {
|
|
|
|
*tnm *= 2;
|
|
|
|
newwrds = (TheLexeme*)realloc( newwrds, sizeof(TheLexeme) * *tnm);
|
|
|
|
if (!newwrds)
|
|
|
|
elog(ERROR,"Out of memory");
|
|
|
|
}
|
|
|
|
|
|
|
|
newwrds[ *nnw ].entries = (LexemeInfo*)malloc( sizeof(LexemeInfo) );
|
|
|
|
if (!newwrds[ *nnw ].entries)
|
|
|
|
elog(ERROR,"Out of memory");
|
|
|
|
|
|
|
|
if ( lexeme && lexeme->lexeme ) {
|
|
|
|
newwrds[ *nnw ].lexeme = strdup( lexeme->lexeme );
|
|
|
|
if ( !newwrds[ *nnw ].lexeme )
|
|
|
|
elog(ERROR,"Out of memory");
|
|
|
|
|
|
|
|
newwrds[ *nnw ].entries->tnvariant = tnvariant;
|
|
|
|
} else {
|
|
|
|
newwrds[ *nnw ].lexeme = NULL;
|
|
|
|
newwrds[ *nnw ].entries->tnvariant = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
newwrds[ *nnw ].entries->idsubst = src->idsubst;
|
|
|
|
newwrds[ *nnw ].entries->posinsubst = src->posinsubst;
|
|
|
|
|
|
|
|
newwrds[ *nnw ].entries->nextentry = NULL;
|
|
|
|
|
|
|
|
(*nnw)++;
|
|
|
|
return newwrds;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
cmpLexemeInfo(LexemeInfo *a, LexemeInfo *b) {
|
|
|
|
if ( a==NULL || b==NULL )
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if ( a->idsubst == b->idsubst ) {
|
|
|
|
if ( a->posinsubst == b->posinsubst ) {
|
|
|
|
if ( a->tnvariant == b->tnvariant )
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return ( a->tnvariant > b->tnvariant ) ? 1 : -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ( a->posinsubst > b->posinsubst ) ? 1 : -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ( a->idsubst > b->idsubst ) ? 1 : -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
cmpLexeme(TheLexeme *a, TheLexeme* b) {
|
|
|
|
if ( a->lexeme == NULL ) {
|
|
|
|
if ( b->lexeme == NULL )
|
|
|
|
return 0;
|
|
|
|
else
|
|
|
|
return 1;
|
|
|
|
} else if ( b->lexeme == NULL )
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
return strcmp( a->lexeme, b->lexeme );
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
cmpLexemeQ(const void *a, const void *b) {
|
|
|
|
return cmpLexeme( (TheLexeme*)a, (TheLexeme*)b );
|
|
|
|
}
|
|
|
|
|
|
|
|
static int cmpTheLexeme(const void *a, const void *b) {
|
|
|
|
TheLexeme *la = (TheLexeme*)a;
|
|
|
|
TheLexeme *lb = (TheLexeme*)b;
|
|
|
|
int res;
|
|
|
|
|
|
|
|
if ( (res=cmpLexeme(la, lb)) != 0 )
|
|
|
|
return res;
|
|
|
|
|
|
|
|
return -cmpLexemeInfo(la->entries, lb->entries);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
compileTheLexeme(DictThesaurus *d) {
|
|
|
|
int i,nnw=0, tnm=16;
|
|
|
|
TheLexeme *newwrds = (TheLexeme*)malloc(sizeof(TheLexeme)*tnm), *ptrwrds;
|
|
|
|
|
|
|
|
if (!newwrds)
|
|
|
|
elog(ERROR,"Out of memory");
|
|
|
|
|
|
|
|
for(i=0;i<d->nwrds;i++) {
|
2006-06-07 00:25:55 +08:00
|
|
|
TSLexeme *ptr;
|
|
|
|
|
|
|
|
ptr = (TSLexeme*) DatumGetPointer(
|
2006-05-31 22:05:31 +08:00
|
|
|
FunctionCall4(
|
|
|
|
&(d->subdict.lexize_info),
|
|
|
|
PointerGetDatum(d->subdict.dictionary),
|
|
|
|
PointerGetDatum(d->wrds[i].lexeme),
|
|
|
|
Int32GetDatum(strlen(d->wrds[i].lexeme)),
|
|
|
|
PointerGetDatum(NULL)
|
|
|
|
)
|
|
|
|
);
|
|
|
|
|
|
|
|
if ( !(ptr && ptr->lexeme) ) {
|
2006-06-02 23:35:42 +08:00
|
|
|
if ( !ptr )
|
2006-06-07 00:25:55 +08:00
|
|
|
elog(ERROR,"Thesaurus: word-sample '%s' isn't recognized by subdictionary (rule %d)",
|
|
|
|
d->wrds[i].lexeme, d->wrds[i].entries->idsubst+1 );
|
2006-06-02 23:35:42 +08:00
|
|
|
else
|
2006-06-07 00:25:55 +08:00
|
|
|
elog(NOTICE,"Thesaurus: word-sample '%s' is recognized as stop-word, assign any stop-word (rule %d)",
|
|
|
|
d->wrds[i].lexeme, d->wrds[i].entries->idsubst+1);
|
2006-06-02 23:35:42 +08:00
|
|
|
|
2006-05-31 22:05:31 +08:00
|
|
|
newwrds = addCompiledLexeme( newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
|
|
|
|
} else {
|
|
|
|
while( ptr->lexeme ) {
|
|
|
|
TSLexeme *remptr = ptr+1;
|
|
|
|
int tnvar = 1;
|
|
|
|
int curvar = ptr->nvariant;
|
|
|
|
|
|
|
|
/* compute n words in one variant */
|
|
|
|
while( remptr->lexeme ) {
|
|
|
|
if ( remptr->nvariant != (remptr-1)->nvariant )
|
|
|
|
break;
|
|
|
|
tnvar++;
|
|
|
|
remptr++;
|
|
|
|
}
|
|
|
|
|
|
|
|
remptr = ptr;
|
|
|
|
while( remptr->lexeme && remptr->nvariant == curvar ) {
|
|
|
|
newwrds = addCompiledLexeme( newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
|
|
|
|
remptr++;
|
|
|
|
}
|
|
|
|
|
|
|
|
ptr = remptr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
free( d->wrds[i].lexeme );
|
|
|
|
free( d->wrds[i].entries );
|
|
|
|
}
|
|
|
|
|
|
|
|
free( d->wrds );
|
|
|
|
d->wrds = newwrds;
|
|
|
|
d->nwrds = nnw;
|
|
|
|
d->ntwrds = tnm;
|
|
|
|
|
|
|
|
if ( d->nwrds > 1 ) {
|
|
|
|
qsort( d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme );
|
|
|
|
|
|
|
|
/* uniq */
|
|
|
|
newwrds = d->wrds;
|
|
|
|
ptrwrds = d->wrds + 1;
|
|
|
|
while( ptrwrds - d->wrds < d->nwrds ) {
|
|
|
|
if ( cmpLexeme( ptrwrds, newwrds ) == 0 ) {
|
|
|
|
if ( cmpLexemeInfo(ptrwrds->entries, newwrds->entries) ) {
|
|
|
|
ptrwrds->entries->nextentry = newwrds->entries;
|
|
|
|
newwrds->entries = ptrwrds->entries;
|
|
|
|
} else
|
|
|
|
free( ptrwrds->entries );
|
|
|
|
|
|
|
|
if ( ptrwrds->lexeme )
|
|
|
|
free( ptrwrds->lexeme );
|
|
|
|
} else {
|
|
|
|
newwrds++;
|
|
|
|
*newwrds = *ptrwrds;
|
|
|
|
}
|
|
|
|
|
|
|
|
ptrwrds++;
|
|
|
|
}
|
|
|
|
|
|
|
|
d->nwrds = newwrds - d->wrds + 1;
|
|
|
|
d->wrds = (TheLexeme*)realloc( d->wrds, sizeof(TheLexeme) * d->nwrds );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
compileTheSubstitute(DictThesaurus *d) {
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for(i=0;i<d->nsubst;i++) {
|
|
|
|
TSLexeme *rem = d->subst[i].res, *outptr, *inptr;
|
|
|
|
int n=2;
|
|
|
|
|
|
|
|
outptr = d->subst[i].res = (TSLexeme*)malloc( sizeof(TSLexeme) * n );
|
|
|
|
if ( d->subst[i].res == NULL )
|
|
|
|
elog(ERROR,"Out of Memory");
|
|
|
|
outptr->lexeme = NULL;
|
|
|
|
inptr = rem;
|
|
|
|
|
|
|
|
while( inptr && inptr->lexeme ) {
|
2006-06-07 00:25:55 +08:00
|
|
|
TSLexeme *lexized, tmplex[2];
|
|
|
|
|
|
|
|
if ( inptr->flags & DT_USEASIS ) { /* do not lexize */
|
|
|
|
tmplex[0] = *inptr;
|
|
|
|
tmplex[0].flags = 0;
|
|
|
|
tmplex[1].lexeme = NULL;
|
|
|
|
lexized = tmplex;
|
|
|
|
} else {
|
|
|
|
lexized = (TSLexeme*) DatumGetPointer(
|
|
|
|
FunctionCall4(
|
|
|
|
&(d->subdict.lexize_info),
|
|
|
|
PointerGetDatum(d->subdict.dictionary),
|
|
|
|
PointerGetDatum(inptr->lexeme),
|
|
|
|
Int32GetDatum(strlen(inptr->lexeme)),
|
|
|
|
PointerGetDatum(NULL)
|
|
|
|
)
|
|
|
|
);
|
|
|
|
}
|
2006-05-31 22:05:31 +08:00
|
|
|
|
2006-06-02 23:35:42 +08:00
|
|
|
if ( lexized && lexized->lexeme ) {
|
2006-05-31 22:05:31 +08:00
|
|
|
int toset = (lexized->lexeme && outptr != d->subst[i].res ) ? (outptr - d->subst[i].res) : -1;
|
|
|
|
|
|
|
|
while( lexized->lexeme ) {
|
|
|
|
if ( outptr - d->subst[i].res + 1 >= n ) {
|
|
|
|
int diff = outptr - d->subst[i].res;
|
|
|
|
n *= 2;
|
|
|
|
d->subst[i].res = (TSLexeme*)realloc( d->subst[i].res, sizeof(TSLexeme) * n );
|
|
|
|
if ( d->subst[i].res == NULL )
|
|
|
|
elog(ERROR,"Out of Memory");
|
|
|
|
outptr = d->subst[i].res + diff;
|
|
|
|
}
|
|
|
|
|
|
|
|
*outptr = *lexized;
|
|
|
|
if ( (outptr->lexeme = strdup(lexized->lexeme)) == NULL )
|
|
|
|
elog(ERROR,"Out of Memory");
|
|
|
|
|
|
|
|
outptr++;
|
|
|
|
lexized++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( toset > 0)
|
|
|
|
d->subst[i].res[toset].flags |= TSL_ADDPOS;
|
2006-06-07 00:25:55 +08:00
|
|
|
} else if ( lexized ) {
|
|
|
|
elog(NOTICE,"Thesaurus: word '%s' in substition is a stop-word, ignored (rule %d)", inptr->lexeme, i+1);
|
2006-06-02 23:35:42 +08:00
|
|
|
} else {
|
2006-06-07 00:25:55 +08:00
|
|
|
elog(ERROR,"Thesaurus: word '%s' in substition isn't recognized (rule %d)", inptr->lexeme, i+1);
|
2006-05-31 22:05:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if ( inptr->lexeme )
|
|
|
|
free( inptr->lexeme );
|
|
|
|
inptr++;
|
|
|
|
}
|
|
|
|
|
2006-06-02 23:35:42 +08:00
|
|
|
if ( outptr == d->subst[i].res )
|
2006-06-07 00:25:55 +08:00
|
|
|
elog(ERROR,"Thesaurus: all words in subsitution are stop word (rule %d)", i+1);
|
2006-06-02 23:35:42 +08:00
|
|
|
|
2006-05-31 22:05:31 +08:00
|
|
|
d->subst[i].reslen = outptr - d->subst[i].res;
|
|
|
|
|
|
|
|
free(rem);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
thesaurus_init(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
DictThesaurus *d;
|
|
|
|
Map *cfg,
|
|
|
|
*pcfg;
|
|
|
|
text *in, *subdictname=NULL;
|
|
|
|
bool fileloaded = false;
|
|
|
|
|
|
|
|
if (PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL)
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
|
|
|
errmsg("Thesaurus confguration error")));
|
|
|
|
|
|
|
|
d = (DictThesaurus *) malloc(sizeof(DictThesaurus));
|
|
|
|
if (!d)
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_OUT_OF_MEMORY),
|
|
|
|
errmsg("out of memory")));
|
|
|
|
memset(d, 0, sizeof(DictThesaurus));
|
|
|
|
|
|
|
|
in = PG_GETARG_TEXT_P(0);
|
|
|
|
parse_cfgdict(in, &cfg);
|
|
|
|
PG_FREE_IF_COPY(in, 0);
|
|
|
|
pcfg = cfg;
|
|
|
|
while (pcfg->key)
|
|
|
|
{
|
|
|
|
if (pg_strcasecmp("DictFile", pcfg->key) == 0)
|
|
|
|
{
|
|
|
|
if (fileloaded)
|
|
|
|
{
|
|
|
|
freeDictThesaurus(d);
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
|
|
errmsg("Thesaurus file is already loaded")));
|
|
|
|
}
|
|
|
|
fileloaded = true;
|
|
|
|
thesaurusRead( pcfg->value, d );
|
|
|
|
}
|
|
|
|
else if (pg_strcasecmp("Dictionary", pcfg->key) == 0)
|
|
|
|
{
|
|
|
|
if (subdictname)
|
|
|
|
{
|
|
|
|
freeDictThesaurus(d);
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
|
|
errmsg("Thesaurus: SubDictionary is already defined")));
|
|
|
|
}
|
|
|
|
subdictname = char2text( pcfg->value );
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
freeDictThesaurus(d);
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_SYNTAX_ERROR),
|
|
|
|
errmsg("unrecognized option: %s => %s",
|
|
|
|
pcfg->key, pcfg->value)));
|
|
|
|
}
|
|
|
|
pfree(pcfg->key);
|
|
|
|
pfree(pcfg->value);
|
|
|
|
pcfg++;
|
|
|
|
}
|
|
|
|
pfree(cfg);
|
|
|
|
|
|
|
|
if (!fileloaded)
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
|
|
errmsg("Thesaurus file isn't defined")));
|
|
|
|
|
|
|
|
if ( subdictname ) {
|
|
|
|
DictInfo *subdictptr;
|
|
|
|
/*
|
|
|
|
* we already in SPI, but name2id_dict()/finddict()
|
|
|
|
* invoke SPI_connect()
|
|
|
|
*/
|
|
|
|
SPI_push();
|
|
|
|
|
|
|
|
subdictptr = finddict( name2id_dict( subdictname ) );
|
|
|
|
|
|
|
|
SPI_pop();
|
|
|
|
|
|
|
|
d->subdict = *subdictptr;
|
|
|
|
} else
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
|
|
errmsg("Thesaurus: SubDictionary isn't defined")));
|
|
|
|
|
|
|
|
compileTheLexeme( d );
|
|
|
|
compileTheSubstitute(d);
|
|
|
|
|
|
|
|
PG_RETURN_POINTER(d);
|
|
|
|
}
|
|
|
|
|
|
|
|
static LexemeInfo*
|
|
|
|
findTheLexeme(DictThesaurus *d, char * lexeme) {
|
|
|
|
TheLexeme key = { lexeme, NULL }, *res;
|
|
|
|
|
|
|
|
if ( d->nwrds == 0 )
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ);
|
|
|
|
|
|
|
|
if ( res == NULL )
|
|
|
|
return NULL;
|
|
|
|
return res->entries;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool
|
|
|
|
matchIdSubst(LexemeInfo *stored, uint16 idsubst) {
|
|
|
|
bool res = true;
|
|
|
|
|
|
|
|
if (stored) {
|
|
|
|
res = false;
|
|
|
|
|
|
|
|
for(; stored; stored=stored->nextvariant)
|
|
|
|
if ( stored->idsubst == idsubst ) {
|
|
|
|
res = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
static LexemeInfo*
|
|
|
|
findVariant( LexemeInfo *in, LexemeInfo *stored, uint16 curpos, LexemeInfo **newin, int newn) {
|
|
|
|
for(;;) {
|
|
|
|
int i;
|
|
|
|
LexemeInfo *ptr = newin[0];
|
|
|
|
|
|
|
|
for(i=0; i<newn; i++) {
|
|
|
|
while(newin[i] && newin[i]->idsubst < ptr->idsubst)
|
|
|
|
newin[i] = newin[i]->nextentry;
|
|
|
|
|
|
|
|
if ( newin[i] == NULL )
|
|
|
|
return in;
|
|
|
|
|
|
|
|
if ( newin[i]->idsubst > ptr->idsubst ) {
|
|
|
|
ptr = newin[i];
|
|
|
|
i=-1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
while(newin[i]->idsubst == ptr->idsubst) {
|
|
|
|
if ( newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn ) {
|
|
|
|
ptr = newin[i];
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
newin[i] = newin[i]->nextentry;
|
|
|
|
if ( newin[i] == NULL )
|
|
|
|
return in;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( newin[i]->idsubst != ptr->idsubst ) {
|
|
|
|
ptr = newin[i];
|
|
|
|
i=-1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( i==newn && matchIdSubst(stored, ptr->idsubst) && (in==NULL || !matchIdSubst(in, ptr->idsubst)) ) { /* found */
|
|
|
|
|
|
|
|
ptr->nextvariant = in;
|
|
|
|
in = ptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* step forward */
|
|
|
|
for(i=0; i<newn; i++)
|
|
|
|
newin[i] = newin[i]->nextentry;
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static TSLexeme*
|
|
|
|
copyTSLexeme( TheSubstitute *ts ) {
|
|
|
|
TSLexeme *res;
|
|
|
|
uint16 i;
|
|
|
|
|
|
|
|
res = (TSLexeme*)palloc( sizeof(TSLexeme) * (ts->reslen+1) );
|
|
|
|
for(i=0;i<ts->reslen;i++) {
|
|
|
|
res[i] = ts->res[i];
|
|
|
|
res[i].lexeme = pstrdup( ts->res[i].lexeme );
|
|
|
|
}
|
|
|
|
|
|
|
|
res[ts->reslen].lexeme = NULL;
|
|
|
|
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
static TSLexeme*
|
|
|
|
checkMatch(DictThesaurus *d, LexemeInfo *info, uint16 curpos, bool *moreres) {
|
|
|
|
*moreres = false;
|
|
|
|
while(info) {
|
|
|
|
Assert( info->idsubst < d->nsubst );
|
|
|
|
if ( info->nextvariant )
|
|
|
|
*moreres = true;
|
|
|
|
if ( d->subst[ info->idsubst ].lastlexeme == curpos )
|
|
|
|
return copyTSLexeme( d->subst + info->idsubst );
|
|
|
|
info = info->nextvariant;
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
thesaurus_lexize(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0);
|
|
|
|
DictSubState *dstate = (DictSubState*)PG_GETARG_POINTER(3);
|
|
|
|
TSLexeme *res=NULL;
|
|
|
|
LexemeInfo *stored, *info = NULL;
|
|
|
|
uint16 curpos = 0;
|
|
|
|
bool moreres = false;
|
|
|
|
|
|
|
|
if ( dstate == NULL || PG_NARGS() < 4 )
|
|
|
|
elog(ERROR,"Forbidden call of thesaurus or nested call");
|
|
|
|
|
|
|
|
if ( dstate->isend )
|
|
|
|
PG_RETURN_POINTER(NULL);
|
|
|
|
stored = (LexemeInfo*) dstate->private;
|
|
|
|
|
|
|
|
if (stored)
|
|
|
|
curpos = stored->posinsubst+1;
|
|
|
|
|
|
|
|
res =(TSLexeme*) DatumGetPointer (
|
|
|
|
FunctionCall4(
|
|
|
|
&(d->subdict.lexize_info),
|
|
|
|
PointerGetDatum(d->subdict.dictionary),
|
|
|
|
PG_GETARG_DATUM(1),
|
|
|
|
PG_GETARG_INT32(2),
|
|
|
|
PointerGetDatum(NULL)
|
|
|
|
)
|
|
|
|
);
|
|
|
|
|
|
|
|
if ( res && res->lexeme ) {
|
|
|
|
TSLexeme *ptr = res , *basevar;
|
|
|
|
|
|
|
|
while( ptr->lexeme ) {
|
|
|
|
uint16 nv = ptr->nvariant;
|
|
|
|
uint16 i,nlex = 0;
|
|
|
|
LexemeInfo **infos;
|
|
|
|
|
|
|
|
basevar = ptr;
|
|
|
|
while( ptr->lexeme && nv == ptr->nvariant ) {
|
|
|
|
nlex++;
|
|
|
|
ptr++;
|
|
|
|
}
|
|
|
|
|
|
|
|
infos = (LexemeInfo**)palloc(sizeof(LexemeInfo*)*nlex);
|
|
|
|
for(i=0;i<nlex;i++)
|
2006-06-07 00:25:55 +08:00
|
|
|
if ( (infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL )
|
2006-05-31 22:05:31 +08:00
|
|
|
break;
|
|
|
|
|
|
|
|
if ( i<nlex ) {
|
|
|
|
/* no chance to find */
|
|
|
|
pfree( infos );
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
info = findVariant( info, stored, curpos, infos, nlex);
|
|
|
|
}
|
2006-06-03 01:55:40 +08:00
|
|
|
} else if ( res ) { /* stop-word */
|
2006-05-31 22:05:31 +08:00
|
|
|
LexemeInfo *infos = findTheLexeme(d, NULL);
|
|
|
|
info = findVariant( NULL, stored, curpos, &infos, 1);
|
2006-06-03 01:55:40 +08:00
|
|
|
} else {
|
|
|
|
info = NULL; /* word isn't recognized */
|
2006-05-31 22:05:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
dstate->private = (void*)info;
|
|
|
|
|
|
|
|
if ( !info ) {
|
|
|
|
dstate->getnext = false;
|
|
|
|
PG_RETURN_POINTER(NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( (res=checkMatch(d, info, curpos,&moreres)) != NULL ) {
|
|
|
|
dstate->getnext = moreres;
|
|
|
|
PG_RETURN_POINTER(res);
|
|
|
|
}
|
|
|
|
|
|
|
|
dstate->getnext = true;
|
|
|
|
|
|
|
|
PG_RETURN_POINTER(NULL);
|
|
|
|
}
|