postgresql/contrib/tsearch/morph.c

215 lines
4.6 KiB
C
Raw Normal View History

/*
* morphology module
* New dictionary is include in dict.h. For languages which
* use latin charset it may be need to modify mapdict table.
* Teodor Sigaev <teodor@stack.net>
*/
#include "postgres.h"
2002-09-06 04:51:39 +08:00
#include <locale.h>
#include "utils/builtins.h"
#include "morph.h"
#include "deflex.h"
/*
* Struct for calling dictionaries
* All of this methods are optional, but
* if all methods are NULL, then dictionary does nothing :)
* Return value of lemmatize must be palloced or the same.
* Return value of init must be malloced in other case
* it will be free in end of transaction!
*/
typedef struct
{
2002-09-06 04:51:39 +08:00
char localename[NAMEDATALEN];
/* init dictionary */
void *(*init) (void);
/* close dictionary */
void (*close) (void *);
/* find in dictionary */
char *(*lemmatize) (void *, char *, int *);
int (*is_stoplemm) (void *, char *, int);
int (*is_stemstoplemm) (void *, char *, int);
} DICT;
/* insert all dictionaries */
#define DICT_BODY
#include "dict.h"
#undef DICT_BODY
/* fill dictionary's structure */
#define DICT_TABLE
DICT dicts[] = {
{
"C", NULL, NULL, NULL, NULL, NULL /* fake dictionary */
}
#include "dict.h"
};
#undef DICT_TABLE
2002-09-06 04:51:39 +08:00
/* array for storing dictionary's objects (if needed) */
2003-08-04 08:43:34 +08:00
void *dictobjs[
lengthof(dicts)];
#define STOPLEXEM -2
#define BYLOCALE -1
#define NODICT 0
#define DEFAULTDICT 1
#define MAXNDICT 2
typedef int2 MAPDICT[MAXNDICT];
#define GETDICT(x,i) *( ((int2*)(x)) + (i) )
/* map dictionaries for lexem type */
static MAPDICT mapdict[] = {
{NODICT, NODICT}, /* not used */
{DEFAULTDICT, NODICT}, /* LATWORD */
{BYLOCALE, NODICT}, /* NONLATINWORD */
{BYLOCALE, DEFAULTDICT}, /* UWORD */
{NODICT, NODICT}, /* EMAIL */
{NODICT, NODICT}, /* FURL */
{NODICT, NODICT}, /* HOST */
{NODICT, NODICT}, /* SCIENTIFIC */
{NODICT, NODICT}, /* VERSIONNUMBER */
{BYLOCALE, DEFAULTDICT}, /* PARTHYPHENWORD */
{BYLOCALE, NODICT}, /* CYRPARTHYPHENWORD */
{DEFAULTDICT, NODICT}, /* LATPARTHYPHENWORD */
{STOPLEXEM, NODICT}, /* SPACE */
{STOPLEXEM, NODICT}, /* TAG */
{STOPLEXEM, NODICT}, /* HTTP */
{BYLOCALE, DEFAULTDICT}, /* HYPHENWORD */
{DEFAULTDICT, NODICT}, /* LATHYPHENWORD */
{BYLOCALE, NODICT}, /* CYRHYPHENWORD */
{NODICT, NODICT}, /* URI */
{NODICT, NODICT}, /* FILEPATH */
{NODICT, NODICT}, /* DECIMAL */
{NODICT, NODICT}, /* SIGNEDINT */
{NODICT, NODICT}, /* UNSIGNEDINT */
{STOPLEXEM, NODICT} /* HTMLENTITY */
};
static bool inited = false;
void
initmorph(void)
{
int i,
j,
k;
MAPDICT *md;
bool needinit[lengthof(dicts)];
2002-09-06 04:51:39 +08:00
const char *curlocale;
int bylocaledict = NODICT;
if (inited)
return;
for (i = 1; i < lengthof(dicts); i++)
needinit[i] = false;
2002-09-06 04:51:39 +08:00
curlocale = setlocale(LC_CTYPE, NULL);
if (curlocale)
{
for (i = 1; i < lengthof(dicts); i++)
2002-09-06 04:51:39 +08:00
if (strcmp(dicts[i].localename, curlocale) == 0)
{
bylocaledict = i;
break;
}
2002-09-06 04:51:39 +08:00
}
for (i = 1; i < lengthof(mapdict); i++)
{
k = 0;
md = &mapdict[i];
for (j = 0; j < MAXNDICT; j++)
{
GETDICT(md, k) = GETDICT(md, j);
if (GETDICT(md, k) == NODICT)
break;
else if (GETDICT(md, k) == BYLOCALE)
{
if (bylocaledict == NODICT)
continue;
GETDICT(md, k) = bylocaledict;
}
if (GETDICT(md, k) >= (int2) lengthof(dicts))
continue;
needinit[GETDICT(md, k)] = true;
k++;
}
for (; k < MAXNDICT; k++)
if (GETDICT(md, k) != STOPLEXEM)
GETDICT(md, k) = NODICT;
}
for (i = 1; i < lengthof(dicts); i++)
if (needinit[i] && dicts[i].init)
dictobjs[i] = (*(dicts[i].init)) ();
inited = true;
return;
}
char *
lemmatize(char *word, int *len, int type)
{
int2 nd;
int i;
DICT *dict;
for (i = 0; i < MAXNDICT; i++)
{
nd = GETDICT(&mapdict[type], i);
if (nd == NODICT)
{
/* there is no dictionary */
return word;
}
else if (nd == STOPLEXEM)
{
/* word is stopword */
return NULL;
}
else if (nd == BYLOCALE)
{
2003-08-04 08:43:34 +08:00
continue; /* no dict for current locale */
}
else
{
dict = &dicts[nd];
if (dict->is_stoplemm && (*(dict->is_stoplemm)) (dictobjs[nd], word, *len))
return NULL;
if (dict->lemmatize)
{
int oldlen = *len;
char *newword = (*(dict->lemmatize)) (dictobjs[nd], word, len);
/* word is recognized by dictionary */
if (newword != word || *len != oldlen)
{
if (dict->is_stemstoplemm &&
(*(dict->is_stemstoplemm)) (dictobjs[nd], word, *len))
{
if (newword != word && newword)
pfree(newword);
return NULL;
}
return newword;
}
}
}
}
return word;
}
bool
is_stoptype(int type)
{
return (GETDICT(&mapdict[type], 0) == STOPLEXEM) ? true : false;
}