2001-10-13 07:19:09 +08:00
|
|
|
/*
|
|
|
|
* morphology module
|
|
|
|
* New dictionary is include in dict.h. For languages which
|
|
|
|
* use latin charset it may be need to modify mapdict table.
|
|
|
|
* Teodor Sigaev <teodor@stack.net>
|
2001-10-25 13:50:21 +08:00
|
|
|
*/
|
2001-10-13 07:19:09 +08:00
|
|
|
#include "postgres.h"
|
|
|
|
|
2002-09-06 04:51:39 +08:00
|
|
|
#include <locale.h>
|
|
|
|
|
2001-10-13 07:19:09 +08:00
|
|
|
#include "utils/builtins.h"
|
|
|
|
|
|
|
|
#include "morph.h"
|
|
|
|
#include "deflex.h"
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Struct for calling dictionaries
|
|
|
|
* All of this methods are optional, but
|
|
|
|
* if all methods are NULL, then dictionary does nothing :)
|
|
|
|
* Return value of lemmatize must be palloced or the same.
|
2001-10-25 13:50:21 +08:00
|
|
|
* Return value of init must be malloced in other case
|
|
|
|
* it will be free in end of transaction!
|
2001-10-13 07:19:09 +08:00
|
|
|
*/
|
2001-10-25 13:50:21 +08:00
|
|
|
typedef struct
|
|
|
|
{
|
2002-09-06 04:51:39 +08:00
|
|
|
char localename[NAMEDATALEN];
|
2001-10-13 07:19:09 +08:00
|
|
|
/* init dictionary */
|
2001-10-25 13:50:21 +08:00
|
|
|
void *(*init) (void);
|
2001-10-13 07:19:09 +08:00
|
|
|
/* close dictionary */
|
2001-10-25 13:50:21 +08:00
|
|
|
void (*close) (void *);
|
2001-10-13 07:19:09 +08:00
|
|
|
/* find in dictionary */
|
2001-10-25 13:50:21 +08:00
|
|
|
char *(*lemmatize) (void *, char *, int *);
|
|
|
|
int (*is_stoplemm) (void *, char *, int);
|
|
|
|
int (*is_stemstoplemm) (void *, char *, int);
|
2001-11-06 01:46:40 +08:00
|
|
|
} DICT;
|
2001-10-13 07:19:09 +08:00
|
|
|
|
|
|
|
/* insert all dictionaries */
|
|
|
|
#define DICT_BODY
|
|
|
|
#include "dict.h"
|
2001-10-25 13:50:21 +08:00
|
|
|
#undef DICT_BODY
|
2001-10-13 07:19:09 +08:00
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
/* fill dictionary's structure */
|
2001-10-13 07:19:09 +08:00
|
|
|
#define DICT_TABLE
|
2001-10-25 13:50:21 +08:00
|
|
|
DICT dicts[] = {
|
2001-10-13 07:19:09 +08:00
|
|
|
{
|
2001-10-25 13:50:21 +08:00
|
|
|
"C", NULL, NULL, NULL, NULL, NULL /* fake dictionary */
|
2001-10-13 07:19:09 +08:00
|
|
|
}
|
|
|
|
#include "dict.h"
|
|
|
|
};
|
2001-10-25 13:50:21 +08:00
|
|
|
|
2001-10-13 07:19:09 +08:00
|
|
|
#undef DICT_TABLE
|
|
|
|
|
2002-09-06 04:51:39 +08:00
|
|
|
/* array for storing dictionary's objects (if needed) */
|
2003-08-04 08:43:34 +08:00
|
|
|
void *dictobjs[
|
|
|
|
lengthof(dicts)];
|
2001-10-13 07:19:09 +08:00
|
|
|
|
|
|
|
#define STOPLEXEM -2
|
|
|
|
#define BYLOCALE -1
|
2001-10-25 13:50:21 +08:00
|
|
|
#define NODICT 0
|
|
|
|
#define DEFAULTDICT 1
|
|
|
|
|
2001-10-13 07:19:09 +08:00
|
|
|
#define MAXNDICT 2
|
2001-10-25 13:50:21 +08:00
|
|
|
typedef int2 MAPDICT[MAXNDICT];
|
|
|
|
|
2001-10-13 07:19:09 +08:00
|
|
|
#define GETDICT(x,i) *( ((int2*)(x)) + (i) )
|
|
|
|
|
|
|
|
/* map dictionaries for lexem type */
|
|
|
|
static MAPDICT mapdict[] = {
|
2001-10-25 13:50:21 +08:00
|
|
|
{NODICT, NODICT}, /* not used */
|
|
|
|
{DEFAULTDICT, NODICT}, /* LATWORD */
|
|
|
|
{BYLOCALE, NODICT}, /* NONLATINWORD */
|
|
|
|
{BYLOCALE, DEFAULTDICT}, /* UWORD */
|
|
|
|
{NODICT, NODICT}, /* EMAIL */
|
|
|
|
{NODICT, NODICT}, /* FURL */
|
|
|
|
{NODICT, NODICT}, /* HOST */
|
2002-08-15 11:02:08 +08:00
|
|
|
{NODICT, NODICT}, /* SCIENTIFIC */
|
|
|
|
{NODICT, NODICT}, /* VERSIONNUMBER */
|
|
|
|
{BYLOCALE, DEFAULTDICT}, /* PARTHYPHENWORD */
|
|
|
|
{BYLOCALE, NODICT}, /* CYRPARTHYPHENWORD */
|
|
|
|
{DEFAULTDICT, NODICT}, /* LATPARTHYPHENWORD */
|
2001-10-25 13:50:21 +08:00
|
|
|
{STOPLEXEM, NODICT}, /* SPACE */
|
2002-08-15 11:02:08 +08:00
|
|
|
{STOPLEXEM, NODICT}, /* TAG */
|
2001-10-25 13:50:21 +08:00
|
|
|
{STOPLEXEM, NODICT}, /* HTTP */
|
2002-08-15 11:02:08 +08:00
|
|
|
{BYLOCALE, DEFAULTDICT}, /* HYPHENWORD */
|
|
|
|
{DEFAULTDICT, NODICT}, /* LATHYPHENWORD */
|
|
|
|
{BYLOCALE, NODICT}, /* CYRHYPHENWORD */
|
2001-10-25 13:50:21 +08:00
|
|
|
{NODICT, NODICT}, /* URI */
|
2002-08-15 11:02:08 +08:00
|
|
|
{NODICT, NODICT}, /* FILEPATH */
|
|
|
|
{NODICT, NODICT}, /* DECIMAL */
|
|
|
|
{NODICT, NODICT}, /* SIGNEDINT */
|
|
|
|
{NODICT, NODICT}, /* UNSIGNEDINT */
|
|
|
|
{STOPLEXEM, NODICT} /* HTMLENTITY */
|
2001-10-13 07:19:09 +08:00
|
|
|
};
|
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
static bool inited = false;
|
|
|
|
|
|
|
|
void
|
|
|
|
initmorph(void)
|
|
|
|
{
|
|
|
|
int i,
|
|
|
|
j,
|
|
|
|
k;
|
|
|
|
MAPDICT *md;
|
|
|
|
bool needinit[lengthof(dicts)];
|
2002-09-06 04:51:39 +08:00
|
|
|
const char *curlocale;
|
2001-10-25 13:50:21 +08:00
|
|
|
int bylocaledict = NODICT;
|
2001-10-13 07:19:09 +08:00
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
if (inited)
|
|
|
|
return;
|
|
|
|
for (i = 1; i < lengthof(dicts); i++)
|
2001-10-13 07:19:09 +08:00
|
|
|
needinit[i] = false;
|
2001-10-25 13:50:21 +08:00
|
|
|
|
2002-09-06 04:51:39 +08:00
|
|
|
curlocale = setlocale(LC_CTYPE, NULL);
|
|
|
|
if (curlocale)
|
|
|
|
{
|
2002-03-12 00:54:27 +08:00
|
|
|
for (i = 1; i < lengthof(dicts); i++)
|
2002-09-06 04:51:39 +08:00
|
|
|
if (strcmp(dicts[i].localename, curlocale) == 0)
|
2002-03-12 00:54:27 +08:00
|
|
|
{
|
|
|
|
bylocaledict = i;
|
|
|
|
break;
|
|
|
|
}
|
2002-09-06 04:51:39 +08:00
|
|
|
}
|
2001-10-13 07:19:09 +08:00
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
for (i = 1; i < lengthof(mapdict); i++)
|
|
|
|
{
|
|
|
|
k = 0;
|
2001-10-13 07:19:09 +08:00
|
|
|
md = &mapdict[i];
|
2001-10-25 13:50:21 +08:00
|
|
|
for (j = 0; j < MAXNDICT; j++)
|
|
|
|
{
|
|
|
|
GETDICT(md, k) = GETDICT(md, j);
|
|
|
|
if (GETDICT(md, k) == NODICT)
|
2001-10-13 07:19:09 +08:00
|
|
|
break;
|
2001-10-25 13:50:21 +08:00
|
|
|
else if (GETDICT(md, k) == BYLOCALE)
|
|
|
|
{
|
|
|
|
if (bylocaledict == NODICT)
|
2001-10-13 07:19:09 +08:00
|
|
|
continue;
|
2001-10-25 13:50:21 +08:00
|
|
|
GETDICT(md, k) = bylocaledict;
|
2001-10-13 07:19:09 +08:00
|
|
|
}
|
2001-10-25 13:50:21 +08:00
|
|
|
if (GETDICT(md, k) >= (int2) lengthof(dicts))
|
2001-10-13 07:19:09 +08:00
|
|
|
continue;
|
2001-10-25 13:50:21 +08:00
|
|
|
needinit[GETDICT(md, k)] = true;
|
|
|
|
k++;
|
2001-10-13 07:19:09 +08:00
|
|
|
}
|
2001-10-25 13:50:21 +08:00
|
|
|
for (; k < MAXNDICT; k++)
|
|
|
|
if (GETDICT(md, k) != STOPLEXEM)
|
|
|
|
GETDICT(md, k) = NODICT;
|
2001-10-13 07:19:09 +08:00
|
|
|
}
|
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
for (i = 1; i < lengthof(dicts); i++)
|
|
|
|
if (needinit[i] && dicts[i].init)
|
|
|
|
dictobjs[i] = (*(dicts[i].init)) ();
|
|
|
|
|
2001-10-13 07:19:09 +08:00
|
|
|
inited = true;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
char *
|
|
|
|
lemmatize(char *word, int *len, int type)
|
|
|
|
{
|
|
|
|
int2 nd;
|
|
|
|
int i;
|
|
|
|
DICT *dict;
|
2001-10-13 07:19:09 +08:00
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
for (i = 0; i < MAXNDICT; i++)
|
|
|
|
{
|
|
|
|
nd = GETDICT(&mapdict[type], i);
|
|
|
|
if (nd == NODICT)
|
|
|
|
{
|
|
|
|
/* there is no dictionary */
|
2001-10-13 07:19:09 +08:00
|
|
|
return word;
|
2001-10-25 13:50:21 +08:00
|
|
|
}
|
|
|
|
else if (nd == STOPLEXEM)
|
|
|
|
{
|
2001-10-13 07:19:09 +08:00
|
|
|
/* word is stopword */
|
|
|
|
return NULL;
|
2001-10-25 13:50:21 +08:00
|
|
|
}
|
2002-12-06 13:15:02 +08:00
|
|
|
else if (nd == BYLOCALE)
|
|
|
|
{
|
2003-08-04 08:43:34 +08:00
|
|
|
continue; /* no dict for current locale */
|
2002-12-06 13:15:02 +08:00
|
|
|
}
|
2001-10-25 13:50:21 +08:00
|
|
|
else
|
|
|
|
{
|
|
|
|
dict = &dicts[nd];
|
|
|
|
if (dict->is_stoplemm && (*(dict->is_stoplemm)) (dictobjs[nd], word, *len))
|
2001-10-13 07:19:09 +08:00
|
|
|
return NULL;
|
2001-10-25 13:50:21 +08:00
|
|
|
if (dict->lemmatize)
|
|
|
|
{
|
|
|
|
int oldlen = *len;
|
|
|
|
char *newword = (*(dict->lemmatize)) (dictobjs[nd], word, len);
|
|
|
|
|
2003-03-11 06:28:22 +08:00
|
|
|
/* word is recognized by dictionary */
|
2001-10-25 13:50:21 +08:00
|
|
|
if (newword != word || *len != oldlen)
|
|
|
|
{
|
|
|
|
if (dict->is_stemstoplemm &&
|
|
|
|
(*(dict->is_stemstoplemm)) (dictobjs[nd], word, *len))
|
|
|
|
{
|
|
|
|
if (newword != word && newword)
|
2001-10-13 07:19:09 +08:00
|
|
|
pfree(newword);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
return newword;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return word;
|
|
|
|
}
|
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
bool
|
|
|
|
is_stoptype(int type)
|
|
|
|
{
|
|
|
|
return (GETDICT(&mapdict[type], 0) == STOPLEXEM) ? true : false;
|
2001-10-13 07:19:09 +08:00
|
|
|
}
|