2003-07-21 18:27:44 +08:00
|
|
|
#include "postgres.h"
|
|
|
|
|
2006-07-14 13:28:29 +08:00
|
|
|
#include <ctype.h>
|
|
|
|
|
2003-07-21 18:27:44 +08:00
|
|
|
#include "spell.h"
|
2005-12-21 21:05:49 +08:00
|
|
|
#include "common.h"
|
2005-12-12 19:10:12 +08:00
|
|
|
#include "ts_locale.h"
|
2003-07-21 18:27:44 +08:00
|
|
|
|
2003-11-18 01:34:35 +08:00
|
|
|
#define MAX_NORM 1024
|
|
|
|
#define MAXNORMLEN 256
|
2003-07-21 18:27:44 +08:00
|
|
|
|
2005-01-12 00:07:55 +08:00
|
|
|
#define ERRSTRSIZE 1024
|
|
|
|
|
2005-12-21 21:05:49 +08:00
|
|
|
#define STRNCMP(s,p) strncmp( (s), (p), strlen(p) )
|
2004-06-23 19:29:58 +08:00
|
|
|
#define GETWCHAR(W,L,N,T) ( ((uint8*)(W))[ ((T)==FF_PREFIX) ? (N) : ( (L) - 1 - (N) ) ] )
|
2003-11-18 01:34:35 +08:00
|
|
|
#define GETCHAR(A,N,T) GETWCHAR( (A)->repl, (A)->replen, N, T )
|
|
|
|
|
2006-02-10 02:04:20 +08:00
|
|
|
static char *VoidString = "";
|
2003-11-18 01:34:35 +08:00
|
|
|
|
|
|
|
#define MEMOUT(X) if ( !(X) ) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory")))
|
2003-07-21 18:27:44 +08:00
|
|
|
|
2003-08-04 08:43:34 +08:00
|
|
|
static int
|
|
|
|
cmpspell(const void *s1, const void *s2)
|
|
|
|
{
|
2006-02-10 02:04:20 +08:00
|
|
|
return (strcmp((*(const SPELL **) s1)->word, (*(const SPELL **) s2)->word));
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
2003-11-18 01:34:35 +08:00
|
|
|
static int
|
|
|
|
cmpspellaffix(const void *s1, const void *s2)
|
|
|
|
{
|
2006-02-10 02:04:20 +08:00
|
|
|
return (strcmp((*(const SPELL **) s1)->p.flag, (*(const SPELL **) s2)->p.flag));
|
2003-11-18 01:34:35 +08:00
|
|
|
}
|
2003-07-21 18:27:44 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
static char *
|
|
|
|
strnduplicate(char *s, int len)
|
|
|
|
{
|
|
|
|
char *d = (char *) palloc(len + 1);
|
|
|
|
|
|
|
|
memcpy(d, s, len);
|
|
|
|
d[len] = '\0';
|
2003-11-18 01:34:35 +08:00
|
|
|
return d;
|
|
|
|
}
|
2004-08-29 13:07:03 +08:00
|
|
|
|
2005-12-21 21:05:49 +08:00
|
|
|
static char *
|
|
|
|
findchar(char *str, int c) {
|
|
|
|
while( *str ) {
|
|
|
|
if ( t_iseq(str, c) )
|
|
|
|
return str;
|
|
|
|
str+=pg_mblen(str);
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-09-25 03:14:05 +08:00
|
|
|
/* backward string compare for suffix tree operations */
|
2003-08-04 08:43:34 +08:00
|
|
|
static int
|
2003-12-19 03:27:53 +08:00
|
|
|
strbcmp(const unsigned char *s1, const unsigned char *s2)
|
2003-08-04 08:43:34 +08:00
|
|
|
{
|
2005-09-25 03:14:05 +08:00
|
|
|
int l1 = strlen((const char *) s1) - 1,
|
|
|
|
l2 = strlen((const char *) s2) - 1;
|
2003-08-04 08:43:34 +08:00
|
|
|
|
|
|
|
while (l1 >= 0 && l2 >= 0)
|
|
|
|
{
|
|
|
|
if (s1[l1] < s2[l2])
|
|
|
|
return -1;
|
|
|
|
if (s1[l1] > s2[l2])
|
|
|
|
return 1;
|
|
|
|
l1--;
|
|
|
|
l2--;
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
2003-08-04 08:43:34 +08:00
|
|
|
if (l1 < l2)
|
|
|
|
return -1;
|
|
|
|
if (l1 > l2)
|
|
|
|
return 1;
|
2003-07-21 18:27:44 +08:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2003-08-04 08:43:34 +08:00
|
|
|
static int
|
2003-12-19 03:27:53 +08:00
|
|
|
strbncmp(const unsigned char *s1, const unsigned char *s2, size_t count)
|
2003-08-04 08:43:34 +08:00
|
|
|
{
|
2005-09-25 03:14:05 +08:00
|
|
|
int l1 = strlen((const char *) s1) - 1,
|
|
|
|
l2 = strlen((const char *) s2) - 1,
|
2003-08-04 08:43:34 +08:00
|
|
|
l = count;
|
|
|
|
|
|
|
|
while (l1 >= 0 && l2 >= 0 && l > 0)
|
|
|
|
{
|
|
|
|
if (s1[l1] < s2[l2])
|
|
|
|
return -1;
|
|
|
|
if (s1[l1] > s2[l2])
|
|
|
|
return 1;
|
2003-07-21 18:27:44 +08:00
|
|
|
l1--;
|
|
|
|
l2--;
|
|
|
|
l--;
|
|
|
|
}
|
2003-08-04 08:43:34 +08:00
|
|
|
if (l == 0)
|
|
|
|
return 0;
|
|
|
|
if (l1 < l2)
|
|
|
|
return -1;
|
|
|
|
if (l1 > l2)
|
|
|
|
return 1;
|
2003-07-21 18:27:44 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2003-08-04 08:43:34 +08:00
|
|
|
static int
|
|
|
|
cmpaffix(const void *s1, const void *s2)
|
|
|
|
{
|
2005-09-25 03:14:05 +08:00
|
|
|
const AFFIX *a1 = (const AFFIX *) s1;
|
|
|
|
const AFFIX *a2 = (const AFFIX *) s2;
|
|
|
|
|
|
|
|
if (a1->type < a2->type)
|
2003-08-04 08:43:34 +08:00
|
|
|
return -1;
|
2005-09-25 03:14:05 +08:00
|
|
|
if (a1->type > a2->type)
|
2003-08-04 08:43:34 +08:00
|
|
|
return 1;
|
2005-09-25 03:14:05 +08:00
|
|
|
if (a1->type == FF_PREFIX)
|
|
|
|
return strcmp(a1->repl, a2->repl);
|
2003-08-04 08:43:34 +08:00
|
|
|
else
|
2005-09-25 03:14:05 +08:00
|
|
|
return strbcmp((const unsigned char *) a1->repl,
|
|
|
|
(const unsigned char *) a2->repl);
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
|
|
|
|
2003-08-04 08:43:34 +08:00
|
|
|
int
|
2003-11-18 01:34:35 +08:00
|
|
|
NIAddSpell(IspellDict * Conf, const char *word, const char *flag)
|
2003-08-04 08:43:34 +08:00
|
|
|
{
|
|
|
|
if (Conf->nspell >= Conf->mspell)
|
|
|
|
{
|
|
|
|
if (Conf->mspell)
|
|
|
|
{
|
|
|
|
Conf->mspell += 1024 * 20;
|
2006-02-10 02:04:20 +08:00
|
|
|
Conf->Spell = (SPELL **) repalloc(Conf->Spell, Conf->mspell * sizeof(SPELL*));
|
2003-08-04 08:43:34 +08:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
Conf->mspell = 1024 * 20;
|
2006-02-10 02:04:20 +08:00
|
|
|
Conf->Spell = (SPELL **) palloc(Conf->mspell * sizeof(SPELL*));
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
|
|
|
}
|
2006-02-10 02:04:20 +08:00
|
|
|
Conf->Spell[Conf->nspell] = (SPELL*)palloc(SPELLHDRSZ + strlen(word) + 1);
|
|
|
|
strcpy( Conf->Spell[Conf->nspell]->word ,word );
|
|
|
|
strncpy(Conf->Spell[Conf->nspell]->p.flag, flag, 16);
|
2003-07-21 18:27:44 +08:00
|
|
|
Conf->nspell++;
|
2003-08-04 08:43:34 +08:00
|
|
|
return (0);
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2003-08-04 08:43:34 +08:00
|
|
|
int
|
2003-11-18 01:34:35 +08:00
|
|
|
NIImportDictionary(IspellDict * Conf, const char *filename)
|
2003-08-04 08:43:34 +08:00
|
|
|
{
|
2005-09-25 03:14:05 +08:00
|
|
|
char str[BUFSIZ];
|
2003-08-04 08:43:34 +08:00
|
|
|
FILE *dict;
|
2003-07-21 18:27:44 +08:00
|
|
|
|
2003-08-04 08:43:34 +08:00
|
|
|
if (!(dict = fopen(filename, "r")))
|
|
|
|
return (1);
|
|
|
|
while (fgets(str, sizeof(str), dict))
|
|
|
|
{
|
2005-10-15 10:49:52 +08:00
|
|
|
char *s;
|
2005-09-25 03:14:05 +08:00
|
|
|
const char *flag;
|
2003-07-21 18:27:44 +08:00
|
|
|
|
2005-12-21 21:05:49 +08:00
|
|
|
pg_verifymbstr( str, strlen(str), false);
|
|
|
|
|
2003-08-04 08:43:34 +08:00
|
|
|
flag = NULL;
|
2005-12-21 21:05:49 +08:00
|
|
|
if ((s = findchar(str, '/')))
|
2003-08-04 08:43:34 +08:00
|
|
|
{
|
2005-09-25 03:14:05 +08:00
|
|
|
*s++ = '\0';
|
2003-08-04 08:43:34 +08:00
|
|
|
flag = s;
|
|
|
|
while (*s)
|
|
|
|
{
|
2005-12-21 21:05:49 +08:00
|
|
|
/* we allow only single encoded flags for faster works */
|
|
|
|
if (pg_mblen(s) == 1 && t_isprint(s) && !t_isspace(s))
|
2003-07-21 18:27:44 +08:00
|
|
|
s++;
|
2003-08-04 08:43:34 +08:00
|
|
|
else
|
|
|
|
{
|
2005-09-25 03:14:05 +08:00
|
|
|
*s = '\0';
|
2003-07-21 18:27:44 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2003-08-04 08:43:34 +08:00
|
|
|
else
|
|
|
|
flag = "";
|
2005-12-21 21:05:49 +08:00
|
|
|
|
|
|
|
|
2003-08-04 08:43:34 +08:00
|
|
|
s = str;
|
|
|
|
while (*s)
|
|
|
|
{
|
2005-12-21 21:05:49 +08:00
|
|
|
if (t_isspace(s)) {
|
2005-09-25 03:14:05 +08:00
|
|
|
*s = '\0';
|
2005-12-21 21:05:49 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
s+=pg_mblen(s);
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
2005-12-21 21:05:49 +08:00
|
|
|
lowerstr(str);
|
|
|
|
|
2003-11-18 01:34:35 +08:00
|
|
|
NIAddSpell(Conf, str, flag);
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
|
|
|
fclose(dict);
|
2003-08-04 08:43:34 +08:00
|
|
|
return (0);
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2003-11-18 01:34:35 +08:00
|
|
|
static int
|
|
|
|
FindWord(IspellDict * Conf, const char *word, int affixflag, char compoundonly)
|
2003-08-04 08:43:34 +08:00
|
|
|
{
|
2004-08-29 13:07:03 +08:00
|
|
|
SPNode *node = Conf->Dictionary;
|
|
|
|
SPNodeData *StopLow,
|
|
|
|
*StopHigh,
|
|
|
|
*StopMiddle;
|
|
|
|
uint8 *ptr = (uint8 *) word;
|
2003-11-18 01:34:35 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
while (node && *ptr)
|
|
|
|
{
|
2003-11-18 01:34:35 +08:00
|
|
|
StopLow = node->data;
|
2004-08-29 13:07:03 +08:00
|
|
|
StopHigh = node->data + node->length;
|
|
|
|
while (StopLow < StopHigh)
|
|
|
|
{
|
2004-06-23 19:06:11 +08:00
|
|
|
StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
|
2004-08-29 13:07:03 +08:00
|
|
|
if (StopMiddle->val == *ptr)
|
|
|
|
{
|
|
|
|
if (*(ptr + 1) == '\0' && StopMiddle->isword)
|
|
|
|
{
|
|
|
|
if (compoundonly && !StopMiddle->compoundallow)
|
2003-11-18 01:34:35 +08:00
|
|
|
return 0;
|
2004-08-29 13:07:03 +08:00
|
|
|
if ((affixflag == 0) || (strchr(Conf->AffixData[StopMiddle->affix], affixflag) != NULL))
|
2003-11-18 01:34:35 +08:00
|
|
|
return 1;
|
|
|
|
}
|
2004-08-29 13:07:03 +08:00
|
|
|
node = StopMiddle->node;
|
2004-06-23 19:06:11 +08:00
|
|
|
ptr++;
|
2003-11-18 01:34:35 +08:00
|
|
|
break;
|
2004-08-29 13:07:03 +08:00
|
|
|
}
|
|
|
|
else if (StopMiddle->val < *ptr)
|
2003-11-18 01:34:35 +08:00
|
|
|
StopLow = StopMiddle + 1;
|
2004-08-29 13:07:03 +08:00
|
|
|
else
|
2003-11-18 01:34:35 +08:00
|
|
|
StopHigh = StopMiddle;
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
2004-08-29 13:07:03 +08:00
|
|
|
if (StopLow >= StopHigh)
|
|
|
|
break;
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
2003-11-18 01:34:35 +08:00
|
|
|
return 0;
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
|
|
|
|
2003-08-04 08:43:34 +08:00
|
|
|
int
|
2003-11-18 01:34:35 +08:00
|
|
|
NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const char *find, const char *repl, int type)
|
2003-08-04 08:43:34 +08:00
|
|
|
{
|
|
|
|
if (Conf->naffixes >= Conf->maffixes)
|
|
|
|
{
|
|
|
|
if (Conf->maffixes)
|
|
|
|
{
|
|
|
|
Conf->maffixes += 16;
|
|
|
|
Conf->Affix = (AFFIX *) realloc((void *) Conf->Affix, Conf->maffixes * sizeof(AFFIX));
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
Conf->maffixes = 16;
|
|
|
|
Conf->Affix = (AFFIX *) malloc(Conf->maffixes * sizeof(AFFIX));
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
2003-11-18 01:34:35 +08:00
|
|
|
MEMOUT(Conf->Affix);
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
2004-06-23 19:06:11 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
if (strcmp(mask, ".") == 0)
|
|
|
|
{
|
|
|
|
Conf->Affix[Conf->naffixes].issimple = 1;
|
|
|
|
Conf->Affix[Conf->naffixes].isregis = 0;
|
2006-02-10 02:04:20 +08:00
|
|
|
Conf->Affix[Conf->naffixes].mask = VoidString;
|
2004-08-29 13:07:03 +08:00
|
|
|
}
|
|
|
|
else if (RS_isRegis(mask))
|
|
|
|
{
|
|
|
|
Conf->Affix[Conf->naffixes].issimple = 0;
|
|
|
|
Conf->Affix[Conf->naffixes].isregis = 1;
|
2006-02-10 02:04:20 +08:00
|
|
|
Conf->Affix[Conf->naffixes].mask = (mask && *mask) ? strdup(mask) : VoidString;
|
2004-08-29 13:07:03 +08:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2005-12-21 21:05:49 +08:00
|
|
|
int masklen = strlen(mask);
|
2004-08-29 13:07:03 +08:00
|
|
|
Conf->Affix[Conf->naffixes].issimple = 0;
|
|
|
|
Conf->Affix[Conf->naffixes].isregis = 0;
|
2005-12-21 21:05:49 +08:00
|
|
|
Conf->Affix[Conf->naffixes].mask = (char *) malloc(masklen + 2);
|
|
|
|
if (type == FF_SUFFIX)
|
|
|
|
sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask);
|
2004-08-29 13:07:03 +08:00
|
|
|
else
|
|
|
|
sprintf(Conf->Affix[Conf->naffixes].mask, "^%s", mask);
|
|
|
|
}
|
2005-01-12 00:07:55 +08:00
|
|
|
MEMOUT(Conf->Affix[Conf->naffixes].mask);
|
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
Conf->Affix[Conf->naffixes].compile = 1;
|
|
|
|
Conf->Affix[Conf->naffixes].flagflags = flagflags;
|
|
|
|
Conf->Affix[Conf->naffixes].flag = flag;
|
|
|
|
Conf->Affix[Conf->naffixes].type = type;
|
|
|
|
|
2006-02-10 02:04:20 +08:00
|
|
|
Conf->Affix[Conf->naffixes].find = (find && *find) ? strdup(find) : VoidString;
|
2005-01-12 00:07:55 +08:00
|
|
|
MEMOUT(Conf->Affix[Conf->naffixes].find);
|
2006-02-10 02:04:20 +08:00
|
|
|
if ( (Conf->Affix[Conf->naffixes].replen = strlen(repl)) > 0 ) {
|
|
|
|
Conf->Affix[Conf->naffixes].repl = strdup(repl);
|
|
|
|
MEMOUT(Conf->Affix[Conf->naffixes].repl);
|
|
|
|
} else
|
|
|
|
Conf->Affix[Conf->naffixes].repl = VoidString;
|
2004-08-29 13:07:03 +08:00
|
|
|
Conf->naffixes++;
|
2003-08-04 08:43:34 +08:00
|
|
|
return (0);
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
|
|
|
|
2005-12-21 21:05:49 +08:00
|
|
|
#define PAE_WAIT_MASK 0
|
|
|
|
#define PAE_INMASK 1
|
|
|
|
#define PAE_WAIT_FIND 2
|
|
|
|
#define PAE_INFIND 3
|
|
|
|
#define PAE_WAIT_REPL 4
|
|
|
|
#define PAE_INREPL 5
|
|
|
|
|
|
|
|
static bool
|
2006-02-10 20:56:14 +08:00
|
|
|
parse_affentry( char *str, char *mask, char *find, char *repl, int line ) {
|
2005-12-21 21:05:49 +08:00
|
|
|
int state = PAE_WAIT_MASK;
|
|
|
|
char *pmask=mask, *pfind=find, *prepl=repl;
|
|
|
|
|
|
|
|
*mask = *find = *repl = '\0';
|
|
|
|
|
|
|
|
while(*str) {
|
|
|
|
if ( state == PAE_WAIT_MASK ) {
|
|
|
|
if ( t_iseq(str,'#') )
|
|
|
|
return false;
|
|
|
|
else if (!t_isspace(str)) {
|
|
|
|
COPYCHAR(pmask, str);
|
|
|
|
pmask += pg_mblen(str);
|
|
|
|
state = PAE_INMASK;
|
|
|
|
}
|
|
|
|
} else if ( state == PAE_INMASK ) {
|
|
|
|
if ( t_iseq(str,'>') ) {
|
|
|
|
*pmask='\0';
|
|
|
|
state = PAE_WAIT_FIND;
|
|
|
|
} else if (!t_isspace(str)) {
|
|
|
|
COPYCHAR(pmask, str);
|
|
|
|
pmask += pg_mblen(str);
|
|
|
|
}
|
|
|
|
} else if ( state == PAE_WAIT_FIND ) {
|
|
|
|
if ( t_iseq(str,'-') ) {
|
|
|
|
state = PAE_INFIND;
|
2006-02-10 20:56:14 +08:00
|
|
|
} else if (t_isalpha(str) || t_iseq(str,'\'') /* english 's */) {
|
2005-12-21 21:05:49 +08:00
|
|
|
COPYCHAR(prepl,str);
|
|
|
|
prepl += pg_mblen(str);
|
|
|
|
state = PAE_INREPL;
|
|
|
|
} else if (!t_isspace(str))
|
2006-02-10 20:56:14 +08:00
|
|
|
ts_error(ERROR, "Affix parse error at %d line", line);
|
2005-12-21 21:05:49 +08:00
|
|
|
} else if ( state == PAE_INFIND ) {
|
|
|
|
if ( t_iseq(str,',') ) {
|
|
|
|
*pfind='\0';
|
|
|
|
state = PAE_WAIT_REPL;
|
|
|
|
} else if (t_isalpha(str)) {
|
|
|
|
COPYCHAR(pfind,str);
|
|
|
|
pfind += pg_mblen(str);
|
|
|
|
} else if (!t_isspace(str))
|
2006-02-10 20:56:14 +08:00
|
|
|
ts_error(ERROR, "Affix parse error at %d line", line);
|
2005-12-21 21:05:49 +08:00
|
|
|
} else if ( state == PAE_WAIT_REPL ) {
|
|
|
|
if ( t_iseq(str,'-') ) {
|
|
|
|
break; /* void repl */
|
|
|
|
} else if ( t_isalpha(str) ) {
|
|
|
|
COPYCHAR(prepl,str);
|
|
|
|
prepl += pg_mblen(str);
|
|
|
|
state = PAE_INREPL;
|
|
|
|
} else if (!t_isspace(str))
|
2006-02-10 20:56:14 +08:00
|
|
|
ts_error(ERROR, "Affix parse error at %d line", line);
|
2005-12-21 21:05:49 +08:00
|
|
|
} else if ( state == PAE_INREPL ) {
|
|
|
|
if ( t_iseq(str,'#') ) {
|
|
|
|
*prepl = '\0';
|
|
|
|
break;
|
|
|
|
} else if ( t_isalpha(str) ) {
|
|
|
|
COPYCHAR(prepl,str);
|
|
|
|
prepl += pg_mblen(str);
|
|
|
|
} else if (!t_isspace(str))
|
2006-02-10 20:56:14 +08:00
|
|
|
ts_error(ERROR, "Affix parse error at %d line", line);
|
2005-12-21 21:05:49 +08:00
|
|
|
} else
|
|
|
|
ts_error(ERROR, "Unknown state in parse_affentry: %d", state);
|
|
|
|
|
|
|
|
str += pg_mblen(str);
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
|
|
|
|
2005-12-21 21:05:49 +08:00
|
|
|
*pmask = *pfind = *prepl = '\0';
|
|
|
|
|
|
|
|
return ( *mask && ( *find || *repl) ) ? true : false;
|
|
|
|
}
|
2003-07-21 18:27:44 +08:00
|
|
|
|
2003-08-04 08:43:34 +08:00
|
|
|
int
|
2003-11-18 01:34:35 +08:00
|
|
|
NIImportAffixes(IspellDict * Conf, const char *filename)
|
2003-08-04 08:43:34 +08:00
|
|
|
{
|
2005-09-25 03:14:05 +08:00
|
|
|
char str[BUFSIZ];
|
2005-12-21 21:05:49 +08:00
|
|
|
char tmpstr[BUFSIZ];
|
2005-09-25 03:14:05 +08:00
|
|
|
char mask[BUFSIZ];
|
|
|
|
char find[BUFSIZ];
|
|
|
|
char repl[BUFSIZ];
|
|
|
|
char *s;
|
2003-08-04 08:43:34 +08:00
|
|
|
int suffixes = 0;
|
|
|
|
int prefixes = 0;
|
2005-09-25 03:14:05 +08:00
|
|
|
int flag = 0;
|
|
|
|
char flagflags = 0;
|
2003-08-04 08:43:34 +08:00
|
|
|
FILE *affix;
|
2006-02-10 20:56:14 +08:00
|
|
|
int line=0;
|
2006-06-09 21:25:59 +08:00
|
|
|
int oldformat = 0;
|
2003-08-04 08:43:34 +08:00
|
|
|
|
|
|
|
if (!(affix = fopen(filename, "r")))
|
|
|
|
return (1);
|
2004-08-29 13:07:03 +08:00
|
|
|
Conf->compoundcontrol = '\t';
|
2003-08-04 08:43:34 +08:00
|
|
|
|
|
|
|
while (fgets(str, sizeof(str), affix))
|
|
|
|
{
|
2006-02-10 20:56:14 +08:00
|
|
|
line++;
|
2005-12-21 21:05:49 +08:00
|
|
|
pg_verifymbstr( str, strlen(str), false);
|
|
|
|
memcpy(tmpstr, str, 32); /* compoundwords... */
|
|
|
|
tmpstr[32]='\0';
|
|
|
|
lowerstr(tmpstr);
|
|
|
|
if (STRNCMP(tmpstr, "compoundwords") == 0)
|
2004-08-29 13:07:03 +08:00
|
|
|
{
|
2005-12-21 21:05:49 +08:00
|
|
|
s = findchar(str, 'l');
|
2004-08-29 13:07:03 +08:00
|
|
|
if (s)
|
|
|
|
{
|
2005-12-21 21:05:49 +08:00
|
|
|
while (*s && !t_isspace(s)) s++;
|
|
|
|
while (*s && t_isspace(s)) s++;
|
|
|
|
if ( *s && pg_mblen(s) == 1 )
|
|
|
|
Conf->compoundcontrol = *s;
|
2006-06-09 21:25:59 +08:00
|
|
|
oldformat++;
|
2004-08-29 13:07:03 +08:00
|
|
|
continue;
|
2003-11-18 01:34:35 +08:00
|
|
|
}
|
|
|
|
}
|
2005-12-21 21:05:49 +08:00
|
|
|
if (STRNCMP(tmpstr, "suffixes") == 0)
|
2003-08-04 08:43:34 +08:00
|
|
|
{
|
|
|
|
suffixes = 1;
|
|
|
|
prefixes = 0;
|
2006-06-09 21:25:59 +08:00
|
|
|
oldformat++;
|
2003-07-21 18:27:44 +08:00
|
|
|
continue;
|
|
|
|
}
|
2005-12-21 21:05:49 +08:00
|
|
|
if (STRNCMP(tmpstr, "prefixes") == 0)
|
2003-08-04 08:43:34 +08:00
|
|
|
{
|
|
|
|
suffixes = 0;
|
|
|
|
prefixes = 1;
|
2006-06-09 21:25:59 +08:00
|
|
|
oldformat++;
|
2003-07-21 18:27:44 +08:00
|
|
|
continue;
|
|
|
|
}
|
2005-12-21 21:05:49 +08:00
|
|
|
if (STRNCMP(tmpstr, "flag") == 0)
|
2003-08-04 08:43:34 +08:00
|
|
|
{
|
2005-12-21 21:05:49 +08:00
|
|
|
s = str + 4;
|
2004-08-29 13:07:03 +08:00
|
|
|
flagflags = 0;
|
2005-12-21 21:05:49 +08:00
|
|
|
|
|
|
|
while (*s && t_isspace(s)) s++;
|
2006-06-09 21:25:59 +08:00
|
|
|
oldformat++;
|
2005-12-21 21:05:49 +08:00
|
|
|
|
|
|
|
/* allow only single-encoded flags */
|
2006-06-09 21:25:59 +08:00
|
|
|
if ( pg_mblen(s) != 1 )
|
|
|
|
elog(ERROR,"Multiencoded flag at line %d: %s", line, s);
|
2005-12-21 21:05:49 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
if (*s == '*')
|
|
|
|
{
|
|
|
|
flagflags |= FF_CROSSPRODUCT;
|
2003-11-18 01:34:35 +08:00
|
|
|
s++;
|
2004-08-29 13:07:03 +08:00
|
|
|
}
|
|
|
|
else if (*s == '~')
|
|
|
|
{
|
|
|
|
flagflags |= FF_COMPOUNDONLYAFX;
|
2003-07-21 18:27:44 +08:00
|
|
|
s++;
|
2003-11-18 01:34:35 +08:00
|
|
|
}
|
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
if (*s == '\\')
|
|
|
|
s++;
|
|
|
|
|
2005-12-21 21:05:49 +08:00
|
|
|
/* allow only single-encoded flags */
|
|
|
|
if ( pg_mblen(s) != 1 ) {
|
|
|
|
flagflags = 0;
|
2006-06-09 21:25:59 +08:00
|
|
|
elog(ERROR,"Multiencoded flag at line %d: %s", line, s);
|
2005-12-21 21:05:49 +08:00
|
|
|
}
|
|
|
|
|
2005-09-25 03:14:05 +08:00
|
|
|
flag = (unsigned char) *s;
|
2003-07-21 18:27:44 +08:00
|
|
|
continue;
|
|
|
|
}
|
2006-06-09 21:25:59 +08:00
|
|
|
if ( STRNCMP(str, "COMPOUNDFLAG") == 0 || STRNCMP(str, "COMPOUNDMIN") == 0 ||
|
|
|
|
STRNCMP(str, "PFX")==0 || STRNCMP(str, "SFX")==0 ) {
|
|
|
|
|
|
|
|
if ( oldformat )
|
|
|
|
elog(ERROR,"Wrong affix file format");
|
|
|
|
|
|
|
|
fclose(affix);
|
|
|
|
return NIImportOOAffixes(Conf, filename);
|
|
|
|
|
|
|
|
}
|
2003-08-04 08:43:34 +08:00
|
|
|
if ((!suffixes) && (!prefixes))
|
|
|
|
continue;
|
2005-12-21 21:05:49 +08:00
|
|
|
|
2005-12-12 19:10:12 +08:00
|
|
|
lowerstr(str);
|
2006-02-10 20:56:14 +08:00
|
|
|
if ( !parse_affentry(str, mask, find, repl, line) )
|
2005-12-21 21:05:49 +08:00
|
|
|
continue;
|
2003-08-04 08:43:34 +08:00
|
|
|
|
2005-09-25 03:14:05 +08:00
|
|
|
NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
|
|
|
fclose(affix);
|
2003-08-04 08:43:34 +08:00
|
|
|
|
|
|
|
return (0);
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
|
|
|
|
2006-06-09 21:25:59 +08:00
|
|
|
int
|
|
|
|
NIImportOOAffixes(IspellDict * Conf, const char *filename) {
|
|
|
|
char str[BUFSIZ];
|
|
|
|
char type[BUFSIZ];
|
|
|
|
char sflag[BUFSIZ];
|
|
|
|
char mask[BUFSIZ];
|
|
|
|
char find[BUFSIZ];
|
|
|
|
char repl[BUFSIZ];
|
|
|
|
bool isSuffix = false;
|
|
|
|
int flag = 0;
|
|
|
|
char flagflags = 0;
|
|
|
|
FILE *affix;
|
|
|
|
int line=0;
|
|
|
|
int scanread = 0;
|
|
|
|
char scanbuf[BUFSIZ];
|
|
|
|
|
|
|
|
sprintf(scanbuf,"%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ/5, BUFSIZ/5, BUFSIZ/5, BUFSIZ/5);
|
|
|
|
|
|
|
|
if (!(affix = fopen(filename, "r")))
|
|
|
|
return (1);
|
|
|
|
Conf->compoundcontrol = '\t';
|
|
|
|
|
|
|
|
while (fgets(str, sizeof(str), affix))
|
|
|
|
{
|
|
|
|
line++;
|
|
|
|
if ( *str == '\0' || t_isspace(str) || t_iseq(str,'#') )
|
|
|
|
continue;
|
|
|
|
pg_verifymbstr( str, strlen(str), false);
|
|
|
|
|
|
|
|
if ( STRNCMP(str, "COMPOUNDFLAG")==0 ) {
|
|
|
|
char *s = str+strlen("COMPOUNDFLAG");
|
|
|
|
while (*s && t_isspace(s)) s++;
|
|
|
|
if ( *s && pg_mblen(s) == 1 )
|
|
|
|
Conf->compoundcontrol = *s;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
scanread = sscanf(str, scanbuf, type, sflag, find, repl, mask);
|
|
|
|
|
|
|
|
lowerstr(type);
|
|
|
|
if ( scanread<4 || (STRNCMP(type,"sfx") && STRNCMP(type,"pfx")) )
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if ( scanread == 4 ) {
|
|
|
|
if ( strlen(sflag) != 1 )
|
|
|
|
continue;
|
|
|
|
flag = *sflag;
|
|
|
|
isSuffix = (STRNCMP(type,"sfx")==0) ? true : false;
|
|
|
|
lowerstr(find);
|
|
|
|
if ( t_iseq(find,'y') )
|
|
|
|
flagflags |= FF_CROSSPRODUCT;
|
|
|
|
else
|
|
|
|
flagflags = 0;
|
|
|
|
} else {
|
|
|
|
if ( strlen(sflag) != 1 || flag != *sflag || flag==0 )
|
|
|
|
continue;
|
|
|
|
lowerstr(repl);
|
|
|
|
lowerstr(find);
|
|
|
|
lowerstr(mask);
|
|
|
|
if ( t_iseq(find,'0') )
|
|
|
|
*find = '\0';
|
|
|
|
if ( t_iseq(repl,'0') )
|
|
|
|
*repl = '\0';
|
|
|
|
|
|
|
|
NIAddAffix(Conf, flag, flagflags, mask, find, repl, isSuffix ? FF_SUFFIX : FF_PREFIX);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fclose(affix);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
static int
|
|
|
|
MergeAffix(IspellDict * Conf, int a1, int a2)
|
|
|
|
{
|
|
|
|
int naffix = 0;
|
|
|
|
char **ptr = Conf->AffixData;
|
2003-11-18 01:34:35 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
while (*ptr)
|
|
|
|
{
|
2003-11-18 01:34:35 +08:00
|
|
|
naffix++;
|
|
|
|
ptr++;
|
|
|
|
}
|
2004-08-29 13:07:03 +08:00
|
|
|
|
|
|
|
Conf->AffixData = (char **) realloc(Conf->AffixData, (naffix + 2) * sizeof(char *));
|
2003-11-18 01:34:35 +08:00
|
|
|
MEMOUT(Conf->AffixData);
|
|
|
|
ptr = Conf->AffixData + naffix;
|
2004-08-29 13:07:03 +08:00
|
|
|
*ptr = malloc(strlen(Conf->AffixData[a1]) + strlen(Conf->AffixData[a2]) + 1 /* space */ + 1 /* \0 */ );
|
2003-11-18 01:34:35 +08:00
|
|
|
MEMOUT(ptr);
|
|
|
|
sprintf(*ptr, "%s %s", Conf->AffixData[a1], Conf->AffixData[a2]);
|
|
|
|
ptr++;
|
2004-08-29 13:07:03 +08:00
|
|
|
*ptr = '\0';
|
|
|
|
return naffix;
|
2003-11-18 01:34:35 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
static SPNode *
|
|
|
|
mkSPNode(IspellDict * Conf, int low, int high, int level)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
int nchar = 0;
|
|
|
|
char lastchar = '\0';
|
|
|
|
SPNode *rs;
|
2003-11-18 01:34:35 +08:00
|
|
|
SPNodeData *data;
|
2004-08-29 13:07:03 +08:00
|
|
|
int lownew = low;
|
2003-11-18 01:34:35 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
for (i = low; i < high; i++)
|
2006-02-10 02:04:20 +08:00
|
|
|
if (Conf->Spell[i]->p.d.len > level && lastchar != Conf->Spell[i]->word[level])
|
2004-08-29 13:07:03 +08:00
|
|
|
{
|
2003-11-18 01:34:35 +08:00
|
|
|
nchar++;
|
2006-02-10 02:04:20 +08:00
|
|
|
lastchar = Conf->Spell[i]->word[level];
|
2003-11-18 01:34:35 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!nchar)
|
|
|
|
return NULL;
|
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
rs = (SPNode *) malloc(SPNHRDSZ + nchar * sizeof(SPNodeData));
|
2003-11-18 01:34:35 +08:00
|
|
|
MEMOUT(rs);
|
2004-08-29 13:07:03 +08:00
|
|
|
memset(rs, 0, SPNHRDSZ + nchar * sizeof(SPNodeData));
|
2003-11-18 01:34:35 +08:00
|
|
|
rs->length = nchar;
|
2004-08-29 13:07:03 +08:00
|
|
|
data = rs->data;
|
|
|
|
|
|
|
|
lastchar = '\0';
|
|
|
|
for (i = low; i < high; i++)
|
2006-02-10 02:04:20 +08:00
|
|
|
if (Conf->Spell[i]->p.d.len > level)
|
2004-08-29 13:07:03 +08:00
|
|
|
{
|
2006-02-10 02:04:20 +08:00
|
|
|
if (lastchar != Conf->Spell[i]->word[level])
|
2004-08-29 13:07:03 +08:00
|
|
|
{
|
|
|
|
if (lastchar)
|
|
|
|
{
|
|
|
|
data->node = mkSPNode(Conf, lownew, i, level + 1);
|
|
|
|
lownew = i;
|
2003-11-18 01:34:35 +08:00
|
|
|
data++;
|
|
|
|
}
|
2006-02-10 02:04:20 +08:00
|
|
|
lastchar = Conf->Spell[i]->word[level];
|
2003-11-18 01:34:35 +08:00
|
|
|
}
|
2006-02-10 02:04:20 +08:00
|
|
|
data->val = ((uint8 *) (Conf->Spell[i]->word))[level];
|
|
|
|
if (Conf->Spell[i]->p.d.len == level + 1)
|
2004-08-29 13:07:03 +08:00
|
|
|
{
|
2006-02-10 02:04:20 +08:00
|
|
|
if (data->isword && data->affix != Conf->Spell[i]->p.d.affix)
|
2004-08-29 13:07:03 +08:00
|
|
|
{
|
|
|
|
/*
|
2005-10-15 10:49:52 +08:00
|
|
|
* fprintf(stderr,"Word already exists: %s (affixes: '%s'
|
2006-02-10 02:04:20 +08:00
|
|
|
* and '%s')\n", Conf->Spell[i]->word,
|
2004-08-29 13:07:03 +08:00
|
|
|
* Conf->AffixData[data->affix],
|
2006-02-10 02:04:20 +08:00
|
|
|
* Conf->AffixData[Conf->Spell[i]->p.d.affix] );
|
2004-08-29 13:07:03 +08:00
|
|
|
*/
|
2003-11-18 01:34:35 +08:00
|
|
|
/* MergeAffix called a few times */
|
2006-02-10 02:04:20 +08:00
|
|
|
data->affix = MergeAffix(Conf, data->affix, Conf->Spell[i]->p.d.affix);
|
2004-08-29 13:07:03 +08:00
|
|
|
}
|
|
|
|
else
|
2006-02-10 02:04:20 +08:00
|
|
|
data->affix = Conf->Spell[i]->p.d.affix;
|
2004-08-29 13:07:03 +08:00
|
|
|
data->isword = 1;
|
|
|
|
if (strchr(Conf->AffixData[data->affix], Conf->compoundcontrol))
|
|
|
|
data->compoundallow = 1;
|
2003-11-18 01:34:35 +08:00
|
|
|
}
|
|
|
|
}
|
2004-08-29 13:07:03 +08:00
|
|
|
|
|
|
|
data->node = mkSPNode(Conf, lownew, high, level + 1);
|
2003-11-18 01:34:35 +08:00
|
|
|
|
|
|
|
return rs;
|
|
|
|
}
|
|
|
|
|
2003-08-04 08:43:34 +08:00
|
|
|
void
|
2003-11-18 01:34:35 +08:00
|
|
|
NISortDictionary(IspellDict * Conf)
|
2003-08-04 08:43:34 +08:00
|
|
|
{
|
|
|
|
size_t i;
|
2004-08-29 13:07:03 +08:00
|
|
|
int naffix = 3;
|
|
|
|
|
2003-11-18 01:34:35 +08:00
|
|
|
/* compress affixes */
|
2006-02-10 02:04:20 +08:00
|
|
|
qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL*), cmpspellaffix);
|
2003-11-18 01:34:35 +08:00
|
|
|
for (i = 1; i < Conf->nspell; i++)
|
2006-02-10 02:04:20 +08:00
|
|
|
if (strcmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag))
|
2003-11-18 01:34:35 +08:00
|
|
|
naffix++;
|
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
Conf->AffixData = (char **) malloc(naffix * sizeof(char *));
|
2003-11-18 01:34:35 +08:00
|
|
|
MEMOUT(Conf->AffixData);
|
2004-08-29 13:07:03 +08:00
|
|
|
memset(Conf->AffixData, 0, naffix * sizeof(char *));
|
|
|
|
naffix = 1;
|
|
|
|
Conf->AffixData[0] = strdup("");
|
2003-11-18 01:34:35 +08:00
|
|
|
MEMOUT(Conf->AffixData[0]);
|
2006-02-10 02:04:20 +08:00
|
|
|
Conf->AffixData[1] = strdup(Conf->Spell[0]->p.flag);
|
2003-11-18 01:34:35 +08:00
|
|
|
MEMOUT(Conf->AffixData[1]);
|
2006-02-10 02:04:20 +08:00
|
|
|
Conf->Spell[0]->p.d.affix = 1;
|
|
|
|
Conf->Spell[0]->p.d.len = strlen(Conf->Spell[0]->word);
|
2004-08-29 13:07:03 +08:00
|
|
|
for (i = 1; i < Conf->nspell; i++)
|
|
|
|
{
|
2006-02-10 02:04:20 +08:00
|
|
|
if (strcmp(Conf->Spell[i]->p.flag, Conf->AffixData[naffix]))
|
2004-08-29 13:07:03 +08:00
|
|
|
{
|
2003-11-18 01:34:35 +08:00
|
|
|
naffix++;
|
2006-02-10 02:04:20 +08:00
|
|
|
Conf->AffixData[naffix] = strdup(Conf->Spell[i]->p.flag);
|
2003-11-18 01:34:35 +08:00
|
|
|
MEMOUT(Conf->AffixData[naffix]);
|
|
|
|
}
|
2006-02-10 02:04:20 +08:00
|
|
|
Conf->Spell[i]->p.d.affix = naffix;
|
|
|
|
Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
|
2003-11-18 01:34:35 +08:00
|
|
|
}
|
2004-08-29 13:07:03 +08:00
|
|
|
|
2006-02-10 02:04:20 +08:00
|
|
|
qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL*), cmpspell);
|
2003-11-18 01:34:35 +08:00
|
|
|
Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);
|
2004-08-29 13:07:03 +08:00
|
|
|
|
|
|
|
for (i = 0; i < Conf->nspell; i++)
|
2006-02-10 02:04:20 +08:00
|
|
|
pfree(Conf->Spell[i]);
|
|
|
|
pfree(Conf->Spell);
|
2004-08-29 13:07:03 +08:00
|
|
|
Conf->Spell = NULL;
|
2003-11-18 01:34:35 +08:00
|
|
|
}
|
2003-07-21 18:27:44 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
static AffixNode *
|
|
|
|
mkANode(IspellDict * Conf, int low, int high, int level, int type)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
int nchar = 0;
|
|
|
|
uint8 lastchar = '\0';
|
|
|
|
AffixNode *rs;
|
2003-11-18 01:34:35 +08:00
|
|
|
AffixNodeData *data;
|
2004-08-29 13:07:03 +08:00
|
|
|
int lownew = low;
|
2003-11-18 01:34:35 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
for (i = low; i < high; i++)
|
|
|
|
if (Conf->Affix[i].replen > level && lastchar != GETCHAR(Conf->Affix + i, level, type))
|
|
|
|
{
|
2003-11-18 01:34:35 +08:00
|
|
|
nchar++;
|
2004-08-29 13:07:03 +08:00
|
|
|
lastchar = GETCHAR(Conf->Affix + i, level, type);
|
2003-11-18 01:34:35 +08:00
|
|
|
}
|
2003-07-21 18:27:44 +08:00
|
|
|
|
2003-11-18 01:34:35 +08:00
|
|
|
if (!nchar)
|
|
|
|
return NULL;
|
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
rs = (AffixNode *) malloc(ANHRDSZ + nchar * sizeof(AffixNodeData));
|
2003-11-18 01:34:35 +08:00
|
|
|
MEMOUT(rs);
|
2004-08-29 13:07:03 +08:00
|
|
|
memset(rs, 0, ANHRDSZ + nchar * sizeof(AffixNodeData));
|
2003-11-18 01:34:35 +08:00
|
|
|
rs->length = nchar;
|
2004-08-29 13:07:03 +08:00
|
|
|
data = rs->data;
|
|
|
|
|
|
|
|
lastchar = '\0';
|
|
|
|
for (i = low; i < high; i++)
|
|
|
|
if (Conf->Affix[i].replen > level)
|
|
|
|
{
|
|
|
|
if (lastchar != GETCHAR(Conf->Affix + i, level, type))
|
|
|
|
{
|
|
|
|
if (lastchar)
|
|
|
|
{
|
|
|
|
data->node = mkANode(Conf, lownew, i, level + 1, type);
|
|
|
|
lownew = i;
|
2003-11-18 01:34:35 +08:00
|
|
|
data++;
|
|
|
|
}
|
2004-08-29 13:07:03 +08:00
|
|
|
lastchar = GETCHAR(Conf->Affix + i, level, type);
|
2003-11-18 01:34:35 +08:00
|
|
|
}
|
2004-08-29 13:07:03 +08:00
|
|
|
data->val = GETCHAR(Conf->Affix + i, level, type);
|
|
|
|
if (Conf->Affix[i].replen == level + 1)
|
|
|
|
{ /* affix stopped */
|
|
|
|
if (!data->naff)
|
|
|
|
{
|
|
|
|
data->aff = (AFFIX **) malloc(sizeof(AFFIX *) * (high - i + 1));
|
2003-12-19 03:27:53 +08:00
|
|
|
MEMOUT(data->aff);
|
|
|
|
}
|
2004-08-29 13:07:03 +08:00
|
|
|
data->aff[data->naff] = Conf->Affix + i;
|
2003-11-18 01:34:35 +08:00
|
|
|
data->naff++;
|
|
|
|
}
|
2003-08-04 08:43:34 +08:00
|
|
|
}
|
2004-08-29 13:07:03 +08:00
|
|
|
|
|
|
|
data->node = mkANode(Conf, lownew, high, level + 1, type);
|
2003-11-18 01:34:35 +08:00
|
|
|
|
|
|
|
return rs;
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
|
|
|
|
2004-06-23 19:06:11 +08:00
|
|
|
static void
|
2004-08-29 13:07:03 +08:00
|
|
|
mkVoidAffix(IspellDict * Conf, int issuffix, int startsuffix)
|
|
|
|
{
|
|
|
|
int i,
|
|
|
|
cnt = 0;
|
|
|
|
int start = (issuffix) ? startsuffix : 0;
|
|
|
|
int end = (issuffix) ? Conf->naffixes : startsuffix;
|
|
|
|
AffixNode *Affix = (AffixNode *) malloc(ANHRDSZ + sizeof(AffixNodeData));
|
2004-06-23 19:06:11 +08:00
|
|
|
|
|
|
|
MEMOUT(Affix);
|
2004-08-29 13:07:03 +08:00
|
|
|
memset(Affix, 0, ANHRDSZ + sizeof(AffixNodeData));
|
|
|
|
Affix->length = 1;
|
|
|
|
Affix->isvoid = 1;
|
2004-06-23 19:06:11 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
if (issuffix)
|
|
|
|
{
|
|
|
|
Affix->data->node = Conf->Suffix;
|
|
|
|
Conf->Suffix = Affix;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
Affix->data->node = Conf->Prefix;
|
|
|
|
Conf->Prefix = Affix;
|
|
|
|
}
|
2004-06-23 19:06:11 +08:00
|
|
|
|
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
for (i = start; i < end; i++)
|
|
|
|
if (Conf->Affix[i].replen == 0)
|
|
|
|
cnt++;
|
2004-06-23 19:06:11 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
if (cnt == 0)
|
|
|
|
return;
|
2004-06-23 19:06:11 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
Affix->data->aff = (AFFIX **) malloc(sizeof(AFFIX *) * cnt);
|
2004-06-23 19:06:11 +08:00
|
|
|
MEMOUT(Affix->data->aff);
|
2004-08-29 13:07:03 +08:00
|
|
|
Affix->data->naff = (uint32) cnt;
|
|
|
|
|
|
|
|
cnt = 0;
|
|
|
|
for (i = start; i < end; i++)
|
|
|
|
if (Conf->Affix[i].replen == 0)
|
|
|
|
{
|
|
|
|
Affix->data->aff[cnt] = Conf->Affix + i;
|
|
|
|
cnt++;
|
|
|
|
}
|
2004-06-23 19:06:11 +08:00
|
|
|
}
|
|
|
|
|
2003-08-04 08:43:34 +08:00
|
|
|
void
|
2003-11-18 01:34:35 +08:00
|
|
|
NISortAffixes(IspellDict * Conf)
|
2003-08-04 08:43:34 +08:00
|
|
|
{
|
|
|
|
AFFIX *Affix;
|
|
|
|
size_t i;
|
2004-08-29 13:07:03 +08:00
|
|
|
CMPDAffix *ptr;
|
|
|
|
int firstsuffix = -1;
|
2003-08-04 08:43:34 +08:00
|
|
|
|
2006-02-06 23:45:34 +08:00
|
|
|
if (Conf->naffixes==0)
|
|
|
|
return;
|
|
|
|
|
2003-08-04 08:43:34 +08:00
|
|
|
if (Conf->naffixes > 1)
|
|
|
|
qsort((void *) Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix);
|
2004-08-29 13:07:03 +08:00
|
|
|
Conf->CompoundAffix = ptr = (CMPDAffix *) malloc(sizeof(CMPDAffix) * Conf->naffixes);
|
2003-11-18 01:34:35 +08:00
|
|
|
MEMOUT(Conf->CompoundAffix);
|
2004-08-29 13:07:03 +08:00
|
|
|
ptr->affix = NULL;
|
2003-11-18 01:34:35 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
for (i = 0; i < Conf->naffixes; i++)
|
|
|
|
{
|
2003-08-04 08:43:34 +08:00
|
|
|
Affix = &(((AFFIX *) Conf->Affix)[i]);
|
2004-08-29 13:07:03 +08:00
|
|
|
if (Affix->type == FF_SUFFIX)
|
|
|
|
{
|
|
|
|
if (firstsuffix < 0)
|
|
|
|
firstsuffix = i;
|
2006-02-21 01:51:05 +08:00
|
|
|
if ((Affix->flagflags & FF_COMPOUNDONLYAFX) && Affix->replen>0 )
|
2004-08-29 13:07:03 +08:00
|
|
|
{
|
2006-02-21 01:51:05 +08:00
|
|
|
if (ptr == Conf->CompoundAffix ||
|
2005-09-25 03:14:05 +08:00
|
|
|
strbncmp((const unsigned char *) (ptr - 1)->affix,
|
|
|
|
(const unsigned char *) Affix->repl,
|
|
|
|
(ptr - 1)->len))
|
2004-08-29 13:07:03 +08:00
|
|
|
{
|
2003-11-18 01:34:35 +08:00
|
|
|
/* leave only unique and minimals suffixes */
|
2004-08-29 13:07:03 +08:00
|
|
|
ptr->affix = Affix->repl;
|
|
|
|
ptr->len = Affix->replen;
|
2003-11-18 01:34:35 +08:00
|
|
|
ptr++;
|
|
|
|
}
|
2003-08-04 08:43:34 +08:00
|
|
|
}
|
|
|
|
}
|
2003-11-18 01:34:35 +08:00
|
|
|
}
|
|
|
|
ptr->affix = NULL;
|
2004-08-29 13:07:03 +08:00
|
|
|
Conf->CompoundAffix = (CMPDAffix *) realloc(Conf->CompoundAffix, sizeof(CMPDAffix) * (ptr - Conf->CompoundAffix + 1));
|
2003-11-18 01:34:35 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
|
2004-06-23 19:29:58 +08:00
|
|
|
Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
|
2004-08-29 13:07:03 +08:00
|
|
|
mkVoidAffix(Conf, 1, firstsuffix);
|
|
|
|
mkVoidAffix(Conf, 0, firstsuffix);
|
2003-11-18 01:34:35 +08:00
|
|
|
}
|
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
static AffixNodeData *
|
|
|
|
FinfAffixes(AffixNode * node, const char *word, int wrdlen, int *level, int type)
|
|
|
|
{
|
|
|
|
AffixNodeData *StopLow,
|
|
|
|
*StopHigh,
|
|
|
|
*StopMiddle;
|
|
|
|
uint8 symbol;
|
|
|
|
|
|
|
|
if (node->isvoid)
|
|
|
|
{ /* search void affixes */
|
|
|
|
if (node->data->naff)
|
|
|
|
return node->data;
|
|
|
|
node = node->data->node;
|
|
|
|
}
|
2004-06-23 19:06:11 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
while (node && *level < wrdlen)
|
|
|
|
{
|
2003-11-18 01:34:35 +08:00
|
|
|
StopLow = node->data;
|
2004-08-29 13:07:03 +08:00
|
|
|
StopHigh = node->data + node->length;
|
|
|
|
while (StopLow < StopHigh)
|
|
|
|
{
|
2004-06-23 19:06:11 +08:00
|
|
|
StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
|
2004-08-29 13:07:03 +08:00
|
|
|
symbol = GETWCHAR(word, wrdlen, *level, type);
|
|
|
|
if (StopMiddle->val == symbol)
|
|
|
|
{
|
2004-06-23 19:06:11 +08:00
|
|
|
(*level)++;
|
2004-08-29 13:07:03 +08:00
|
|
|
if (StopMiddle->naff)
|
2003-11-18 01:34:35 +08:00
|
|
|
return StopMiddle;
|
2004-08-29 13:07:03 +08:00
|
|
|
node = StopMiddle->node;
|
2003-11-18 01:34:35 +08:00
|
|
|
break;
|
2004-08-29 13:07:03 +08:00
|
|
|
}
|
|
|
|
else if (StopMiddle->val < symbol)
|
2003-11-18 01:34:35 +08:00
|
|
|
StopLow = StopMiddle + 1;
|
2004-08-29 13:07:03 +08:00
|
|
|
else
|
2003-11-18 01:34:35 +08:00
|
|
|
StopHigh = StopMiddle;
|
2003-08-04 08:43:34 +08:00
|
|
|
}
|
2004-08-29 13:07:03 +08:00
|
|
|
if (StopLow >= StopHigh)
|
|
|
|
break;
|
2003-08-04 08:43:34 +08:00
|
|
|
}
|
2003-11-18 01:34:35 +08:00
|
|
|
return NULL;
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
|
|
|
|
2003-08-04 08:43:34 +08:00
|
|
|
static char *
|
2006-02-10 02:04:20 +08:00
|
|
|
CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *newword, int *baselen)
|
2004-08-29 13:07:03 +08:00
|
|
|
{
|
2003-08-04 08:43:34 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
if (flagflags & FF_COMPOUNDONLYAFX)
|
|
|
|
{
|
|
|
|
if ((Affix->flagflags & FF_COMPOUNDONLYAFX) == 0)
|
2003-11-18 01:34:35 +08:00
|
|
|
return NULL;
|
2004-08-29 13:07:03 +08:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (Affix->flagflags & FF_COMPOUNDONLYAFX)
|
2003-11-18 01:34:35 +08:00
|
|
|
return NULL;
|
2004-08-29 13:07:03 +08:00
|
|
|
}
|
2003-11-18 01:34:35 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
if (Affix->type == FF_SUFFIX)
|
|
|
|
{
|
2003-11-18 01:34:35 +08:00
|
|
|
strcpy(newword, word);
|
|
|
|
strcpy(newword + len - Affix->replen, Affix->find);
|
2006-02-10 02:04:20 +08:00
|
|
|
if ( baselen ) /* store length of non-changed part of word */
|
|
|
|
*baselen = len - Affix->replen;
|
2004-08-29 13:07:03 +08:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2006-02-10 02:04:20 +08:00
|
|
|
/* if prefix is a all non-chaged part's length then all word contains only prefix and suffix,
|
|
|
|
so out */
|
|
|
|
if ( baselen && *baselen + strlen(Affix->find) <= Affix->replen )
|
|
|
|
return NULL;
|
2003-11-18 01:34:35 +08:00
|
|
|
strcpy(newword, Affix->find);
|
|
|
|
strcat(newword, word + Affix->replen);
|
|
|
|
}
|
2003-08-04 08:43:34 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
if (Affix->issimple)
|
|
|
|
return newword;
|
|
|
|
else if (Affix->isregis)
|
|
|
|
{
|
|
|
|
if (Affix->compile)
|
|
|
|
{
|
2005-12-21 21:05:49 +08:00
|
|
|
RS_compile(&(Affix->reg.regis), (Affix->type == FF_SUFFIX) ? true : false, Affix->mask);
|
2004-08-29 13:07:03 +08:00
|
|
|
Affix->compile = 0;
|
|
|
|
}
|
2005-12-21 21:05:49 +08:00
|
|
|
if (RS_execute(&(Affix->reg.regis), newword))
|
2004-08-29 13:07:03 +08:00
|
|
|
return newword;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2004-06-23 19:06:11 +08:00
|
|
|
int err;
|
|
|
|
pg_wchar *data;
|
|
|
|
size_t data_len;
|
2005-12-21 21:05:49 +08:00
|
|
|
int newword_len;
|
2004-08-29 13:07:03 +08:00
|
|
|
|
2004-06-23 19:06:11 +08:00
|
|
|
if (Affix->compile)
|
2003-08-04 08:43:34 +08:00
|
|
|
{
|
2004-08-29 13:07:03 +08:00
|
|
|
int wmasklen,
|
|
|
|
masklen = strlen(Affix->mask);
|
|
|
|
pg_wchar *mask;
|
2004-06-23 19:06:11 +08:00
|
|
|
mask = (pg_wchar *) palloc((masklen + 1) * sizeof(pg_wchar));
|
2004-08-29 13:07:03 +08:00
|
|
|
wmasklen = pg_mb2wchar_with_len(Affix->mask, mask, masklen);
|
|
|
|
|
2005-12-21 21:05:49 +08:00
|
|
|
err = pg_regcomp(&(Affix->reg.regex), mask, wmasklen, REG_ADVANCED | REG_NOSUB);
|
2004-06-23 19:06:11 +08:00
|
|
|
pfree(mask);
|
|
|
|
if (err)
|
|
|
|
{
|
2005-10-15 10:49:52 +08:00
|
|
|
char regerrstr[ERRSTRSIZE];
|
|
|
|
|
2005-01-12 00:07:55 +08:00
|
|
|
pg_regerror(err, &(Affix->reg.regex), regerrstr, ERRSTRSIZE);
|
2006-03-01 14:30:32 +08:00
|
|
|
elog(ERROR, "regex error in '%s': %s", Affix->mask, regerrstr);
|
2004-06-23 19:06:11 +08:00
|
|
|
}
|
|
|
|
Affix->compile = 0;
|
2003-08-04 08:43:34 +08:00
|
|
|
}
|
2004-05-31 21:29:43 +08:00
|
|
|
|
2004-06-23 19:06:11 +08:00
|
|
|
/* Convert data string to wide characters */
|
2005-12-21 21:05:49 +08:00
|
|
|
newword_len = strlen(newword);
|
|
|
|
data = (pg_wchar *) palloc((newword_len + 1) * sizeof(pg_wchar));
|
|
|
|
data_len = pg_mb2wchar_with_len(newword, data, newword_len);
|
2004-05-31 21:29:43 +08:00
|
|
|
|
2005-12-21 21:05:49 +08:00
|
|
|
if (!(err = pg_regexec(&(Affix->reg.regex), data, data_len, 0, NULL, 0, NULL, 0)))
|
2004-08-29 13:07:03 +08:00
|
|
|
{
|
|
|
|
pfree(data);
|
|
|
|
return newword;
|
2004-06-23 19:06:11 +08:00
|
|
|
}
|
|
|
|
pfree(data);
|
2004-05-31 21:55:19 +08:00
|
|
|
}
|
2004-05-31 21:29:43 +08:00
|
|
|
|
2003-08-04 08:43:34 +08:00
|
|
|
return NULL;
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
|
|
|
|
2003-11-18 01:34:35 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
static char **
|
|
|
|
NormalizeSubWord(IspellDict * Conf, char *word, char flag)
|
|
|
|
{
|
|
|
|
AffixNodeData *suffix = NULL,
|
|
|
|
*prefix = NULL;
|
|
|
|
int slevel = 0,
|
|
|
|
plevel = 0;
|
|
|
|
int wrdlen = strlen(word),
|
|
|
|
swrdlen;
|
2003-11-18 01:34:35 +08:00
|
|
|
char **forms;
|
|
|
|
char **cur;
|
2003-08-04 08:43:34 +08:00
|
|
|
char newword[2 * MAXNORMLEN] = "";
|
2003-11-18 01:34:35 +08:00
|
|
|
char pnewword[2 * MAXNORMLEN] = "";
|
2004-08-29 13:07:03 +08:00
|
|
|
AffixNode *snode = Conf->Suffix,
|
|
|
|
*pnode;
|
|
|
|
int i,
|
|
|
|
j;
|
2003-08-04 08:43:34 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
if (wrdlen > MAXNORMLEN)
|
|
|
|
return NULL;
|
2005-12-12 19:10:12 +08:00
|
|
|
lowerstr(word);
|
2003-11-18 01:34:35 +08:00
|
|
|
cur = forms = (char **) palloc(MAX_NORM * sizeof(char *));
|
|
|
|
*cur = NULL;
|
|
|
|
|
|
|
|
|
|
|
|
/* Check that the word itself is normal form */
|
2004-08-29 13:07:03 +08:00
|
|
|
if (FindWord(Conf, word, 0, flag & FF_COMPOUNDWORD))
|
|
|
|
{
|
2003-11-18 01:34:35 +08:00
|
|
|
*cur = pstrdup(word);
|
|
|
|
cur++;
|
|
|
|
*cur = NULL;
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
2003-08-04 08:43:34 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
/* Find all other NORMAL forms of the 'word' (check only prefix) */
|
|
|
|
pnode = Conf->Prefix;
|
|
|
|
plevel = 0;
|
|
|
|
while (pnode)
|
|
|
|
{
|
|
|
|
prefix = FinfAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX);
|
|
|
|
if (!prefix)
|
|
|
|
break;
|
|
|
|
for (j = 0; j < prefix->naff; j++)
|
|
|
|
{
|
2006-02-10 02:04:20 +08:00
|
|
|
if (CheckAffix(word, wrdlen, prefix->aff[j], flag, newword, NULL))
|
2004-08-29 13:07:03 +08:00
|
|
|
{
|
2003-11-18 01:34:35 +08:00
|
|
|
/* prefix success */
|
2004-08-29 13:07:03 +08:00
|
|
|
if (FindWord(Conf, newword, prefix->aff[j]->flag, flag & FF_COMPOUNDWORD) && (cur - forms) < (MAX_NORM - 1))
|
|
|
|
{
|
2003-11-18 01:34:35 +08:00
|
|
|
/* word search success */
|
|
|
|
*cur = pstrdup(newword);
|
|
|
|
cur++;
|
2004-08-29 13:07:03 +08:00
|
|
|
*cur = NULL;
|
2003-11-18 01:34:35 +08:00
|
|
|
}
|
2003-08-04 08:43:34 +08:00
|
|
|
}
|
|
|
|
}
|
2003-11-18 01:34:35 +08:00
|
|
|
pnode = prefix->node;
|
|
|
|
}
|
2004-08-29 13:07:03 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Find all other NORMAL forms of the 'word' (check suffix and then
|
|
|
|
* prefix)
|
|
|
|
*/
|
|
|
|
while (snode)
|
|
|
|
{
|
2006-02-10 02:04:20 +08:00
|
|
|
int baselen=0;
|
|
|
|
|
2003-11-18 01:34:35 +08:00
|
|
|
/* find possible suffix */
|
2004-06-23 19:29:58 +08:00
|
|
|
suffix = FinfAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX);
|
2004-08-29 13:07:03 +08:00
|
|
|
if (!suffix)
|
|
|
|
break;
|
2003-11-18 01:34:35 +08:00
|
|
|
/* foreach suffix check affix */
|
2004-08-29 13:07:03 +08:00
|
|
|
for (i = 0; i < suffix->naff; i++)
|
|
|
|
{
|
2006-02-10 02:04:20 +08:00
|
|
|
if (CheckAffix(word, wrdlen, suffix->aff[i], flag, newword, &baselen))
|
2004-08-29 13:07:03 +08:00
|
|
|
{
|
2003-11-18 01:34:35 +08:00
|
|
|
/* suffix success */
|
2004-08-29 13:07:03 +08:00
|
|
|
if (FindWord(Conf, newword, suffix->aff[i]->flag, flag & FF_COMPOUNDWORD) && (cur - forms) < (MAX_NORM - 1))
|
|
|
|
{
|
2003-11-18 01:34:35 +08:00
|
|
|
/* word search success */
|
|
|
|
*cur = pstrdup(newword);
|
|
|
|
cur++;
|
2004-08-29 13:07:03 +08:00
|
|
|
*cur = NULL;
|
2003-11-18 01:34:35 +08:00
|
|
|
}
|
|
|
|
/* now we will look changed word with prefixes */
|
2004-08-29 13:07:03 +08:00
|
|
|
pnode = Conf->Prefix;
|
|
|
|
plevel = 0;
|
|
|
|
swrdlen = strlen(newword);
|
|
|
|
while (pnode)
|
|
|
|
{
|
|
|
|
prefix = FinfAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX);
|
|
|
|
if (!prefix)
|
|
|
|
break;
|
|
|
|
for (j = 0; j < prefix->naff; j++)
|
|
|
|
{
|
2006-02-10 02:04:20 +08:00
|
|
|
if (CheckAffix(newword, swrdlen, prefix->aff[j], flag, pnewword, &baselen))
|
2004-08-29 13:07:03 +08:00
|
|
|
{
|
2003-11-18 01:34:35 +08:00
|
|
|
/* prefix success */
|
2004-08-29 13:07:03 +08:00
|
|
|
int ff = (prefix->aff[j]->flagflags & suffix->aff[i]->flagflags & FF_CROSSPRODUCT) ?
|
|
|
|
0 : prefix->aff[j]->flag;
|
|
|
|
|
|
|
|
if (FindWord(Conf, pnewword, ff, flag & FF_COMPOUNDWORD) && (cur - forms) < (MAX_NORM - 1))
|
|
|
|
{
|
2003-11-18 01:34:35 +08:00
|
|
|
/* word search success */
|
|
|
|
*cur = pstrdup(pnewword);
|
|
|
|
cur++;
|
2004-08-29 13:07:03 +08:00
|
|
|
*cur = NULL;
|
2003-11-18 01:34:35 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
pnode = prefix->node;
|
2004-08-29 13:07:03 +08:00
|
|
|
}
|
2003-08-04 08:43:34 +08:00
|
|
|
}
|
|
|
|
}
|
2003-07-21 18:27:44 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
snode = suffix->node;
|
2003-11-18 01:34:35 +08:00
|
|
|
}
|
2003-07-21 18:27:44 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
if (cur == forms)
|
|
|
|
{
|
2003-11-28 20:09:02 +08:00
|
|
|
pfree(forms);
|
2003-08-04 08:43:34 +08:00
|
|
|
return (NULL);
|
2003-11-18 01:34:35 +08:00
|
|
|
}
|
|
|
|
return (forms);
|
|
|
|
}
|
2003-07-21 18:27:44 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
typedef struct SplitVar
|
|
|
|
{
|
|
|
|
int nstem;
|
|
|
|
char **stem;
|
|
|
|
struct SplitVar *next;
|
|
|
|
} SplitVar;
|
2003-11-18 01:34:35 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
static int
|
2006-02-21 01:51:05 +08:00
|
|
|
CheckCompoundAffixes(CMPDAffix ** ptr, char *word, int len, bool CheckInPlace)
|
2004-08-29 13:07:03 +08:00
|
|
|
{
|
2006-02-21 01:51:05 +08:00
|
|
|
if ( CheckInPlace ) {
|
|
|
|
while ((*ptr)->affix)
|
|
|
|
{
|
|
|
|
if (len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len) == 0)
|
|
|
|
{
|
|
|
|
len = (*ptr)->len;
|
|
|
|
(*ptr)++;
|
|
|
|
return len;
|
|
|
|
}
|
|
|
|
(*ptr)++;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
char *affbegin;
|
|
|
|
while ((*ptr)->affix)
|
2004-08-29 13:07:03 +08:00
|
|
|
{
|
2006-02-21 01:51:05 +08:00
|
|
|
if (len > (*ptr)->len && (affbegin = strstr(word, (*ptr)->affix)) != NULL)
|
|
|
|
{
|
|
|
|
len = (*ptr)->len + (affbegin-word);
|
|
|
|
(*ptr)++;
|
|
|
|
return len;
|
|
|
|
}
|
2003-11-18 01:34:35 +08:00
|
|
|
(*ptr)++;
|
|
|
|
}
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
2003-11-18 01:34:35 +08:00
|
|
|
return 0;
|
|
|
|
}
|
2003-07-21 18:27:44 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
static SplitVar *
|
|
|
|
CopyVar(SplitVar * s, int makedup)
|
|
|
|
{
|
|
|
|
SplitVar *v = (SplitVar *) palloc(sizeof(SplitVar));
|
|
|
|
|
|
|
|
v->stem = (char **) palloc(sizeof(char *) * (MAX_NORM));
|
|
|
|
v->next = NULL;
|
|
|
|
if (s)
|
|
|
|
{
|
|
|
|
int i;
|
2003-11-18 01:34:35 +08:00
|
|
|
|
|
|
|
v->nstem = s->nstem;
|
2004-08-29 13:07:03 +08:00
|
|
|
for (i = 0; i < s->nstem; i++)
|
|
|
|
v->stem[i] = (makedup) ? pstrdup(s->stem[i]) : s->stem[i];
|
2003-11-18 01:34:35 +08:00
|
|
|
}
|
2004-08-29 13:07:03 +08:00
|
|
|
else
|
|
|
|
v->nstem = 0;
|
2003-11-18 01:34:35 +08:00
|
|
|
return v;
|
|
|
|
}
|
2003-07-21 18:27:44 +08:00
|
|
|
|
2003-08-04 08:43:34 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
static SplitVar *
|
|
|
|
SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word, int wordlen, int startpos, int minpos)
|
|
|
|
{
|
|
|
|
SplitVar *var = NULL;
|
|
|
|
SPNodeData *StopLow,
|
|
|
|
*StopHigh,
|
|
|
|
*StopMiddle = NULL;
|
|
|
|
SPNode *node = (snode) ? snode : Conf->Dictionary;
|
|
|
|
int level = (snode) ? minpos : startpos; /* recursive
|
|
|
|
* minpos==level */
|
|
|
|
int lenaff;
|
|
|
|
CMPDAffix *caff;
|
|
|
|
char *notprobed;
|
2003-11-18 01:34:35 +08:00
|
|
|
|
2004-04-02 07:44:38 +08:00
|
|
|
notprobed = (char *) palloc(wordlen);
|
2004-08-29 13:07:03 +08:00
|
|
|
memset(notprobed, 1, wordlen);
|
|
|
|
var = CopyVar(orig, 1);
|
2003-11-18 01:34:35 +08:00
|
|
|
|
2006-02-21 01:51:05 +08:00
|
|
|
while (level < wordlen)
|
2004-08-29 13:07:03 +08:00
|
|
|
{
|
2006-02-21 01:51:05 +08:00
|
|
|
/* find word with epenthetic or/and compound suffix */
|
2003-11-18 01:34:35 +08:00
|
|
|
caff = Conf->CompoundAffix;
|
2006-02-21 01:51:05 +08:00
|
|
|
while (level > startpos && (lenaff = CheckCompoundAffixes(&caff, word + level, wordlen - level, (node) ? true : false)) > 0)
|
2004-08-29 13:07:03 +08:00
|
|
|
{
|
|
|
|
/*
|
2005-10-15 10:49:52 +08:00
|
|
|
* there is one of compound suffixes, so check word for existings
|
2004-08-29 13:07:03 +08:00
|
|
|
*/
|
|
|
|
char buf[MAXNORMLEN];
|
|
|
|
char **subres;
|
|
|
|
|
|
|
|
lenaff = level - startpos + lenaff;
|
|
|
|
|
|
|
|
if (!notprobed[startpos + lenaff - 1])
|
2003-11-18 01:34:35 +08:00
|
|
|
continue;
|
2004-08-29 13:07:03 +08:00
|
|
|
|
|
|
|
if (level + lenaff - 1 <= minpos)
|
2003-11-18 01:34:35 +08:00
|
|
|
continue;
|
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
memcpy(buf, word + startpos, lenaff);
|
|
|
|
buf[lenaff] = '\0';
|
2003-11-18 01:34:35 +08:00
|
|
|
|
|
|
|
subres = NormalizeSubWord(Conf, buf, FF_COMPOUNDWORD | FF_COMPOUNDONLYAFX);
|
2004-08-29 13:07:03 +08:00
|
|
|
if (subres)
|
|
|
|
{
|
2003-11-18 01:34:35 +08:00
|
|
|
/* Yes, it was a word from dictionary */
|
2004-08-29 13:07:03 +08:00
|
|
|
SplitVar *new = CopyVar(var, 0);
|
|
|
|
SplitVar *ptr = var;
|
|
|
|
char **sptr = subres;
|
|
|
|
|
|
|
|
notprobed[startpos + lenaff - 1] = 0;
|
|
|
|
|
|
|
|
while (*sptr)
|
|
|
|
{
|
|
|
|
new->stem[new->nstem] = *sptr;
|
2003-11-18 01:34:35 +08:00
|
|
|
new->nstem++;
|
|
|
|
sptr++;
|
|
|
|
}
|
2003-11-28 20:09:02 +08:00
|
|
|
pfree(subres);
|
2003-11-18 01:34:35 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
while (ptr->next)
|
2003-11-18 01:34:35 +08:00
|
|
|
ptr = ptr->next;
|
2004-08-29 13:07:03 +08:00
|
|
|
ptr->next = SplitToVariants(Conf, NULL, new, word, wordlen, startpos + lenaff, startpos + lenaff);
|
|
|
|
|
2003-11-28 20:09:02 +08:00
|
|
|
pfree(new->stem);
|
|
|
|
pfree(new);
|
2003-08-04 08:43:34 +08:00
|
|
|
}
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
2003-08-04 08:43:34 +08:00
|
|
|
|
2006-02-21 01:51:05 +08:00
|
|
|
if ( !node )
|
|
|
|
break;
|
|
|
|
|
|
|
|
StopLow = node->data;
|
|
|
|
StopHigh = node->data + node->length;
|
|
|
|
while (StopLow < StopHigh)
|
2004-08-29 13:07:03 +08:00
|
|
|
{
|
2006-02-21 01:51:05 +08:00
|
|
|
StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
|
|
|
|
if (StopMiddle->val == ((uint8 *) (word))[level])
|
|
|
|
break;
|
|
|
|
else if (StopMiddle->val < ((uint8 *) (word))[level])
|
|
|
|
StopLow = StopMiddle + 1;
|
|
|
|
else
|
|
|
|
StopHigh = StopMiddle;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (StopLow < StopHigh) {
|
|
|
|
|
|
|
|
/* find infinitive */
|
|
|
|
if (StopMiddle->isword && StopMiddle->compoundallow && notprobed[level])
|
2004-08-29 13:07:03 +08:00
|
|
|
{
|
2006-02-21 01:51:05 +08:00
|
|
|
/* ok, we found full compoundallowed word */
|
|
|
|
if (level > minpos)
|
2004-08-29 13:07:03 +08:00
|
|
|
{
|
2006-02-21 01:51:05 +08:00
|
|
|
/* and its length more than minimal */
|
|
|
|
if (wordlen == level + 1)
|
|
|
|
{
|
|
|
|
/* well, it was last word */
|
|
|
|
var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos);
|
|
|
|
var->nstem++;
|
|
|
|
pfree(notprobed);
|
|
|
|
return var;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* then we will search more big word at the same point */
|
|
|
|
SplitVar *ptr = var;
|
|
|
|
|
|
|
|
while (ptr->next)
|
|
|
|
ptr = ptr->next;
|
|
|
|
ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
|
|
|
|
/* we can find next word */
|
|
|
|
level++;
|
|
|
|
var->stem[var->nstem] = strnduplicate(word + startpos, level - startpos);
|
|
|
|
var->nstem++;
|
|
|
|
node = Conf->Dictionary;
|
|
|
|
startpos = level;
|
|
|
|
continue;
|
|
|
|
}
|
2003-08-04 08:43:34 +08:00
|
|
|
}
|
|
|
|
}
|
2006-02-21 01:51:05 +08:00
|
|
|
node = StopMiddle->node;
|
|
|
|
} else
|
|
|
|
node = NULL;
|
2003-11-18 01:34:35 +08:00
|
|
|
level++;
|
|
|
|
}
|
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos);
|
2003-11-18 01:34:35 +08:00
|
|
|
var->nstem++;
|
2004-04-02 07:44:38 +08:00
|
|
|
pfree(notprobed);
|
2003-11-18 01:34:35 +08:00
|
|
|
return var;
|
2004-08-29 13:07:03 +08:00
|
|
|
}
|
|
|
|
|
2005-01-25 23:24:38 +08:00
|
|
|
TSLexeme *
|
2004-08-29 13:07:03 +08:00
|
|
|
NINormalizeWord(IspellDict * Conf, char *word)
|
|
|
|
{
|
|
|
|
char **res = NormalizeSubWord(Conf, word, 0);
|
2005-10-15 10:49:52 +08:00
|
|
|
TSLexeme *lcur = NULL,
|
|
|
|
*lres = NULL;
|
|
|
|
uint16 NVariant = 1;
|
|
|
|
|
|
|
|
if (res)
|
|
|
|
{
|
|
|
|
char **ptr = res;
|
|
|
|
|
|
|
|
lcur = lres = (TSLexeme *) palloc(MAX_NORM * sizeof(TSLexeme));
|
|
|
|
while (*ptr)
|
|
|
|
{
|
|
|
|
lcur->lexeme = *ptr;
|
|
|
|
lcur->flags = 0;
|
2005-01-25 23:24:38 +08:00
|
|
|
lcur->nvariant = NVariant++;
|
|
|
|
lcur++;
|
|
|
|
ptr++;
|
|
|
|
}
|
2005-10-15 10:49:52 +08:00
|
|
|
lcur->lexeme = NULL;
|
2005-01-25 23:24:38 +08:00
|
|
|
pfree(res);
|
|
|
|
}
|
2004-08-29 13:07:03 +08:00
|
|
|
|
|
|
|
if (Conf->compoundcontrol != '\t')
|
|
|
|
{
|
|
|
|
int wordlen = strlen(word);
|
|
|
|
SplitVar *ptr,
|
|
|
|
*var = SplitToVariants(Conf, NULL, NULL, word, wordlen, 0, -1);
|
|
|
|
int i;
|
|
|
|
|
|
|
|
while (var)
|
|
|
|
{
|
|
|
|
if (var->nstem > 1)
|
|
|
|
{
|
|
|
|
char **subres = NormalizeSubWord(Conf, var->stem[var->nstem - 1], FF_COMPOUNDWORD);
|
|
|
|
|
|
|
|
if (subres)
|
|
|
|
{
|
2005-01-25 23:24:38 +08:00
|
|
|
char **subptr = subres;
|
|
|
|
|
2005-10-15 10:49:52 +08:00
|
|
|
if (!lcur)
|
|
|
|
lcur = lres = (TSLexeme *) palloc(MAX_NORM * sizeof(TSLexeme));
|
|
|
|
|
|
|
|
while (*subptr)
|
|
|
|
{
|
|
|
|
for (i = 0; i < var->nstem - 1; i++)
|
|
|
|
{
|
|
|
|
lcur->lexeme = (subptr == subres) ? var->stem[i] : pstrdup(var->stem[i]);
|
|
|
|
lcur->flags = 0;
|
2005-01-25 23:24:38 +08:00
|
|
|
lcur->nvariant = NVariant;
|
|
|
|
lcur++;
|
|
|
|
}
|
2004-08-29 13:07:03 +08:00
|
|
|
|
2005-10-15 10:49:52 +08:00
|
|
|
lcur->lexeme = *subptr;
|
|
|
|
lcur->flags = 0;
|
2005-01-25 23:24:38 +08:00
|
|
|
lcur->nvariant = NVariant;
|
|
|
|
lcur++;
|
|
|
|
subptr++;
|
|
|
|
NVariant++;
|
2005-10-15 10:49:52 +08:00
|
|
|
}
|
2004-08-29 13:07:03 +08:00
|
|
|
|
2005-10-15 10:49:52 +08:00
|
|
|
lcur->lexeme = NULL;
|
2003-11-28 20:09:02 +08:00
|
|
|
pfree(subres);
|
2004-08-29 13:07:03 +08:00
|
|
|
var->stem[0] = NULL;
|
2005-10-15 10:49:52 +08:00
|
|
|
pfree(var->stem[var->nstem - 1]);
|
2003-08-04 08:43:34 +08:00
|
|
|
}
|
|
|
|
}
|
2004-08-29 13:07:03 +08:00
|
|
|
|
|
|
|
for (i = 0; i < var->nstem && var->stem[i]; i++)
|
|
|
|
pfree(var->stem[i]);
|
2003-11-18 01:34:35 +08:00
|
|
|
ptr = var->next;
|
2003-11-28 20:09:02 +08:00
|
|
|
pfree(var->stem);
|
2004-08-29 13:07:03 +08:00
|
|
|
pfree(var);
|
|
|
|
var = ptr;
|
2003-11-18 01:34:35 +08:00
|
|
|
}
|
|
|
|
}
|
2005-01-25 23:24:38 +08:00
|
|
|
return lres;
|
2003-11-18 01:34:35 +08:00
|
|
|
}
|
2003-08-04 08:43:34 +08:00
|
|
|
|
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
static void
|
|
|
|
freeSPNode(SPNode * node)
|
|
|
|
{
|
2003-11-18 01:34:35 +08:00
|
|
|
SPNodeData *data;
|
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
if (!node)
|
|
|
|
return;
|
|
|
|
data = node->data;
|
|
|
|
while (node->length)
|
|
|
|
{
|
2003-11-18 01:34:35 +08:00
|
|
|
freeSPNode(data->node);
|
|
|
|
data++;
|
|
|
|
node->length--;
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
2003-11-18 01:34:35 +08:00
|
|
|
free(node);
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
2004-08-29 13:07:03 +08:00
|
|
|
|
|
|
|
static void
|
|
|
|
freeANode(AffixNode * node)
|
|
|
|
{
|
2003-11-18 01:34:35 +08:00
|
|
|
AffixNodeData *data;
|
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
if (!node)
|
|
|
|
return;
|
|
|
|
data = node->data;
|
|
|
|
while (node->length)
|
|
|
|
{
|
2003-11-18 01:34:35 +08:00
|
|
|
freeANode(data->node);
|
|
|
|
if (data->naff)
|
2004-08-29 13:07:03 +08:00
|
|
|
free(data->aff);
|
2003-11-18 01:34:35 +08:00
|
|
|
data++;
|
|
|
|
node->length--;
|
|
|
|
}
|
|
|
|
free(node);
|
|
|
|
}
|
2004-08-29 13:07:03 +08:00
|
|
|
|
2003-07-21 18:27:44 +08:00
|
|
|
|
2003-08-04 08:43:34 +08:00
|
|
|
void
|
2003-11-18 01:34:35 +08:00
|
|
|
NIFree(IspellDict * Conf)
|
2003-08-04 08:43:34 +08:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
AFFIX *Affix = (AFFIX *) Conf->Affix;
|
2004-08-29 13:07:03 +08:00
|
|
|
char **aff = Conf->AffixData;
|
2003-11-18 01:34:35 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
if (aff)
|
|
|
|
{
|
|
|
|
while (*aff)
|
|
|
|
{
|
2003-11-18 01:34:35 +08:00
|
|
|
free(*aff);
|
|
|
|
aff++;
|
|
|
|
}
|
|
|
|
free(Conf->AffixData);
|
|
|
|
}
|
2003-08-04 08:43:34 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
|
2003-08-04 08:43:34 +08:00
|
|
|
for (i = 0; i < Conf->naffixes; i++)
|
|
|
|
{
|
2004-08-29 13:07:03 +08:00
|
|
|
if (Affix[i].compile == 0)
|
|
|
|
{
|
|
|
|
if (Affix[i].isregis)
|
|
|
|
RS_free(&(Affix[i].reg.regis));
|
|
|
|
else
|
2004-06-23 19:06:11 +08:00
|
|
|
pg_regfree(&(Affix[i].reg.regex));
|
|
|
|
}
|
2006-02-10 02:04:20 +08:00
|
|
|
if ( Affix[i].mask != VoidString ) free(Affix[i].mask);
|
|
|
|
if ( Affix[i].find != VoidString ) free(Affix[i].find);
|
|
|
|
if ( Affix[i].repl != VoidString ) free(Affix[i].repl);
|
2003-08-04 08:43:34 +08:00
|
|
|
}
|
2004-08-29 13:07:03 +08:00
|
|
|
if (Conf->Spell)
|
|
|
|
{
|
2003-11-18 01:34:35 +08:00
|
|
|
for (i = 0; i < Conf->nspell; i++)
|
2006-02-10 02:04:20 +08:00
|
|
|
pfree(Conf->Spell[i]->word);
|
|
|
|
pfree(Conf->Spell);
|
2003-11-18 01:34:35 +08:00
|
|
|
}
|
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
if (Conf->Affix)
|
|
|
|
free(Conf->Affix);
|
|
|
|
if (Conf->CompoundAffix)
|
|
|
|
free(Conf->CompoundAffix);
|
2003-11-18 01:34:35 +08:00
|
|
|
freeSPNode(Conf->Dictionary);
|
|
|
|
freeANode(Conf->Suffix);
|
|
|
|
freeANode(Conf->Prefix);
|
2003-08-04 08:43:34 +08:00
|
|
|
memset((void *) Conf, 0, sizeof(IspellDict));
|
|
|
|
return;
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|