postgresql/contrib/tsearch2/ispell/spell.c

1260 lines
26 KiB
C
Raw Normal View History

2003-07-21 18:27:44 +08:00
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "postgres.h"
#include "spell.h"
#define MAX_NORM 1024
#define MAXNORMLEN 256
2003-07-21 18:27:44 +08:00
#define STRNCASECMP(x,y) pg_strncasecmp(x, y, strlen(y))
2004-06-23 19:29:58 +08:00
#define GETWCHAR(W,L,N,T) ( ((uint8*)(W))[ ((T)==FF_PREFIX) ? (N) : ( (L) - 1 - (N) ) ] )
#define GETCHAR(A,N,T) GETWCHAR( (A)->repl, (A)->replen, N, T )
#define MEMOUT(X) if ( !(X) ) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory")))
2003-07-21 18:27:44 +08:00
2003-08-04 08:43:34 +08:00
static int
cmpspell(const void *s1, const void *s2)
{
return (strcmp(((const SPELL *) s1)->word, ((const SPELL *) s2)->word));
2003-07-21 18:27:44 +08:00
}
static int
cmpspellaffix(const void *s1, const void *s2)
{
return (strcmp(((const SPELL *) s1)->p.flag, ((const SPELL *) s2)->p.flag));
}
2003-07-21 18:27:44 +08:00
2003-08-04 08:43:34 +08:00
static void
strlower(char *str)
{
unsigned char *ptr = (unsigned char *) str;
while (*ptr)
{
*ptr = tolower(*ptr);
2003-07-21 18:27:44 +08:00
ptr++;
}
}
2004-08-29 13:07:03 +08:00
static char *
strnduplicate(char *s, int len)
{
char *d = (char *) palloc(len + 1);
memcpy(d, s, len);
d[len] = '\0';
return d;
}
2004-08-29 13:07:03 +08:00
2003-07-21 18:27:44 +08:00
/* backward string compaire for suffix tree operations */
2003-08-04 08:43:34 +08:00
static int
strbcmp(const unsigned char *s1, const unsigned char *s2)
2003-08-04 08:43:34 +08:00
{
int l1 = strlen(s1) - 1,
l2 = strlen(s2) - 1;
while (l1 >= 0 && l2 >= 0)
{
if (s1[l1] < s2[l2])
return -1;
if (s1[l1] > s2[l2])
return 1;
l1--;
l2--;
2003-07-21 18:27:44 +08:00
}
2003-08-04 08:43:34 +08:00
if (l1 < l2)
return -1;
if (l1 > l2)
return 1;
2003-07-21 18:27:44 +08:00
return 0;
}
2003-08-04 08:43:34 +08:00
static int
strbncmp(const unsigned char *s1, const unsigned char *s2, size_t count)
2003-08-04 08:43:34 +08:00
{
int l1 = strlen(s1) - 1,
l2 = strlen(s2) - 1,
l = count;
while (l1 >= 0 && l2 >= 0 && l > 0)
{
if (s1[l1] < s2[l2])
return -1;
if (s1[l1] > s2[l2])
return 1;
2003-07-21 18:27:44 +08:00
l1--;
l2--;
l--;
}
2003-08-04 08:43:34 +08:00
if (l == 0)
return 0;
if (l1 < l2)
return -1;
if (l1 > l2)
return 1;
2003-07-21 18:27:44 +08:00
return 0;
}
2003-08-04 08:43:34 +08:00
static int
cmpaffix(const void *s1, const void *s2)
{
if (((const AFFIX *) s1)->type < ((const AFFIX *) s2)->type)
return -1;
if (((const AFFIX *) s1)->type > ((const AFFIX *) s2)->type)
return 1;
2004-06-23 19:29:58 +08:00
if (((const AFFIX *) s1)->type == FF_PREFIX)
2003-08-04 08:43:34 +08:00
return (strcmp(((const AFFIX *) s1)->repl, ((const AFFIX *) s2)->repl));
else
return (strbcmp(((const AFFIX *) s1)->repl, ((const AFFIX *) s2)->repl));
2003-07-21 18:27:44 +08:00
}
2003-08-04 08:43:34 +08:00
int
NIAddSpell(IspellDict * Conf, const char *word, const char *flag)
2003-08-04 08:43:34 +08:00
{
if (Conf->nspell >= Conf->mspell)
{
if (Conf->mspell)
{
Conf->mspell += 1024 * 20;
Conf->Spell = (SPELL *) realloc(Conf->Spell, Conf->mspell * sizeof(SPELL));
}
else
{
Conf->mspell = 1024 * 20;
Conf->Spell = (SPELL *) malloc(Conf->mspell * sizeof(SPELL));
2003-07-21 18:27:44 +08:00
}
MEMOUT(Conf->Spell);
2003-07-21 18:27:44 +08:00
}
2003-08-04 08:43:34 +08:00
Conf->Spell[Conf->nspell].word = strdup(word);
MEMOUT(Conf->Spell[Conf->nspell].word);
strncpy(Conf->Spell[Conf->nspell].p.flag, flag, 16);
2003-07-21 18:27:44 +08:00
Conf->nspell++;
2003-08-04 08:43:34 +08:00
return (0);
2003-07-21 18:27:44 +08:00
}
2003-08-04 08:43:34 +08:00
int
NIImportDictionary(IspellDict * Conf, const char *filename)
2003-08-04 08:43:34 +08:00
{
unsigned char str[BUFSIZ];
FILE *dict;
2003-07-21 18:27:44 +08:00
2003-08-04 08:43:34 +08:00
if (!(dict = fopen(filename, "r")))
return (1);
while (fgets(str, sizeof(str), dict))
{
2003-07-21 18:27:44 +08:00
unsigned char *s;
const unsigned char *flag;
2003-08-04 08:43:34 +08:00
flag = NULL;
if ((s = strchr(str, '/')))
{
*s = 0;
s++;
flag = s;
while (*s)
{
if (isprint(*s) && !isspace(*s))
2003-07-21 18:27:44 +08:00
s++;
2003-08-04 08:43:34 +08:00
else
{
*s = 0;
2003-07-21 18:27:44 +08:00
break;
}
}
}
2003-08-04 08:43:34 +08:00
else
flag = "";
2003-07-21 18:27:44 +08:00
strlower(str);
/* Dont load words if first letter is not required */
/* It allows to optimize loading at search time */
2003-08-04 08:43:34 +08:00
s = str;
while (*s)
{
if (*s == '\r')
*s = 0;
if (*s == '\n')
*s = 0;
2003-07-21 18:27:44 +08:00
s++;
}
NIAddSpell(Conf, str, flag);
2003-07-21 18:27:44 +08:00
}
fclose(dict);
2003-08-04 08:43:34 +08:00
return (0);
2003-07-21 18:27:44 +08:00
}
static int
FindWord(IspellDict * Conf, const char *word, int affixflag, char compoundonly)
2003-08-04 08:43:34 +08:00
{
2004-08-29 13:07:03 +08:00
SPNode *node = Conf->Dictionary;
SPNodeData *StopLow,
*StopHigh,
*StopMiddle;
uint8 *ptr = (uint8 *) word;
2004-08-29 13:07:03 +08:00
while (node && *ptr)
{
StopLow = node->data;
2004-08-29 13:07:03 +08:00
StopHigh = node->data + node->length;
while (StopLow < StopHigh)
{
StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
2004-08-29 13:07:03 +08:00
if (StopMiddle->val == *ptr)
{
if (*(ptr + 1) == '\0' && StopMiddle->isword)
{
if (compoundonly && !StopMiddle->compoundallow)
return 0;
2004-08-29 13:07:03 +08:00
if ((affixflag == 0) || (strchr(Conf->AffixData[StopMiddle->affix], affixflag) != NULL))
return 1;
}
2004-08-29 13:07:03 +08:00
node = StopMiddle->node;
ptr++;
break;
2004-08-29 13:07:03 +08:00
}
else if (StopMiddle->val < *ptr)
StopLow = StopMiddle + 1;
2004-08-29 13:07:03 +08:00
else
StopHigh = StopMiddle;
2003-07-21 18:27:44 +08:00
}
2004-08-29 13:07:03 +08:00
if (StopLow >= StopHigh)
break;
2003-07-21 18:27:44 +08:00
}
return 0;
2003-07-21 18:27:44 +08:00
}
2003-08-04 08:43:34 +08:00
int
NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const char *find, const char *repl, int type)
2003-08-04 08:43:34 +08:00
{
if (Conf->naffixes >= Conf->maffixes)
{
if (Conf->maffixes)
{
Conf->maffixes += 16;
Conf->Affix = (AFFIX *) realloc((void *) Conf->Affix, Conf->maffixes * sizeof(AFFIX));
}
else
{
Conf->maffixes = 16;
Conf->Affix = (AFFIX *) malloc(Conf->maffixes * sizeof(AFFIX));
2003-07-21 18:27:44 +08:00
}
MEMOUT(Conf->Affix);
2003-07-21 18:27:44 +08:00
}
2004-08-29 13:07:03 +08:00
if (strcmp(mask, ".") == 0)
{
Conf->Affix[Conf->naffixes].issimple = 1;
Conf->Affix[Conf->naffixes].isregis = 0;
*(Conf->Affix[Conf->naffixes].mask) = '\0';
}
else if (RS_isRegis(mask))
{
Conf->Affix[Conf->naffixes].issimple = 0;
Conf->Affix[Conf->naffixes].isregis = 1;
strcpy(Conf->Affix[Conf->naffixes].mask, mask);
}
else
{
Conf->Affix[Conf->naffixes].issimple = 0;
Conf->Affix[Conf->naffixes].isregis = 0;
if (type == FF_SUFFIX)
sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask);
else
sprintf(Conf->Affix[Conf->naffixes].mask, "^%s", mask);
}
Conf->Affix[Conf->naffixes].compile = 1;
Conf->Affix[Conf->naffixes].flagflags = flagflags;
Conf->Affix[Conf->naffixes].flag = flag;
Conf->Affix[Conf->naffixes].type = type;
strcpy(Conf->Affix[Conf->naffixes].find, find);
strcpy(Conf->Affix[Conf->naffixes].repl, repl);
Conf->Affix[Conf->naffixes].replen = strlen(repl);
Conf->naffixes++;
2003-08-04 08:43:34 +08:00
return (0);
2003-07-21 18:27:44 +08:00
}
2003-08-04 08:43:34 +08:00
static char *
remove_spaces(char *dist, char *src)
{
char *d,
*s;
d = dist;
s = src;
while (*s)
{
if (*s != ' ' && *s != '-' && *s != '\t')
{
*d = *s;
2003-07-21 18:27:44 +08:00
d++;
}
s++;
}
2003-08-04 08:43:34 +08:00
*d = 0;
return (dist);
2003-07-21 18:27:44 +08:00
}
2003-08-04 08:43:34 +08:00
int
NIImportAffixes(IspellDict * Conf, const char *filename)
2003-08-04 08:43:34 +08:00
{
2003-07-21 18:27:44 +08:00
unsigned char str[BUFSIZ];
2003-08-04 08:43:34 +08:00
unsigned char flag = 0;
unsigned char mask[BUFSIZ] = "";
unsigned char find[BUFSIZ] = "";
unsigned char repl[BUFSIZ] = "";
2003-07-21 18:27:44 +08:00
unsigned char *s;
2003-08-04 08:43:34 +08:00
int i;
int suffixes = 0;
int prefixes = 0;
unsigned char flagflags = 0;
2003-08-04 08:43:34 +08:00
FILE *affix;
if (!(affix = fopen(filename, "r")))
return (1);
2004-08-29 13:07:03 +08:00
Conf->compoundcontrol = '\t';
2003-08-04 08:43:34 +08:00
while (fgets(str, sizeof(str), affix))
{
2004-08-29 13:07:03 +08:00
if (STRNCASECMP(str, "compoundwords") == 0)
{
s = strchr(str, 'l');
if (s)
{
while (*s != ' ')
s++;
while (*s == ' ')
s++;
Conf->compoundcontrol = *s;
2004-08-29 13:07:03 +08:00
continue;
}
}
2004-08-29 13:07:03 +08:00
if (STRNCASECMP(str, "suffixes") == 0)
2003-08-04 08:43:34 +08:00
{
suffixes = 1;
prefixes = 0;
2003-07-21 18:27:44 +08:00
continue;
}
2004-08-29 13:07:03 +08:00
if (STRNCASECMP(str, "prefixes") == 0)
2003-08-04 08:43:34 +08:00
{
suffixes = 0;
prefixes = 1;
2003-07-21 18:27:44 +08:00
continue;
}
2004-08-29 13:07:03 +08:00
if (STRNCASECMP(str, "flag ") == 0)
2003-08-04 08:43:34 +08:00
{
s = str + 5;
2004-08-29 13:07:03 +08:00
flagflags = 0;
while (*s == ' ')
s++;
if (*s == '*')
{
flagflags |= FF_CROSSPRODUCT;
s++;
2004-08-29 13:07:03 +08:00
}
else if (*s == '~')
{
flagflags |= FF_COMPOUNDONLYAFX;
2003-07-21 18:27:44 +08:00
s++;
}
2004-08-29 13:07:03 +08:00
if (*s == '\\')
s++;
2003-08-04 08:43:34 +08:00
flag = *s;
2003-07-21 18:27:44 +08:00
continue;
}
2003-08-04 08:43:34 +08:00
if ((!suffixes) && (!prefixes))
continue;
if ((s = strchr(str, '#')))
*s = 0;
if (!*str)
continue;
2003-07-21 18:27:44 +08:00
strlower(str);
2003-08-04 08:43:34 +08:00
strcpy(mask, "");
strcpy(find, "");
strcpy(repl, "");
i = sscanf(str, "%[^>\n]>%[^,\n],%[^\n]", mask, find, repl);
remove_spaces(str, repl);
strcpy(repl, str);
remove_spaces(str, find);
strcpy(find, str);
remove_spaces(str, mask);
strcpy(mask, str);
switch (i)
{
2003-07-21 18:27:44 +08:00
case 3:
break;
case 2:
2003-08-04 08:43:34 +08:00
if (*find != '\0')
{
strcpy(repl, find);
strcpy(find, "");
2003-07-21 18:27:44 +08:00
}
break;
default:
continue;
}
2003-08-04 08:43:34 +08:00
NIAddAffix(Conf, (int) flag, (char) flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
2003-08-04 08:43:34 +08:00
2003-07-21 18:27:44 +08:00
}
fclose(affix);
2003-08-04 08:43:34 +08:00
return (0);
2003-07-21 18:27:44 +08:00
}
2004-08-29 13:07:03 +08:00
static int
MergeAffix(IspellDict * Conf, int a1, int a2)
{
int naffix = 0;
char **ptr = Conf->AffixData;
2004-08-29 13:07:03 +08:00
while (*ptr)
{
naffix++;
ptr++;
}
2004-08-29 13:07:03 +08:00
Conf->AffixData = (char **) realloc(Conf->AffixData, (naffix + 2) * sizeof(char *));
MEMOUT(Conf->AffixData);
ptr = Conf->AffixData + naffix;
2004-08-29 13:07:03 +08:00
*ptr = malloc(strlen(Conf->AffixData[a1]) + strlen(Conf->AffixData[a2]) + 1 /* space */ + 1 /* \0 */ );
MEMOUT(ptr);
sprintf(*ptr, "%s %s", Conf->AffixData[a1], Conf->AffixData[a2]);
ptr++;
2004-08-29 13:07:03 +08:00
*ptr = '\0';
return naffix;
}
2004-08-29 13:07:03 +08:00
static SPNode *
mkSPNode(IspellDict * Conf, int low, int high, int level)
{
int i;
int nchar = 0;
char lastchar = '\0';
SPNode *rs;
SPNodeData *data;
2004-08-29 13:07:03 +08:00
int lownew = low;
2004-08-29 13:07:03 +08:00
for (i = low; i < high; i++)
if (Conf->Spell[i].p.d.len > level && lastchar != Conf->Spell[i].word[level])
{
nchar++;
2004-08-29 13:07:03 +08:00
lastchar = Conf->Spell[i].word[level];
}
if (!nchar)
return NULL;
2004-08-29 13:07:03 +08:00
rs = (SPNode *) malloc(SPNHRDSZ + nchar * sizeof(SPNodeData));
MEMOUT(rs);
2004-08-29 13:07:03 +08:00
memset(rs, 0, SPNHRDSZ + nchar * sizeof(SPNodeData));
rs->length = nchar;
2004-08-29 13:07:03 +08:00
data = rs->data;
lastchar = '\0';
for (i = low; i < high; i++)
if (Conf->Spell[i].p.d.len > level)
{
if (lastchar != Conf->Spell[i].word[level])
{
if (lastchar)
{
data->node = mkSPNode(Conf, lownew, i, level + 1);
lownew = i;
data++;
}
2004-08-29 13:07:03 +08:00
lastchar = Conf->Spell[i].word[level];
}
2004-08-29 13:07:03 +08:00
data->val = ((uint8 *) (Conf->Spell[i].word))[level];
if (Conf->Spell[i].p.d.len == level + 1)
{
if (data->isword && data->affix != Conf->Spell[i].p.d.affix)
{
/*
* fprintf(stderr,"Word already exists: %s (affixes:
* '%s' and '%s')\n", Conf->Spell[i].word,
* Conf->AffixData[data->affix],
* Conf->AffixData[Conf->Spell[i].p.d.affix] );
*/
/* MergeAffix called a few times */
data->affix = MergeAffix(Conf, data->affix, Conf->Spell[i].p.d.affix);
2004-08-29 13:07:03 +08:00
}
else
data->affix = Conf->Spell[i].p.d.affix;
2004-08-29 13:07:03 +08:00
data->isword = 1;
if (strchr(Conf->AffixData[data->affix], Conf->compoundcontrol))
data->compoundallow = 1;
}
}
2004-08-29 13:07:03 +08:00
data->node = mkSPNode(Conf, lownew, high, level + 1);
return rs;
}
2003-08-04 08:43:34 +08:00
void
NISortDictionary(IspellDict * Conf)
2003-08-04 08:43:34 +08:00
{
size_t i;
2004-08-29 13:07:03 +08:00
int naffix = 3;
/* compress affixes */
qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL), cmpspellaffix);
for (i = 1; i < Conf->nspell; i++)
2004-08-29 13:07:03 +08:00
if (strcmp(Conf->Spell[i].p.flag, Conf->Spell[i - 1].p.flag))
naffix++;
2004-08-29 13:07:03 +08:00
Conf->AffixData = (char **) malloc(naffix * sizeof(char *));
MEMOUT(Conf->AffixData);
2004-08-29 13:07:03 +08:00
memset(Conf->AffixData, 0, naffix * sizeof(char *));
naffix = 1;
Conf->AffixData[0] = strdup("");
MEMOUT(Conf->AffixData[0]);
2004-08-29 13:07:03 +08:00
Conf->AffixData[1] = strdup(Conf->Spell[0].p.flag);
MEMOUT(Conf->AffixData[1]);
Conf->Spell[0].p.d.affix = 1;
Conf->Spell[0].p.d.len = strlen(Conf->Spell[0].word);
2004-08-29 13:07:03 +08:00
for (i = 1; i < Conf->nspell; i++)
{
if (strcmp(Conf->Spell[i].p.flag, Conf->AffixData[naffix]))
{
naffix++;
2004-08-29 13:07:03 +08:00
Conf->AffixData[naffix] = strdup(Conf->Spell[i].p.flag);
MEMOUT(Conf->AffixData[naffix]);
}
Conf->Spell[i].p.d.affix = naffix;
Conf->Spell[i].p.d.len = strlen(Conf->Spell[i].word);
}
2004-08-29 13:07:03 +08:00
2003-08-04 08:43:34 +08:00
qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL), cmpspell);
Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);
2004-08-29 13:07:03 +08:00
for (i = 0; i < Conf->nspell; i++)
free(Conf->Spell[i].word);
free(Conf->Spell);
Conf->Spell = NULL;
}
2003-07-21 18:27:44 +08:00
2004-08-29 13:07:03 +08:00
static AffixNode *
mkANode(IspellDict * Conf, int low, int high, int level, int type)
{
int i;
int nchar = 0;
uint8 lastchar = '\0';
AffixNode *rs;
AffixNodeData *data;
2004-08-29 13:07:03 +08:00
int lownew = low;
2004-08-29 13:07:03 +08:00
for (i = low; i < high; i++)
if (Conf->Affix[i].replen > level && lastchar != GETCHAR(Conf->Affix + i, level, type))
{
nchar++;
2004-08-29 13:07:03 +08:00
lastchar = GETCHAR(Conf->Affix + i, level, type);
}
2003-07-21 18:27:44 +08:00
if (!nchar)
return NULL;
2004-08-29 13:07:03 +08:00
rs = (AffixNode *) malloc(ANHRDSZ + nchar * sizeof(AffixNodeData));
MEMOUT(rs);
2004-08-29 13:07:03 +08:00
memset(rs, 0, ANHRDSZ + nchar * sizeof(AffixNodeData));
rs->length = nchar;
2004-08-29 13:07:03 +08:00
data = rs->data;
lastchar = '\0';
for (i = low; i < high; i++)
if (Conf->Affix[i].replen > level)
{
if (lastchar != GETCHAR(Conf->Affix + i, level, type))
{
if (lastchar)
{
data->node = mkANode(Conf, lownew, i, level + 1, type);
lownew = i;
data++;
}
2004-08-29 13:07:03 +08:00
lastchar = GETCHAR(Conf->Affix + i, level, type);
}
2004-08-29 13:07:03 +08:00
data->val = GETCHAR(Conf->Affix + i, level, type);
if (Conf->Affix[i].replen == level + 1)
{ /* affix stopped */
if (!data->naff)
{
data->aff = (AFFIX **) malloc(sizeof(AFFIX *) * (high - i + 1));
MEMOUT(data->aff);
}
2004-08-29 13:07:03 +08:00
data->aff[data->naff] = Conf->Affix + i;
data->naff++;
}
2003-08-04 08:43:34 +08:00
}
2004-08-29 13:07:03 +08:00
data->node = mkANode(Conf, lownew, high, level + 1, type);
return rs;
2003-07-21 18:27:44 +08:00
}
static void
2004-08-29 13:07:03 +08:00
mkVoidAffix(IspellDict * Conf, int issuffix, int startsuffix)
{
int i,
cnt = 0;
int start = (issuffix) ? startsuffix : 0;
int end = (issuffix) ? Conf->naffixes : startsuffix;
AffixNode *Affix = (AffixNode *) malloc(ANHRDSZ + sizeof(AffixNodeData));
MEMOUT(Affix);
2004-08-29 13:07:03 +08:00
memset(Affix, 0, ANHRDSZ + sizeof(AffixNodeData));
Affix->length = 1;
Affix->isvoid = 1;
2004-08-29 13:07:03 +08:00
if (issuffix)
{
Affix->data->node = Conf->Suffix;
Conf->Suffix = Affix;
}
else
{
Affix->data->node = Conf->Prefix;
Conf->Prefix = Affix;
}
2004-08-29 13:07:03 +08:00
for (i = start; i < end; i++)
if (Conf->Affix[i].replen == 0)
cnt++;
2004-08-29 13:07:03 +08:00
if (cnt == 0)
return;
2004-08-29 13:07:03 +08:00
Affix->data->aff = (AFFIX **) malloc(sizeof(AFFIX *) * cnt);
MEMOUT(Affix->data->aff);
2004-08-29 13:07:03 +08:00
Affix->data->naff = (uint32) cnt;
cnt = 0;
for (i = start; i < end; i++)
if (Conf->Affix[i].replen == 0)
{
Affix->data->aff[cnt] = Conf->Affix + i;
cnt++;
}
}
2003-08-04 08:43:34 +08:00
void
NISortAffixes(IspellDict * Conf)
2003-08-04 08:43:34 +08:00
{
AFFIX *Affix;
size_t i;
2004-08-29 13:07:03 +08:00
CMPDAffix *ptr;
int firstsuffix = -1;
2003-08-04 08:43:34 +08:00
if (Conf->naffixes > 1)
qsort((void *) Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix);
2004-08-29 13:07:03 +08:00
Conf->CompoundAffix = ptr = (CMPDAffix *) malloc(sizeof(CMPDAffix) * Conf->naffixes);
MEMOUT(Conf->CompoundAffix);
2004-08-29 13:07:03 +08:00
ptr->affix = NULL;
2004-08-29 13:07:03 +08:00
for (i = 0; i < Conf->naffixes; i++)
{
2003-08-04 08:43:34 +08:00
Affix = &(((AFFIX *) Conf->Affix)[i]);
2004-08-29 13:07:03 +08:00
if (Affix->type == FF_SUFFIX)
{
if (firstsuffix < 0)
firstsuffix = i;
if (Affix->flagflags & FF_COMPOUNDONLYAFX)
{
if (!ptr->affix || strbncmp((ptr - 1)->affix, Affix->repl, (ptr - 1)->len))
{
/* leave only unique and minimals suffixes */
2004-08-29 13:07:03 +08:00
ptr->affix = Affix->repl;
ptr->len = Affix->replen;
ptr++;
}
2003-08-04 08:43:34 +08:00
}
}
}
ptr->affix = NULL;
2004-08-29 13:07:03 +08:00
Conf->CompoundAffix = (CMPDAffix *) realloc(Conf->CompoundAffix, sizeof(CMPDAffix) * (ptr - Conf->CompoundAffix + 1));
2004-08-29 13:07:03 +08:00
Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
2004-06-23 19:29:58 +08:00
Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
2004-08-29 13:07:03 +08:00
mkVoidAffix(Conf, 1, firstsuffix);
mkVoidAffix(Conf, 0, firstsuffix);
}
2004-08-29 13:07:03 +08:00
static AffixNodeData *
FinfAffixes(AffixNode * node, const char *word, int wrdlen, int *level, int type)
{
AffixNodeData *StopLow,
*StopHigh,
*StopMiddle;
uint8 symbol;
if (node->isvoid)
{ /* search void affixes */
if (node->data->naff)
return node->data;
node = node->data->node;
}
2004-08-29 13:07:03 +08:00
while (node && *level < wrdlen)
{
StopLow = node->data;
2004-08-29 13:07:03 +08:00
StopHigh = node->data + node->length;
while (StopLow < StopHigh)
{
StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
2004-08-29 13:07:03 +08:00
symbol = GETWCHAR(word, wrdlen, *level, type);
if (StopMiddle->val == symbol)
{
(*level)++;
2004-08-29 13:07:03 +08:00
if (StopMiddle->naff)
return StopMiddle;
2004-08-29 13:07:03 +08:00
node = StopMiddle->node;
break;
2004-08-29 13:07:03 +08:00
}
else if (StopMiddle->val < symbol)
StopLow = StopMiddle + 1;
2004-08-29 13:07:03 +08:00
else
StopHigh = StopMiddle;
2003-08-04 08:43:34 +08:00
}
2004-08-29 13:07:03 +08:00
if (StopLow >= StopHigh)
break;
2003-08-04 08:43:34 +08:00
}
return NULL;
2003-07-21 18:27:44 +08:00
}
2003-08-04 08:43:34 +08:00
static char *
2004-08-29 13:07:03 +08:00
CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *newword)
{
2003-08-04 08:43:34 +08:00
2004-08-29 13:07:03 +08:00
if (flagflags & FF_COMPOUNDONLYAFX)
{
if ((Affix->flagflags & FF_COMPOUNDONLYAFX) == 0)
return NULL;
2004-08-29 13:07:03 +08:00
}
else
{
if (Affix->flagflags & FF_COMPOUNDONLYAFX)
return NULL;
2004-08-29 13:07:03 +08:00
}
2004-08-29 13:07:03 +08:00
if (Affix->type == FF_SUFFIX)
{
strcpy(newword, word);
strcpy(newword + len - Affix->replen, Affix->find);
2004-08-29 13:07:03 +08:00
}
else
{
strcpy(newword, Affix->find);
strcat(newword, word + Affix->replen);
}
2003-08-04 08:43:34 +08:00
2004-08-29 13:07:03 +08:00
if (Affix->issimple)
return newword;
else if (Affix->isregis)
{
if (Affix->compile)
{
RS_compile(&(Affix->reg.regis), (Affix->type == FF_SUFFIX) ? 1 : 0, Affix->mask);
Affix->compile = 0;
}
if (RS_execute(&(Affix->reg.regis), newword, -1))
return newword;
}
else
{
regmatch_t subs[2]; /* workaround for apache&linux */
int err;
pg_wchar *data;
size_t data_len;
2004-08-29 13:07:03 +08:00
int dat_len;
if (Affix->compile)
2003-08-04 08:43:34 +08:00
{
2004-08-29 13:07:03 +08:00
int wmasklen,
masklen = strlen(Affix->mask);
pg_wchar *mask;
mask = (pg_wchar *) palloc((masklen + 1) * sizeof(pg_wchar));
2004-08-29 13:07:03 +08:00
wmasklen = pg_mb2wchar_with_len(Affix->mask, mask, masklen);
err = pg_regcomp(&(Affix->reg.regex), mask, wmasklen, REG_EXTENDED | REG_ICASE | REG_NOSUB);
pfree(mask);
if (err)
{
2004-08-29 13:07:03 +08:00
/*
* regerror(err, &(Affix->reg.regex), regerrstr,
* ERRSTRSIZE);
*/
pg_regfree(&(Affix->reg.regex));
return (NULL);
}
Affix->compile = 0;
2003-08-04 08:43:34 +08:00
}
/* Convert data string to wide characters */
dat_len = strlen(newword);
data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
data_len = pg_mb2wchar_with_len(newword, data, dat_len);
2004-08-29 13:07:03 +08:00
if (!(err = pg_regexec(&(Affix->reg.regex), data, dat_len, NULL, 1, subs, 0)))
{
pfree(data);
return newword;
}
pfree(data);
2004-05-31 21:55:19 +08:00
}
2003-08-04 08:43:34 +08:00
return NULL;
2003-07-21 18:27:44 +08:00
}
2004-08-29 13:07:03 +08:00
static char **
NormalizeSubWord(IspellDict * Conf, char *word, char flag)
{
AffixNodeData *suffix = NULL,
*prefix = NULL;
int slevel = 0,
plevel = 0;
int wrdlen = strlen(word),
swrdlen;
char **forms;
char **cur;
2003-08-04 08:43:34 +08:00
char newword[2 * MAXNORMLEN] = "";
char pnewword[2 * MAXNORMLEN] = "";
2004-08-29 13:07:03 +08:00
AffixNode *snode = Conf->Suffix,
*pnode;
int i,
j;
2003-08-04 08:43:34 +08:00
2004-08-29 13:07:03 +08:00
if (wrdlen > MAXNORMLEN)
return NULL;
strlower(word);
cur = forms = (char **) palloc(MAX_NORM * sizeof(char *));
*cur = NULL;
/* Check that the word itself is normal form */
2004-08-29 13:07:03 +08:00
if (FindWord(Conf, word, 0, flag & FF_COMPOUNDWORD))
{
*cur = pstrdup(word);
cur++;
*cur = NULL;
2003-07-21 18:27:44 +08:00
}
2003-08-04 08:43:34 +08:00
2004-08-29 13:07:03 +08:00
/* Find all other NORMAL forms of the 'word' (check only prefix) */
pnode = Conf->Prefix;
plevel = 0;
while (pnode)
{
prefix = FinfAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX);
if (!prefix)
break;
for (j = 0; j < prefix->naff; j++)
{
if (CheckAffix(word, wrdlen, prefix->aff[j], flag, newword))
{
/* prefix success */
2004-08-29 13:07:03 +08:00
if (FindWord(Conf, newword, prefix->aff[j]->flag, flag & FF_COMPOUNDWORD) && (cur - forms) < (MAX_NORM - 1))
{
/* word search success */
*cur = pstrdup(newword);
cur++;
2004-08-29 13:07:03 +08:00
*cur = NULL;
}
2003-08-04 08:43:34 +08:00
}
}
pnode = prefix->node;
}
2004-08-29 13:07:03 +08:00
/*
* Find all other NORMAL forms of the 'word' (check suffix and then
* prefix)
*/
while (snode)
{
/* find possible suffix */
2004-06-23 19:29:58 +08:00
suffix = FinfAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX);
2004-08-29 13:07:03 +08:00
if (!suffix)
break;
/* foreach suffix check affix */
2004-08-29 13:07:03 +08:00
for (i = 0; i < suffix->naff; i++)
{
if (CheckAffix(word, wrdlen, suffix->aff[i], flag, newword))
{
/* suffix success */
2004-08-29 13:07:03 +08:00
if (FindWord(Conf, newword, suffix->aff[i]->flag, flag & FF_COMPOUNDWORD) && (cur - forms) < (MAX_NORM - 1))
{
/* word search success */
*cur = pstrdup(newword);
cur++;
2004-08-29 13:07:03 +08:00
*cur = NULL;
}
/* now we will look changed word with prefixes */
2004-08-29 13:07:03 +08:00
pnode = Conf->Prefix;
plevel = 0;
swrdlen = strlen(newword);
while (pnode)
{
prefix = FinfAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX);
if (!prefix)
break;
for (j = 0; j < prefix->naff; j++)
{
if (CheckAffix(newword, swrdlen, prefix->aff[j], flag, pnewword))
{
/* prefix success */
2004-08-29 13:07:03 +08:00
int ff = (prefix->aff[j]->flagflags & suffix->aff[i]->flagflags & FF_CROSSPRODUCT) ?
0 : prefix->aff[j]->flag;
if (FindWord(Conf, pnewword, ff, flag & FF_COMPOUNDWORD) && (cur - forms) < (MAX_NORM - 1))
{
/* word search success */
*cur = pstrdup(pnewword);
cur++;
2004-08-29 13:07:03 +08:00
*cur = NULL;
}
}
}
pnode = prefix->node;
2004-08-29 13:07:03 +08:00
}
2003-08-04 08:43:34 +08:00
}
}
2003-07-21 18:27:44 +08:00
2004-08-29 13:07:03 +08:00
snode = suffix->node;
}
2003-07-21 18:27:44 +08:00
2004-08-29 13:07:03 +08:00
if (cur == forms)
{
2003-11-28 20:09:02 +08:00
pfree(forms);
2003-08-04 08:43:34 +08:00
return (NULL);
}
return (forms);
}
2003-07-21 18:27:44 +08:00
2004-08-29 13:07:03 +08:00
typedef struct SplitVar
{
int nstem;
char **stem;
struct SplitVar *next;
} SplitVar;
2004-08-29 13:07:03 +08:00
static int
CheckCompoundAffixes(CMPDAffix ** ptr, char *word, int len)
{
while ((*ptr)->affix)
{
if (len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len) == 0)
{
len = (*ptr)->len;
(*ptr)++;
return len;
}
(*ptr)++;
2003-07-21 18:27:44 +08:00
}
return 0;
}
2003-07-21 18:27:44 +08:00
2004-08-29 13:07:03 +08:00
static SplitVar *
CopyVar(SplitVar * s, int makedup)
{
SplitVar *v = (SplitVar *) palloc(sizeof(SplitVar));
v->stem = (char **) palloc(sizeof(char *) * (MAX_NORM));
v->next = NULL;
if (s)
{
int i;
v->nstem = s->nstem;
2004-08-29 13:07:03 +08:00
for (i = 0; i < s->nstem; i++)
v->stem[i] = (makedup) ? pstrdup(s->stem[i]) : s->stem[i];
}
2004-08-29 13:07:03 +08:00
else
v->nstem = 0;
return v;
}
2003-07-21 18:27:44 +08:00
2003-08-04 08:43:34 +08:00
2004-08-29 13:07:03 +08:00
static SplitVar *
SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word, int wordlen, int startpos, int minpos)
{
SplitVar *var = NULL;
SPNodeData *StopLow,
*StopHigh,
*StopMiddle = NULL;
SPNode *node = (snode) ? snode : Conf->Dictionary;
int level = (snode) ? minpos : startpos; /* recursive
* minpos==level */
int lenaff;
CMPDAffix *caff;
char *notprobed;
notprobed = (char *) palloc(wordlen);
2004-08-29 13:07:03 +08:00
memset(notprobed, 1, wordlen);
var = CopyVar(orig, 1);
2004-08-29 13:07:03 +08:00
while (node && level < wordlen)
{
StopLow = node->data;
2004-08-29 13:07:03 +08:00
StopHigh = node->data + node->length;
while (StopLow < StopHigh)
{
2004-06-23 19:29:58 +08:00
StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
2004-08-29 13:07:03 +08:00
if (StopMiddle->val == ((uint8 *) (word))[level])
break;
2004-08-29 13:07:03 +08:00
else if (StopMiddle->val < ((uint8 *) (word))[level])
StopLow = StopMiddle + 1;
2004-08-29 13:07:03 +08:00
else
StopHigh = StopMiddle;
}
2004-08-29 13:07:03 +08:00
if (StopLow >= StopHigh)
break;
/* find word with epenthetic */
caff = Conf->CompoundAffix;
2004-08-29 13:07:03 +08:00
while (level > startpos && (lenaff = CheckCompoundAffixes(&caff, word + level, wordlen - level)) > 0)
{
/*
* there is one of compound suffixes, so check word for
* existings
*/
char buf[MAXNORMLEN];
char **subres;
lenaff = level - startpos + lenaff;
if (!notprobed[startpos + lenaff - 1])
continue;
2004-08-29 13:07:03 +08:00
if (level + lenaff - 1 <= minpos)
continue;
2004-08-29 13:07:03 +08:00
memcpy(buf, word + startpos, lenaff);
buf[lenaff] = '\0';
subres = NormalizeSubWord(Conf, buf, FF_COMPOUNDWORD | FF_COMPOUNDONLYAFX);
2004-08-29 13:07:03 +08:00
if (subres)
{
/* Yes, it was a word from dictionary */
2004-08-29 13:07:03 +08:00
SplitVar *new = CopyVar(var, 0);
SplitVar *ptr = var;
char **sptr = subres;
notprobed[startpos + lenaff - 1] = 0;
while (*sptr)
{
new->stem[new->nstem] = *sptr;
new->nstem++;
sptr++;
}
2003-11-28 20:09:02 +08:00
pfree(subres);
2004-08-29 13:07:03 +08:00
while (ptr->next)
ptr = ptr->next;
2004-08-29 13:07:03 +08:00
ptr->next = SplitToVariants(Conf, NULL, new, word, wordlen, startpos + lenaff, startpos + lenaff);
2003-11-28 20:09:02 +08:00
pfree(new->stem);
pfree(new);
2003-08-04 08:43:34 +08:00
}
2003-07-21 18:27:44 +08:00
}
2003-08-04 08:43:34 +08:00
/* find infinitive */
2004-08-29 13:07:03 +08:00
if (StopMiddle->isword && StopMiddle->compoundallow && notprobed[level])
{
/* ok, we found full compoundallowed word */
if (level > minpos)
{
/* and its length more than minimal */
2004-08-29 13:07:03 +08:00
if (wordlen == level + 1)
{
/* well, it was last word */
2004-08-29 13:07:03 +08:00
var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos);
var->nstem++;
pfree(notprobed);
return var;
2004-08-29 13:07:03 +08:00
}
else
{
/* then we will search more big word at the same point */
2004-08-29 13:07:03 +08:00
SplitVar *ptr = var;
while (ptr->next)
ptr = ptr->next;
2004-08-29 13:07:03 +08:00
ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
/* we can find next word */
level++;
2004-08-29 13:07:03 +08:00
var->stem[var->nstem] = strnduplicate(word + startpos, level - startpos);
var->nstem++;
node = Conf->Dictionary;
2004-08-29 13:07:03 +08:00
startpos = level;
continue;
2003-08-04 08:43:34 +08:00
}
}
}
level++;
2004-08-29 13:07:03 +08:00
node = StopMiddle->node;
}
2004-08-29 13:07:03 +08:00
var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos);
var->nstem++;
pfree(notprobed);
return var;
2004-08-29 13:07:03 +08:00
}
char **
NINormalizeWord(IspellDict * Conf, char *word)
{
char **res = NormalizeSubWord(Conf, word, 0);
if (Conf->compoundcontrol != '\t')
{
int wordlen = strlen(word);
SplitVar *ptr,
*var = SplitToVariants(Conf, NULL, NULL, word, wordlen, 0, -1);
char **cur = res;
int i;
while (var)
{
if (var->nstem > 1)
{
char **subres = NormalizeSubWord(Conf, var->stem[var->nstem - 1], FF_COMPOUNDWORD);
if (subres)
{
char **ptr = subres;
if (cur)
{
while (*cur)
cur++;
}
2004-08-29 13:07:03 +08:00
else
res = cur = (char **) palloc(MAX_NORM * sizeof(char *));
for (i = 0; i < var->nstem - 1; i++)
{
*cur = var->stem[i];
cur++;
}
2004-08-29 13:07:03 +08:00
while (*ptr)
{
*cur = *ptr;
cur++;
ptr++;
}
2004-08-29 13:07:03 +08:00
*cur = NULL;
2003-11-28 20:09:02 +08:00
pfree(subres);
2004-08-29 13:07:03 +08:00
var->stem[0] = NULL;
2003-08-04 08:43:34 +08:00
}
}
2004-08-29 13:07:03 +08:00
for (i = 0; i < var->nstem && var->stem[i]; i++)
pfree(var->stem[i]);
ptr = var->next;
2003-11-28 20:09:02 +08:00
pfree(var->stem);
2004-08-29 13:07:03 +08:00
pfree(var);
var = ptr;
}
}
return res;
}
2003-08-04 08:43:34 +08:00
2004-08-29 13:07:03 +08:00
static void
freeSPNode(SPNode * node)
{
SPNodeData *data;
2004-08-29 13:07:03 +08:00
if (!node)
return;
data = node->data;
while (node->length)
{
freeSPNode(data->node);
data++;
node->length--;
2003-07-21 18:27:44 +08:00
}
free(node);
2003-07-21 18:27:44 +08:00
}
2004-08-29 13:07:03 +08:00
static void
freeANode(AffixNode * node)
{
AffixNodeData *data;
2004-08-29 13:07:03 +08:00
if (!node)
return;
data = node->data;
while (node->length)
{
freeANode(data->node);
if (data->naff)
2004-08-29 13:07:03 +08:00
free(data->aff);
data++;
node->length--;
}
free(node);
}
2004-08-29 13:07:03 +08:00
2003-07-21 18:27:44 +08:00
2003-08-04 08:43:34 +08:00
void
NIFree(IspellDict * Conf)
2003-08-04 08:43:34 +08:00
{
int i;
AFFIX *Affix = (AFFIX *) Conf->Affix;
2004-08-29 13:07:03 +08:00
char **aff = Conf->AffixData;
2004-08-29 13:07:03 +08:00
if (aff)
{
while (*aff)
{
free(*aff);
aff++;
}
free(Conf->AffixData);
}
2003-08-04 08:43:34 +08:00
2004-08-29 13:07:03 +08:00
2003-08-04 08:43:34 +08:00
for (i = 0; i < Conf->naffixes; i++)
{
2004-08-29 13:07:03 +08:00
if (Affix[i].compile == 0)
{
if (Affix[i].isregis)
RS_free(&(Affix[i].reg.regis));
else
pg_regfree(&(Affix[i].reg.regex));
}
2003-08-04 08:43:34 +08:00
}
2004-08-29 13:07:03 +08:00
if (Conf->Spell)
{
for (i = 0; i < Conf->nspell; i++)
free(Conf->Spell[i].word);
free(Conf->Spell);
}
2004-08-29 13:07:03 +08:00
if (Conf->Affix)
free(Conf->Affix);
if (Conf->CompoundAffix)
free(Conf->CompoundAffix);
freeSPNode(Conf->Dictionary);
freeANode(Conf->Suffix);
freeANode(Conf->Prefix);
2003-08-04 08:43:34 +08:00
memset((void *) Conf, 0, sizeof(IspellDict));
return;
2003-07-21 18:27:44 +08:00
}