mirror of
https://git.postgresql.org/git/postgresql.git
synced 2025-01-12 18:34:36 +08:00
1010 lines
22 KiB
C
1010 lines
22 KiB
C
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <ctype.h>
|
|
|
|
#include "postgres.h"
|
|
|
|
#include "spell.h"
|
|
|
|
#define MAX_NORM 1024
|
|
#define MAXNORMLEN 256
|
|
|
|
#define STRNCASECMP(x,y) (strncasecmp(x,y,strlen(y)))
|
|
#define GETWCHAR(W,L,N,T) ( ((u_int8_t*)(W))[ ((T)=='p') ? (N) : ( (L) - 1 - (N) ) ] )
|
|
#define GETCHAR(A,N,T) GETWCHAR( (A)->repl, (A)->replen, N, T )
|
|
|
|
|
|
#define MEMOUT(X) if ( !(X) ) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory")))
|
|
|
|
static int
|
|
cmpspell(const void *s1, const void *s2)
|
|
{
|
|
return (strcmp(((const SPELL *) s1)->word, ((const SPELL *) s2)->word));
|
|
}
|
|
static int
|
|
cmpspellaffix(const void *s1, const void *s2)
|
|
{
|
|
return (strcmp(((const SPELL *) s1)->p.flag, ((const SPELL *) s2)->p.flag));
|
|
}
|
|
|
|
static void
|
|
strlower(char *str)
|
|
{
|
|
unsigned char *ptr = (unsigned char *) str;
|
|
|
|
while (*ptr)
|
|
{
|
|
*ptr = tolower(*ptr);
|
|
ptr++;
|
|
}
|
|
}
|
|
|
|
static char*
|
|
strndup(char *s, int len) {
|
|
char *d=(char*)palloc( len + 1 );
|
|
memcpy(d, s, len );
|
|
d[len]='\0';
|
|
return d;
|
|
}
|
|
/* backward string compaire for suffix tree operations */
|
|
static int
|
|
strbcmp(const char *s1, const char *s2)
|
|
{
|
|
int l1 = strlen(s1) - 1,
|
|
l2 = strlen(s2) - 1;
|
|
|
|
while (l1 >= 0 && l2 >= 0)
|
|
{
|
|
if (s1[l1] < s2[l2])
|
|
return -1;
|
|
if (s1[l1] > s2[l2])
|
|
return 1;
|
|
l1--;
|
|
l2--;
|
|
}
|
|
if (l1 < l2)
|
|
return -1;
|
|
if (l1 > l2)
|
|
return 1;
|
|
|
|
return 0;
|
|
}
|
|
static int
|
|
strbncmp(const char *s1, const char *s2, size_t count)
|
|
{
|
|
int l1 = strlen(s1) - 1,
|
|
l2 = strlen(s2) - 1,
|
|
l = count;
|
|
|
|
while (l1 >= 0 && l2 >= 0 && l > 0)
|
|
{
|
|
if (s1[l1] < s2[l2])
|
|
return -1;
|
|
if (s1[l1] > s2[l2])
|
|
return 1;
|
|
l1--;
|
|
l2--;
|
|
l--;
|
|
}
|
|
if (l == 0)
|
|
return 0;
|
|
if (l1 < l2)
|
|
return -1;
|
|
if (l1 > l2)
|
|
return 1;
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
cmpaffix(const void *s1, const void *s2)
|
|
{
|
|
if (((const AFFIX *) s1)->type < ((const AFFIX *) s2)->type)
|
|
return -1;
|
|
if (((const AFFIX *) s1)->type > ((const AFFIX *) s2)->type)
|
|
return 1;
|
|
if (((const AFFIX *) s1)->type == 'p')
|
|
return (strcmp(((const AFFIX *) s1)->repl, ((const AFFIX *) s2)->repl));
|
|
else
|
|
return (strbcmp(((const AFFIX *) s1)->repl, ((const AFFIX *) s2)->repl));
|
|
}
|
|
|
|
int
|
|
NIAddSpell(IspellDict * Conf, const char *word, const char *flag)
|
|
{
|
|
if (Conf->nspell >= Conf->mspell)
|
|
{
|
|
if (Conf->mspell)
|
|
{
|
|
Conf->mspell += 1024 * 20;
|
|
Conf->Spell = (SPELL *) realloc(Conf->Spell, Conf->mspell * sizeof(SPELL));
|
|
}
|
|
else
|
|
{
|
|
Conf->mspell = 1024 * 20;
|
|
Conf->Spell = (SPELL *) malloc(Conf->mspell * sizeof(SPELL));
|
|
}
|
|
MEMOUT(Conf->Spell);
|
|
}
|
|
Conf->Spell[Conf->nspell].word = strdup(word);
|
|
MEMOUT(Conf->Spell[Conf->nspell].word);
|
|
strncpy(Conf->Spell[Conf->nspell].p.flag, flag, 16);
|
|
Conf->nspell++;
|
|
return (0);
|
|
}
|
|
|
|
|
|
int
|
|
NIImportDictionary(IspellDict * Conf, const char *filename)
|
|
{
|
|
unsigned char str[BUFSIZ];
|
|
FILE *dict;
|
|
|
|
if (!(dict = fopen(filename, "r")))
|
|
return (1);
|
|
while (fgets(str, sizeof(str), dict))
|
|
{
|
|
unsigned char *s;
|
|
const unsigned char *flag;
|
|
|
|
flag = NULL;
|
|
if ((s = strchr(str, '/')))
|
|
{
|
|
*s = 0;
|
|
s++;
|
|
flag = s;
|
|
while (*s)
|
|
{
|
|
if (isprint(*s) && !isspace(*s))
|
|
s++;
|
|
else
|
|
{
|
|
*s = 0;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
flag = "";
|
|
strlower(str);
|
|
/* Dont load words if first letter is not required */
|
|
/* It allows to optimize loading at search time */
|
|
s = str;
|
|
while (*s)
|
|
{
|
|
if (*s == '\r')
|
|
*s = 0;
|
|
if (*s == '\n')
|
|
*s = 0;
|
|
s++;
|
|
}
|
|
NIAddSpell(Conf, str, flag);
|
|
}
|
|
fclose(dict);
|
|
return (0);
|
|
}
|
|
|
|
|
|
static int
|
|
FindWord(IspellDict * Conf, const char *word, int affixflag, char compoundonly)
|
|
{
|
|
SPNode *node = Conf->Dictionary;
|
|
SPNodeData *StopLow, *StopHigh, *StopMiddle;
|
|
int level=0, wrdlen=strlen(word);
|
|
|
|
while( node && level<wrdlen) {
|
|
StopLow = node->data;
|
|
StopHigh = node->data+node->length;
|
|
while (StopLow < StopHigh) {
|
|
StopMiddle = StopLow + (StopHigh - StopLow) / 2;
|
|
if ( StopMiddle->val == ((u_int8_t*)(word))[level] ) {
|
|
if ( wrdlen==level+1 && StopMiddle->isword ) {
|
|
if ( compoundonly && !StopMiddle->compoundallow )
|
|
return 0;
|
|
if ( (affixflag == 0) || (strchr(Conf->AffixData[StopMiddle->affix], affixflag) != NULL))
|
|
return 1;
|
|
}
|
|
node=StopMiddle->node;
|
|
level++;
|
|
break;
|
|
} else if ( StopMiddle->val < ((u_int8_t*)(word))[level] ) {
|
|
StopLow = StopMiddle + 1;
|
|
} else {
|
|
StopHigh = StopMiddle;
|
|
}
|
|
}
|
|
if ( StopLow >= StopHigh )
|
|
break;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
int
|
|
NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const char *find, const char *repl, int type)
|
|
{
|
|
if (Conf->naffixes >= Conf->maffixes)
|
|
{
|
|
if (Conf->maffixes)
|
|
{
|
|
Conf->maffixes += 16;
|
|
Conf->Affix = (AFFIX *) realloc((void *) Conf->Affix, Conf->maffixes * sizeof(AFFIX));
|
|
}
|
|
else
|
|
{
|
|
Conf->maffixes = 16;
|
|
Conf->Affix = (AFFIX *) malloc(Conf->maffixes * sizeof(AFFIX));
|
|
}
|
|
MEMOUT(Conf->Affix);
|
|
}
|
|
if (type == 's')
|
|
sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask);
|
|
else
|
|
sprintf(Conf->Affix[Conf->naffixes].mask, "^%s", mask);
|
|
Conf->Affix[Conf->naffixes].compile = 1;
|
|
Conf->Affix[Conf->naffixes].flagflags = flagflags;
|
|
Conf->Affix[Conf->naffixes].flag = flag;
|
|
Conf->Affix[Conf->naffixes].type = type;
|
|
|
|
strcpy(Conf->Affix[Conf->naffixes].find, find);
|
|
strcpy(Conf->Affix[Conf->naffixes].repl, repl);
|
|
Conf->Affix[Conf->naffixes].replen = strlen(repl);
|
|
Conf->naffixes++;
|
|
return (0);
|
|
}
|
|
|
|
static char *
|
|
remove_spaces(char *dist, char *src)
|
|
{
|
|
char *d,
|
|
*s;
|
|
|
|
d = dist;
|
|
s = src;
|
|
while (*s)
|
|
{
|
|
if (*s != ' ' && *s != '-' && *s != '\t')
|
|
{
|
|
*d = *s;
|
|
d++;
|
|
}
|
|
s++;
|
|
}
|
|
*d = 0;
|
|
return (dist);
|
|
}
|
|
|
|
|
|
int
|
|
NIImportAffixes(IspellDict * Conf, const char *filename)
|
|
{
|
|
unsigned char str[BUFSIZ];
|
|
unsigned char flag = 0;
|
|
unsigned char mask[BUFSIZ] = "";
|
|
unsigned char find[BUFSIZ] = "";
|
|
unsigned char repl[BUFSIZ] = "";
|
|
unsigned char *s;
|
|
int i;
|
|
int suffixes = 0;
|
|
int prefixes = 0;
|
|
unsigned char flagflags = 0;
|
|
FILE *affix;
|
|
|
|
if (!(affix = fopen(filename, "r")))
|
|
return (1);
|
|
Conf->compoundcontrol='\t';
|
|
|
|
while (fgets(str, sizeof(str), affix))
|
|
{
|
|
if (STRNCASECMP(str, "compoundwords")==0) {
|
|
s=strchr(str, 'l');
|
|
if ( s ) {
|
|
while( *s!=' ' ) s++;
|
|
while( *s==' ' ) s++;
|
|
Conf->compoundcontrol = *s;
|
|
continue;
|
|
}
|
|
}
|
|
if (!STRNCASECMP(str, "suffixes"))
|
|
{
|
|
suffixes = 1;
|
|
prefixes = 0;
|
|
continue;
|
|
}
|
|
if (!STRNCASECMP(str, "prefixes"))
|
|
{
|
|
suffixes = 0;
|
|
prefixes = 1;
|
|
continue;
|
|
}
|
|
if (!STRNCASECMP(str, "flag "))
|
|
{
|
|
s = str + 5;
|
|
flagflags=0;
|
|
while( *s==' ' ) s++;
|
|
if ( *s=='*' ) {
|
|
flagflags|=FF_CROSSPRODUCT;
|
|
s++;
|
|
} else if ( *s=='~' ) {
|
|
flagflags|=FF_COMPOUNDONLYAFX;
|
|
s++;
|
|
}
|
|
|
|
if ( *s=='\\' ) s++;
|
|
|
|
flag = *s;
|
|
continue;
|
|
}
|
|
if ((!suffixes) && (!prefixes))
|
|
continue;
|
|
if ((s = strchr(str, '#')))
|
|
*s = 0;
|
|
if (!*str)
|
|
continue;
|
|
strlower(str);
|
|
strcpy(mask, "");
|
|
strcpy(find, "");
|
|
strcpy(repl, "");
|
|
i = sscanf(str, "%[^>\n]>%[^,\n],%[^\n]", mask, find, repl);
|
|
remove_spaces(str, repl);
|
|
strcpy(repl, str);
|
|
remove_spaces(str, find);
|
|
strcpy(find, str);
|
|
remove_spaces(str, mask);
|
|
strcpy(mask, str);
|
|
switch (i)
|
|
{
|
|
case 3:
|
|
break;
|
|
case 2:
|
|
if (*find != '\0')
|
|
{
|
|
strcpy(repl, find);
|
|
strcpy(find, "");
|
|
}
|
|
break;
|
|
default:
|
|
continue;
|
|
}
|
|
|
|
NIAddAffix(Conf, (int) flag, (char) flagflags, mask, find, repl, suffixes ? 's' : 'p');
|
|
|
|
}
|
|
fclose(affix);
|
|
|
|
return (0);
|
|
}
|
|
|
|
static int
|
|
MergeAffix(IspellDict *Conf, int a1, int a2) {
|
|
int naffix=0;
|
|
char **ptr=Conf->AffixData;
|
|
|
|
while(*ptr) {
|
|
naffix++;
|
|
ptr++;
|
|
}
|
|
|
|
Conf->AffixData=(char**)realloc( Conf->AffixData, (naffix+2)*sizeof(char*) );
|
|
MEMOUT(Conf->AffixData);
|
|
ptr = Conf->AffixData + naffix;
|
|
*ptr=malloc( strlen(Conf->AffixData[a1]) + strlen(Conf->AffixData[a2]) + 1 /* space */ + 1 /* \0 */ );
|
|
MEMOUT(ptr);
|
|
sprintf(*ptr, "%s %s", Conf->AffixData[a1], Conf->AffixData[a2]);
|
|
ptr++;
|
|
*ptr='\0';
|
|
return naffix;
|
|
}
|
|
|
|
|
|
static SPNode*
|
|
mkSPNode(IspellDict *Conf, int low, int high, int level) {
|
|
int i;
|
|
int nchar=0;
|
|
char lastchar='\0';
|
|
SPNode *rs;
|
|
SPNodeData *data;
|
|
int lownew=low;
|
|
|
|
for(i=low; i<high; i++)
|
|
if ( Conf->Spell[i].p.d.len>level && lastchar!=Conf->Spell[i].word[level] ) {
|
|
nchar++;
|
|
lastchar=Conf->Spell[i].word[level];
|
|
}
|
|
|
|
if (!nchar)
|
|
return NULL;
|
|
|
|
rs=(SPNode*)malloc(SPNHRDSZ+nchar*sizeof(SPNodeData));
|
|
MEMOUT(rs);
|
|
memset(rs,0,SPNHRDSZ+nchar*sizeof(SPNodeData));
|
|
rs->length = nchar;
|
|
data=rs->data;
|
|
|
|
lastchar='\0';
|
|
for(i=low; i<high; i++)
|
|
if ( Conf->Spell[i].p.d.len>level ) {
|
|
if ( lastchar!=Conf->Spell[i].word[level] ) {
|
|
if ( lastchar ) {
|
|
data->node = mkSPNode(Conf, lownew, i, level+1);
|
|
lownew=i;
|
|
data++;
|
|
}
|
|
lastchar=Conf->Spell[i].word[level];
|
|
}
|
|
data->val=((u_int8_t*)(Conf->Spell[i].word))[level];
|
|
if ( Conf->Spell[i].p.d.len == level+1 ) {
|
|
if ( data->isword && data->affix!=Conf->Spell[i].p.d.affix) {
|
|
/*
|
|
fprintf(stderr,"Word already exists: %s (affixes: '%s' and '%s')\n",
|
|
Conf->Spell[i].word,
|
|
Conf->AffixData[data->affix],
|
|
Conf->AffixData[Conf->Spell[i].p.d.affix]
|
|
);
|
|
*/
|
|
/* MergeAffix called a few times */
|
|
data->affix = MergeAffix(Conf, data->affix, Conf->Spell[i].p.d.affix);
|
|
} else
|
|
data->affix = Conf->Spell[i].p.d.affix;
|
|
data->isword=1;
|
|
if ( strchr( Conf->AffixData[ data->affix ], Conf->compoundcontrol ) )
|
|
data->compoundallow=1;
|
|
}
|
|
}
|
|
|
|
data->node = mkSPNode(Conf, lownew, high, level+1);
|
|
|
|
return rs;
|
|
}
|
|
|
|
|
|
|
|
void
|
|
NISortDictionary(IspellDict * Conf)
|
|
{
|
|
size_t i;
|
|
int naffix=3;
|
|
|
|
/* compress affixes */
|
|
qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL), cmpspellaffix);
|
|
for (i = 1; i < Conf->nspell; i++)
|
|
if ( strcmp(Conf->Spell[i].p.flag,Conf->Spell[i-1].p.flag) )
|
|
naffix++;
|
|
|
|
Conf->AffixData=(char**)malloc( naffix*sizeof(char*) );
|
|
MEMOUT(Conf->AffixData);
|
|
memset(Conf->AffixData, 0, naffix*sizeof(char*));
|
|
naffix=1;
|
|
Conf->AffixData[0]=strdup("");
|
|
MEMOUT(Conf->AffixData[0]);
|
|
Conf->AffixData[1]=strdup( Conf->Spell[0].p.flag );
|
|
MEMOUT(Conf->AffixData[1]);
|
|
Conf->Spell[0].p.d.affix = 1;
|
|
Conf->Spell[0].p.d.len = strlen(Conf->Spell[0].word);
|
|
for (i = 1; i < Conf->nspell; i++) {
|
|
if ( strcmp(Conf->Spell[i].p.flag, Conf->AffixData[naffix]) ) {
|
|
naffix++;
|
|
Conf->AffixData[naffix] = strdup( Conf->Spell[i].p.flag );
|
|
MEMOUT(Conf->AffixData[naffix]);
|
|
}
|
|
Conf->Spell[i].p.d.affix = naffix;
|
|
Conf->Spell[i].p.d.len = strlen(Conf->Spell[i].word);
|
|
}
|
|
|
|
qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL), cmpspell);
|
|
Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);
|
|
|
|
for (i = 0; i < Conf->nspell; i++)
|
|
free( Conf->Spell[i].word );
|
|
free( Conf->Spell );
|
|
Conf->Spell=NULL;
|
|
}
|
|
|
|
static AffixNode*
|
|
mkANode(IspellDict *Conf, int low, int high, int level, int type) {
|
|
int i;
|
|
int nchar=0;
|
|
u_int8_t lastchar='\0';
|
|
AffixNode *rs;
|
|
AffixNodeData *data;
|
|
int lownew=low;
|
|
|
|
for(i=low; i<high; i++)
|
|
if ( Conf->Affix[i].replen>level && lastchar!=GETCHAR( Conf->Affix + i, level, type ) ) {
|
|
nchar++;
|
|
lastchar=GETCHAR( Conf->Affix + i, level, type );
|
|
}
|
|
|
|
if (!nchar)
|
|
return NULL;
|
|
|
|
rs=(AffixNode*)malloc(ANHRDSZ+nchar*sizeof(AffixNodeData));
|
|
MEMOUT(rs);
|
|
memset(rs,0,ANHRDSZ+nchar*sizeof(AffixNodeData));
|
|
rs->length = nchar;
|
|
data=rs->data;
|
|
|
|
lastchar='\0';
|
|
for(i=low; i<high; i++)
|
|
if ( Conf->Affix[i].replen>level ) {
|
|
if ( lastchar!=GETCHAR( Conf->Affix + i, level, type ) ) {
|
|
if ( lastchar ) {
|
|
data->node = mkANode(Conf, lownew, i, level+1, type);
|
|
lownew=i;
|
|
data++;
|
|
}
|
|
lastchar=GETCHAR( Conf->Affix + i, level, type );
|
|
}
|
|
data->val=GETCHAR( Conf->Affix + i, level, type );
|
|
if ( Conf->Affix[i].replen == level+1 ) { /* affix stopped */
|
|
if ( !data->naff )
|
|
data->aff=(AFFIX**)malloc(sizeof(AFFIX*)*(high-i+1));
|
|
MEMOUT(data);
|
|
data->aff[ data->naff ] = Conf->Affix + i;
|
|
data->naff++;
|
|
}
|
|
}
|
|
|
|
data->node = mkANode(Conf, lownew, high, level+1, type);
|
|
|
|
return rs;
|
|
}
|
|
|
|
void
|
|
NISortAffixes(IspellDict * Conf)
|
|
{
|
|
AFFIX *Affix;
|
|
size_t i;
|
|
CMPDAffix* ptr;
|
|
int firstsuffix=-1;
|
|
|
|
if (Conf->naffixes > 1)
|
|
qsort((void *) Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix);
|
|
|
|
Conf->CompoundAffix = ptr = (CMPDAffix*)malloc( sizeof(CMPDAffix) * Conf->naffixes );
|
|
MEMOUT(Conf->CompoundAffix);
|
|
ptr->affix=NULL;
|
|
|
|
for (i = 0; i < Conf->naffixes; i++) {
|
|
Affix = &(((AFFIX *) Conf->Affix)[i]);
|
|
if ( Affix->type == 's' ) {
|
|
if ( firstsuffix<0 ) firstsuffix=i;
|
|
if ( Affix->flagflags & FF_COMPOUNDONLYAFX ) {
|
|
if ( !ptr->affix || strbncmp((ptr-1)->affix, Affix->repl, (ptr-1)->len) ) {
|
|
/* leave only unique and minimals suffixes */
|
|
ptr->affix=Affix->repl;
|
|
ptr->len=Affix->replen;
|
|
ptr++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
ptr->affix = NULL;
|
|
Conf->CompoundAffix = (CMPDAffix*)realloc( Conf->CompoundAffix, sizeof(CMPDAffix) * (ptr-Conf->CompoundAffix+1) );
|
|
|
|
Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, 'p');
|
|
Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, 's');
|
|
}
|
|
|
|
static AffixNodeData*
|
|
FinfAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type) {
|
|
AffixNodeData *StopLow, *StopHigh, *StopMiddle;
|
|
u_int8_t symbol;
|
|
|
|
while( node && *level<wrdlen) {
|
|
StopLow = node->data;
|
|
StopHigh = node->data+node->length;
|
|
while (StopLow < StopHigh) {
|
|
StopMiddle = StopLow + (StopHigh - StopLow) / 2;
|
|
symbol = GETWCHAR(word,wrdlen,*level,type);
|
|
if ( StopMiddle->val == symbol ) {
|
|
if ( StopMiddle->naff )
|
|
return StopMiddle;
|
|
node=StopMiddle->node;
|
|
(*level)++;
|
|
break;
|
|
} else if ( StopMiddle->val < symbol ) {
|
|
StopLow = StopMiddle + 1;
|
|
} else {
|
|
StopHigh = StopMiddle;
|
|
}
|
|
}
|
|
if ( StopLow >= StopHigh )
|
|
break;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
static char *
|
|
CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *newword) {
|
|
regmatch_t subs[2]; /* workaround for apache&linux */
|
|
int err;
|
|
|
|
if ( flagflags & FF_COMPOUNDONLYAFX ) {
|
|
if ( (Affix->flagflags & FF_COMPOUNDONLYAFX) == 0 )
|
|
return NULL;
|
|
} else {
|
|
if ( Affix->flagflags & FF_COMPOUNDONLYAFX )
|
|
return NULL;
|
|
}
|
|
|
|
if ( Affix->type=='s' ) {
|
|
strcpy(newword, word);
|
|
strcpy(newword + len - Affix->replen, Affix->find);
|
|
} else {
|
|
strcpy(newword, Affix->find);
|
|
strcat(newword, word + Affix->replen);
|
|
}
|
|
|
|
if (Affix->compile)
|
|
{
|
|
err = regcomp(&(Affix->reg), Affix->mask, REG_EXTENDED | REG_ICASE | REG_NOSUB);
|
|
if (err)
|
|
{
|
|
/* regerror(err, &(Affix->reg), regerrstr, ERRSTRSIZE); */
|
|
regfree(&(Affix->reg));
|
|
return (NULL);
|
|
}
|
|
Affix->compile = 0;
|
|
}
|
|
if (!(err = regexec(&(Affix->reg), newword, 1, subs, 0)))
|
|
return newword;
|
|
return NULL;
|
|
}
|
|
|
|
|
|
static char **
|
|
NormalizeSubWord(IspellDict * Conf, char *word, char flag) {
|
|
AffixNodeData *suffix=NULL, *prefix=NULL;
|
|
int slevel=0, plevel=0;
|
|
int wrdlen = strlen(word), swrdlen;
|
|
char **forms;
|
|
char **cur;
|
|
char newword[2 * MAXNORMLEN] = "";
|
|
char pnewword[2 * MAXNORMLEN] = "";
|
|
AffixNode *snode = Conf->Suffix, *pnode;
|
|
int i,j;
|
|
|
|
if (wrdlen > MAXNORMLEN) return NULL;
|
|
strlower(word);
|
|
cur = forms = (char **) palloc(MAX_NORM * sizeof(char *));
|
|
*cur = NULL;
|
|
|
|
|
|
/* Check that the word itself is normal form */
|
|
if (FindWord(Conf, word, 0, flag & FF_COMPOUNDWORD)) {
|
|
*cur = pstrdup(word);
|
|
cur++;
|
|
*cur = NULL;
|
|
}
|
|
|
|
/* Find all other NORMAL forms of the 'word' (check only prefix)*/
|
|
pnode=Conf->Prefix;
|
|
plevel=0;
|
|
while(pnode) {
|
|
prefix=FinfAffixes(pnode, word, wrdlen, &plevel,'p');
|
|
if (!prefix) break;
|
|
for(j=0;j<prefix->naff;j++) {
|
|
if ( CheckAffix(word,wrdlen,prefix->aff[j], flag, newword) ) {
|
|
/* prefix success */
|
|
if ( FindWord(Conf, newword, prefix->aff[j]->flag, flag&FF_COMPOUNDWORD) && (cur - forms) < (MAX_NORM-1) ) {
|
|
/* word search success */
|
|
*cur = pstrdup(newword);
|
|
cur++;
|
|
*cur=NULL;
|
|
}
|
|
}
|
|
}
|
|
pnode = prefix->node;
|
|
plevel++;
|
|
}
|
|
|
|
/* Find all other NORMAL forms of the 'word' (check suffix and then prefix)*/
|
|
while( snode ) {
|
|
/* find possible suffix */
|
|
suffix = FinfAffixes(snode, word, wrdlen, &slevel, 's');
|
|
if (!suffix) break;
|
|
/* foreach suffix check affix */
|
|
for(i=0;i<suffix->naff;i++) {
|
|
if ( CheckAffix(word, wrdlen, suffix->aff[i], flag, newword) ) {
|
|
/* suffix success */
|
|
if ( FindWord(Conf, newword, suffix->aff[i]->flag, flag&FF_COMPOUNDWORD) && (cur - forms) < (MAX_NORM-1) ) {
|
|
/* word search success */
|
|
*cur = pstrdup(newword);
|
|
cur++;
|
|
*cur=NULL;
|
|
}
|
|
/* now we will look changed word with prefixes */
|
|
pnode=Conf->Prefix;
|
|
plevel=0;
|
|
swrdlen=strlen(newword);
|
|
while(pnode) {
|
|
prefix=FinfAffixes(pnode, newword, swrdlen, &plevel,'p');
|
|
if (!prefix) break;
|
|
for(j=0;j<prefix->naff;j++) {
|
|
if ( CheckAffix(newword,swrdlen,prefix->aff[j], flag, pnewword) ) {
|
|
/* prefix success */
|
|
int ff=( prefix->aff[j]->flagflags & suffix->aff[i]->flagflags & FF_CROSSPRODUCT ) ?
|
|
0 : prefix->aff[j]->flag;
|
|
if ( FindWord(Conf, pnewword, ff, flag&FF_COMPOUNDWORD) && (cur - forms) < (MAX_NORM-1) ) {
|
|
/* word search success */
|
|
*cur = pstrdup(pnewword);
|
|
cur++;
|
|
*cur=NULL;
|
|
}
|
|
}
|
|
}
|
|
pnode = prefix->node;
|
|
plevel++;
|
|
}
|
|
}
|
|
}
|
|
|
|
snode=suffix->node;
|
|
slevel++;
|
|
}
|
|
|
|
if (cur == forms) {
|
|
free(forms);
|
|
return (NULL);
|
|
}
|
|
return (forms);
|
|
}
|
|
|
|
typedef struct SplitVar {
|
|
int nstem;
|
|
char **stem;
|
|
struct SplitVar *next;
|
|
} SplitVar;
|
|
|
|
static int
|
|
CheckCompoundAffixes(CMPDAffix **ptr, char *word, int len) {
|
|
while( (*ptr)->affix ) {
|
|
if ( len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len)==0 ) {
|
|
len = (*ptr)->len;
|
|
(*ptr)++;
|
|
return len;
|
|
}
|
|
(*ptr)++;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static SplitVar*
|
|
CopyVar(SplitVar *s, int makedup) {
|
|
SplitVar *v = (SplitVar*)palloc(sizeof(SplitVar));
|
|
|
|
v->stem=(char**)palloc( sizeof(char*) * (MAX_NORM) );
|
|
v->next=NULL;
|
|
if ( s ) {
|
|
int i;
|
|
v->nstem = s->nstem;
|
|
for(i=0;i<s->nstem;i++)
|
|
v->stem[i] = (makedup) ? pstrdup( s->stem[i] ) : s->stem[i];
|
|
} else {
|
|
v->nstem=0;
|
|
}
|
|
return v;
|
|
}
|
|
|
|
|
|
static SplitVar*
|
|
SplitToVariants( IspellDict * Conf, SPNode *snode, SplitVar * orig, char *word, int wordlen, int startpos, int minpos ) {
|
|
SplitVar *var=NULL;
|
|
SPNodeData *StopLow, *StopHigh, *StopMiddle;
|
|
SPNode *node = (snode) ? snode : Conf->Dictionary;
|
|
int level=(snode) ? minpos : startpos; /* recursive minpos==level*/
|
|
int lenaff;
|
|
CMPDAffix *caff;
|
|
char notprobed[wordlen];
|
|
|
|
memset(notprobed,1,wordlen);
|
|
var = CopyVar(orig,1);
|
|
|
|
while( node && level<wordlen) {
|
|
StopLow = node->data;
|
|
StopHigh = node->data+node->length;
|
|
while (StopLow < StopHigh) {
|
|
StopMiddle = StopLow + (StopHigh - StopLow) / 2;
|
|
if ( StopMiddle->val == ((u_int8_t*)(word))[level] ) {
|
|
break;
|
|
} else if ( StopMiddle->val < ((u_int8_t*)(word))[level] ) {
|
|
StopLow = StopMiddle + 1;
|
|
} else {
|
|
StopHigh = StopMiddle;
|
|
}
|
|
}
|
|
if ( StopLow >= StopHigh )
|
|
break;
|
|
|
|
/* find word with epenthetic */
|
|
caff = Conf->CompoundAffix;
|
|
while ( level>startpos && (lenaff=CheckCompoundAffixes( &caff, word + level, wordlen - level ))>0 ) {
|
|
/* there is one of compound suffixes, so check word for existings */
|
|
char buf[MAXNORMLEN];
|
|
char **subres;
|
|
|
|
lenaff=level-startpos+lenaff;
|
|
|
|
if ( !notprobed[startpos+lenaff-1] )
|
|
continue;
|
|
|
|
if ( level+lenaff-1 <= minpos )
|
|
continue;
|
|
|
|
memcpy(buf, word+startpos, lenaff);
|
|
buf[lenaff]='\0';
|
|
|
|
subres = NormalizeSubWord(Conf, buf, FF_COMPOUNDWORD | FF_COMPOUNDONLYAFX);
|
|
if ( subres ) {
|
|
/* Yes, it was a word from dictionary */
|
|
SplitVar *new=CopyVar(var,0);
|
|
SplitVar *ptr=var;
|
|
char **sptr=subres;
|
|
|
|
notprobed[startpos+lenaff-1]=0;
|
|
|
|
while(*sptr) {
|
|
new->stem[ new->nstem ] = *sptr;
|
|
new->nstem++;
|
|
sptr++;
|
|
}
|
|
free(subres);
|
|
|
|
while( ptr->next )
|
|
ptr = ptr->next;
|
|
ptr->next = SplitToVariants(Conf, NULL, new, word, wordlen, startpos+lenaff, startpos+lenaff);
|
|
|
|
free(new->stem);
|
|
free(new);
|
|
}
|
|
}
|
|
|
|
/* find infinitive */
|
|
if ( StopMiddle->isword && StopMiddle->compoundallow && notprobed[level] ) {
|
|
/* ok, we found full compoundallowed word*/
|
|
if ( level>minpos ) {
|
|
/* and its length more than minimal */
|
|
if ( wordlen==level+1 ) {
|
|
/* well, it was last word */
|
|
var->stem[ var->nstem ] = strndup(word + startpos, wordlen - startpos);
|
|
var->nstem++;
|
|
return var;
|
|
} else {
|
|
/* then we will search more big word at the same point */
|
|
SplitVar *ptr=var;
|
|
while( ptr->next )
|
|
ptr = ptr->next;
|
|
ptr->next=SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
|
|
/* we can find next word */
|
|
level++;
|
|
var->stem[ var->nstem ] = strndup(word + startpos, level - startpos);
|
|
var->nstem++;
|
|
node = Conf->Dictionary;
|
|
startpos=level;
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
level++;
|
|
node=StopMiddle->node;
|
|
}
|
|
|
|
var->stem[ var->nstem ] = strndup(word + startpos, wordlen - startpos);
|
|
var->nstem++;
|
|
return var;
|
|
}
|
|
|
|
char **
|
|
NINormalizeWord(IspellDict * Conf, char *word) {
|
|
char **res= NormalizeSubWord(Conf, word, 0);
|
|
|
|
if ( Conf->compoundcontrol != '\t' ) {
|
|
int wordlen=strlen(word);
|
|
SplitVar *ptr, *var = SplitToVariants(Conf,NULL,NULL, word, wordlen, 0, -1);
|
|
char **cur=res;
|
|
int i;
|
|
|
|
while(var) {
|
|
if ( var->nstem > 1 ) {
|
|
char **subres = NormalizeSubWord(Conf, var->stem[ var->nstem-1 ], FF_COMPOUNDWORD);
|
|
if ( subres ) {
|
|
char **ptr=subres;
|
|
|
|
if ( cur ) {
|
|
while(*cur)
|
|
cur++;
|
|
} else {
|
|
res=cur=(char **) palloc(MAX_NORM * sizeof(char *));
|
|
}
|
|
|
|
for(i=0;i<var->nstem-1;i++) {
|
|
*cur=var->stem[ i ];
|
|
cur++;
|
|
}
|
|
while(*ptr) {
|
|
*cur=*ptr;
|
|
cur++; ptr++;
|
|
}
|
|
*cur=NULL;
|
|
free(subres);
|
|
var->stem[ 0 ] = NULL;
|
|
}
|
|
}
|
|
|
|
for(i=0;i<var->nstem && var->stem[ i ];i++)
|
|
free( var->stem[i] );
|
|
ptr = var->next;
|
|
free(var->stem);
|
|
free(var);
|
|
var=ptr;
|
|
}
|
|
}
|
|
return res;
|
|
}
|
|
|
|
|
|
static void freeSPNode(SPNode *node) {
|
|
SPNodeData *data;
|
|
|
|
if (!node) return;
|
|
data=node->data;
|
|
while( node->length ) {
|
|
freeSPNode(data->node);
|
|
data++;
|
|
node->length--;
|
|
}
|
|
free(node);
|
|
}
|
|
|
|
static void freeANode(AffixNode *node) {
|
|
AffixNodeData *data;
|
|
|
|
if (!node) return;
|
|
data=node->data;
|
|
while( node->length ) {
|
|
freeANode(data->node);
|
|
if (data->naff)
|
|
free(data->aff);
|
|
data++;
|
|
node->length--;
|
|
}
|
|
free(node);
|
|
}
|
|
|
|
|
|
void
|
|
NIFree(IspellDict * Conf)
|
|
{
|
|
int i;
|
|
AFFIX *Affix = (AFFIX *) Conf->Affix;
|
|
char** aff = Conf->AffixData;
|
|
|
|
if ( aff ) {
|
|
while(*aff) {
|
|
free(*aff);
|
|
aff++;
|
|
}
|
|
free(Conf->AffixData);
|
|
}
|
|
|
|
|
|
for (i = 0; i < Conf->naffixes; i++)
|
|
{
|
|
if (Affix[i].compile == 0)
|
|
regfree(&(Affix[i].reg));
|
|
}
|
|
if (Conf->Spell) {
|
|
for (i = 0; i < Conf->nspell; i++)
|
|
free(Conf->Spell[i].word);
|
|
free(Conf->Spell);
|
|
}
|
|
|
|
if (Conf->Affix) free(Conf->Affix);
|
|
if ( Conf->CompoundAffix ) free(Conf->CompoundAffix);
|
|
freeSPNode(Conf->Dictionary);
|
|
freeANode(Conf->Suffix);
|
|
freeANode(Conf->Prefix);
|
|
memset((void *) Conf, 0, sizeof(IspellDict));
|
|
return;
|
|
}
|