postgresql/contrib/tsearch2/ispell/spell.c

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#include "postgres.h"

#include "spell.h"

#define MAXNORMLEN 56

#define STRNCASECMP(x,y)		(strncasecmp(x,y,strlen(y)))

static int
cmpspell(const void *s1, const void *s2)
{
	return (strcmp(((const SPELL *) s1)->word, ((const SPELL *) s2)->word));
}

static void
strlower(char *str)
{
	unsigned char *ptr = (unsigned char *) str;

	while (*ptr)
	{
		*ptr = tolower(*ptr);
		ptr++;
	}
}

/* backward string compaire for suffix tree operations */
static int
strbcmp(const char *s1, const char *s2)
{
	int			l1 = strlen(s1) - 1,
				l2 = strlen(s2) - 1;

	while (l1 >= 0 && l2 >= 0)
	{
		if (s1[l1] < s2[l2])
			return -1;
		if (s1[l1] > s2[l2])
			return 1;
		l1--;
		l2--;
	}
	if (l1 < l2)
		return -1;
	if (l1 > l2)
		return 1;

	return 0;
}
static int
strbncmp(const char *s1, const char *s2, size_t count)
{
	int			l1 = strlen(s1) - 1,
				l2 = strlen(s2) - 1,
				l = count;

	while (l1 >= 0 && l2 >= 0 && l > 0)
	{
		if (s1[l1] < s2[l2])
			return -1;
		if (s1[l1] > s2[l2])
			return 1;
		l1--;
		l2--;
		l--;
	}
	if (l == 0)
		return 0;
	if (l1 < l2)
		return -1;
	if (l1 > l2)
		return 1;
	return 0;
}

static int
cmpaffix(const void *s1, const void *s2)
{
	if (((const AFFIX *) s1)->type < ((const AFFIX *) s2)->type)
		return -1;
	if (((const AFFIX *) s1)->type > ((const AFFIX *) s2)->type)
		return 1;
	if (((const AFFIX *) s1)->type == 'p')
		return (strcmp(((const AFFIX *) s1)->repl, ((const AFFIX *) s2)->repl));
	else
		return (strbcmp(((const AFFIX *) s1)->repl, ((const AFFIX *) s2)->repl));
}

int
AddSpell(IspellDict * Conf, const char *word, const char *flag)
{
	if (Conf->nspell >= Conf->mspell)
	{
		if (Conf->mspell)
		{
			Conf->mspell += 1024 * 20;
			Conf->Spell = (SPELL *) realloc(Conf->Spell, Conf->mspell * sizeof(SPELL));
		}
		else
		{
			Conf->mspell = 1024 * 20;
			Conf->Spell = (SPELL *) malloc(Conf->mspell * sizeof(SPELL));
		}
		if (Conf->Spell == NULL)
			ereport(ERROR,
					(errcode(ERRCODE_OUT_OF_MEMORY),
					 errmsg("out of memory")));
	}
	Conf->Spell[Conf->nspell].word = strdup(word);
	if (!Conf->Spell[Conf->nspell].word)
		ereport(ERROR,
				(errcode(ERRCODE_OUT_OF_MEMORY),
				 errmsg("out of memory")));
	strncpy(Conf->Spell[Conf->nspell].flag, flag, 10);
	Conf->nspell++;
	return (0);
}


int
ImportDictionary(IspellDict * Conf, const char *filename)
{
	unsigned char str[BUFSIZ];
	FILE	   *dict;

	if (!(dict = fopen(filename, "r")))
		return (1);
	while (fgets(str, sizeof(str), dict))
	{
		unsigned char *s;
		const unsigned char *flag;

		flag = NULL;
		if ((s = strchr(str, '/')))
		{
			*s = 0;
			s++;
			flag = s;
			while (*s)
			{
				if (((*s >= 'A') && (*s <= 'Z')) || ((*s >= 'a') && (*s <= 'z')))
					s++;
				else
				{
					*s = 0;
					break;
				}
			}
		}
		else
			flag = "";
		strlower(str);
		/* Dont load words if first letter is not required */
		/* It allows to optimize loading at  search time   */
		s = str;
		while (*s)
		{
			if (*s == '\r')
				*s = 0;
			if (*s == '\n')
				*s = 0;
			s++;
		}
		AddSpell(Conf, str, flag);
	}
	fclose(dict);
	return (0);
}


static SPELL *
FindWord(IspellDict * Conf, const char *word, int affixflag)
{
	int			l,
				c,
				r,
				resc,
				resl,
				resr,
				i;

	i = (int) (*word) & 255;
	l = Conf->SpellTree.Left[i];
	r = Conf->SpellTree.Right[i];
	if (l == -1)
		return (NULL);
	while (l <= r)
	{
		c = (l + r) >> 1;
		resc = strcmp(Conf->Spell[c].word, word);
		if ((resc == 0) &&
			((affixflag == 0) || (strchr(Conf->Spell[c].flag, affixflag) != NULL)))
			return (&Conf->Spell[c]);
		resl = strcmp(Conf->Spell[l].word, word);
		if ((resl == 0) &&
			((affixflag == 0) || (strchr(Conf->Spell[l].flag, affixflag) != NULL)))
			return (&Conf->Spell[l]);
		resr = strcmp(Conf->Spell[r].word, word);
		if ((resr == 0) &&
			((affixflag == 0) || (strchr(Conf->Spell[r].flag, affixflag) != NULL)))
			return (&Conf->Spell[r]);
		if (resc < 0)
		{
			l = c + 1;
			r--;
		}
		else if (resc > 0)
		{
			r = c - 1;
			l++;
		}
		else
		{
			l++;
			r--;
		}
	}
	return (NULL);
}

int
AddAffix(IspellDict * Conf, int flag, const char *mask, const char *find, const char *repl, int type)
{
	if (Conf->naffixes >= Conf->maffixes)
	{
		if (Conf->maffixes)
		{
			Conf->maffixes += 16;
			Conf->Affix = (AFFIX *) realloc((void *) Conf->Affix, Conf->maffixes * sizeof(AFFIX));
		}
		else
		{
			Conf->maffixes = 16;
			Conf->Affix = (AFFIX *) malloc(Conf->maffixes * sizeof(AFFIX));
		}
		if (Conf->Affix == NULL)
			ereport(ERROR,
					(errcode(ERRCODE_OUT_OF_MEMORY),
					 errmsg("out of memory")));
	}
	if (type == 's')
		sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask);
	else
		sprintf(Conf->Affix[Conf->naffixes].mask, "^%s", mask);
	Conf->Affix[Conf->naffixes].compile = 1;
	Conf->Affix[Conf->naffixes].flag = flag;
	Conf->Affix[Conf->naffixes].type = type;

	strcpy(Conf->Affix[Conf->naffixes].find, find);
	strcpy(Conf->Affix[Conf->naffixes].repl, repl);
	Conf->Affix[Conf->naffixes].replen = strlen(repl);
	Conf->naffixes++;
	return (0);
}

static char *
remove_spaces(char *dist, char *src)
{
	char	   *d,
			   *s;

	d = dist;
	s = src;
	while (*s)
	{
		if (*s != ' ' && *s != '-' && *s != '\t')
		{
			*d = *s;
			d++;
		}
		s++;
	}
	*d = 0;
	return (dist);
}


int
ImportAffixes(IspellDict * Conf, const char *filename)
{
	unsigned char str[BUFSIZ];
	unsigned char flag = 0;
	unsigned char mask[BUFSIZ] = "";
	unsigned char find[BUFSIZ] = "";
	unsigned char repl[BUFSIZ] = "";
	unsigned char *s;
	int			i;
	int			suffixes = 0;
	int			prefixes = 0;
	FILE	   *affix;

	if (!(affix = fopen(filename, "r")))
		return (1);

	while (fgets(str, sizeof(str), affix))
	{
		if (!STRNCASECMP(str, "suffixes"))
		{
			suffixes = 1;
			prefixes = 0;
			continue;
		}
		if (!STRNCASECMP(str, "prefixes"))
		{
			suffixes = 0;
			prefixes = 1;
			continue;
		}
		if (!STRNCASECMP(str, "flag "))
		{
			s = str + 5;
			while (strchr("* ", *s))
				s++;
			flag = *s;
			continue;
		}
		if ((!suffixes) && (!prefixes))
			continue;
		if ((s = strchr(str, '#')))
			*s = 0;
		if (!*str)
			continue;
		strlower(str);
		strcpy(mask, "");
		strcpy(find, "");
		strcpy(repl, "");
		i = sscanf(str, "%[^>\n]>%[^,\n],%[^\n]", mask, find, repl);
		remove_spaces(str, repl);
		strcpy(repl, str);
		remove_spaces(str, find);
		strcpy(find, str);
		remove_spaces(str, mask);
		strcpy(mask, str);
		switch (i)
		{
			case 3:
				break;
			case 2:
				if (*find != '\0')
				{
					strcpy(repl, find);
					strcpy(find, "");
				}
				break;
			default:
				continue;
		}

		AddAffix(Conf, (int) flag, mask, find, repl, suffixes ? 's' : 'p');

	}
	fclose(affix);

	return (0);
}

void
SortDictionary(IspellDict * Conf)
{
	int			CurLet = -1,
				Let;
	size_t		i;

	qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL), cmpspell);

	for (i = 0; i < 256; i++)
		Conf->SpellTree.Left[i] = -1;

	for (i = 0; i < Conf->nspell; i++)
	{
		Let = (int) (*(Conf->Spell[i].word)) & 255;
		if (CurLet != Let)
		{
			Conf->SpellTree.Left[Let] = i;
			CurLet = Let;
		}
		Conf->SpellTree.Right[Let] = i;
	}
}

void
SortAffixes(IspellDict * Conf)
{
	int			CurLetP = -1,
				CurLetS = -1,
				Let;
	AFFIX	   *Affix;
	size_t		i;

	if (Conf->naffixes > 1)
		qsort((void *) Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix);
	for (i = 0; i < 256; i++)
	{
		Conf->PrefixTree.Left[i] = Conf->PrefixTree.Right[i] = -1;
		Conf->SuffixTree.Left[i] = Conf->SuffixTree.Right[i] = -1;
	}

	for (i = 0; i < Conf->naffixes; i++)
	{
		Affix = &(((AFFIX *) Conf->Affix)[i]);
		if (Affix->type == 'p')
		{
			Let = (int) (*(Affix->repl)) & 255;
			if (CurLetP != Let)
			{
				Conf->PrefixTree.Left[Let] = i;
				CurLetP = Let;
			}
			Conf->PrefixTree.Right[Let] = i;
		}
		else
		{
			Let = (Affix->replen) ? (int) (Affix->repl[Affix->replen - 1]) & 255 : 0;
			if (CurLetS != Let)
			{
				Conf->SuffixTree.Left[Let] = i;
				CurLetS = Let;
			}
			Conf->SuffixTree.Right[Let] = i;
		}
	}
}

static char *
CheckSuffix(const char *word, size_t len, AFFIX * Affix, int *res, IspellDict * Conf)
{
	regmatch_t	subs[2];		/* workaround for apache&linux */
	char		newword[2 * MAXNORMLEN] = "";
	int			err;

	*res = strbncmp(word, Affix->repl, Affix->replen);
	if (*res < 0)
		return NULL;
	if (*res > 0)
		return NULL;
	strcpy(newword, word);
	strcpy(newword + len - Affix->replen, Affix->find);

	if (Affix->compile)
	{
		err = regcomp(&(Affix->reg), Affix->mask, REG_EXTENDED | REG_ICASE | REG_NOSUB);
		if (err)
		{
			/* regerror(err, &(Affix->reg), regerrstr, ERRSTRSIZE); */
			regfree(&(Affix->reg));
			return (NULL);
		}
		Affix->compile = 0;
	}
	if (!(err = regexec(&(Affix->reg), newword, 1, subs, 0)))
	{
		if (FindWord(Conf, newword, Affix->flag))
			return pstrdup(newword);
	}
	return NULL;
}

#define NS 1
#define MAX_NORM 512
static int
CheckPrefix(const char *word, size_t len, AFFIX * Affix, IspellDict * Conf, int pi,
			char **forms, char ***cur)
{
	regmatch_t	subs[NS * 2];
	char		newword[2 * MAXNORMLEN] = "";
	int			err,
				ls,
				res,
				lres;
	size_t		newlen;
	AFFIX	   *CAffix = Conf->Affix;

	res = strncmp(word, Affix->repl, Affix->replen);
	if (res != 0)
		return res;
	strcpy(newword, Affix->find);
	strcat(newword, word + Affix->replen);

	if (Affix->compile)
	{
		err = regcomp(&(Affix->reg), Affix->mask, REG_EXTENDED | REG_ICASE | REG_NOSUB);
		if (err)
		{
			/* regerror(err, &(Affix->reg), regerrstr, ERRSTRSIZE); */
			regfree(&(Affix->reg));
			return (0);
		}
		Affix->compile = 0;
	}
	if (!(err = regexec(&(Affix->reg), newword, 1, subs, 0)))
	{
		SPELL	   *curspell;

		if ((curspell = FindWord(Conf, newword, Affix->flag)))
		{
			if ((*cur - forms) < (MAX_NORM - 1))
			{
				**cur = pstrdup(newword);
				(*cur)++;
				**cur = NULL;
			}
		}
		newlen = strlen(newword);
		ls = Conf->SuffixTree.Left[pi];
		if (ls >= 0 && ((*cur - forms) < (MAX_NORM - 1)))
		{
			**cur = CheckSuffix(newword, newlen, &CAffix[ls], &lres, Conf);
			if (**cur)
			{
				(*cur)++;
				**cur = NULL;
			}
		}
	}
	return 0;
}


char	  **
NormalizeWord(IspellDict * Conf, char *word)
{
/*regmatch_t subs[NS];*/
	size_t		len;
	char	  **forms;
	char	  **cur;
	AFFIX	   *Affix;
	int			ri,
				pi,
				ipi,
				lp,
				rp,
				cp,
				ls,
				rs;
	int			lres,
				rres,
				cres = 0;
	SPELL	   *spell;

	len = strlen(word);
	if (len > MAXNORMLEN)
		return (NULL);

	strlower(word);

	forms = (char **) palloc(MAX_NORM * sizeof(char **));
	cur = forms;
	*cur = NULL;

	ri = (int) (*word) & 255;
	pi = (int) (word[strlen(word) - 1]) & 255;
	Affix = (AFFIX *) Conf->Affix;

	/* Check that the word itself is normal form */
	if ((spell = FindWord(Conf, word, 0)))
	{
		*cur = pstrdup(word);
		cur++;
		*cur = NULL;
	}

	/* Find all other NORMAL forms of the 'word' */

	for (ipi = 0; ipi <= pi; ipi += pi)
	{

		/* check prefix */
		lp = Conf->PrefixTree.Left[ri];
		rp = Conf->PrefixTree.Right[ri];
		while (lp >= 0 && lp <= rp)
		{
			cp = (lp + rp) >> 1;
			cres = 0;
			if ((cur - forms) < (MAX_NORM - 1))
				cres = CheckPrefix(word, len, &Affix[cp], Conf, ipi, forms, &cur);
			if ((lp < cp) && ((cur - forms) < (MAX_NORM - 1)))
				lres = CheckPrefix(word, len, &Affix[lp], Conf, ipi, forms, &cur);
			if ((rp > cp) && ((cur - forms) < (MAX_NORM - 1)))
				rres = CheckPrefix(word, len, &Affix[rp], Conf, ipi, forms, &cur);
			if (cres < 0)
			{
				rp = cp - 1;
				lp++;
			}
			else if (cres > 0)
			{
				lp = cp + 1;
				rp--;
			}
			else
			{
				lp++;
				rp--;
			}
		}

		/* check suffix */
		ls = Conf->SuffixTree.Left[ipi];
		rs = Conf->SuffixTree.Right[ipi];
		while (ls >= 0 && ls <= rs)
		{
			if (((cur - forms) < (MAX_NORM - 1)))
			{
				*cur = CheckSuffix(word, len, &Affix[ls], &lres, Conf);
				if (*cur)
				{
					cur++;
					*cur = NULL;
				}
			}
			if ((rs > ls) && ((cur - forms) < (MAX_NORM - 1)))
			{
				*cur = CheckSuffix(word, len, &Affix[rs], &rres, Conf);
				if (*cur)
				{
					cur++;
					*cur = NULL;
				}
			}
			ls++;
			rs--;
		}						/* end while */

	}							/* for ipi */

	if (cur == forms)
	{
		pfree(forms);
		return (NULL);
	}
	return (forms);
}

void
FreeIspell(IspellDict * Conf)
{
	int			i;
	AFFIX	   *Affix = (AFFIX *) Conf->Affix;

	for (i = 0; i < Conf->naffixes; i++)
	{
		if (Affix[i].compile == 0)
			regfree(&(Affix[i].reg));
	}
	for (i = 0; i < Conf->naffixes; i++)
		free(Conf->Spell[i].word);
	free(Conf->Affix);
	free(Conf->Spell);
	memset((void *) Conf, 0, sizeof(IspellDict));
	return;
}