postgresql/contrib/tsearch/dict/porter_english.dct
Bruce Momjian 8fdc7814d0 Please, apply attached patch for contrib/tsearch to 7.2.1 and current
CVS. It  fix english stemmer's problem with ending words like
'technology'.

We have found one more bug in english stemmer. The bug is with
'irregular'  english words like 'skies' -> 'sky'. Please, apply attached
cumulative patch to  7.2.1 and current CVS instead previous one.

Thank to Thomas T. Thai <tom@minnesota.com> for hard testing. This kind
of bug  has significance only for dump/reload database and viewing, but
searching/indexing works right.

Teodor Sigaev
2002-03-05 06:10:28 +00:00

1286 lines
27 KiB
Plaintext

/*
* ----START-LICENCE----
* Copyright 1999,2000 BrightStation PLC
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
* USA
* -----END-LICENCE-----
*/
/* Version 1: see http://open.muscat.com/ for further information */
#ifdef DICT_BODY
#include <ctype.h> /* tolower */
static void * setup_english_stemmer();
static const char * english_stem(void * z, const char * q, int i0, int i1);
static void closedown_english_stemmer(void * z);
/* To set up the english stemming process:
void * z = setup_stemmer();
to use it:
char * p = stem(z, q, i0, i1);
The word to be stemmed is in byte address q offsets i0 to i1
inclusive (i.e. from q[i0] to q[i1]). The stemmed result is the
C string at address p.
To close down the stemming process:
closedown_stemmer(z);
*/
/* The English stemming algorithm is essentially the Porter stemming
* algorithm, and has been coded up by its author. It follows the algorithm
* presented in
*
* Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
* no. 3, pp 130-137,
*
* only differing from it at the points marked -DEPARTURE- and -NEW-
* below.
*
* For a more faithful version of the Porter algorithm, see
*
* http://www.muscat.com/~martin/stem.html
*
*/
/* Later additions:
June 2000
The 'l' of the 'logi' -> 'log' rule is put with the stem, so that
short stems like 'geo' 'theo' etc work like 'archaeo' 'philo' etc.
This follows a suggestion of Barry Wilkins, reasearch student at
Birmingham.
February 2000
the cvc test for not dropping final -e now looks after vc at the
beginning of a word, so are, eve, ice, ore, use keep final -e. In this
test c is any consonant, including w, x and y. This extension was
suggested by Chris Emerson.
-fully -> -ful treated like -fulness -> -ful, and
-tionally -> -tion treated like -tional -> -tion
both in Step 2. These were suggested by Hiranmay Ghosh, of New Delhi.
Invariants proceed, succeed, exceed. Also suggested by Hiranmay Ghosh.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
struct pool {
int size;
struct pool_entry * entries;
};
/* This is used as a library to resolve exceptions in the various
stemming algorithms. Typical use is,
struct pool * p = create_pool(t);
char * s_translated = search_pool(p, strlen(s), s);
...
free_pool(p);
t is an array of strings, e.g.
static char * t[] = {
"sky", "sky/skies/",
"die", "dying/",
"lie", "lying/",
"tie", "tying/",
....
0, 0
};
if s is "sky", "skies", "dying" etc., translated_s is becomes "sky",
"sky", "die" etc.
The code includes a sort/merge capability which may be turned into
(or replaced by) something more general later on.
*/
/* merge(n, p, q, r, l, k, f) repeatedly merges n-byte sequences of items of
size k from addresses p and q into r. f is the comparison routine and
l is the limit point for q.
*/
static void merge(int n, char * p, char * q, char * r, char * l, int k,
int (*f)(char *, char *))
{ char * q0 = q;
if (q0 > l) { memmove(r, p, l-p); return; }
while (p < q0)
{ char * pl = n+p;
char * ql = n+q;
if (ql > l) ql = l;
while(true)
{ if (p >= pl) { memmove(r, q, ql-q); r += ql-q; q = ql; break; }
if (q >= ql) { memmove(r, p, pl-p); r += pl-p; p = pl; break; }
if (f(p, q)) { memmove(r, p, k); p += k; }
else { memmove(r, q, k); q += k; }
r += k;
}
}
memmove(r, q, l-q);
}
/* In sort(p, c, k, f), p+c is a byte address at which begin a sequence of
items of size k to be sorted. p+l is the address of the byte after the
last of these items, so l - c is divisible by k. f is a comparison function
for a pair of these items: f(p+i, q+j) is true if the item at p+i is before
the item at q+j, false if it is equal to or after it.
*/
static void sort(char * p, int c, int l, int k,
int (*f)(char *, char *))
{
char * q = malloc(l-c); /* temporary work space */
int j = k;
int w = l-c;
while (j < w)
{ int cycle;
for (cycle = 1; cycle <= 2; cycle++)
{ int h = (w+j-1) / j / 2 * j; /* half way */
if (cycle == 1) merge(j, p+c, p+c+h, q, p+l, k, f);
else merge(j, q, q+h, p+c, q+w, k, f);
j *= 2;
}
}
free(q);
}
struct pool_entry {
const char * translation;
const char * pointer;
int length;
};
static void print_entry(struct pool_entry * p)
{
{ int j; for (j=0;j<p->length;j++) fprintf(stderr, "%c", (p->pointer)[j]); }
fprintf(stderr, " --> %s\n", p->translation);
}
/* - debugging aid
static void print_pool(struct pool * p)
{ int i;
int size = p->size;
struct pool_entry * q = p->entries;
fprintf(stderr, "\nPool:\n");
for (i = 0; i < size; i++) print_entry(q+i);
}
*/
/* compare(p, q) is our comparison function, used for f above
*/
static int compare(char * char_p, char * char_q)
{ struct pool_entry * p = (struct pool_entry *) char_p;
struct pool_entry * q = (struct pool_entry *) char_q;
if (p->length == q->length) return memcmp(p->pointer, q->pointer, p->length) < 0;
return p->length < q->length;
}
static int count_slashes(const char * s[])
{ int slash_count = 0;
int i;
for (i = 1; s[i] != 0; i += 2)
{ const char * p = s[i];
int j = 0;
while (p[j] != 0) if (p[j++] == '/') slash_count++;
}
return slash_count;
}
static struct pool * create_pool(const char * s[])
{ int size = count_slashes(s);
struct pool_entry * z = (struct pool_entry *) malloc(size * sizeof(struct pool_entry));
struct pool_entry * q = z;
int i;
for (i = 1; s[i] != 0; i += 2)
{ const char * p = s[i];
int j = 0;
int j0 = 0;
while(true)
{ if (p[j] == 0)
{ if (j0 != j) { fprintf(stderr, "%s lacks final '/'\n", p); exit(1); }
break;
}
if (p[j] == '/')
{
q->translation = s[i-1];
q->pointer = p+j0; q->length = j-j0;
q++;
j0 = j+1;
}
j++;
}
}
sort((char *) z, 0, size * sizeof(struct pool_entry), sizeof(struct pool_entry), compare);
/* now validate the contents */
for (i = 1; i < size; i++)
{ struct pool_entry * p = z+i;
struct pool_entry * q = z+i-1;
if (p->length == q->length && memcmp(p->pointer, q->pointer, p->length) == 0)
{ fprintf(stderr, "warning: "); print_entry(p);
fprintf(stderr, " and "); print_entry(q);
}
}
{ struct pool * p = (struct pool *) malloc(sizeof(struct pool));
p->entries = z;
p->size = size;
return p;
}
}
static int compare_to_pool(int length, const char * s, int length_p, const char * s_p)
{ if (length != length_p) return length-length_p;
return memcmp(s, s_p, length);
}
static const char * search_pool(struct pool * p, int length, char * s)
{ int i = 0;
int j = p->size;
struct pool_entry * q = p->entries;
if (j == 0) return 0; /* empty pool */
if (compare_to_pool(length, s, q->length, q->pointer) < 0) return 0;
while(true)
{
int h = (i+j)/2;
int diff = compare_to_pool(length, s, (q+h)->length, (q+h)->pointer);
if (diff == 0) return (q+h)->translation;
if (j-i <= 1) return 0;
if (diff < 0) j = h; else i = h;
}
}
static void free_pool(struct pool * p)
{ free(p->entries);
free(p);
}
struct english_stemmer
{
char * p;
int p_size;
int k;
int j;
struct pool * irregulars;
};
/* The main part of the stemming algorithm starts here. z->p is a buffer
holding a word to be stemmed. The letters are in z->p[0], z->p[1] ...
ending at z->p[z->k]. z->k is readjusted downwards as the stemming
progresses. Zero termination is not in fact used in the algorithm.
Note that only lower case sequences are stemmed. Forcing to lower case
should be done before english_stem(...) is called.
We will write p, k etc in place of z->p, z->k in the comments.
*/
/* cons(z, i) is true <=> p[i] is a consonant.
*/
static int cons(struct english_stemmer * z, int i)
{ switch (z->p[i])
{ case 'a': case 'e': case 'i': case 'o': case 'u':
return false;
case 'y':
return (i==0) ? true : !cons(z, i - 1);
default: return true;
}
}
/* m(z) measures the number of consonant sequences between 0 and j. if c is
a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
presence,
<c><v> gives 0
<c>vc<v> gives 1
<c>vcvc<v> gives 2
<c>vcvcvc<v> gives 3
....
*/
static int m(struct english_stemmer * z)
{ int n = 0;
int i = 0;
while(true)
{ if (i > z->j) return n;
if (! cons(z, i)) break; i++;
}
i++;
while(true)
{ while(true)
{ if (i > z->j) return n;
if (cons(z, i)) break;
i++;
}
i++;
n++;
while(true)
{ if (i > z->j) return n;
if (! cons(z, i)) break;
i++;
}
i++;
}
}
/* vowelinstem(z) is true p[0], ... p[j] contains a vowel
*/
static int vowelinstem(struct english_stemmer * z)
{ int i;
for (i = 0; i <= z->j; i++) if (! cons(z, i)) return true;
return false;
}
/* doublec(z, i) is true <=> p[i], p[i - 1] contain a double consonant.
*/
static int doublec(struct english_stemmer * z, int i)
{ if (i < 1) return false;
if (z->p[i] != z->p[i - 1]) return false;
return cons(z, i);
}
/* cvc(z, i) is true <=>
a) ( -NEW- ) i == 1, and p[0] p[1] is vowel consonant, or
b) p[i - 2], p[i - 1], p[i] has the form consonant -
vowel - consonant and also if the second c is not w, x or y. this is used
when trying to restore an e at the end of a short word. e.g.
cav(e), lov(e), hop(e), crim(e), but
snow, box, tray.
*/
static int cvc(struct english_stemmer * z, int i)
{
if (i == 0) return false; /* i == 0 never happens perhaps */
if (i == 1) return !cons(z, 0) && cons(z, 1);
if (!cons(z, i) || cons(z, i - 1) || !cons(z, i - 2)) return false;
{ int ch = z->p[i];
if (ch == 'w' || ch == 'x' || ch == 'y') return false;
}
return true;
}
/* ends(z, s, length) is true <=> p[0], ... p[k] ends with the string s.
*/
static int ends(struct english_stemmer * z, const char * s, int length)
{
if (length > z->k + 1) return false;
if (memcmp(z->p + z->k - length + 1, s, length) != 0) return false;
z->j = z->k - length;
return true;
}
/* setto(z, s, length) sets p[j + 1] ... to the characters in the string s,
readjusting k.
*/
static void setto(struct english_stemmer * z, const char * s, int length)
{
memmove(z->p + z->j + 1, s, length);
z->k = z->j + length;
}
/* r(z, s, length) is used further down. */
static void r(struct english_stemmer * z, const char * s, int length)
{
if (m(z) > 0) setto(z, s, length);
}
/* step_1ab(z) gets rid of plurals and -ed or -ing. e.g.
caresses -> caress
ponies -> poni
sties -> sti
tie -> tie (-NEW-: see below)
caress -> caress
cats -> cat
feed -> feed
agreed -> agree
disabled -> disable
matting -> mat
mating -> mate
meeting -> meet
milling -> mill
messing -> mess
meetings -> meet
*/
static void step_1ab(struct english_stemmer * z)
{ if (z->p[z->k] == 's')
{ if (ends(z, "sses", 4)) z->k -= 2; else
if (ends(z, "ies", 3))
if (z->j == 0) z->k--; else z->k -= 2;
/* this line extends the original algorithm, so that 'flies'->'fli' but
'dies'->'die' etc */
else
if (z->p[z->k - 1] != 's') z->k--;
}
if (ends(z, "ied", 3)) { if (z->j == 0) z->k--; else z->k -= 2; } else
/* this line extends the original algorithm, so that 'spied'->'spi' but
'died'->'die' etc */
if (ends(z, "eed", 3)) { if (m(z) > 0) z->k--; } else
if ((ends(z, "ed", 2) || ends(z, "ing", 3)) && vowelinstem(z))
{ z->k = z->j;
if (ends(z, "at", 2)) setto(z, "ate", 3); else
if (ends(z, "bl", 2)) setto(z, "ble", 3); else
if (ends(z, "iz", 2)) setto(z, "ize", 3); else
if (doublec(z, z->k))
{ z->k--;
{ int ch = z->p[z->k];
if (ch == 'l' || ch == 's' || ch == 'z') z->k++;
}
}
else if (m(z) == 1 && cvc(z, z->k)) setto(z, "e", 1);
}
}
/* step_1c(z) turns terminal y to i when there is another vowel in the stem.
-NEW-: This has been modified from the original Porter algorithm so that y->i
is only done when y is preceded by a consonant, but not if the stem
is only a single consonant, i.e.
(*c and not c) Y -> I
So 'happy' -> 'happi', but
'enjoy' -> 'enjoy' etc
This is a much better rule. Formerly 'enjoy'->'enjoi' and 'enjoyment'->
'enjoy'. Step 1c is perhaps done too soon; but with this modification that
no longer really matters.
Also, the removal of the vowelinstem(z) condition means that 'spy', 'fly',
'try' ... stem to 'spi', 'fli', 'tri' and conflate with 'spied', 'tried',
'flies' ...
*/
static void step_1c(struct english_stemmer * z)
{
if (ends(z, "y", 1) && z->j > 0 && cons(z, z->k - 1)) z->p[z->k] = 'i';
}
/* step_2(z) maps double suffices to single ones. so -ization ( = -ize plus
-ation) maps to -ize etc. Note that the string before the suffix must give
m(z) > 0.
*/
static void step_2(struct english_stemmer * z)
{ switch (z->p[z->k - 1])
{
case 'a':
if (ends(z, "ational", 7)) { r(z, "ate", 3); break; }
if (ends(z, "tional", 6)) { r(z, "tion", 4); break; }
break;
case 'c':
if (ends(z, "enci", 4)) { r(z, "ence", 4); break; }
if (ends(z, "anci", 4)) { r(z, "ance", 4); break; }
break;
case 'e':
if (ends(z, "izer", 4)) { r(z, "ize", 3); break; }
break;
case 'l':
if (ends(z, "bli", 3)) { r(z, "ble", 3); break; } /*-DEPARTURE-*/
/* To match the published algorithm, replace this line with
case 'l':
if (ends(z, "abli", 4)) { r(z, "able", 4); break; }
*/
if (ends(z, "alli", 4))
{
if (m(z) > 0) { setto(z, "al", 2); step_2(z); } /*-NEW-*/
break;
}
if (ends(z, "fulli", 5)) { r(z, "ful", 3); break; } /*-NEW-*/
if (ends(z, "entli", 5)) { r(z, "ent", 3); break; }
if (ends(z, "eli", 3)) { r(z, "e", 1); break; }
if (ends(z, "ousli", 5)) { r(z, "ous", 3); break; }
break;
case 'o':
if (ends(z, "ization", 7)) { r(z, "ize", 3); break; }
if (ends(z, "ation", 5)) { r(z, "ate", 3); break; }
if (ends(z, "ator", 4)) { r(z, "ate", 3); break; }
break;
case 's':
if (ends(z, "alism", 5)) { r(z, "al", 2); break; }
if (ends(z, "iveness", 7)) { r(z, "ive", 3); break; }
if (ends(z, "fulness", 7)) { r(z, "ful", 3); break; }
if (ends(z, "ousness", 7)) { r(z, "ous", 3); break; }
break;
case 't':
if (ends(z, "aliti", 5)) { r(z, "al", 2); break; }
if (ends(z, "iviti", 5)) { r(z, "ive", 3); break; }
if (ends(z, "biliti", 6)) { r(z, "ble", 3); break; }
break;
case 'g':
if (ends(z, "logi", 4))
{ z->j++; /*-NEW-*/ /*(Barry Wilkins)*/
r(z, "og", 2); break;
} /*-DEPARTURE-*/
/* To match the published algorithm, delete this line */
}
}
/* step_3(z) deals with -ic-, -full, -ness etc. Similar strategy to step_2.
*/
static void step_3(struct english_stemmer * z)
{ switch (z->p[z->k])
{
case 'e':
if (ends(z, "icate", 5)) { r(z, "ic", 2); break; }
if (ends(z, "ative", 5)) { r(z, "", 0); break; }
if (ends(z, "alize", 5)) { r(z, "al", 2); break; }
break;
case 'i':
if (ends(z, "iciti", 5)) { r(z, "ic", 2); break; }
break;
case 'l':
if (ends(z, "ical", 4)) { r(z, "ic", 2); break; }
if (ends(z, "ful", 3)) { r(z, "", 0); break; }
break;
case 's':
if (ends(z, "ness", 4)) { r(z, "", 0); break; }
break;
}
}
/* step_4() takes off -ant, -ence etc., in context <c>vcvc<v>.
*/
static void step_4(struct english_stemmer * z)
{ switch (z->p[z->k - 1])
{ case 'a':
if (ends(z, "al", 2)) break; return;
case 'c':
if (ends(z, "ance", 4)) break;
if (ends(z, "ence", 4)) break; return;
case 'e':
if (ends(z, "er", 2)) break; return;
case 'i':
if (ends(z, "ic", 2)) break; return;
case 'l':
if (ends(z, "able", 4)) break;
if (ends(z, "ible", 4)) break; return;
case 'n':
if (ends(z, "ant", 3)) break;
if (ends(z, "ement", 5)) break;
if (ends(z, "ment", 4)) break;
if (ends(z, "ent", 3)) break; return;
case 'o':
if (ends(z, "ion", 3) && (z->p[z->j] == 's' ||
z->p[z->j] == 't')) break;
if (ends(z, "ou", 2)) break; return;
/* takes care of -ous */
case 's':
if (ends(z, "ism", 3)) break; return;
case 't':
if (ends(z, "ate", 3)) break;
if (ends(z, "iti", 3)) break; return;
case 'u':
if (ends(z, "ous", 3)) break; return;
case 'v':
if (ends(z, "ive", 3)) break; return;
case 'z':
if (ends(z, "ize", 3)) break; return;
default:
return;
}
if (m(z) > 1) z->k = z->j;
}
/* step_5(z) removes a final -e if m(z) > 1, and changes -ll to -l if
m(z) > 1.
*/
static void step_5(struct english_stemmer * z)
{ z->j = z->k;
if (z->p[z->k] == 'e')
{ int a = m(z);
if (a > 1 || (a == 1 && !cvc(z, z->k - 1))) z->k--;
}
if (z->p[z->k] == 'l' && doublec(z, z->k) && m(z) > 1) z->k--;
}
static const char * english_stem(void * z_, const char * q, int i0, int i1)
{
struct english_stemmer * z = (struct english_stemmer *) z_;
int p_size = z->p_size;
if (i1 - i0 + 50 > p_size)
{ free(z->p);
p_size = i1 - i0 + 75; /* ample */ z->p_size = p_size;
z->p = (char *) malloc(p_size);
}
memmove(z->p, q + i0, i1 - i0 + 1);
z->k = i1 - i0;
{ const char * t = search_pool(z->irregulars, z->k + 1, z->p);
if (t != 0) {
z->k = strlen(t) - 1;
return t;
}
}
if (z->k > 1) /*-DEPARTURE-*/
/* With this line, strings of length 1 or 2 don't go through the
stemming process, although no mention is made of this in the
published algorithm. Remove the line to match the published
algorithm. */
{ step_1ab(z); step_1c(z);
step_2(z);
step_3(z);
step_4(z);
step_5(z);
}
z->p[z->k + 1] = 0; /* C string form for now */
return z->p;
}
/* -NEW-
This is a table of irregular forms. It is quite short, but still
reflects the errors actually drawn to Martin Porter's attention over
a 20 year period!
Extend it as necessary.
The form of the table is:
"p1" "s11/s12/s13/ ... /"
"p2" "s21/s22/s23/ ... /"
...
"pn" "sn1/sn2/sn3/ ... /"
0, 0
String sij is mapped to paradigm form pi, and the main stemming
process is then bypassed.
*/
static const char * irregular_forms[] = {
"sky", "sky/skies/",
"die", "dying/",
"lie", "lying/",
"tie", "tying/",
"news", "news/",
"inning", "innings/inning/",
"outing", "outings/outing/",
"canning", "cannings/canning/",
"howe", "howe/",
/*-NEW-*/
"proceed", "proceed/",
"exceed", "exceed/",
"succeed", "succeed/", /* Hiranmay Ghosh */
0, 0 /* terminator */
};
/*
* is_stopword part
*/
typedef struct {
unsigned char val;
unsigned char flag;
unsigned char right;
unsigned char child;
} ESWNODE;
/* is exists left tree ? */
#define L 0x01
/* finish word flag */
#define F 0x02
#define ISLEFT(x) (((ESWNODE*)x)->flag & L)
#define ISFINISH(x) (((ESWNODE*)x)->flag & F)
static ESWNODE engstoptree[] = {
{'m',L,9,126},
{'d',L,4,71},
{'b',L,2,40},
{'a',F,0,14},
{'c',0,0,62},
{'f',L,2,79},
{'e',0,0,75},
{'h',0,1,90},
{'i',F,0,108},
{'t',L,4,177},
{'o',L,2,135},
{'n',0,0,131},
{'s',0,0,156},
{'v',L,2,210},
{'u',0,0,201},
{'w',0,1,211},
{'y',0,0,237},
{'m',L|F,5,0},
{'f',L,2,12},
{'b',0,0,7},
{'g',0,1,13},
{'l',0,0,17},
{'r',L,2,19},
{'n',F,0,16},
{'s',F,1,0},
{'t',F,0,0},
{'o',0,0,1},
{'u',0,1,2},
{'v',F,0,0},
{'t',F,0,0},
{'t',0,0,1},
{'e',0,0,1},
{'r',F,0,0},
{'a',0,0,1},
{'i',0,0,1},
{'n',F,0,1},
{'s',0,0,1},
{'t',F,0,0},
{'l',F,0,0},
{'d',F,1,0},
{'i',F,0,0},
{'e',F,0,0},
{'o',L,2,21},
{'e',F,0,3},
{'u',0,1,21},
{'y',F,0,0},
{'f',L,3,9},
{'c',0,1,4},
{'e',0,0,6},
{'l',0,1,8},
{'t',0,0,9},
{'a',0,0,1},
{'u',0,0,1},
{'s',F,0,0},
{'n',F,0,0},
{'o',0,0,1},
{'r',F,0,0},
{'o',0,0,1},
{'w',F,0,0},
{'w',0,0,1},
{'e',0,0,1},
{'e',0,0,1},
{'n',F,0,0},
{'t',0,0,1},
{'h',F,0,0},
{'t',F,0,0},
{'a',0,1,2},
{'o',0,0,2},
{'n',F,0,0},
{'u',0,0,1},
{'l',0,0,1},
{'d',F,0,0},
{'o',L|F,2,4},
{'i',0,0,2},
{'u',0,0,5},
{'d',F,0,0},
{'e',F,1,0},
{'w',0,0,1},
{'n',F,0,0},
{'r',0,0,1},
{'e',F,0,0},
{'a',0,0,1},
{'c',0,0,1},
{'h',F,0,0},
{'o',L,2,5},
{'e',0,0,3},
{'r',0,1,4},
{'u',0,0,5},
{'w',F,0,0},
{'r',F,0,0},
{'o',0,0,1},
{'m',F,0,0},
{'r',0,0,1},
{'t',0,0,1},
{'h',0,0,1},
{'e',0,0,1},
{'r',F,0,0},
{'e',L|F,2,7},
{'a',F,0,3},
{'i',F,1,11},
{'o',0,0,15},
{'d',F,1,0},
{'v',0,0,1},
{'e',F,0,0},
{'r',F,0,1},
{'e',F,1,0},
{'s',0,0,1},
{'e',0,0,1},
{'l',0,0,1},
{'f',F,0,0},
{'m',F,0,1},
{'s',0,0,1},
{'e',0,0,1},
{'l',0,0,1},
{'f',F,0,0},
{'w',F,0,0},
{'n',L|F,2,4},
{'f',F,0,0},
{'s',F,1,0},
{'t',F,0,3},
{'t',0,0,1},
{'o',F,0,0},
{'s',0,0,1},
{'e',0,0,1},
{'l',0,0,1},
{'f',F,0,0},
{'o',L,3,6},
{'a',0,1,4},
{'e',F,0,0},
{'u',0,1,7},
{'y',F,0,8},
{'y',F,0,0},
{'r',0,1,2},
{'s',0,0,2},
{'e',F,0,0},
{'t',F,0,0},
{'s',0,0,1},
{'t',F,0,0},
{'s',0,0,1},
{'e',0,0,1},
{'l',0,0,1},
{'f',F,0,0},
{'o',F,0,1},
{'r',F,1,0},
{'t',F,0,0},
{'t',L,4,11},
{'n',L|F,2,7},
{'f',F,0,5},
{'r',F,0,0},
{'v',L,2,16},
{'u',0,0,9},
{'w',0,0,16},
{'f',F,0,0},
{'c',F,1,0},
{'l',0,0,1},
{'i',F,0,0},
{'h',0,0,1},
{'e',0,0,1},
{'r',F,0,0},
{'r',F,1,2},
{'t',F,0,0},
{'s',0,0,1},
{'e',0,0,1},
{'l',0,0,1},
{'v',F,0,0},
{'e',0,0,1},
{'r',F,0,0},
{'n',F,0,0},
{'h',L,2,6},
{'a',0,0,3},
{'o',F,1,12},
{'u',0,0,13},
{'m',0,0,1},
{'e',F,0,0},
{'e',L|F,2,0},
{'a',0,0,2},
{'o',0,0,3},
{'l',0,0,1},
{'l',F,0,0},
{'u',0,0,1},
{'l',0,0,1},
{'d',F,0,0},
{'m',0,0,1},
{'e',F,0,0},
{'c',0,0,1},
{'h',F,0,0},
{'h',0,1,2},
{'o',F,0,27},
{'i',L|F,3,0},
{'a',0,1,4},
{'e',F,0,5},
{'o',0,1,17},
{'r',0,0,18},
{'n',F,1,0},
{'t',F,0,0},
{'n',L|F,3,0},
{'i',0,1,5},
{'m',F,0,5},
{'s',L,2,9},
{'r',0,0,7},
{'y',F,0,0},
{'r',F,0,0},
{'s',0,0,1},
{'e',0,0,1},
{'l',0,0,1},
{'v',F,0,0},
{'e',F,0,0},
{'e',F,0,0},
{'s',0,0,1},
{'e',F,0,0},
{'o',0,0,1},
{'u',0,0,1},
{'g',0,0,1},
{'h',F,0,0},
{'o',F,0,0},
{'n',0,1,2},
{'p',F,0,0},
{'d',0,1,2},
{'t',0,0,3},
{'e',0,0,1},
{'r',F,0,0},
{'i',0,0,1},
{'l',F,0,0},
{'e',0,0,1},
{'r',0,0,1},
{'i',F,0,0},
{'h',L,3,7},
{'a',F,1,0},
{'e',F,0,3},
{'i',0,1,17},
{'o',0,0,20},
{'r',0,0,1},
{'e',F,0,0},
{'e',L,2,5},
{'a',0,0,3},
{'i',F,1,6},
{'o',F,0,9},
{'t',F,0,0},
{'n',F,1,0},
{'r',0,0,1},
{'e',F,0,0},
{'c',0,1,2},
{'l',0,0,2},
{'h',F,0,0},
{'e',F,0,0},
{'m',F,0,0},
{'l',0,1,2},
{'t',0,0,2},
{'l',F,0,0},
{'h',F,0,0},
{'u',0,0,1},
{'l',0,0,1},
{'d',F,0,0},
{'o',0,0,1},
{'u',F,0,1},
{'r',F,0,1},
{'s',0,0,1},
{'e',0,0,1},
{'l',0,0,1},
{'f',F,1,0},
{'v',F,0,0}
};
static unsigned int
find_english_stopword( unsigned char *buf, int len ) {
ESWNODE *ptr = engstoptree;
int result = 0;
unsigned char *cur = buf;
while( cur - buf < len ) {
if ( ptr->val == *cur ) {
cur++;
if ( ISFINISH(ptr) ) result = cur - buf;
if ( ! ptr->child ) break;
ptr += ptr->child;
} else if ( ptr->val > *cur ) {
if ( ISLEFT(ptr) )
ptr++;
else
break;
} else {
if ( ptr->right )
ptr += ptr->right;
else
break;
}
}
return result;
}
#undef L
#undef F
#undef ISLEFT
#undef ISFINISH
static int
is_stopengword(void* obj,char* word,int len) {
return ( len == find_english_stopword((unsigned char*)word, len) ) ? 1 : 0;
}
static void * setup_english_stemmer()
{
struct english_stemmer * z = (struct english_stemmer *) malloc(sizeof(struct english_stemmer));
z->p = 0; z->p_size = 0;
z->irregulars = create_pool(irregular_forms);
return (void *) z;
}
static void closedown_english_stemmer(void * z_)
{
struct english_stemmer * z = (struct english_stemmer *) z_;
free_pool(z->irregulars);
free(z->p);
free(z);
}
static char*
engstemming(void* obj, char *word, int *len) {
struct english_stemmer * z = (struct english_stemmer *) obj;
const char* stemmed_word;
char *result = word;
while(result-word < *len) {
*result = tolower((unsigned char) *result);
result++;
}
stemmed_word = english_stem(obj, word, 0, *len-1);
*len = z->k + 1;
result = (char*)palloc( *len );
memcpy((void*)result, (void*)stemmed_word, *len);
return result;
}
#endif /* DICT_BODY */
#ifdef DICT_TABLE
TABLE_DICT_START
"C",
setup_english_stemmer,
closedown_english_stemmer,
engstemming,
NULL,
is_stopengword
TABLE_DICT_END
#endif