mirror of
https://git.postgresql.org/git/postgresql.git
synced 2024-12-21 08:29:39 +08:00
8fdc7814d0
CVS. It fix english stemmer's problem with ending words like 'technology'. We have found one more bug in english stemmer. The bug is with 'irregular' english words like 'skies' -> 'sky'. Please, apply attached cumulative patch to 7.2.1 and current CVS instead previous one. Thank to Thomas T. Thai <tom@minnesota.com> for hard testing. This kind of bug has significance only for dump/reload database and viewing, but searching/indexing works right. Teodor Sigaev
1286 lines
27 KiB
Plaintext
1286 lines
27 KiB
Plaintext
/*
|
|
* ----START-LICENCE----
|
|
* Copyright 1999,2000 BrightStation PLC
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License as
|
|
* published by the Free Software Foundation; either version 2 of the
|
|
* License, or (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
|
|
* USA
|
|
* -----END-LICENCE-----
|
|
*/
|
|
/* Version 1: see http://open.muscat.com/ for further information */
|
|
|
|
|
|
#ifdef DICT_BODY
|
|
#include <ctype.h> /* tolower */
|
|
|
|
static void * setup_english_stemmer();
|
|
|
|
static const char * english_stem(void * z, const char * q, int i0, int i1);
|
|
|
|
static void closedown_english_stemmer(void * z);
|
|
|
|
|
|
/* To set up the english stemming process:
|
|
|
|
void * z = setup_stemmer();
|
|
|
|
to use it:
|
|
|
|
char * p = stem(z, q, i0, i1);
|
|
|
|
The word to be stemmed is in byte address q offsets i0 to i1
|
|
inclusive (i.e. from q[i0] to q[i1]). The stemmed result is the
|
|
C string at address p.
|
|
|
|
To close down the stemming process:
|
|
|
|
closedown_stemmer(z);
|
|
|
|
*/
|
|
|
|
/* The English stemming algorithm is essentially the Porter stemming
|
|
* algorithm, and has been coded up by its author. It follows the algorithm
|
|
* presented in
|
|
*
|
|
* Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
|
* no. 3, pp 130-137,
|
|
*
|
|
* only differing from it at the points marked -DEPARTURE- and -NEW-
|
|
* below.
|
|
*
|
|
* For a more faithful version of the Porter algorithm, see
|
|
*
|
|
* http://www.muscat.com/~martin/stem.html
|
|
*
|
|
*/
|
|
|
|
/* Later additions:
|
|
|
|
June 2000
|
|
|
|
The 'l' of the 'logi' -> 'log' rule is put with the stem, so that
|
|
short stems like 'geo' 'theo' etc work like 'archaeo' 'philo' etc.
|
|
|
|
This follows a suggestion of Barry Wilkins, reasearch student at
|
|
Birmingham.
|
|
|
|
|
|
February 2000
|
|
|
|
the cvc test for not dropping final -e now looks after vc at the
|
|
beginning of a word, so are, eve, ice, ore, use keep final -e. In this
|
|
test c is any consonant, including w, x and y. This extension was
|
|
suggested by Chris Emerson.
|
|
|
|
-fully -> -ful treated like -fulness -> -ful, and
|
|
-tionally -> -tion treated like -tional -> -tion
|
|
|
|
both in Step 2. These were suggested by Hiranmay Ghosh, of New Delhi.
|
|
|
|
Invariants proceed, succeed, exceed. Also suggested by Hiranmay Ghosh.
|
|
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
struct pool {
|
|
|
|
int size;
|
|
struct pool_entry * entries;
|
|
|
|
};
|
|
|
|
/* This is used as a library to resolve exceptions in the various
|
|
stemming algorithms. Typical use is,
|
|
|
|
struct pool * p = create_pool(t);
|
|
char * s_translated = search_pool(p, strlen(s), s);
|
|
...
|
|
free_pool(p);
|
|
|
|
t is an array of strings, e.g.
|
|
|
|
static char * t[] = {
|
|
|
|
"sky", "sky/skies/",
|
|
"die", "dying/",
|
|
"lie", "lying/",
|
|
"tie", "tying/",
|
|
....
|
|
0, 0
|
|
|
|
};
|
|
|
|
if s is "sky", "skies", "dying" etc., translated_s is becomes "sky",
|
|
"sky", "die" etc.
|
|
|
|
The code includes a sort/merge capability which may be turned into
|
|
(or replaced by) something more general later on.
|
|
|
|
*/
|
|
|
|
/* merge(n, p, q, r, l, k, f) repeatedly merges n-byte sequences of items of
|
|
size k from addresses p and q into r. f is the comparison routine and
|
|
l is the limit point for q.
|
|
*/
|
|
|
|
static void merge(int n, char * p, char * q, char * r, char * l, int k,
|
|
int (*f)(char *, char *))
|
|
{ char * q0 = q;
|
|
if (q0 > l) { memmove(r, p, l-p); return; }
|
|
while (p < q0)
|
|
{ char * pl = n+p;
|
|
char * ql = n+q;
|
|
if (ql > l) ql = l;
|
|
while(true)
|
|
{ if (p >= pl) { memmove(r, q, ql-q); r += ql-q; q = ql; break; }
|
|
if (q >= ql) { memmove(r, p, pl-p); r += pl-p; p = pl; break; }
|
|
if (f(p, q)) { memmove(r, p, k); p += k; }
|
|
else { memmove(r, q, k); q += k; }
|
|
r += k;
|
|
}
|
|
}
|
|
memmove(r, q, l-q);
|
|
}
|
|
|
|
/* In sort(p, c, k, f), p+c is a byte address at which begin a sequence of
|
|
items of size k to be sorted. p+l is the address of the byte after the
|
|
last of these items, so l - c is divisible by k. f is a comparison function
|
|
for a pair of these items: f(p+i, q+j) is true if the item at p+i is before
|
|
the item at q+j, false if it is equal to or after it.
|
|
*/
|
|
|
|
static void sort(char * p, int c, int l, int k,
|
|
int (*f)(char *, char *))
|
|
{
|
|
char * q = malloc(l-c); /* temporary work space */
|
|
int j = k;
|
|
int w = l-c;
|
|
while (j < w)
|
|
{ int cycle;
|
|
for (cycle = 1; cycle <= 2; cycle++)
|
|
{ int h = (w+j-1) / j / 2 * j; /* half way */
|
|
if (cycle == 1) merge(j, p+c, p+c+h, q, p+l, k, f);
|
|
else merge(j, q, q+h, p+c, q+w, k, f);
|
|
j *= 2;
|
|
}
|
|
}
|
|
free(q);
|
|
}
|
|
|
|
struct pool_entry {
|
|
|
|
const char * translation;
|
|
const char * pointer;
|
|
int length;
|
|
|
|
};
|
|
|
|
static void print_entry(struct pool_entry * p)
|
|
{
|
|
{ int j; for (j=0;j<p->length;j++) fprintf(stderr, "%c", (p->pointer)[j]); }
|
|
fprintf(stderr, " --> %s\n", p->translation);
|
|
}
|
|
|
|
/* - debugging aid
|
|
static void print_pool(struct pool * p)
|
|
{ int i;
|
|
int size = p->size;
|
|
struct pool_entry * q = p->entries;
|
|
fprintf(stderr, "\nPool:\n");
|
|
for (i = 0; i < size; i++) print_entry(q+i);
|
|
}
|
|
*/
|
|
|
|
/* compare(p, q) is our comparison function, used for f above
|
|
*/
|
|
|
|
static int compare(char * char_p, char * char_q)
|
|
{ struct pool_entry * p = (struct pool_entry *) char_p;
|
|
struct pool_entry * q = (struct pool_entry *) char_q;
|
|
if (p->length == q->length) return memcmp(p->pointer, q->pointer, p->length) < 0;
|
|
return p->length < q->length;
|
|
}
|
|
|
|
static int count_slashes(const char * s[])
|
|
{ int slash_count = 0;
|
|
int i;
|
|
for (i = 1; s[i] != 0; i += 2)
|
|
{ const char * p = s[i];
|
|
int j = 0;
|
|
while (p[j] != 0) if (p[j++] == '/') slash_count++;
|
|
}
|
|
return slash_count;
|
|
}
|
|
|
|
static struct pool * create_pool(const char * s[])
|
|
{ int size = count_slashes(s);
|
|
struct pool_entry * z = (struct pool_entry *) malloc(size * sizeof(struct pool_entry));
|
|
struct pool_entry * q = z;
|
|
int i;
|
|
for (i = 1; s[i] != 0; i += 2)
|
|
{ const char * p = s[i];
|
|
int j = 0;
|
|
int j0 = 0;
|
|
while(true)
|
|
{ if (p[j] == 0)
|
|
{ if (j0 != j) { fprintf(stderr, "%s lacks final '/'\n", p); exit(1); }
|
|
break;
|
|
}
|
|
if (p[j] == '/')
|
|
{
|
|
q->translation = s[i-1];
|
|
q->pointer = p+j0; q->length = j-j0;
|
|
q++;
|
|
j0 = j+1;
|
|
}
|
|
j++;
|
|
}
|
|
}
|
|
sort((char *) z, 0, size * sizeof(struct pool_entry), sizeof(struct pool_entry), compare);
|
|
|
|
/* now validate the contents */
|
|
|
|
for (i = 1; i < size; i++)
|
|
{ struct pool_entry * p = z+i;
|
|
struct pool_entry * q = z+i-1;
|
|
if (p->length == q->length && memcmp(p->pointer, q->pointer, p->length) == 0)
|
|
{ fprintf(stderr, "warning: "); print_entry(p);
|
|
fprintf(stderr, " and "); print_entry(q);
|
|
}
|
|
}
|
|
|
|
{ struct pool * p = (struct pool *) malloc(sizeof(struct pool));
|
|
p->entries = z;
|
|
p->size = size;
|
|
return p;
|
|
}
|
|
}
|
|
|
|
static int compare_to_pool(int length, const char * s, int length_p, const char * s_p)
|
|
{ if (length != length_p) return length-length_p;
|
|
return memcmp(s, s_p, length);
|
|
}
|
|
|
|
static const char * search_pool(struct pool * p, int length, char * s)
|
|
{ int i = 0;
|
|
int j = p->size;
|
|
struct pool_entry * q = p->entries;
|
|
if (j == 0) return 0; /* empty pool */
|
|
if (compare_to_pool(length, s, q->length, q->pointer) < 0) return 0;
|
|
while(true)
|
|
{
|
|
int h = (i+j)/2;
|
|
int diff = compare_to_pool(length, s, (q+h)->length, (q+h)->pointer);
|
|
if (diff == 0) return (q+h)->translation;
|
|
if (j-i <= 1) return 0;
|
|
if (diff < 0) j = h; else i = h;
|
|
}
|
|
}
|
|
|
|
static void free_pool(struct pool * p)
|
|
{ free(p->entries);
|
|
free(p);
|
|
}
|
|
|
|
struct english_stemmer
|
|
{
|
|
char * p;
|
|
int p_size;
|
|
int k;
|
|
int j;
|
|
struct pool * irregulars;
|
|
};
|
|
|
|
/* The main part of the stemming algorithm starts here. z->p is a buffer
|
|
holding a word to be stemmed. The letters are in z->p[0], z->p[1] ...
|
|
ending at z->p[z->k]. z->k is readjusted downwards as the stemming
|
|
progresses. Zero termination is not in fact used in the algorithm.
|
|
|
|
Note that only lower case sequences are stemmed. Forcing to lower case
|
|
should be done before english_stem(...) is called.
|
|
|
|
We will write p, k etc in place of z->p, z->k in the comments.
|
|
*/
|
|
|
|
/* cons(z, i) is true <=> p[i] is a consonant.
|
|
*/
|
|
|
|
static int cons(struct english_stemmer * z, int i)
|
|
{ switch (z->p[i])
|
|
{ case 'a': case 'e': case 'i': case 'o': case 'u':
|
|
return false;
|
|
case 'y':
|
|
return (i==0) ? true : !cons(z, i - 1);
|
|
default: return true;
|
|
}
|
|
}
|
|
|
|
/* m(z) measures the number of consonant sequences between 0 and j. if c is
|
|
a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
|
|
presence,
|
|
|
|
<c><v> gives 0
|
|
<c>vc<v> gives 1
|
|
<c>vcvc<v> gives 2
|
|
<c>vcvcvc<v> gives 3
|
|
....
|
|
*/
|
|
|
|
static int m(struct english_stemmer * z)
|
|
{ int n = 0;
|
|
int i = 0;
|
|
while(true)
|
|
{ if (i > z->j) return n;
|
|
if (! cons(z, i)) break; i++;
|
|
}
|
|
i++;
|
|
while(true)
|
|
{ while(true)
|
|
{ if (i > z->j) return n;
|
|
if (cons(z, i)) break;
|
|
i++;
|
|
}
|
|
i++;
|
|
n++;
|
|
while(true)
|
|
{ if (i > z->j) return n;
|
|
if (! cons(z, i)) break;
|
|
i++;
|
|
}
|
|
i++;
|
|
}
|
|
}
|
|
|
|
/* vowelinstem(z) is true p[0], ... p[j] contains a vowel
|
|
*/
|
|
|
|
static int vowelinstem(struct english_stemmer * z)
|
|
{ int i;
|
|
for (i = 0; i <= z->j; i++) if (! cons(z, i)) return true;
|
|
return false;
|
|
}
|
|
|
|
/* doublec(z, i) is true <=> p[i], p[i - 1] contain a double consonant.
|
|
*/
|
|
|
|
static int doublec(struct english_stemmer * z, int i)
|
|
{ if (i < 1) return false;
|
|
if (z->p[i] != z->p[i - 1]) return false;
|
|
return cons(z, i);
|
|
}
|
|
|
|
/* cvc(z, i) is true <=>
|
|
|
|
a) ( -NEW- ) i == 1, and p[0] p[1] is vowel consonant, or
|
|
|
|
b) p[i - 2], p[i - 1], p[i] has the form consonant -
|
|
vowel - consonant and also if the second c is not w, x or y. this is used
|
|
when trying to restore an e at the end of a short word. e.g.
|
|
|
|
cav(e), lov(e), hop(e), crim(e), but
|
|
snow, box, tray.
|
|
|
|
*/
|
|
|
|
static int cvc(struct english_stemmer * z, int i)
|
|
{
|
|
if (i == 0) return false; /* i == 0 never happens perhaps */
|
|
|
|
if (i == 1) return !cons(z, 0) && cons(z, 1);
|
|
|
|
if (!cons(z, i) || cons(z, i - 1) || !cons(z, i - 2)) return false;
|
|
{ int ch = z->p[i];
|
|
if (ch == 'w' || ch == 'x' || ch == 'y') return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/* ends(z, s, length) is true <=> p[0], ... p[k] ends with the string s.
|
|
*/
|
|
|
|
static int ends(struct english_stemmer * z, const char * s, int length)
|
|
{
|
|
if (length > z->k + 1) return false;
|
|
if (memcmp(z->p + z->k - length + 1, s, length) != 0) return false;
|
|
z->j = z->k - length;
|
|
return true;
|
|
}
|
|
|
|
/* setto(z, s, length) sets p[j + 1] ... to the characters in the string s,
|
|
readjusting k.
|
|
*/
|
|
|
|
static void setto(struct english_stemmer * z, const char * s, int length)
|
|
{
|
|
memmove(z->p + z->j + 1, s, length);
|
|
z->k = z->j + length;
|
|
}
|
|
|
|
/* r(z, s, length) is used further down. */
|
|
|
|
static void r(struct english_stemmer * z, const char * s, int length)
|
|
{
|
|
if (m(z) > 0) setto(z, s, length);
|
|
}
|
|
|
|
/* step_1ab(z) gets rid of plurals and -ed or -ing. e.g.
|
|
|
|
caresses -> caress
|
|
ponies -> poni
|
|
sties -> sti
|
|
tie -> tie (-NEW-: see below)
|
|
caress -> caress
|
|
cats -> cat
|
|
|
|
feed -> feed
|
|
agreed -> agree
|
|
disabled -> disable
|
|
|
|
matting -> mat
|
|
mating -> mate
|
|
meeting -> meet
|
|
milling -> mill
|
|
messing -> mess
|
|
|
|
meetings -> meet
|
|
|
|
*/
|
|
|
|
static void step_1ab(struct english_stemmer * z)
|
|
{ if (z->p[z->k] == 's')
|
|
{ if (ends(z, "sses", 4)) z->k -= 2; else
|
|
if (ends(z, "ies", 3))
|
|
if (z->j == 0) z->k--; else z->k -= 2;
|
|
|
|
/* this line extends the original algorithm, so that 'flies'->'fli' but
|
|
'dies'->'die' etc */
|
|
|
|
else
|
|
if (z->p[z->k - 1] != 's') z->k--;
|
|
}
|
|
|
|
if (ends(z, "ied", 3)) { if (z->j == 0) z->k--; else z->k -= 2; } else
|
|
|
|
/* this line extends the original algorithm, so that 'spied'->'spi' but
|
|
'died'->'die' etc */
|
|
|
|
if (ends(z, "eed", 3)) { if (m(z) > 0) z->k--; } else
|
|
if ((ends(z, "ed", 2) || ends(z, "ing", 3)) && vowelinstem(z))
|
|
{ z->k = z->j;
|
|
if (ends(z, "at", 2)) setto(z, "ate", 3); else
|
|
if (ends(z, "bl", 2)) setto(z, "ble", 3); else
|
|
if (ends(z, "iz", 2)) setto(z, "ize", 3); else
|
|
if (doublec(z, z->k))
|
|
{ z->k--;
|
|
{ int ch = z->p[z->k];
|
|
if (ch == 'l' || ch == 's' || ch == 'z') z->k++;
|
|
}
|
|
}
|
|
else if (m(z) == 1 && cvc(z, z->k)) setto(z, "e", 1);
|
|
}
|
|
}
|
|
|
|
/* step_1c(z) turns terminal y to i when there is another vowel in the stem.
|
|
|
|
-NEW-: This has been modified from the original Porter algorithm so that y->i
|
|
is only done when y is preceded by a consonant, but not if the stem
|
|
is only a single consonant, i.e.
|
|
|
|
(*c and not c) Y -> I
|
|
|
|
So 'happy' -> 'happi', but
|
|
'enjoy' -> 'enjoy' etc
|
|
|
|
This is a much better rule. Formerly 'enjoy'->'enjoi' and 'enjoyment'->
|
|
'enjoy'. Step 1c is perhaps done too soon; but with this modification that
|
|
no longer really matters.
|
|
|
|
Also, the removal of the vowelinstem(z) condition means that 'spy', 'fly',
|
|
'try' ... stem to 'spi', 'fli', 'tri' and conflate with 'spied', 'tried',
|
|
'flies' ...
|
|
|
|
*/
|
|
|
|
static void step_1c(struct english_stemmer * z)
|
|
{
|
|
if (ends(z, "y", 1) && z->j > 0 && cons(z, z->k - 1)) z->p[z->k] = 'i';
|
|
}
|
|
|
|
|
|
/* step_2(z) maps double suffices to single ones. so -ization ( = -ize plus
|
|
-ation) maps to -ize etc. Note that the string before the suffix must give
|
|
m(z) > 0.
|
|
*/
|
|
|
|
static void step_2(struct english_stemmer * z)
|
|
{ switch (z->p[z->k - 1])
|
|
{
|
|
case 'a':
|
|
if (ends(z, "ational", 7)) { r(z, "ate", 3); break; }
|
|
if (ends(z, "tional", 6)) { r(z, "tion", 4); break; }
|
|
break;
|
|
case 'c':
|
|
if (ends(z, "enci", 4)) { r(z, "ence", 4); break; }
|
|
if (ends(z, "anci", 4)) { r(z, "ance", 4); break; }
|
|
break;
|
|
case 'e':
|
|
if (ends(z, "izer", 4)) { r(z, "ize", 3); break; }
|
|
break;
|
|
case 'l':
|
|
if (ends(z, "bli", 3)) { r(z, "ble", 3); break; } /*-DEPARTURE-*/
|
|
|
|
/* To match the published algorithm, replace this line with
|
|
case 'l':
|
|
if (ends(z, "abli", 4)) { r(z, "able", 4); break; }
|
|
*/
|
|
if (ends(z, "alli", 4))
|
|
{
|
|
if (m(z) > 0) { setto(z, "al", 2); step_2(z); } /*-NEW-*/
|
|
break;
|
|
}
|
|
|
|
if (ends(z, "fulli", 5)) { r(z, "ful", 3); break; } /*-NEW-*/
|
|
if (ends(z, "entli", 5)) { r(z, "ent", 3); break; }
|
|
if (ends(z, "eli", 3)) { r(z, "e", 1); break; }
|
|
if (ends(z, "ousli", 5)) { r(z, "ous", 3); break; }
|
|
break;
|
|
case 'o':
|
|
if (ends(z, "ization", 7)) { r(z, "ize", 3); break; }
|
|
if (ends(z, "ation", 5)) { r(z, "ate", 3); break; }
|
|
if (ends(z, "ator", 4)) { r(z, "ate", 3); break; }
|
|
break;
|
|
case 's':
|
|
if (ends(z, "alism", 5)) { r(z, "al", 2); break; }
|
|
if (ends(z, "iveness", 7)) { r(z, "ive", 3); break; }
|
|
if (ends(z, "fulness", 7)) { r(z, "ful", 3); break; }
|
|
if (ends(z, "ousness", 7)) { r(z, "ous", 3); break; }
|
|
break;
|
|
case 't':
|
|
if (ends(z, "aliti", 5)) { r(z, "al", 2); break; }
|
|
if (ends(z, "iviti", 5)) { r(z, "ive", 3); break; }
|
|
if (ends(z, "biliti", 6)) { r(z, "ble", 3); break; }
|
|
break;
|
|
case 'g':
|
|
if (ends(z, "logi", 4))
|
|
{ z->j++; /*-NEW-*/ /*(Barry Wilkins)*/
|
|
r(z, "og", 2); break;
|
|
} /*-DEPARTURE-*/
|
|
|
|
/* To match the published algorithm, delete this line */
|
|
|
|
}
|
|
}
|
|
|
|
/* step_3(z) deals with -ic-, -full, -ness etc. Similar strategy to step_2.
|
|
*/
|
|
|
|
static void step_3(struct english_stemmer * z)
|
|
{ switch (z->p[z->k])
|
|
{
|
|
case 'e':
|
|
if (ends(z, "icate", 5)) { r(z, "ic", 2); break; }
|
|
if (ends(z, "ative", 5)) { r(z, "", 0); break; }
|
|
if (ends(z, "alize", 5)) { r(z, "al", 2); break; }
|
|
break;
|
|
case 'i':
|
|
if (ends(z, "iciti", 5)) { r(z, "ic", 2); break; }
|
|
break;
|
|
case 'l':
|
|
if (ends(z, "ical", 4)) { r(z, "ic", 2); break; }
|
|
if (ends(z, "ful", 3)) { r(z, "", 0); break; }
|
|
break;
|
|
case 's':
|
|
if (ends(z, "ness", 4)) { r(z, "", 0); break; }
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* step_4() takes off -ant, -ence etc., in context <c>vcvc<v>.
|
|
*/
|
|
|
|
static void step_4(struct english_stemmer * z)
|
|
{ switch (z->p[z->k - 1])
|
|
{ case 'a':
|
|
if (ends(z, "al", 2)) break; return;
|
|
case 'c':
|
|
if (ends(z, "ance", 4)) break;
|
|
if (ends(z, "ence", 4)) break; return;
|
|
case 'e':
|
|
if (ends(z, "er", 2)) break; return;
|
|
case 'i':
|
|
if (ends(z, "ic", 2)) break; return;
|
|
case 'l':
|
|
if (ends(z, "able", 4)) break;
|
|
if (ends(z, "ible", 4)) break; return;
|
|
case 'n':
|
|
if (ends(z, "ant", 3)) break;
|
|
if (ends(z, "ement", 5)) break;
|
|
if (ends(z, "ment", 4)) break;
|
|
if (ends(z, "ent", 3)) break; return;
|
|
case 'o':
|
|
if (ends(z, "ion", 3) && (z->p[z->j] == 's' ||
|
|
z->p[z->j] == 't')) break;
|
|
if (ends(z, "ou", 2)) break; return;
|
|
/* takes care of -ous */
|
|
case 's':
|
|
if (ends(z, "ism", 3)) break; return;
|
|
case 't':
|
|
if (ends(z, "ate", 3)) break;
|
|
if (ends(z, "iti", 3)) break; return;
|
|
case 'u':
|
|
if (ends(z, "ous", 3)) break; return;
|
|
case 'v':
|
|
if (ends(z, "ive", 3)) break; return;
|
|
case 'z':
|
|
if (ends(z, "ize", 3)) break; return;
|
|
default:
|
|
return;
|
|
}
|
|
if (m(z) > 1) z->k = z->j;
|
|
}
|
|
|
|
/* step_5(z) removes a final -e if m(z) > 1, and changes -ll to -l if
|
|
m(z) > 1.
|
|
*/
|
|
|
|
static void step_5(struct english_stemmer * z)
|
|
{ z->j = z->k;
|
|
if (z->p[z->k] == 'e')
|
|
{ int a = m(z);
|
|
if (a > 1 || (a == 1 && !cvc(z, z->k - 1))) z->k--;
|
|
}
|
|
if (z->p[z->k] == 'l' && doublec(z, z->k) && m(z) > 1) z->k--;
|
|
}
|
|
|
|
static const char * english_stem(void * z_, const char * q, int i0, int i1)
|
|
{
|
|
struct english_stemmer * z = (struct english_stemmer *) z_;
|
|
int p_size = z->p_size;
|
|
|
|
if (i1 - i0 + 50 > p_size)
|
|
{ free(z->p);
|
|
p_size = i1 - i0 + 75; /* ample */ z->p_size = p_size;
|
|
z->p = (char *) malloc(p_size);
|
|
}
|
|
|
|
memmove(z->p, q + i0, i1 - i0 + 1);
|
|
|
|
z->k = i1 - i0;
|
|
|
|
|
|
{ const char * t = search_pool(z->irregulars, z->k + 1, z->p);
|
|
if (t != 0) {
|
|
z->k = strlen(t) - 1;
|
|
return t;
|
|
}
|
|
}
|
|
|
|
if (z->k > 1) /*-DEPARTURE-*/
|
|
|
|
/* With this line, strings of length 1 or 2 don't go through the
|
|
stemming process, although no mention is made of this in the
|
|
published algorithm. Remove the line to match the published
|
|
algorithm. */
|
|
|
|
{ step_1ab(z); step_1c(z);
|
|
step_2(z);
|
|
step_3(z);
|
|
step_4(z);
|
|
step_5(z);
|
|
}
|
|
|
|
z->p[z->k + 1] = 0; /* C string form for now */
|
|
return z->p;
|
|
}
|
|
|
|
/* -NEW-
|
|
This is a table of irregular forms. It is quite short, but still
|
|
reflects the errors actually drawn to Martin Porter's attention over
|
|
a 20 year period!
|
|
|
|
Extend it as necessary.
|
|
|
|
The form of the table is:
|
|
|
|
"p1" "s11/s12/s13/ ... /"
|
|
"p2" "s21/s22/s23/ ... /"
|
|
...
|
|
"pn" "sn1/sn2/sn3/ ... /"
|
|
0, 0
|
|
|
|
String sij is mapped to paradigm form pi, and the main stemming
|
|
process is then bypassed.
|
|
*/
|
|
|
|
static const char * irregular_forms[] = {
|
|
|
|
"sky", "sky/skies/",
|
|
"die", "dying/",
|
|
"lie", "lying/",
|
|
"tie", "tying/",
|
|
"news", "news/",
|
|
"inning", "innings/inning/",
|
|
"outing", "outings/outing/",
|
|
"canning", "cannings/canning/",
|
|
"howe", "howe/",
|
|
|
|
/*-NEW-*/
|
|
"proceed", "proceed/",
|
|
"exceed", "exceed/",
|
|
"succeed", "succeed/", /* Hiranmay Ghosh */
|
|
|
|
0, 0 /* terminator */
|
|
|
|
};
|
|
|
|
|
|
/*
|
|
* is_stopword part
|
|
*/
|
|
typedef struct {
|
|
unsigned char val;
|
|
unsigned char flag;
|
|
unsigned char right;
|
|
|
|
unsigned char child;
|
|
} ESWNODE;
|
|
|
|
/* is exists left tree ? */
|
|
#define L 0x01
|
|
/* finish word flag */
|
|
#define F 0x02
|
|
#define ISLEFT(x) (((ESWNODE*)x)->flag & L)
|
|
#define ISFINISH(x) (((ESWNODE*)x)->flag & F)
|
|
|
|
|
|
static ESWNODE engstoptree[] = {
|
|
{'m',L,9,126},
|
|
{'d',L,4,71},
|
|
{'b',L,2,40},
|
|
{'a',F,0,14},
|
|
{'c',0,0,62},
|
|
{'f',L,2,79},
|
|
{'e',0,0,75},
|
|
{'h',0,1,90},
|
|
{'i',F,0,108},
|
|
{'t',L,4,177},
|
|
{'o',L,2,135},
|
|
{'n',0,0,131},
|
|
{'s',0,0,156},
|
|
{'v',L,2,210},
|
|
{'u',0,0,201},
|
|
{'w',0,1,211},
|
|
{'y',0,0,237},
|
|
|
|
{'m',L|F,5,0},
|
|
{'f',L,2,12},
|
|
{'b',0,0,7},
|
|
{'g',0,1,13},
|
|
{'l',0,0,17},
|
|
{'r',L,2,19},
|
|
{'n',F,0,16},
|
|
{'s',F,1,0},
|
|
{'t',F,0,0},
|
|
|
|
{'o',0,0,1},
|
|
|
|
{'u',0,1,2},
|
|
{'v',F,0,0},
|
|
|
|
{'t',F,0,0},
|
|
|
|
{'t',0,0,1},
|
|
|
|
{'e',0,0,1},
|
|
|
|
{'r',F,0,0},
|
|
|
|
{'a',0,0,1},
|
|
|
|
{'i',0,0,1},
|
|
|
|
{'n',F,0,1},
|
|
|
|
{'s',0,0,1},
|
|
|
|
{'t',F,0,0},
|
|
|
|
{'l',F,0,0},
|
|
|
|
{'d',F,1,0},
|
|
{'i',F,0,0},
|
|
|
|
{'e',F,0,0},
|
|
|
|
{'o',L,2,21},
|
|
{'e',F,0,3},
|
|
{'u',0,1,21},
|
|
{'y',F,0,0},
|
|
|
|
{'f',L,3,9},
|
|
{'c',0,1,4},
|
|
{'e',0,0,6},
|
|
{'l',0,1,8},
|
|
{'t',0,0,9},
|
|
|
|
{'a',0,0,1},
|
|
|
|
{'u',0,0,1},
|
|
|
|
{'s',F,0,0},
|
|
|
|
{'n',F,0,0},
|
|
|
|
{'o',0,0,1},
|
|
|
|
{'r',F,0,0},
|
|
|
|
{'o',0,0,1},
|
|
|
|
{'w',F,0,0},
|
|
|
|
{'w',0,0,1},
|
|
|
|
{'e',0,0,1},
|
|
|
|
{'e',0,0,1},
|
|
|
|
{'n',F,0,0},
|
|
|
|
{'t',0,0,1},
|
|
|
|
{'h',F,0,0},
|
|
|
|
{'t',F,0,0},
|
|
|
|
{'a',0,1,2},
|
|
{'o',0,0,2},
|
|
|
|
{'n',F,0,0},
|
|
|
|
{'u',0,0,1},
|
|
|
|
{'l',0,0,1},
|
|
|
|
{'d',F,0,0},
|
|
|
|
{'o',L|F,2,4},
|
|
{'i',0,0,2},
|
|
{'u',0,0,5},
|
|
|
|
{'d',F,0,0},
|
|
|
|
{'e',F,1,0},
|
|
{'w',0,0,1},
|
|
|
|
{'n',F,0,0},
|
|
|
|
{'r',0,0,1},
|
|
|
|
{'e',F,0,0},
|
|
|
|
{'a',0,0,1},
|
|
|
|
{'c',0,0,1},
|
|
|
|
{'h',F,0,0},
|
|
|
|
{'o',L,2,5},
|
|
{'e',0,0,3},
|
|
{'r',0,1,4},
|
|
{'u',0,0,5},
|
|
|
|
{'w',F,0,0},
|
|
|
|
{'r',F,0,0},
|
|
|
|
{'o',0,0,1},
|
|
|
|
{'m',F,0,0},
|
|
|
|
{'r',0,0,1},
|
|
|
|
{'t',0,0,1},
|
|
|
|
{'h',0,0,1},
|
|
|
|
{'e',0,0,1},
|
|
|
|
{'r',F,0,0},
|
|
|
|
{'e',L|F,2,7},
|
|
{'a',F,0,3},
|
|
{'i',F,1,11},
|
|
{'o',0,0,15},
|
|
|
|
{'d',F,1,0},
|
|
{'v',0,0,1},
|
|
|
|
{'e',F,0,0},
|
|
|
|
{'r',F,0,1},
|
|
|
|
{'e',F,1,0},
|
|
{'s',0,0,1},
|
|
|
|
{'e',0,0,1},
|
|
|
|
{'l',0,0,1},
|
|
|
|
{'f',F,0,0},
|
|
|
|
{'m',F,0,1},
|
|
|
|
{'s',0,0,1},
|
|
|
|
{'e',0,0,1},
|
|
|
|
{'l',0,0,1},
|
|
|
|
{'f',F,0,0},
|
|
|
|
{'w',F,0,0},
|
|
|
|
{'n',L|F,2,4},
|
|
{'f',F,0,0},
|
|
{'s',F,1,0},
|
|
{'t',F,0,3},
|
|
|
|
{'t',0,0,1},
|
|
|
|
{'o',F,0,0},
|
|
|
|
{'s',0,0,1},
|
|
|
|
{'e',0,0,1},
|
|
|
|
{'l',0,0,1},
|
|
|
|
{'f',F,0,0},
|
|
|
|
{'o',L,3,6},
|
|
{'a',0,1,4},
|
|
{'e',F,0,0},
|
|
{'u',0,1,7},
|
|
{'y',F,0,8},
|
|
|
|
{'y',F,0,0},
|
|
|
|
{'r',0,1,2},
|
|
{'s',0,0,2},
|
|
|
|
{'e',F,0,0},
|
|
|
|
{'t',F,0,0},
|
|
|
|
{'s',0,0,1},
|
|
|
|
{'t',F,0,0},
|
|
|
|
{'s',0,0,1},
|
|
|
|
{'e',0,0,1},
|
|
|
|
{'l',0,0,1},
|
|
|
|
{'f',F,0,0},
|
|
|
|
{'o',F,0,1},
|
|
|
|
{'r',F,1,0},
|
|
{'t',F,0,0},
|
|
|
|
{'t',L,4,11},
|
|
{'n',L|F,2,7},
|
|
{'f',F,0,5},
|
|
{'r',F,0,0},
|
|
{'v',L,2,16},
|
|
{'u',0,0,9},
|
|
{'w',0,0,16},
|
|
|
|
{'f',F,0,0},
|
|
|
|
{'c',F,1,0},
|
|
{'l',0,0,1},
|
|
|
|
{'i',F,0,0},
|
|
|
|
{'h',0,0,1},
|
|
|
|
{'e',0,0,1},
|
|
|
|
{'r',F,0,0},
|
|
|
|
{'r',F,1,2},
|
|
{'t',F,0,0},
|
|
|
|
{'s',0,0,1},
|
|
|
|
{'e',0,0,1},
|
|
|
|
{'l',0,0,1},
|
|
|
|
{'v',F,0,0},
|
|
|
|
{'e',0,0,1},
|
|
|
|
{'r',F,0,0},
|
|
|
|
{'n',F,0,0},
|
|
|
|
{'h',L,2,6},
|
|
{'a',0,0,3},
|
|
{'o',F,1,12},
|
|
{'u',0,0,13},
|
|
|
|
{'m',0,0,1},
|
|
|
|
{'e',F,0,0},
|
|
|
|
{'e',L|F,2,0},
|
|
{'a',0,0,2},
|
|
{'o',0,0,3},
|
|
|
|
{'l',0,0,1},
|
|
|
|
{'l',F,0,0},
|
|
|
|
{'u',0,0,1},
|
|
|
|
{'l',0,0,1},
|
|
|
|
{'d',F,0,0},
|
|
|
|
{'m',0,0,1},
|
|
|
|
{'e',F,0,0},
|
|
|
|
{'c',0,0,1},
|
|
|
|
{'h',F,0,0},
|
|
|
|
{'h',0,1,2},
|
|
{'o',F,0,27},
|
|
|
|
{'i',L|F,3,0},
|
|
{'a',0,1,4},
|
|
{'e',F,0,5},
|
|
{'o',0,1,17},
|
|
{'r',0,0,18},
|
|
|
|
{'n',F,1,0},
|
|
{'t',F,0,0},
|
|
|
|
{'n',L|F,3,0},
|
|
{'i',0,1,5},
|
|
{'m',F,0,5},
|
|
{'s',L,2,9},
|
|
{'r',0,0,7},
|
|
{'y',F,0,0},
|
|
|
|
{'r',F,0,0},
|
|
|
|
{'s',0,0,1},
|
|
|
|
{'e',0,0,1},
|
|
|
|
{'l',0,0,1},
|
|
|
|
{'v',F,0,0},
|
|
|
|
{'e',F,0,0},
|
|
|
|
{'e',F,0,0},
|
|
|
|
{'s',0,0,1},
|
|
|
|
{'e',F,0,0},
|
|
|
|
{'o',0,0,1},
|
|
|
|
{'u',0,0,1},
|
|
|
|
{'g',0,0,1},
|
|
|
|
{'h',F,0,0},
|
|
|
|
{'o',F,0,0},
|
|
|
|
{'n',0,1,2},
|
|
{'p',F,0,0},
|
|
|
|
{'d',0,1,2},
|
|
{'t',0,0,3},
|
|
|
|
{'e',0,0,1},
|
|
|
|
{'r',F,0,0},
|
|
|
|
{'i',0,0,1},
|
|
|
|
{'l',F,0,0},
|
|
|
|
{'e',0,0,1},
|
|
|
|
{'r',0,0,1},
|
|
|
|
{'i',F,0,0},
|
|
|
|
{'h',L,3,7},
|
|
{'a',F,1,0},
|
|
{'e',F,0,3},
|
|
{'i',0,1,17},
|
|
{'o',0,0,20},
|
|
|
|
{'r',0,0,1},
|
|
|
|
{'e',F,0,0},
|
|
|
|
{'e',L,2,5},
|
|
{'a',0,0,3},
|
|
{'i',F,1,6},
|
|
{'o',F,0,9},
|
|
|
|
{'t',F,0,0},
|
|
|
|
{'n',F,1,0},
|
|
{'r',0,0,1},
|
|
|
|
{'e',F,0,0},
|
|
|
|
{'c',0,1,2},
|
|
{'l',0,0,2},
|
|
|
|
{'h',F,0,0},
|
|
|
|
{'e',F,0,0},
|
|
|
|
{'m',F,0,0},
|
|
|
|
{'l',0,1,2},
|
|
{'t',0,0,2},
|
|
|
|
{'l',F,0,0},
|
|
|
|
{'h',F,0,0},
|
|
|
|
{'u',0,0,1},
|
|
|
|
{'l',0,0,1},
|
|
|
|
{'d',F,0,0},
|
|
|
|
{'o',0,0,1},
|
|
|
|
{'u',F,0,1},
|
|
|
|
{'r',F,0,1},
|
|
|
|
{'s',0,0,1},
|
|
|
|
{'e',0,0,1},
|
|
|
|
{'l',0,0,1},
|
|
|
|
{'f',F,1,0},
|
|
{'v',F,0,0}
|
|
};
|
|
|
|
static unsigned int
|
|
find_english_stopword( unsigned char *buf, int len ) {
|
|
ESWNODE *ptr = engstoptree;
|
|
int result = 0;
|
|
unsigned char *cur = buf;
|
|
|
|
while( cur - buf < len ) {
|
|
if ( ptr->val == *cur ) {
|
|
cur++;
|
|
if ( ISFINISH(ptr) ) result = cur - buf;
|
|
if ( ! ptr->child ) break;
|
|
ptr += ptr->child;
|
|
} else if ( ptr->val > *cur ) {
|
|
if ( ISLEFT(ptr) )
|
|
ptr++;
|
|
else
|
|
break;
|
|
} else {
|
|
if ( ptr->right )
|
|
ptr += ptr->right;
|
|
else
|
|
break;
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
#undef L
|
|
#undef F
|
|
#undef ISLEFT
|
|
#undef ISFINISH
|
|
|
|
static int
|
|
is_stopengword(void* obj,char* word,int len) {
|
|
return ( len == find_english_stopword((unsigned char*)word, len) ) ? 1 : 0;
|
|
}
|
|
|
|
static void * setup_english_stemmer()
|
|
{
|
|
struct english_stemmer * z = (struct english_stemmer *) malloc(sizeof(struct english_stemmer));
|
|
z->p = 0; z->p_size = 0;
|
|
z->irregulars = create_pool(irregular_forms);
|
|
return (void *) z;
|
|
}
|
|
|
|
static void closedown_english_stemmer(void * z_)
|
|
{
|
|
struct english_stemmer * z = (struct english_stemmer *) z_;
|
|
free_pool(z->irregulars);
|
|
free(z->p);
|
|
free(z);
|
|
}
|
|
|
|
static char*
|
|
engstemming(void* obj, char *word, int *len) {
|
|
struct english_stemmer * z = (struct english_stemmer *) obj;
|
|
const char* stemmed_word;
|
|
char *result = word;
|
|
|
|
while(result-word < *len) {
|
|
*result = tolower((unsigned char) *result);
|
|
result++;
|
|
}
|
|
stemmed_word = english_stem(obj, word, 0, *len-1);
|
|
*len = z->k + 1;
|
|
|
|
result = (char*)palloc( *len );
|
|
memcpy((void*)result, (void*)stemmed_word, *len);
|
|
return result;
|
|
}
|
|
#endif /* DICT_BODY */
|
|
|
|
#ifdef DICT_TABLE
|
|
TABLE_DICT_START
|
|
"C",
|
|
setup_english_stemmer,
|
|
closedown_english_stemmer,
|
|
engstemming,
|
|
NULL,
|
|
is_stopengword
|
|
TABLE_DICT_END
|
|
#endif
|
|
|