2001-10-13 07:19:09 +08:00
|
|
|
#!/usr/bin/perl
|
|
|
|
use strict;
|
|
|
|
use Getopt::Std;
|
|
|
|
use locale;
|
|
|
|
|
|
|
|
my %opt;
|
2002-12-13 13:50:18 +08:00
|
|
|
getopts('l:he:s:ap:o:m:f', \%opt);
|
2001-10-13 07:19:09 +08:00
|
|
|
|
|
|
|
if ( $opt{h} || ! ($opt{e}||$opt{s}) || !$opt{l} ) {
|
|
|
|
print<<EOT;
|
|
|
|
Generator of variant of the Lovin's stemmer which
|
|
|
|
uses a longest match algorithm.
|
|
|
|
|
|
|
|
Author Teodor Sigaev <teodor\@stack.net>
|
|
|
|
Usage:
|
|
|
|
$0 -l LOCALENAME [ -e FILENAME ] [ -s FILENAME ] [ -p PREFIX ] [ -o FILENAME ] [ -a ] [ -m NUMBER ]
|
|
|
|
-e FILENAME - file with endings of word
|
|
|
|
-s FILENAME - file with list of stop-word
|
|
|
|
-o FILENAME - out file, default STDOUT
|
|
|
|
-a - stop-word are strimmed
|
|
|
|
-p PREFIX - prefix of function and etc, default strimmed locale
|
|
|
|
-m NUMBER - minimal length of rest after semming, default 3
|
|
|
|
-l LOCALENAME - name of locale
|
|
|
|
-f - do not call tolower for each char
|
|
|
|
At least one of -e or -s must be defined
|
|
|
|
EOT
|
|
|
|
exit;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( ! defined $opt{p} ) {
|
|
|
|
$opt{p} = $opt{l};
|
|
|
|
$opt{p}=~s/[^a-zA-Z0-9_]+//g;
|
|
|
|
}
|
|
|
|
$opt{m}=3 if ! defined $opt{m};
|
|
|
|
|
|
|
|
my ($enddata,$stopdata) = ('','');
|
|
|
|
my $maxchild = 0;
|
|
|
|
|
|
|
|
if ( $opt{e} ) {
|
|
|
|
my @tree;
|
|
|
|
buildtree(\@tree, $opt{e}, 1);
|
|
|
|
printstruct( \@tree, 0, \$enddata);
|
|
|
|
undef @tree;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( $opt{s} ) {
|
|
|
|
my @tree;
|
|
|
|
buildtree(\@tree, $opt{s}, 0);
|
|
|
|
printstruct( \@tree, 0, \$stopdata);
|
|
|
|
undef @tree;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
die "No data\n" if ( ! (length $enddata || length $stopdata) );
|
|
|
|
|
|
|
|
$enddata = "\t{0,0,0,0}" if ( ! length $enddata );
|
|
|
|
$stopdata = "\t{0,0,0,0}" if ( ! length $stopdata );
|
|
|
|
|
|
|
|
my $fh=\*STDOUT;
|
|
|
|
if ( $opt{o} ) {
|
|
|
|
open(OUT,">$opt{o}") || die "Can;t open file '$opt{o}' for writing\n";
|
|
|
|
$fh = \*OUT;
|
|
|
|
}
|
|
|
|
|
|
|
|
my $linktype = 'uint32';
|
|
|
|
if ( $maxchild <= 0xff ) {
|
|
|
|
$linktype='uint8';
|
|
|
|
} elsif ( $maxchild <= 0xffff ) {
|
|
|
|
$linktype='uint16';
|
|
|
|
}
|
|
|
|
|
|
|
|
my $wherecheck = ( $opt{a} ) ?
|
|
|
|
"NULL,\n\t$opt{p}_is_stopword"
|
|
|
|
:
|
|
|
|
"$opt{p}_is_stopword,\n\tNULL";
|
|
|
|
|
|
|
|
my ($tolower, $resttolower) = ('','');
|
|
|
|
if ( ! $opt{f} ) {
|
|
|
|
$tolower = '*cur = tolower( *cur );';
|
|
|
|
$resttolower=<<EOT;
|
|
|
|
while( cur - buf >= 0 ) {
|
|
|
|
*cur = tolower(*cur);
|
|
|
|
cur--;
|
|
|
|
}
|
|
|
|
EOT
|
|
|
|
}
|
|
|
|
|
|
|
|
print {$fh} <<EOT;
|
|
|
|
/*
|
|
|
|
* Autogenerated file
|
|
|
|
*
|
|
|
|
* Variant of the Lovin's stemmer which uses a longest match algorithm.
|
|
|
|
* Endings are stored in a suffix tree.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifdef DICT_BODY
|
2002-12-13 13:50:18 +08:00
|
|
|
#include <ctype.h>
|
|
|
|
|
2001-10-13 07:19:09 +08:00
|
|
|
typedef struct {
|
|
|
|
uint8 val;
|
|
|
|
uint8 flag;
|
|
|
|
uint8 right;
|
|
|
|
$linktype child;
|
|
|
|
} $opt{p}_NODE;
|
|
|
|
|
|
|
|
/* is exists left tree ? */
|
|
|
|
#define L 0x01
|
|
|
|
/* finish word flag */
|
|
|
|
#define F 0x02
|
|
|
|
#define ISLEFT(x) ((($opt{p}_NODE*)x)->flag & L)
|
|
|
|
#define ISFINISH(x) ((($opt{p}_NODE*)x)->flag & F)
|
|
|
|
|
|
|
|
#define MINLENREST $opt{m}
|
|
|
|
|
|
|
|
static $opt{p}_NODE $opt{p}_endstree[]={
|
|
|
|
$enddata
|
|
|
|
};
|
|
|
|
|
|
|
|
static $opt{p}_NODE $opt{p}_stoptree[]={
|
|
|
|
$stopdata
|
|
|
|
};
|
|
|
|
|
|
|
|
static char*
|
|
|
|
$opt{p}_stem( void* obj, char *in, int *len ) {
|
|
|
|
$opt{p}_NODE *ptr = $opt{p}_endstree;
|
|
|
|
int result = 0;
|
|
|
|
uint8 *buf = (uint8 *)in;
|
|
|
|
uint8 *cur = buf + (*len) - 1;
|
|
|
|
|
|
|
|
while( cur - buf >= MINLENREST ) {
|
|
|
|
$tolower
|
|
|
|
if ( ptr->val == *cur ) {
|
|
|
|
if ( ISFINISH(ptr) ) result = buf + (*len) - cur;
|
|
|
|
cur--;
|
|
|
|
if ( ! ptr->child ) break;
|
|
|
|
ptr += ptr->child;
|
|
|
|
} else if ( ptr->val > *cur ) {
|
|
|
|
if ( ISLEFT(ptr) )
|
|
|
|
ptr++;
|
|
|
|
else
|
|
|
|
break;
|
|
|
|
} else {
|
|
|
|
if ( ptr->right )
|
|
|
|
ptr += ptr->right;
|
|
|
|
else
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
$resttolower
|
|
|
|
*len -= result;
|
|
|
|
return in;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
$opt{p}_is_stopword( void *obj, char *in, int len ) {
|
|
|
|
$opt{p}_NODE *ptr = $opt{p}_stoptree;
|
|
|
|
int result = 0;
|
|
|
|
uint8 *buf = (uint8 *)in;
|
|
|
|
uint8 *cur = buf;
|
|
|
|
|
|
|
|
while( cur - buf < len ) {
|
|
|
|
$tolower
|
|
|
|
if ( ptr->val == *cur ) {
|
|
|
|
cur++;
|
|
|
|
if ( ISFINISH(ptr) ) result = cur - buf;
|
|
|
|
if ( ! ptr->child ) break;
|
|
|
|
ptr += ptr->child;
|
|
|
|
} else if ( ptr->val > *cur ) {
|
|
|
|
if ( ISLEFT(ptr) )
|
|
|
|
ptr++;
|
|
|
|
else
|
|
|
|
break;
|
|
|
|
} else {
|
|
|
|
if ( ptr->right )
|
|
|
|
ptr += ptr->right;
|
|
|
|
else
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return (result==len) ? 1 : 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#undef L
|
|
|
|
#undef F
|
|
|
|
#undef ISLEFT
|
|
|
|
#undef ISFINISH
|
|
|
|
#undef MINLENREST
|
|
|
|
#endif /* DICT_BODY */
|
|
|
|
|
|
|
|
#ifdef DICT_TABLE
|
|
|
|
TABLE_DICT_START
|
|
|
|
\"$opt{l}\",
|
|
|
|
NULL,
|
|
|
|
NULL,
|
|
|
|
$opt{p}_stem,
|
|
|
|
$wherecheck
|
|
|
|
TABLE_DICT_END
|
|
|
|
#endif
|
|
|
|
|
|
|
|
EOT
|
|
|
|
|
|
|
|
close($fh) if ( $fh != \*STDOUT );
|
|
|
|
|
|
|
|
|
|
|
|
sub buildtree {
|
|
|
|
my ($reftree,$file, $needreverse) = @_;
|
|
|
|
open(DATA,$file) || die "Can't open file '$file'\n";
|
|
|
|
while(<DATA>) {
|
|
|
|
chomp;
|
|
|
|
next if ! length $_;
|
|
|
|
$_ = lc($_) if ! $opt{f};
|
|
|
|
addtostruct( $reftree, ( $needreverse ) ? scalar(reverse($_)) : $_ );
|
|
|
|
}
|
|
|
|
close DATA;
|
|
|
|
}
|
|
|
|
|
|
|
|
sub mkbintree {
|
|
|
|
my ( $start, $stop, $rprop, $rres) = @_;
|
|
|
|
|
|
|
|
my $middle = $start + int( ($stop-$start)/2 );
|
|
|
|
|
|
|
|
push( @$rres, $rprop->[$middle] );
|
|
|
|
my $idx = $#$rres;
|
|
|
|
$rres->[$idx]{right}=0;
|
|
|
|
$rres->[$idx]{left}=0;
|
|
|
|
return 1 if ( $start == $stop );
|
|
|
|
|
|
|
|
my $leftsize = 0;
|
|
|
|
if ( $middle!=$start ) {
|
|
|
|
$rres->[$idx]{left}=1;
|
|
|
|
$leftsize = mkbintree( $start, $middle-1, $rprop, $rres );
|
|
|
|
$rres->[$idx]{right}=$leftsize+1;
|
|
|
|
} else {
|
|
|
|
$rres->[$idx]{right} = 1;
|
|
|
|
}
|
|
|
|
return 1 + $leftsize + mkbintree( $middle+1, $stop, $rprop, $rres );
|
|
|
|
}
|
|
|
|
|
|
|
|
sub addtostruct {
|
|
|
|
my $node = shift;
|
|
|
|
my ($char, $subval) = split('', shift, 2);
|
|
|
|
$char = ord( $char );
|
|
|
|
if ( ! defined $node->[$char] ) {
|
|
|
|
$node->[$char] = {};
|
|
|
|
$node->[$char]{finish} = length $subval;
|
|
|
|
$node->[$char]{child} = [];
|
|
|
|
} elsif ( ! length $subval ) {
|
|
|
|
$node->[$char]{finish} = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
addtostruct( $node->[$char]{child}, $subval ) if ( length $subval );
|
|
|
|
}
|
|
|
|
|
|
|
|
sub printstruct {
|
|
|
|
my ($node, $pre, $refout) = @_;
|
|
|
|
my $add = 0;
|
|
|
|
my @prop;
|
|
|
|
my $outchild;
|
|
|
|
my $current = 0;
|
|
|
|
my $poschild=0;
|
|
|
|
my @tmp;
|
|
|
|
|
|
|
|
foreach my $i ( 0..255 ) {
|
|
|
|
next if ( !defined $node->[ $i ] );
|
|
|
|
push @prop , { val=>$i,
|
|
|
|
nchild=>printstruct( $node->[ $i ]{child}, 1, \$outchild ),
|
|
|
|
poschild=>$poschild };
|
|
|
|
$poschild += $prop[$#prop]{nchild};
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0 if $#prop < 0;
|
|
|
|
if ($pre) {
|
|
|
|
$$refout .= ",\n\n";
|
|
|
|
}
|
|
|
|
mkbintree(0,$#prop,\@prop,\@tmp);
|
|
|
|
@prop = @tmp;
|
|
|
|
|
|
|
|
$current=$#prop+1;
|
|
|
|
foreach my $i ( 0..$#prop ) {
|
|
|
|
my $flag = ($prop[$i]{left}) ? 'L' : undef;
|
|
|
|
if ( $node->[ $prop[$i]{val} ]{finish}==0 ) {
|
|
|
|
$flag .= '|' if defined $flag;
|
|
|
|
$flag .= 'F';
|
|
|
|
} elsif ( ! defined $flag ) {
|
|
|
|
$flag='0';
|
|
|
|
}
|
|
|
|
$$refout .= "\t{'".chr( $prop[$i]{val} )."',".
|
|
|
|
$flag.','.
|
|
|
|
$prop[$i]{right}.','.
|
|
|
|
(($prop[$i]{nchild}==0)?0:($prop[$i]{poschild}+$current)).'}'.
|
|
|
|
(($i==$#prop)? '' : ",\n");
|
|
|
|
|
|
|
|
$maxchild = $prop[$i]{poschild}+$current if
|
|
|
|
( $prop[$i]{nchild} && $prop[$i]{poschild}+$current > $maxchild );
|
|
|
|
$current--;
|
|
|
|
$add += $prop[$i]{nchild};
|
|
|
|
}
|
|
|
|
$$refout .= $outchild;
|
|
|
|
return $#prop+1 + $add;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|