7 getopts('l:he:s:ap:o:m:f', \%opt);
9 if ( $opt{h} || ! ($opt{e}||$opt{s}) || !$opt{l} ) {
11 Generator of variant of the Lovin's stemmer which
12 uses a longest match algorithm.
14 Author Teodor Sigaev <teodor\@stack.net>
16 $0 -l LOCALENAME [ -e FILENAME ] [ -s FILENAME ] [ -p PREFIX ] [ -o FILENAME ] [ -a ] [ -m NUMBER ]
17 -e FILENAME - file with endings of word
18 -s FILENAME - file with list of stop-word
19 -o FILENAME - out file, default STDOUT
20 -a - stop-word are strimmed
21 -p PREFIX - prefix of function and etc, default strimmed locale
22 -m NUMBER - minimal length of rest after semming, default 3
23 -l LOCALENAME - name of locale
24 -f - do not call tolower for each char
25 At least one of -e or -s must be defined
30 if ( ! defined $opt{p} ) {
32 $opt{p}=~s/[^a-zA-Z0-9_]+//g;
34 $opt{m}=3 if ! defined $opt{m};
36 my ($enddata,$stopdata) = ('','');
41 buildtree(\@tree, $opt{e}, 1);
42 printstruct( \@tree, 0, \$enddata);
48 buildtree(\@tree, $opt{s}, 0);
49 printstruct( \@tree, 0, \$stopdata);
54 die "No data\n" if ( ! (length $enddata || length $stopdata) );
56 $enddata = "\t{0,0,0,0}" if ( ! length $enddata );
57 $stopdata = "\t{0,0,0,0}" if ( ! length $stopdata );
61 open(OUT,">$opt{o}") || die "Can;t open file '$opt{o}' for writing\n";
65 my $linktype = 'uint32';
66 if ( $maxchild <= 0xff ) {
68 } elsif ( $maxchild <= 0xffff ) {
72 my $wherecheck = ( $opt{a} ) ?
73 "NULL,\n\t$opt{p}_is_stopword"
75 "$opt{p}_is_stopword,\n\tNULL";
77 my ($tolower, $resttolower) = ('','');
79 $tolower = '*cur = tolower( *cur );';
81 while( cur - buf >= 0 ) {
92 * Variant of the Lovin's stemmer which uses a longest match algorithm.
93 * Endings are stored in a suffix tree.
106 /* is exists left tree ? */
108 /* finish word flag */
110 #define ISLEFT(x) ((($opt{p}_NODE*)x)->flag & L)
111 #define ISFINISH(x) ((($opt{p}_NODE*)x)->flag & F)
113 #define MINLENREST $opt{m}
115 static $opt{p}_NODE $opt{p}_endstree[]={
119 static $opt{p}_NODE $opt{p}_stoptree[]={
124 $opt{p}_stem( void* obj, char *in, int *len ) {
125 $opt{p}_NODE *ptr = $opt{p}_endstree;
127 uint8 *buf = (uint8 *)in;
128 uint8 *cur = buf + (*len) - 1;
130 while( cur - buf >= MINLENREST ) {
132 if ( ptr->val == *cur ) {
133 if ( ISFINISH(ptr) ) result = buf + (*len) - cur;
135 if ( ! ptr->child ) break;
137 } else if ( ptr->val > *cur ) {
155 $opt{p}_is_stopword( void *obj, char *in, int len ) {
156 $opt{p}_NODE *ptr = $opt{p}_stoptree;
158 uint8 *buf = (uint8 *)in;
161 while( cur - buf < len ) {
163 if ( ptr->val == *cur ) {
165 if ( ISFINISH(ptr) ) result = cur - buf;
166 if ( ! ptr->child ) break;
168 } else if ( ptr->val > *cur ) {
180 return (result==len) ? 1 : 0;
188 #endif /* DICT_BODY */
202 close($fh) if ( $fh != \*STDOUT );
206 my ($reftree,$file, $needreverse) = @_;
207 open(DATA,$file) || die "Can't open file '$file'\n";
211 $_ = lc($_) if ! $opt{f};
212 addtostruct( $reftree, ( $needreverse ) ? scalar(reverse($_)) : $_ );
218 my ( $start, $stop, $rprop, $rres) = @_;
220 my $middle = $start + int( ($stop-$start)/2 );
222 push( @$rres, $rprop->[$middle] );
224 $rres->[$idx]{right}=0;
225 $rres->[$idx]{left}=0;
226 return 1 if ( $start == $stop );
229 if ( $middle!=$start ) {
230 $rres->[$idx]{left}=1;
231 $leftsize = mkbintree( $start, $middle-1, $rprop, $rres );
232 $rres->[$idx]{right}=$leftsize+1;
234 $rres->[$idx]{right} = 1;
236 return 1 + $leftsize + mkbintree( $middle+1, $stop, $rprop, $rres );
241 my ($char, $subval) = split('', shift, 2);
242 $char = ord( $char );
243 if ( ! defined $node->[$char] ) {
245 $node->[$char]{finish} = length $subval;
246 $node->[$char]{child} = [];
247 } elsif ( ! length $subval ) {
248 $node->[$char]{finish} = 0;
251 addtostruct( $node->[$char]{child}, $subval ) if ( length $subval );
255 my ($node, $pre, $refout) = @_;
263 foreach my $i ( 0..255 ) {
264 next if ( !defined $node->[ $i ] );
265 push @prop , { val=>$i,
266 nchild=>printstruct( $node->[ $i ]{child}, 1, \$outchild ),
267 poschild=>$poschild };
268 $poschild += $prop[$#prop]{nchild};
271 return 0 if $#prop < 0;
275 mkbintree(0,$#prop,\@prop,\@tmp);
279 foreach my $i ( 0..$#prop ) {
280 my $flag = ($prop[$i]{left}) ? 'L' : undef;
281 if ( $node->[ $prop[$i]{val} ]{finish}==0 ) {
282 $flag .= '|' if defined $flag;
284 } elsif ( ! defined $flag ) {
287 $$refout .= "\t{'".chr( $prop[$i]{val} )."',".
289 $prop[$i]{right}.','.
290 (($prop[$i]{nchild}==0)?0:($prop[$i]{poschild}+$current)).'}'.
291 (($i==$#prop)? '' : ",\n");
293 $maxchild = $prop[$i]{poschild}+$current if
294 ( $prop[$i]{nchild} && $prop[$i]{poschild}+$current > $maxchild );
296 $add += $prop[$i]{nchild};
298 $$refout .= $outchild;
299 return $#prop+1 + $add;