From 938236a29716c754a9a9238e377c3cd15db11dde Mon Sep 17 00:00:00 2001 From: Bruce Momjian Date: Wed, 1 Aug 2001 18:40:12 +0000 Subject: [PATCH] The fti.pl supplied with the fulltextindex module generate ALL possible substrings of two characters or greater, and is case-sensitive. This patch makes it work correctly. It generates only the suffixes of each word, plus lowercases them - as specified by the README file. This brings it into line with the fti.c function, makes it case-insensitive properly, removes the problem with duplicate rows being returned from an fti search and greatly reduces the size of the generated index table. It was written by my co-worker, Brett Toolin. Christopher Kings-Lynne --- contrib/fulltextindex/fti.pl | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/contrib/fulltextindex/fti.pl b/contrib/fulltextindex/fti.pl index 02bf057e94..230ba92703 100644 --- a/contrib/fulltextindex/fti.pl +++ b/contrib/fulltextindex/fti.pl @@ -1,6 +1,6 @@ #!/usr/bin/perl # -# This script substracts all substrings out of a specific column in a table +# This script substracts all suffixes of all words in a specific column in a table # and generates output that can be loaded into a new table with the # psql '\copy' command. The new table should have the following structure: # @@ -52,27 +52,28 @@ $PGRES_BAD_RESPONSE = 5 ; $PGRES_NONFATAL_ERROR = 6 ; $PGRES_FATAL_ERROR = 7 ; +# the minimum length of word to include in the full text index +$MIN_WORD_LENGTH = 2; + +# the minimum length of the substrings in the full text index +$MIN_SUBSTRING_LENGTH = 2; + $[ = 0; # make sure string offsets start at 0 sub break_up { my $string = pop @_; + # convert strings to lower case + $string = lc($string); @strings = split(/\W+/, $string); @subs = (); foreach $s (@strings) { $len = length($s); - next if ($len < 4); - - $lpos = $len-1; - while ($lpos >= 3) { - $fpos = $lpos - 3; - while ($fpos >= 0) { - $sub = substr($s, $fpos, $lpos - $fpos + 1); - push(@subs, $sub); - $fpos = $fpos - 1; - } - $lpos = $lpos - 1; + next if ($len <= $MIN_WORD_LENGTH); + for ($i = 0; $i <= $len - $MIN_SUBSTRING_LENGTH; $i++) { + $tmp = substr($s, $i); + push(@subs, $tmp); } } -- 2.40.0