]> granicus.if.org Git - postgresql/commitdiff
Change text search parsing rules for hyphenated words so that digit strings
authorTom Lane <tgl@sss.pgh.pa.us>
Sat, 27 Oct 2007 19:03:45 +0000 (19:03 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Sat, 27 Oct 2007 19:03:45 +0000 (19:03 +0000)
containing decimal points aren't considered part of a hyphenated word.
Sync the hyphenated-word lookahead states with the subsequent part-by-part
reparsing states so that we don't get different answers about how much text
is part of the hyphenated word.  Per my gripe of a few days ago.

src/backend/tsearch/wparser_def.c
src/test/regress/expected/tsearch.out

index 7fa0f435b20c64ecb87cc86cd34d8d7c0e747847..086ac95155801a9c7cb54e5075787b777e80d7f8 100644 (file)
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.6 2007/10/27 17:53:15 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.7 2007/10/27 19:03:45 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -181,19 +181,13 @@ typedef enum
        TPS_InHyphenWord,
        TPS_InHyphenNumWordFirst,
        TPS_InHyphenNumWord,
-       TPS_InHyphenValueFirst,
-       TPS_InHyphenValue,
-       TPS_InHyphenValueExact,
+       TPS_InHyphenDigitLookahead,
        TPS_InParseHyphen,
        TPS_InParseHyphenHyphen,
        TPS_InHyphenWordPart,
        TPS_InHyphenAsciiWordPart,
        TPS_InHyphenNumWordPart,
        TPS_InHyphenUnsignedInt,
-       TPS_InHDecimalPartFirst,
-       TPS_InHDecimalPart,
-       TPS_InHVersionPartFirst,
-       TPS_InHVersionPart,
        TPS_Null                                        /* last state (fake value) */
 } TParserState;
 
@@ -1147,8 +1141,7 @@ static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
        {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
        {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
        {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
-       {p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
-       {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
+       {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
        {NULL, 0, A_POP, TPS_Null, 0, NULL}
 };
 
@@ -1164,8 +1157,7 @@ static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
 static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
        {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
        {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
-       {p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
-       {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
+       {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
        {NULL, 0, A_POP, TPS_Null, 0, NULL}
 };
 
@@ -1179,8 +1171,8 @@ static const TParserStateActionItem actionTPS_InHyphenWord[] = {
 
 static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
        {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
-       {p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
        {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
+       {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
        {NULL, 0, A_POP, TPS_Null, 0, NULL}
 };
 
@@ -1191,34 +1183,18 @@ static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
        {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
 };
 
-static const TParserStateActionItem actionTPS_InHyphenValueFirst[] = {
+static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = {
        {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
-       {p_isdigit, 0, A_NEXT, TPS_InHyphenValueExact, 0, NULL},
-       {NULL, 0, A_POP, TPS_Null, 0, NULL}
-};
-
-static const TParserStateActionItem actionTPS_InHyphenValue[] = {
-       {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
-       {p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
-       {p_iseqC, '.', A_PUSH, TPS_InHyphenValueFirst, 0, NULL},
-       {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
+       {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
        {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
-       {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
-};
-
-static const TParserStateActionItem actionTPS_InHyphenValueExact[] = {
-       {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
-       {p_isdigit, 0, A_NEXT, TPS_InHyphenValueExact, 0, NULL},
-       {p_iseqC, '.', A_PUSH, TPS_InHyphenValueFirst, 0, NULL},
-       {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
-       {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
+       {NULL, 0, A_POP, TPS_Null, 0, NULL}
 };
 
 static const TParserStateActionItem actionTPS_InParseHyphen[] = {
        {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
        {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
        {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
-       {p_isdigit, 0, A_NEXT, TPS_InHyphenUnsignedInt, 0, NULL},
+       {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
        {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
        {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
 };
@@ -1251,39 +1227,12 @@ static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
 };
 
 static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
-       {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
-       {p_isdigit, 0, A_NEXT, TPS_InHyphenUnsignedInt, 0, NULL},
-       {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
-       {p_iseqC, '.', A_PUSH, TPS_InHDecimalPartFirst, 0, NULL},
-       {NULL, 0, A_BINGO, TPS_InParseHyphen, UNSIGNEDINT, NULL}
-};
-
-static const TParserStateActionItem actionTPS_InHDecimalPartFirst[] = {
-       {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
-       {p_isdigit, 0, A_CLEAR, TPS_InHDecimalPart, 0, NULL},
-       {NULL, 0, A_POP, TPS_Null, 0, NULL}
-};
-
-static const TParserStateActionItem actionTPS_InHDecimalPart[] = {
-       {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL, NULL},
-       {p_isdigit, 0, A_NEXT, TPS_InHDecimalPart, 0, NULL},
-       {p_iseqC, '.', A_PUSH, TPS_InHVersionPartFirst, 0, NULL},
-       {NULL, 0, A_BINGO, TPS_InParseHyphen, DECIMAL, NULL}
-};
-
-static const TParserStateActionItem actionTPS_InHVersionPartFirst[] = {
        {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
-       {p_isdigit, 0, A_CLEAR, TPS_InHVersionPart, 0, NULL},
+       {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
+       {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
        {NULL, 0, A_POP, TPS_Null, 0, NULL}
 };
 
-static const TParserStateActionItem actionTPS_InHVersionPart[] = {
-       {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
-       {p_isdigit, 0, A_NEXT, TPS_InHVersionPart, 0, NULL},
-       {p_iseqC, '.', A_PUSH, TPS_InHVersionPartFirst, 0, NULL},
-       {NULL, 0, A_BINGO, TPS_InParseHyphen, VERSIONNUMBER, NULL}
-};
-
 
 /*
  * main table of per-state parser actions
@@ -1378,19 +1327,13 @@ static const TParserStateAction Actions[] = {
        TPARSERSTATEACTION(TPS_InHyphenWord),
        TPARSERSTATEACTION(TPS_InHyphenNumWordFirst),
        TPARSERSTATEACTION(TPS_InHyphenNumWord),
-       TPARSERSTATEACTION(TPS_InHyphenValueFirst),
-       TPARSERSTATEACTION(TPS_InHyphenValue),
-       TPARSERSTATEACTION(TPS_InHyphenValueExact),
+       TPARSERSTATEACTION(TPS_InHyphenDigitLookahead),
        TPARSERSTATEACTION(TPS_InParseHyphen),
        TPARSERSTATEACTION(TPS_InParseHyphenHyphen),
        TPARSERSTATEACTION(TPS_InHyphenWordPart),
        TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart),
        TPARSERSTATEACTION(TPS_InHyphenNumWordPart),
-       TPARSERSTATEACTION(TPS_InHyphenUnsignedInt),
-       TPARSERSTATEACTION(TPS_InHDecimalPartFirst),
-       TPARSERSTATEACTION(TPS_InHDecimalPart),
-       TPARSERSTATEACTION(TPS_InHVersionPartFirst),
-       TPARSERSTATEACTION(TPS_InHVersionPart)
+       TPARSERSTATEACTION(TPS_InHyphenUnsignedInt)
 };
 
 
index 9de795913414952d4445e75072ad3f104bd1b1e2..b6f8f05d228506add6dbe6fcaa27b12b59fa84bb 100644 (file)
@@ -352,15 +352,11 @@ SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.w
     12 | . 
     20 | 4.2
     12 | , 
-    15 | readline-4.2
-    11 | readline
-    12 | -
-    20 | 4.2
+     1 | readline
+    20 | -4.2
     12 |  
-    15 | readline-4.2
-    11 | readline
-    12 | -
-    20 | 4.2
+     1 | readline
+    20 | -4.2
     12 | . 
     22 | 234
     12 | 
@@ -377,14 +373,14 @@ SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.w
     12 |  
     12 | <> 
      1 | qwerty
-(135 rows)
+(131 rows)
 
 SELECT to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/?  ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
 /usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
 <i <b> wow  < jqw <> qwerty');
-                                                                                                                                                                                                                                                                                                                                                                                                                             to_tsvector                                                                                                                                                                                                                                                                                                                                                                                                                              
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
- 'ad':17 'dw':19 'jf':39 '234':63 '345':1 '4.2':54,55,56,59,62 '455':31 'jqw':66 'qwe':2,18,27,28,35 'wer':36 'wow':65 'asdf':37 'ewr1':43 'qwer':38 'sdjk':40 '5.005':32 'efd.r':3 'ewri2':44 'hjwer':42 'qwqwe':29 'wefjn':48 'gist.c':52 'gist.h':50 'qwerti':67 '234.435':30 'qwe-wer':34 'readlin':53,58,61 'www.com':4 '+4.0e-10':26 'gist.h.c':51 'rewt/ewr':47 '/?ad=qwe&dw':7,10,14,22 '/wqe-324/ewr':49 'aew.werc.ewr':6 'readline-4.2':57,60 '1aew.werc.ewr':9 '2aew.werc.ewr':11 '3aew.werc.ewr':13 '4aew.werc.ewr':15 '/usr/local/fff':45 '/awdf/dwqe/4325':46 'teodor@stack.net':33 '/?ad=qwe&dw=%20%32':25 '5aew.werc.ewr:8100':16 '6aew.werc.ewr:8100':21 '7aew.werc.ewr:8100':24 'aew.werc.ewr/?ad=qwe&dw':5 '1aew.werc.ewr/?ad=qwe&dw':8 '3aew.werc.ewr/?ad=qwe&dw':12 '6aew.werc.ewr:8100/?ad=qwe&dw':20 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':23
+                                                                                                                                                                                                                                                                                                                                                                                                                      to_tsvector                                                                                                                                                                                                                                                                                                                                                                                                                       
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ 'ad':17 'dw':19 'jf':39 '234':61 '345':1 '4.2':54,55,56 '455':31 'jqw':64 'qwe':2,18,27,28,35 'wer':36 'wow':63 '-4.2':58,60 'asdf':37 'ewr1':43 'qwer':38 'sdjk':40 '5.005':32 'efd.r':3 'ewri2':44 'hjwer':42 'qwqwe':29 'wefjn':48 'gist.c':52 'gist.h':50 'qwerti':65 '234.435':30 'qwe-wer':34 'readlin':53,57,59 'www.com':4 '+4.0e-10':26 'gist.h.c':51 'rewt/ewr':47 '/?ad=qwe&dw':7,10,14,22 '/wqe-324/ewr':49 'aew.werc.ewr':6 '1aew.werc.ewr':9 '2aew.werc.ewr':11 '3aew.werc.ewr':13 '4aew.werc.ewr':15 '/usr/local/fff':45 '/awdf/dwqe/4325':46 'teodor@stack.net':33 '/?ad=qwe&dw=%20%32':25 '5aew.werc.ewr:8100':16 '6aew.werc.ewr:8100':21 '7aew.werc.ewr:8100':24 'aew.werc.ewr/?ad=qwe&dw':5 '1aew.werc.ewr/?ad=qwe&dw':8 '3aew.werc.ewr/?ad=qwe&dw':12 '6aew.werc.ewr:8100/?ad=qwe&dw':20 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':23
 (1 row)
 
 SELECT length(to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/?  ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">