]> granicus.if.org Git - postgresql/commitdiff
Modify the built-in text search parser to handle URLs more nearly according
authorTom Lane <tgl@sss.pgh.pa.us>
Wed, 28 Apr 2010 02:04:16 +0000 (02:04 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Wed, 28 Apr 2010 02:04:16 +0000 (02:04 +0000)
to RFC 3986.  In particular, these characters now terminate the path part
of a URL: '"', '<', '>', '\', '^', '`', '{', '|', '}'.  The previous behavior
was inconsistent and depended on whether a "?" was present in the path.
Per gripe from Donald Fraser and spec research by Kevin Grittner.

This is a pre-existing bug, but not back-patching since the risks of
breaking existing applications seem to outweigh the benefits.

src/backend/tsearch/wparser_def.c
src/test/regress/expected/tsearch.out
src/test/regress/sql/tsearch.sql

index ab632354c288e420eff8e6a39b61b9765ccc7a5c..a2da9210c4c7128cb4390dd11a628f63dc5947d6 100644 (file)
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.29 2010/04/26 17:10:18 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.30 2010/04/28 02:04:16 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -583,6 +583,35 @@ p_isasclet(TParser *prs)
        return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
 }
 
+static int
+p_isurlchar(TParser *prs)
+{
+       char            ch;
+
+       /* no non-ASCII need apply */
+       if (prs->state->charlen != 1)
+               return 0;
+       ch = *(prs->str + prs->state->posbyte);
+       /* no spaces or control characters */
+       if (ch <= 0x20 || ch >= 0x7F)
+               return 0;
+       /* reject characters disallowed by RFC 3986 */
+       switch (ch)
+       {
+               case '"':
+               case '<':
+               case '>':
+               case '\\':
+               case '^':
+               case '`':
+               case '{':
+               case '|':
+               case '}':
+                       return 0;
+       }
+       return 1;
+}
+
 
 /* deliberately suppress unused-function complaints for the above */
 void           _make_compiler_happy(void);
@@ -707,9 +736,9 @@ p_isURLPath(TParser *prs)
        int                     res = 0;
 
        tmpprs->state = newTParserPosition(tmpprs->state);
-       tmpprs->state->state = TPS_InFileFirst;
+       tmpprs->state->state = TPS_InURLPathFirst;
 
-       if (TParserGet(tmpprs) && (tmpprs->type == URLPATH || tmpprs->type == FILEPATH))
+       if (TParserGet(tmpprs) && tmpprs->type == URLPATH)
        {
                prs->state->posbyte += tmpprs->lenbytetoken;
                prs->state->poschar += tmpprs->lenchartoken;
@@ -1441,7 +1470,6 @@ static const TParserStateActionItem actionTPS_InFileFirst[] = {
        {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
        {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
        {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
-       {p_iseqC, '?', A_PUSH, TPS_InURLPathFirst, 0, NULL},
        {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
        {NULL, 0, A_POP, TPS_Null, 0, NULL}
 };
@@ -1488,7 +1516,6 @@ static const TParserStateActionItem actionTPS_InFile[] = {
        {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
        {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
        {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
-       {p_iseqC, '?', A_PUSH, TPS_InURLPathFirst, 0, NULL},
        {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
 };
 
@@ -1502,9 +1529,7 @@ static const TParserStateActionItem actionTPS_InFileNext[] = {
 
 static const TParserStateActionItem actionTPS_InURLPathFirst[] = {
        {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
-       {p_iseqC, '"', A_POP, TPS_Null, 0, NULL},
-       {p_iseqC, '\'', A_POP, TPS_Null, 0, NULL},
-       {p_isnotspace, 0, A_CLEAR, TPS_InURLPath, 0, NULL},
+       {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
        {NULL, 0, A_POP, TPS_Null, 0, NULL},
 };
 
@@ -1514,9 +1539,7 @@ static const TParserStateActionItem actionTPS_InURLPathStart[] = {
 
 static const TParserStateActionItem actionTPS_InURLPath[] = {
        {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
-       {p_iseqC, '"', A_BINGO, TPS_Base, URLPATH, NULL},
-       {p_iseqC, '\'', A_BINGO, TPS_Base, URLPATH, NULL},
-       {p_isnotspace, 0, A_NEXT, TPS_InURLPath, 0, NULL},
+       {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
        {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
 };
 
index 1cd9186d6929e1dbdffd18c4cae106b4010e5c65..86ea5efc7ba1fec944b2d2a225a28c20c5e72c46 100644 (file)
@@ -287,8 +287,10 @@ SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.w
      6 | 4aew.werc.ewr
     12 |  
     14 | http://
+     5 | 5aew.werc.ewr:8100/?
      6 | 5aew.werc.ewr:8100
-    12 | /?  
+    18 | /?
+    12 |   
      1 | ad
     12 | =
      1 | qwe
@@ -391,14 +393,14 @@ SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.w
     12 |  
     12 | <> 
      1 | qwerty
-(131 rows)
+(133 rows)
 
 SELECT to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/?  ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
 /usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
 <i <b> wow  < jqw <> qwerty');
-                                                                                                                                                                                                                                                                                                                                                                                                                      to_tsvector                                                                                                                                                                                                                                                                                                                                                                                                                       
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
- '+4.0e-10':26 '-4.2':58,60 '/?ad=qwe&dw':7,10,14,22 '/?ad=qwe&dw=%20%32':25 '/awdf/dwqe/4325':46 '/usr/local/fff':45 '/wqe-324/ewr':49 '1aew.werc.ewr':9 '1aew.werc.ewr/?ad=qwe&dw':8 '234':61 '234.435':30 '2aew.werc.ewr':11 '345':1 '3aew.werc.ewr':13 '3aew.werc.ewr/?ad=qwe&dw':12 '4.2':54,55,56 '455':31 '4aew.werc.ewr':15 '5.005':32 '5aew.werc.ewr:8100':16 '6aew.werc.ewr:8100':21 '6aew.werc.ewr:8100/?ad=qwe&dw':20 '7aew.werc.ewr:8100':24 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':23 'ad':17 'aew.werc.ewr':6 'aew.werc.ewr/?ad=qwe&dw':5 'asdf':37 'dw':19 'efd.r':3 'ewr1':43 'ewri2':44 'gist.c':52 'gist.h':50 'gist.h.c':51 'hjwer':42 'jf':39 'jqw':64 'qwe':2,18,27,28,35 'qwe-wer':34 'qwer':38 'qwerti':65 'qwqwe':29 'readlin':53,57,59 'rewt/ewr':47 'sdjk':40 'teodor@stack.net':33 'wefjn':48 'wer':36 'wow':63 'www.com':4
+                                                                                                                                                                                                                                                                                                                                                                                                                                       to_tsvector                                                                                                                                                                                                                                                                                                                                                                                                                                        
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ '+4.0e-10':28 '-4.2':60,62 '/?':18 '/?ad=qwe&dw':7,10,14,24 '/?ad=qwe&dw=%20%32':27 '/awdf/dwqe/4325':48 '/usr/local/fff':47 '/wqe-324/ewr':51 '1aew.werc.ewr':9 '1aew.werc.ewr/?ad=qwe&dw':8 '234':63 '234.435':32 '2aew.werc.ewr':11 '345':1 '3aew.werc.ewr':13 '3aew.werc.ewr/?ad=qwe&dw':12 '4.2':56,57,58 '455':33 '4aew.werc.ewr':15 '5.005':34 '5aew.werc.ewr:8100':17 '5aew.werc.ewr:8100/?':16 '6aew.werc.ewr:8100':23 '6aew.werc.ewr:8100/?ad=qwe&dw':22 '7aew.werc.ewr:8100':26 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':25 'ad':19 'aew.werc.ewr':6 'aew.werc.ewr/?ad=qwe&dw':5 'asdf':39 'dw':21 'efd.r':3 'ewr1':45 'ewri2':46 'gist.c':54 'gist.h':52 'gist.h.c':53 'hjwer':44 'jf':41 'jqw':66 'qwe':2,20,29,30,37 'qwe-wer':36 'qwer':40 'qwerti':67 'qwqwe':31 'readlin':55,59,61 'rewt/ewr':49 'sdjk':42 'teodor@stack.net':35 'wefjn':50 'wer':38 'wow':65 'www.com':4
 (1 row)
 
 SELECT length(to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/?  ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
@@ -406,7 +408,7 @@ SELECT length(to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://ae
 <i <b> wow  < jqw <> qwerty'));
  length 
 --------
-     51
+     53
 (1 row)
 
 -- ts_debug
@@ -424,6 +426,44 @@ SELECT * from ts_debug('english', '<myns:foo-bar_baz.blurfl>abc&nm1;def&#xa9;ghi
  tag       | XML tag         | </myns:foo-bar_baz.blurfl> | {}             |              | 
 (9 rows)
 
+-- check parsing of URLs
+SELECT * from ts_debug('english', 'http://www.harewoodsolutions.co.uk/press.aspx</span>');
+  alias   |  description  |                 token                  | dictionaries | dictionary |                 lexemes                  
+----------+---------------+----------------------------------------+--------------+------------+------------------------------------------
+ protocol | Protocol head | http://                                | {}           |            | 
+ url      | URL           | www.harewoodsolutions.co.uk/press.aspx | {simple}     | simple     | {www.harewoodsolutions.co.uk/press.aspx}
+ host     | Host          | www.harewoodsolutions.co.uk            | {simple}     | simple     | {www.harewoodsolutions.co.uk}
+ url_path | URL path      | /press.aspx                            | {simple}     | simple     | {/press.aspx}
+ tag      | XML tag       | </span>                                | {}           |            | 
+(5 rows)
+
+SELECT * from ts_debug('english', 'http://aew.wer0c.ewr/id?ad=qwe&dw<span>');
+  alias   |  description  |           token            | dictionaries | dictionary |           lexemes            
+----------+---------------+----------------------------+--------------+------------+------------------------------
+ protocol | Protocol head | http://                    | {}           |            | 
+ url      | URL           | aew.wer0c.ewr/id?ad=qwe&dw | {simple}     | simple     | {aew.wer0c.ewr/id?ad=qwe&dw}
+ host     | Host          | aew.wer0c.ewr              | {simple}     | simple     | {aew.wer0c.ewr}
+ url_path | URL path      | /id?ad=qwe&dw              | {simple}     | simple     | {/id?ad=qwe&dw}
+ tag      | XML tag       | <span>                     | {}           |            | 
+(5 rows)
+
+SELECT * from ts_debug('english', 'http://5aew.werc.ewr:8100/?');
+  alias   |  description  |        token         | dictionaries | dictionary |        lexemes         
+----------+---------------+----------------------+--------------+------------+------------------------
+ protocol | Protocol head | http://              | {}           |            | 
+ url      | URL           | 5aew.werc.ewr:8100/? | {simple}     | simple     | {5aew.werc.ewr:8100/?}
+ host     | Host          | 5aew.werc.ewr:8100   | {simple}     | simple     | {5aew.werc.ewr:8100}
+ url_path | URL path      | /?                   | {simple}     | simple     | {/?}
+(4 rows)
+
+SELECT * from ts_debug('english', '5aew.werc.ewr:8100/?xx');
+  alias   | description |         token          | dictionaries | dictionary |         lexemes          
+----------+-------------+------------------------+--------------+------------+--------------------------
+ url      | URL         | 5aew.werc.ewr:8100/?xx | {simple}     | simple     | {5aew.werc.ewr:8100/?xx}
+ host     | Host        | 5aew.werc.ewr:8100     | {simple}     | simple     | {5aew.werc.ewr:8100}
+ url_path | URL path    | /?xx                   | {simple}     | simple     | {/?xx}
+(3 rows)
+
 -- to_tsquery
 SELECT to_tsquery('english', 'qwe & sKies ');
   to_tsquery   
index 3467b1f6de115424cc5bb9e4b225bd6a090a0e79..3c0a7dd82a819f217ebbceb8299b1cf7c7f79c45 100644 (file)
@@ -105,6 +105,12 @@ SELECT length(to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://ae
 
 SELECT * from ts_debug('english', '<myns:foo-bar_baz.blurfl>abc&nm1;def&#xa9;ghi&#245;jkl</myns:foo-bar_baz.blurfl>');
 
+-- check parsing of URLs
+SELECT * from ts_debug('english', 'http://www.harewoodsolutions.co.uk/press.aspx</span>');
+SELECT * from ts_debug('english', 'http://aew.wer0c.ewr/id?ad=qwe&dw<span>');
+SELECT * from ts_debug('english', 'http://5aew.werc.ewr:8100/?');
+SELECT * from ts_debug('english', '5aew.werc.ewr:8100/?xx');
+
 -- to_tsquery
 
 SELECT to_tsquery('english', 'qwe & sKies ');