]> granicus.if.org Git - postgresql/commitdiff
Change descriptions of entity and tag objects to "XML entity" and "XML tag".
authorAndrew Dunstan <andrew@dunslane.net>
Tue, 20 Nov 2007 02:25:22 +0000 (02:25 +0000)
committerAndrew Dunstan <andrew@dunslane.net>
Tue, 20 Nov 2007 02:25:22 +0000 (02:25 +0000)
Allow tag and entity names that follow XML rules. Provide for hexadecimal
as well as decimal numeric entities. Adjust code names to coincide with
new descriptions.

doc/src/sgml/textsearch.sgml
src/backend/tsearch/wparser_def.c
src/test/regress/expected/tsearch.out

index b43872cca5c3898f2dc790a3e8ddc0b1eb923953..61583df3a2105add6aa329763bb0eed9c9197e81 100644 (file)
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.36 2007/11/16 03:23:07 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.37 2007/11/20 02:25:22 adunstan Exp $ -->
 
 <chapter id="textsearch">
  <title id="textsearch-title">Full Text Search</title>
@@ -1862,12 +1862,12 @@ LIMIT 10;
      </row>
      <row>
       <entry><literal>tag</></entry>
-      <entry>HTML tag</entry>
-      <entry><literal>&lt;A HREF="dictionaries.html"&gt;</literal></entry>
+      <entry>XML tag</entry>
+      <entry><literal>&lt;a href="dictionaries.html"&gt;</literal></entry>
      </row>
      <row>
       <entry><literal>entity</></entry>
-      <entry>HTML entity</entry>
+      <entry>XML entity</entry>
       <entry><literal>&amp;amp;</literal></entry>
      </row>
      <row>
index 3f95f60579ea9b1cab5ef276449ae0ddd79c164b..b80175456d2ee3ef0699eef2f3eeeac58c24bff0 100644 (file)
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.10 2007/11/15 22:25:16 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.11 2007/11/20 02:25:22 adunstan Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -50,7 +50,7 @@
 #define DECIMAL                        20
 #define SIGNEDINT              21
 #define UNSIGNEDINT            22
-#define HTMLENTITY             23
+#define XMLENTITY              23
 
 #define LASTNUM                        23
 
@@ -95,7 +95,7 @@ static const char *const lex_descr[] = {
        "Hyphenated word part, all letters",
        "Hyphenated word part, all ASCII",
        "Space symbols",
-       "HTML tag",
+       "XML tag",
        "Protocol head",
        "Hyphenated word, letters and digits",
        "Hyphenated word, all ASCII",
@@ -105,7 +105,7 @@ static const char *const lex_descr[] = {
        "Decimal notation",
        "Signed integer",
        "Unsigned integer",
-       "HTML entity"
+       "XML entity"
 };
 
 
@@ -132,11 +132,13 @@ typedef enum
        TPS_InMantissaFirst,
        TPS_InMantissaSign,
        TPS_InMantissa,
-       TPS_InHTMLEntityFirst,
-       TPS_InHTMLEntity,
-       TPS_InHTMLEntityNumFirst,
-       TPS_InHTMLEntityNum,
-       TPS_InHTMLEntityEnd,
+       TPS_InXMLEntityFirst,
+       TPS_InXMLEntity,
+       TPS_InXMLEntityNumFirst,
+       TPS_InXMLEntityNum,
+       TPS_InXMLEntityHexNumFirst,
+       TPS_InXMLEntityHexNum,
+       TPS_InXMLEntityEnd,
        TPS_InTagFirst,
        TPS_InXMLBegin,
        TPS_InTagCloseFirst,
@@ -653,7 +655,7 @@ static const TParserStateActionItem actionTPS_Base[] = {
        {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
        {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
        {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
-       {p_iseqC, '&', A_PUSH, TPS_InHTMLEntityFirst, 0, NULL},
+       {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
        {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
        {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
        {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
@@ -811,35 +813,56 @@ static const TParserStateActionItem actionTPS_InMantissa[] = {
        {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
 };
 
-static const TParserStateActionItem actionTPS_InHTMLEntityFirst[] = {
+static const TParserStateActionItem actionTPS_InXMLEntityFirst[] = {
        {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
-       {p_iseqC, '#', A_NEXT, TPS_InHTMLEntityNumFirst, 0, NULL},
-       {p_isasclet, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL},
+       {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
+       {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
+       {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
+       {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
        {NULL, 0, A_POP, TPS_Null, 0, NULL}
 };
 
-static const TParserStateActionItem actionTPS_InHTMLEntity[] = {
+static const TParserStateActionItem actionTPS_InXMLEntity[] = {
        {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
-       {p_isasclet, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL},
-       {p_iseqC, ';', A_NEXT, TPS_InHTMLEntityEnd, 0, NULL},
+       {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
+       {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
+       {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
+       {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
+       {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
+       {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
+       {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
        {NULL, 0, A_POP, TPS_Null, 0, NULL}
 };
 
-static const TParserStateActionItem actionTPS_InHTMLEntityNumFirst[] = {
+static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[] = {
        {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
-       {p_isdigit, 0, A_NEXT, TPS_InHTMLEntityNum, 0, NULL},
+       {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
+       {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
        {NULL, 0, A_POP, TPS_Null, 0, NULL}
 };
 
-static const TParserStateActionItem actionTPS_InHTMLEntityNum[] = {
+static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[] = {
        {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
-       {p_isdigit, 0, A_NEXT, TPS_InHTMLEntityNum, 0, NULL},
-       {p_iseqC, ';', A_NEXT, TPS_InHTMLEntityEnd, 0, NULL},
+       {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
        {NULL, 0, A_POP, TPS_Null, 0, NULL}
 };
 
-static const TParserStateActionItem actionTPS_InHTMLEntityEnd[] = {
-       {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, HTMLENTITY, NULL}
+static const TParserStateActionItem actionTPS_InXMLEntityNum[] = {
+       {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+       {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
+       {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
+       {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InXMLEntityHexNum[] = {
+       {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+       {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
+       {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
+       {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InXMLEntityEnd[] = {
+       {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
 };
 
 static const TParserStateActionItem actionTPS_InTagFirst[] = {
@@ -854,8 +877,8 @@ static const TParserStateActionItem actionTPS_InTagFirst[] = {
 static const TParserStateActionItem actionTPS_InXMLBegin[] = {
        {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
        /* <?xml ... */
+    /* XXX do we wants states for the m and l ?  Right now this accepts <?xZ */
        {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
-       {p_iseqC, 'X', A_NEXT, TPS_InTag, 0, NULL},
        {NULL, 0, A_POP, TPS_Null, 0, NULL}
 };
 
@@ -1278,11 +1301,13 @@ static const TParserStateAction Actions[] = {
        TPARSERSTATEACTION(TPS_InMantissaFirst),
        TPARSERSTATEACTION(TPS_InMantissaSign),
        TPARSERSTATEACTION(TPS_InMantissa),
-       TPARSERSTATEACTION(TPS_InHTMLEntityFirst),
-       TPARSERSTATEACTION(TPS_InHTMLEntity),
-       TPARSERSTATEACTION(TPS_InHTMLEntityNumFirst),
-       TPARSERSTATEACTION(TPS_InHTMLEntityNum),
-       TPARSERSTATEACTION(TPS_InHTMLEntityEnd),
+       TPARSERSTATEACTION(TPS_InXMLEntityFirst),
+       TPARSERSTATEACTION(TPS_InXMLEntity),
+       TPARSERSTATEACTION(TPS_InXMLEntityNumFirst),
+       TPARSERSTATEACTION(TPS_InXMLEntityNum),
+       TPARSERSTATEACTION(TPS_InXMLEntityHexNumFirst),
+       TPARSERSTATEACTION(TPS_InXMLEntityHexNum),
+       TPARSERSTATEACTION(TPS_InXMLEntityEnd),
        TPARSERSTATEACTION(TPS_InTagFirst),
        TPARSERSTATEACTION(TPS_InXMLBegin),
        TPARSERSTATEACTION(TPS_InTagCloseFirst),
@@ -1556,9 +1581,9 @@ prsd_end(PG_FUNCTION_ARGS)
 #define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
 #define ENDPUNCTOKEN(x) ( (x)==SPACE )
 
-#define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==HTMLENTITY )
+#define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
 #define HLIDIGNORE(x) ( (x)==URL_T || (x)==TAG_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
-#define HTMLHLIDIGNORE(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
+#define XMLHLIDIGNORE(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
 #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDIGNORE(x) )
 #define NOENDTOKEN(x)  ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
 
@@ -1839,7 +1864,7 @@ prsd_headline(PG_FUNCTION_ARGS)
                }
                else
                {
-                       if (HTMLHLIDIGNORE(prs->words[i].type))
+                       if (XMLHLIDIGNORE(prs->words[i].type))
                                prs->words[i].replace = 1;
                }
 
index b6f8f05d228506add6dbe6fcaa27b12b59fa84bb..eb004020758e8b826acbc598078b5b6ca4f42254 100644 (file)
@@ -222,7 +222,7 @@ SELECT * FROM ts_token_type('default');
     10 | hword_part      | Hyphenated word part, all letters
     11 | hword_asciipart | Hyphenated word part, all ASCII
     12 | blank           | Space symbols
-    13 | tag             | HTML tag
+    13 | tag             | XML tag
     14 | protocol        | Protocol head
     15 | numhword        | Hyphenated word, letters and digits
     16 | asciihword      | Hyphenated word, all ASCII
@@ -232,7 +232,7 @@ SELECT * FROM ts_token_type('default');
     20 | float           | Decimal notation
     21 | int             | Signed integer
     22 | uint            | Unsigned integer
-    23 | entity          | HTML entity
+    23 | entity          | XML entity
 (23 rows)
 
 SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/?  ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">