]> granicus.if.org Git - postgresql/commitdiff
August 13, 2002
authorBruce Momjian <bruce@momjian.us>
Thu, 15 Aug 2002 03:02:08 +0000 (03:02 +0000)
committerBruce Momjian <bruce@momjian.us>
Thu, 15 Aug 2002 03:02:08 +0000 (03:02 +0000)
         Use parser of OpenFTS v0.33.

--
Teodor Sigaev

contrib/tsearch/README.tsearch
contrib/tsearch/deflex.h
contrib/tsearch/expected/tsearch.out
contrib/tsearch/morph.c
contrib/tsearch/parser.l

index c63ae91edd096cfbf4646e565f4cd1c41b08e85e..a57df55eea79f39d9f4a7bcb9a39185680266152 100644 (file)
@@ -4,6 +4,11 @@ a searchable data type (textual) with indexed access.
 All work was done by Teodor Sigaev (teodor@stack.net) and Oleg Bartunov
 (oleg@sai.msu.su).
 
+CHANGES:
+
+August 13, 2002
+       Use parser of OpenFTS v0.33.
+
 IMPORTANT NOTICE:
 
 This is a first step of our work on integration of OpenFTS
index f9d6847167988e8a11aae2226b134757ecf65b02..17c4fdf1ec3e765bfb71bb0f98c85b6c3805512e 100644 (file)
@@ -2,28 +2,33 @@
 #define __DEFLEX_H__
 
 /* rememder !!!! */
-#define LASTNUM                19
+#define LASTNUM                23
 
 #define LATWORD                1
-#define NONLATINWORD   2
+#define CYRWORD                2
 #define UWORD          3
 #define EMAIL          4
 #define FURL           5
 #define HOST           6
-#define FLOAT          7
-#define FINT           8
-#define PARTWORD       9
-#define NONLATINPARTWORD       10
-#define LATPARTWORD            11
-#define SPACE          12
-#define SYMTAG         13
-#define HTTP           14
-#define DEFISWORD      15
-#define DEFISLATWORD   16
-#define DEFISNONLATINWORD      17
+#define SCIENTIFIC     7
+#define VERSIONNUMBER  8
+#define PARTHYPHENWORD         9       
+#define CYRPARTHYPHENWORD      10      
+#define LATPARTHYPHENWORD      11      
+#define SPACE          12
+#define TAG            13
+#define HTTP           14
+#define HYPHENWORD     15
+#define LATHYPHENWORD  16
+#define CYRHYPHENWORD  17
 #define URI            18
 #define FILEPATH       19
+#define DECIMAL                20
+#define SIGNEDINT      21
+#define UNSIGNEDINT    22
+#define HTMLENTITY     23
 
 extern const char *descr[];
 
 #endif
+
index f75b429bcbb436f671a14fe6017753cd5fb0931d..0b12765d8f6cf0335703f54fe0c2007685867ad2 100644 (file)
@@ -689,9 +689,9 @@ SELECT count(*) FROM test_txtidx WHERE a ## '(eq|yt)&(wR|qh)';
 select txt2txtidx('345 qwe@efd.r \' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/?  ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
 /usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234 
 <i <b> wow  < jqw <> qwerty');
-                                                                                                                                                                                                                                                                                                                                            txt2txtidx                                                                                                                                                                                                                                                                                                                                             
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
- 'ad' 'dw' 'jf' '234' '345' '4.2' '455' 'jqw' 'qwe' 'wer' 'wow' 'asdf' 'ewr1' 'qwer' 'sdjk' '5.005' 'ewri2' 'qwqwe' 'wefjn' 'gist.c' 'gist.h' 'qwerti' '234.435' ':8100/?' 'qwe-wer' 'readlin' 'www.com' '+4.0e-10' 'gist.h.c' 'rewt/ewr' 'qwe@efd.r' '/?ad=qwe&dw' '/wqe-324/ewr' 'aew.werc.ewr' '1aew.werc.ewr' '2aew.werc.ewr' '3aew.werc.ewr' '4aew.werc.ewr' '5aew.werc.ewr' '6aew.werc.ewr' '7aew.werc.ewr' '/usr/local/fff' '/awdf/dwqe/4325' ':8100/?ad=qwe&dw' 'teodor@stack.net' '5aew.werc.ewr:8100/?' ':8100/?ad=qwe&dw=%20%32' 'aew.werc.ewr/?ad=qwe&dw' '1aew.werc.ewr/?ad=qwe&dw' '3aew.werc.ewr/?ad=qwe&dw' '6aew.werc.ewr:8100/?ad=qwe&dw' '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32'
+                                                                                                                                                                                                                                                                                                                                                   txt2txtidx                                                                                                                                                                                                                                                                                                                                                   
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ 'ad' 'dw' 'jf' '234' '345' '4.2' '455' 'jqw' 'qwe' 'wer' 'wow' 'asdf' 'ewr1' 'qwer' 'sdjk' '5.005' 'ewri2' 'qwqwe' 'wefjn' 'gist.c' 'gist.h' 'qwerti' '234.435' ':8100/?' 'qwe-wer' 'readlin' 'www.com' '+4.0e-10' 'gist.h.c' 'rewt/ewr' 'qwe@efd.r' 'readline-4' '/?ad=qwe&dw' '/wqe-324/ewr' 'aew.werc.ewr' '1aew.werc.ewr' '2aew.werc.ewr' '3aew.werc.ewr' '4aew.werc.ewr' '5aew.werc.ewr' '6aew.werc.ewr' '7aew.werc.ewr' '/usr/local/fff' '/awdf/dwqe/4325' ':8100/?ad=qwe&dw' 'teodor@stack.net' '5aew.werc.ewr:8100/?' ':8100/?ad=qwe&dw=%20%32' 'aew.werc.ewr/?ad=qwe&dw' '1aew.werc.ewr/?ad=qwe&dw' '3aew.werc.ewr/?ad=qwe&dw' '6aew.werc.ewr:8100/?ad=qwe&dw' '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32'
 (1 row)
 
 select txtidxsize(txt2txtidx('345 qw'));
@@ -705,7 +705,7 @@ select txtidxsize(txt2txtidx('345 qwe@efd.r \' http://www.com/ http://aew.werc.e
 <i <b> wow  < jqw <> qwerty'));
  txtidxsize 
 ------------
-         52
+         53
 (1 row)
 
 insert into test_txtidx (a) values ('345 qwerty');
index 60797b07e92441c7f218c16d775e22e658a249ed..b29a3f6779dbe57e786f04312a7fb72d7fcb928d 100644 (file)
@@ -75,19 +75,23 @@ static MAPDICT mapdict[] = {
        {NODICT, NODICT},                       /* EMAIL                */
        {NODICT, NODICT},                       /* FURL                 */
        {NODICT, NODICT},                       /* HOST                 */
-       {NODICT, NODICT},                       /* FLOAT                */
-       {NODICT, NODICT},                       /* FINT                 */
-       {BYLOCALE, DEFAULTDICT},        /* PARTWORD             */
-       {BYLOCALE, NODICT},                     /* NONLATINPARTWORD */
-       {DEFAULTDICT, NODICT},          /* LATPARTWORD          */
+       {NODICT, NODICT},                       /* SCIENTIFIC           */
+       {NODICT, NODICT},                       /* VERSIONNUMBER                */
+       {BYLOCALE, DEFAULTDICT},        /* PARTHYPHENWORD               */
+       {BYLOCALE, NODICT},                     /* CYRPARTHYPHENWORD */
+       {DEFAULTDICT, NODICT},          /* LATPARTHYPHENWORD            */
        {STOPLEXEM, NODICT},            /* SPACE                */
-       {STOPLEXEM, NODICT},            /* SYMTAG               */
+       {STOPLEXEM, NODICT},            /* TAG          */
        {STOPLEXEM, NODICT},            /* HTTP                 */
-       {BYLOCALE, DEFAULTDICT},        /* DEFISWORD            */
-       {DEFAULTDICT, NODICT},          /* DEFISLATWORD         */
-       {BYLOCALE, NODICT},                     /* DEFISNONLATINWORD    */
+       {BYLOCALE, DEFAULTDICT},        /* HYPHENWORD           */
+       {DEFAULTDICT, NODICT},          /* LATHYPHENWORD                */
+       {BYLOCALE, NODICT},                     /* CYRHYPHENWORD        */
        {NODICT, NODICT},                       /* URI                  */
-       {NODICT, NODICT}                        /* FILEPATH             */
+       {NODICT, NODICT},                       /* FILEPATH             */
+       {NODICT, NODICT},                       /* DECIMAL              */
+       {NODICT, NODICT},                       /* SIGNEDINT            */
+       {NODICT, NODICT},                       /* UNSIGNEDINT          */
+       {STOPLEXEM, NODICT}                     /* HTMLENTITY           */
 };
 
 static bool inited = false;
index 6081fd4c7bec02bcbada539f43a6d65a3eb7cb5f..f30fbcd4f4608a8b104c1b1ce678e1c7a31af5bf 100644 (file)
@@ -5,18 +5,17 @@
 
 /* postgres allocation function */
 #include "postgres.h"
-#define free   pfree
-#define malloc palloc
+#define free    pfree
+#define malloc  palloc
 #define realloc repalloc
 
 #ifdef strdup
 #undef strdup
 #endif
-#define strdup pstrdup
-
+#define strdup  pstrdup
 
 char *token = NULL;  /* pointer to token */
-char *s     = NULL;  /* for returning full defis-word */
+char *s     = NULL;  /* to return WHOLE hyphenated-word */
 
 YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */
 
@@ -57,21 +56,21 @@ int bytestoread = 0;        /* for limiting read from filehandle */
 %option nounput
 %option noyywrap
 
-
-/* parser's state for parsing defis-word */
+/* parser's state for parsing hyphenated-word */
 %x DELIM  
 /* parser's state for parsing URL*/
 %x URL  
 %x SERVER  
 
-/* parser's state for parsing filepath */
-
+/* parser's state for parsing TAGS */
 %x INTAG
 %x QINTAG
+%x INCOMMENT
+%x INSCRIPT
 
-/* NONLATIN char */
-NONLATINALNUM  [0-9\200-\377]
-NONLATINALPHA  [\200-\377]
+/* cyrillic koi8 char */
+CYRALNUM       [0-9\200-\377]
+CYRALPHA       [\200-\377]
 ALPHA          [a-zA-Z\200-\377]
 ALNUM          [0-9a-zA-Z\200-\377]
 
@@ -81,66 +80,59 @@ URI         [-_[:alnum:]/%,\.;=&?#]+
 
 %%
 
-"<"[[:alpha:]] { BEGIN INTAG;
-       token = tsearch_yytext;
-       tokenlen = tsearch_yyleng;
-       return SYMTAG;
- }
-
-"</"[[:alpha:]]        { BEGIN INTAG;
-       token = tsearch_yytext;
-       tokenlen = tsearch_yyleng;
-       return SYMTAG;
- }
+"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; }
 
-"<>" {
+<INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" {
+       BEGIN INITIAL; 
+       *tsearch_yytext=' '; *(tsearch_yytext+1) = '\0'; 
        token = tsearch_yytext;
        tokenlen = tsearch_yyleng;
-       return SYMTAG;
+       return SPACE;
 }
 
-"<"[^>[:alpha:]] { 
+"<!--" { BEGIN INCOMMENT; }
+
+<INCOMMENT>"-->"       { 
+       BEGIN INITIAL;
+       *tsearch_yytext=' '; *(tsearch_yytext+1) = '\0'; 
        token = tsearch_yytext;
        tokenlen = tsearch_yyleng;
        return SPACE;
 }
 
-<INTAG>"\""    { BEGIN QINTAG;
-       token = tsearch_yytext;
-       tokenlen = tsearch_yyleng;
-       return SYMTAG;
- }
 
-<QINTAG>"\\\"" {
-       token = tsearch_yytext;
-       tokenlen = tsearch_yyleng;
-       return SYMTAG;
-}
+"<"[\![:alpha:]]       { BEGIN INTAG; }
 
-<QINTAG>"\""   { BEGIN INTAG;
-       token = tsearch_yytext;
-       tokenlen = tsearch_yyleng;
-       return SYMTAG;
- }
+"</"[[:alpha:]]        { BEGIN INTAG; }
 
-<QINTAG>.|\n   {
+<INTAG>"\""    { BEGIN QINTAG; }
+
+<QINTAG>"\\\"" ;
+
+<QINTAG>"\""   { BEGIN INTAG; }
+
+<INTAG>">"     { 
+       BEGIN INITIAL;
        token = tsearch_yytext;
-       tokenlen = tsearch_yyleng;
-       return SYMTAG;
+       *tsearch_yytext=' '; 
+       token = tsearch_yytext;
+       tokenlen = 1;
+       return TAG;
 }
 
-<INTAG>">"     { BEGIN INITIAL;
+<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n  ;
+
+\&(quot|amp|nbsp|lt|gt)\;   {
        token = tsearch_yytext;
        tokenlen = tsearch_yyleng;
-       return SYMTAG;
- }
+       return HTMLENTITY;
+}
 
-<INTAG>.|\n    {
+\&\#[0-9][0-9]?[0-9]?\; {
        token = tsearch_yytext;
        tokenlen = tsearch_yyleng;
-       return SYMTAG;
+       return HTMLENTITY;
 }
-
  
 [-_\.[:alnum:]]+@{HOSTNAME}  /* Emails */ { 
        token = tsearch_yytext; 
@@ -148,22 +140,34 @@ URI               [-_[:alnum:]/%,\.;=&?#]+
        return EMAIL; 
 }
 
-<DELIM,INITIAL>[0-9]   /* digit's and point (might be a version) */ { 
+[+-]?[0-9]+(\.[0-9]+)?[eEdD][+-]?[0-9]+  /* float */   { 
        token = tsearch_yytext; 
        tokenlen = tsearch_yyleng;
-       return FINT; 
+       return SCIENTIFIC; 
+}
+
+[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
+       token = tsearch_yytext;
+       tokenlen = tsearch_yyleng;
+       return VERSIONNUMBER;
+}
+
+[+-]?[0-9]+\.[0-9]+ {
+       token = tsearch_yytext;
+       tokenlen = tsearch_yyleng;
+       return DECIMAL;
 }
 
-<DELIM,INITIAL>[0-9]+[0-9\.]*[0-9]     /* digit's and point (might be a version) */ { 
+[+-][0-9]+ { 
        token = tsearch_yytext; 
        tokenlen = tsearch_yyleng;
-       return FINT; 
+       return SIGNEDINT; 
 }
 
-[+-]?[0-9\.]+[eE][+-]?[0-9]+  /* float */      { 
+<DELIM,INITIAL>[0-9]+ { 
        token = tsearch_yytext; 
        tokenlen = tsearch_yyleng;
-       return FLOAT; 
+       return UNSIGNEDINT; 
 }
 
 http"://"        { 
@@ -208,52 +212,58 @@ ftp"://"        {
        return FILEPATH;
 }
 
-({NONLATINALNUM}+-)+{NONLATINALPHA}+ /* composite-word */      {
+({CYRALPHA}+-)+{CYRALPHA}+ /* composite-word */        {
        BEGIN DELIM;
        if (s) { free(s); s=NULL; } 
        s = strdup( tsearch_yytext );
        tokenlen = tsearch_yyleng;
        yyless( 0 );
        token = s;
-       return DEFISNONLATINWORD;
+       return CYRHYPHENWORD;
 }
 
-([[:alnum:]]+-)+[[:alpha:]]+ /* composite-word */      {
+([[:alpha:]]+-)+[[:alpha:]]+ /* composite-word */      {
         BEGIN DELIM;
        if (s) { free(s); s=NULL; } 
-       tokenlen = tsearch_yyleng;
        s = strdup( tsearch_yytext );
+       tokenlen = tsearch_yyleng;
        yyless( 0 );
        token = s;
-       return DEFISLATWORD;
+       return LATHYPHENWORD;
 }
 
-({ALNUM}+-)+{ALPHA}+ /* composite-word */      {
+({ALNUM}+-)+{ALNUM}+ /* composite-word */      {
        BEGIN DELIM;
        if (s) { free(s); s=NULL; } 
        s = strdup( tsearch_yytext );
        tokenlen = tsearch_yyleng;
        yyless( 0 );
        token = s;
-       return DEFISWORD;
+       return HYPHENWORD;
+}
+
+<DELIM>\+?[0-9]+\.[0-9]+ {
+       token = tsearch_yytext;
+       tokenlen = tsearch_yyleng;
+       return DECIMAL;
 }
 
-<DELIM>{NONLATINALNUM}+  /* one word in composite-word */      { 
+<DELIM>{CYRALPHA}+  /* one word in composite-word */   { 
        token = tsearch_yytext; 
        tokenlen = tsearch_yyleng;
-       return NONLATINPARTWORD; 
+       return CYRPARTHYPHENWORD; 
 }
 
-<DELIM>[[:alnum:]]+  /* one word in composite-word */  { 
+<DELIM>[[:alpha:]]+  /* one word in composite-word */  { 
        token = tsearch_yytext; 
        tokenlen = tsearch_yyleng;
-       return LATPARTWORD; 
+       return LATPARTHYPHENWORD; 
 }
 
 <DELIM>{ALNUM}+  /* one word in composite-word */      { 
        token = tsearch_yytext; 
        tokenlen = tsearch_yyleng;
-       return PARTWORD; 
+       return PARTHYPHENWORD; 
 }
 
 <DELIM>-  { 
@@ -264,17 +274,16 @@ ftp"://"        {
 
 <DELIM,SERVER,URL>.|\n /* return in basic state */     {
        BEGIN INITIAL;
-       tokenlen = tsearch_yyleng;
        yyless( 0 );
 }
 
-{NONLATINALNUM}+ /* normal word */     { 
+{CYRALPHA}+ /* normal word */  { 
        token = tsearch_yytext; 
        tokenlen = tsearch_yyleng;
-       return NONLATINWORD; 
+       return CYRWORD; 
 }
 
-[[:alnum:]]+ /* normal word */ { 
+[[:alpha:]]+ /* normal word */ { 
        token = tsearch_yytext; 
        tokenlen = tsearch_yyleng;
        return LATWORD; 
@@ -286,7 +295,13 @@ ftp"://"        {
        return UWORD; 
 }
 
-.|\n {
+[ \r\n\t]+ {
+       token = tsearch_yytext;
+       tokenlen = tsearch_yyleng;
+       return SPACE;
+}
+
+. {
        token = tsearch_yytext;
        tokenlen = tsearch_yyleng;
        return SPACE;