]> granicus.if.org Git - postgresql/commitdiff
Text parser rewritten:
authorTeodor Sigaev <teodor@sigaev.ru>
Mon, 21 Nov 2005 12:27:57 +0000 (12:27 +0000)
committerTeodor Sigaev <teodor@sigaev.ru>
Mon, 21 Nov 2005 12:27:57 +0000 (12:27 +0000)
        - supports multibyte encodings
        - more strict rules for lexemes
        - flex isn't used
Add:
        - tsquery plainto_tsquery(text)
          Function makes tsquery from plain text.
        - &&, ||, !! operation for tsquery for combining
          tsquery from it's parts:  'foo & bar' || 'asd' => 'foo & bar | asd'

15 files changed:
contrib/tsearch2/Makefile
contrib/tsearch2/expected/tsearch2.out
contrib/tsearch2/query.c
contrib/tsearch2/query_support.c
contrib/tsearch2/sql/tsearch2.sql
contrib/tsearch2/ts_locale.c [new file with mode: 0644]
contrib/tsearch2/ts_locale.h [new file with mode: 0644]
contrib/tsearch2/tsearch.sql.in
contrib/tsearch2/wordparser/Makefile
contrib/tsearch2/wordparser/deflex.c
contrib/tsearch2/wordparser/deflex.h
contrib/tsearch2/wordparser/parser.c [new file with mode: 0644]
contrib/tsearch2/wordparser/parser.h
contrib/tsearch2/wordparser/parser.l [deleted file]
contrib/tsearch2/wparser_def.c

index 4901b611ee1e0648f914d8190213b5a533a89492..2ef904ddb4e01629b80c70b8f0a67caf7bce5dcd 100644 (file)
@@ -1,4 +1,4 @@
-# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.11 2005/11/08 17:08:46 teodor Exp $
+# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.12 2005/11/21 12:27:57 teodor Exp $
 
 MODULE_big = tsearch2
 OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \
@@ -6,7 +6,8 @@ OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \
        wparser.o wparser_def.o \
        ts_cfg.o tsvector.o query_cleanup.o crc32.o query.o gistidx.o \
        tsvector_op.o rank.o ts_stat.o \
-       query_util.o query_support.o query_rewrite.o query_gist.o
+       query_util.o query_support.o query_rewrite.o query_gist.o \
+       ts_locale.o
 
 SUBDIRS     := snowball ispell wordparser
 SUBDIROBJS  := $(SUBDIRS:%=%/SUBSYS.o)
index 296c0ac676f874dd6e2ed3ed7996df6a8eb8ff0f..a98c2216a8da55d0074aa1c35e3a4190532fa376 100644 (file)
@@ -13,12 +13,12 @@ psql:tsearch2.sql:342: NOTICE:  argument type tsvector is only a shell
 psql:tsearch2.sql:396: NOTICE:  type "tsquery" is not yet defined
 DETAIL:  Creating a shell type definition.
 psql:tsearch2.sql:401: NOTICE:  argument type tsquery is only a shell
-psql:tsearch2.sql:544: NOTICE:  type "gtsvector" is not yet defined
+psql:tsearch2.sql:559: NOTICE:  type "gtsvector" is not yet defined
 DETAIL:  Creating a shell type definition.
-psql:tsearch2.sql:549: NOTICE:  argument type gtsvector is only a shell
-psql:tsearch2.sql:998: NOTICE:  type "gtsq" is not yet defined
+psql:tsearch2.sql:564: NOTICE:  argument type gtsvector is only a shell
+psql:tsearch2.sql:1054: NOTICE:  type "gtsq" is not yet defined
 DETAIL:  Creating a shell type definition.
-psql:tsearch2.sql:1003: NOTICE:  argument type gtsq is only a shell
+psql:tsearch2.sql:1059: NOTICE:  argument type gtsq is only a shell
 --tsvector
 SELECT '1'::tsvector;
  tsvector 
@@ -653,7 +653,7 @@ select * from token_type('default');
     11 | lpart_hword  | Latin part of hyphenated word
     12 | blank        | Space symbols
     13 | tag          | HTML Tag
-    14 | http         | HTTP head
+    14 | protocol     | Protocol head
     15 | hword        | Hyphenated word
     16 | lhword       | Latin hyphenated word
     17 | nlhword      | Non-latin hyphenated word
@@ -672,14 +672,13 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc
 -------+--------------------------------------
     22 | 345
     12 |  
-     4 | qwe@efd.r
-    12 |  
-    12 | '
-    12 |  
+     1 | qwe
+    12 | @
+    19 | efd.r
+    12 |  
     14 | http://
      6 | www.com
-    12 | /
-    12 |  
+    12 | / 
     14 | http://
      5 | aew.werc.ewr/?ad=qwe&dw
      6 | aew.werc.ewr
@@ -700,10 +699,8 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc
      6 | 4aew.werc.ewr
     12 |  
     14 | http://
-     5 | 5aew.werc.ewr:8100/?
-     6 | 5aew.werc.ewr
-    18 | :8100/?
-    12 |   
+     6 | 5aew.werc.ewr:8100
+    12 | /?  
      1 | ad
     12 | =
      1 | qwe
@@ -711,12 +708,12 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc
      1 | dw
     12 |  
      5 | 6aew.werc.ewr:8100/?ad=qwe&dw
-     6 | 6aew.werc.ewr
-    18 | :8100/?ad=qwe&dw
+     6 | 6aew.werc.ewr:8100
+    18 | /?ad=qwe&dw
     12 |  
      5 | 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32
-     6 | 7aew.werc.ewr
-    18 | :8100/?ad=qwe&dw=%20%32
+     6 | 7aew.werc.ewr:8100
+    18 | /?ad=qwe&dw=%20%32
     12 |  
      7 | +4.0e-10
     12 |  
@@ -747,11 +744,15 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc
      1 | jf
     12 |  
      1 | sdjk
-    13 | <we hjwer <werrwe>
+    12 | <
+     1 | we
     12 |  
-     3 | ewr1
-    12 | >
+     1 | hjwer
+    12 |  
+    13 | <werrwe>
     12 |  
+     3 | ewr1
+    12 | > 
      3 | ewri2
     12 |  
     13 | <a href="qwe<qwe>">
@@ -767,57 +768,53 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc
     12 |  
     19 | /wqe-324/ewr
     12 |  
-     6 | gist.h
-    12 |  
-     6 | gist.h.c
+    19 | gist.h
     12 |  
-     6 | gist.c
-    12 | .
+    19 | gist.h.c
     12 |  
+    19 | gist.c
+    12 | . 
      1 | readline
     12 |  
     20 | 4.2
     12 |  
     20 | 4.2
-    12 | .
-    12 |  
+    12 | . 
     20 | 4.2
-    12 | ,
-    12 |  
-    15 | readline-4
+    12 | , 
+    15 | readline-4.2
     11 | readline
     12 | -
     20 | 4.2
     12 |  
-    15 | readline-4
+    15 | readline-4.2
     11 | readline
     12 | -
     20 | 4.2
-    12 | .
-    12 |  
+    12 | . 
     22 | 234
     12 |  
 
-    13 | <i <b>
+    12 | <
+     1 | i
+    12 |  
+    13 | <b>
     12 |  
      1 | wow
     12 |   
-    12 | <
-    12 |  
+    12 | < 
      1 | jqw
     12 |  
-    12 | <
-    12 | >
-    12 |  
+    12 | <> 
      1 | qwerty
-(138 rows)
+(135 rows)
 
 SELECT to_tsvector('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/?  ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
 /usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234 
 <i <b> wow  < jqw <> qwerty');
-                                                                                                                                                                                                                                                                                                                                                                                                                                               to_tsvector                                                                                                                                                                                                                                                                                                                                                                                                                                                
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
- 'ad':18 'dw':20 'jf':40 '234':62 '345':1 '4.2':53,54,55,58,61 '455':32 'jqw':64 'qwe':19,28,29,36 'wer':37 'wow':63 'asdf':38 'ewr1':42 'qwer':39 'sdjk':41 '5.005':33 'ewri2':43 'qwqwe':30 'wefjn':47 'gist.c':51 'gist.h':49 'qwerti':65 '234.435':31 ':8100/?':17 'qwe-wer':35 'readlin':52,57,60 'www.com':3 '+4.0e-10':27 'gist.h.c':50 'rewt/ewr':46 'qwe@efd.r':2 'readline-4':56,59 '/?ad=qwe&dw':6,9,13 '/wqe-324/ewr':48 'aew.werc.ewr':5 '1aew.werc.ewr':8 '2aew.werc.ewr':10 '3aew.werc.ewr':12 '4aew.werc.ewr':14 '5aew.werc.ewr':16 '6aew.werc.ewr':22 '7aew.werc.ewr':25 '/usr/local/fff':44 '/awdf/dwqe/4325':45 ':8100/?ad=qwe&dw':23 'teodor@stack.net':34 '5aew.werc.ewr:8100/?':15 ':8100/?ad=qwe&dw=%20%32':26 'aew.werc.ewr/?ad=qwe&dw':4 '1aew.werc.ewr/?ad=qwe&dw':7 '3aew.werc.ewr/?ad=qwe&dw':11 '6aew.werc.ewr:8100/?ad=qwe&dw':21 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':24
+                                                                                                                                                                                                                                                                                                                                                                                                                             to_tsvector                                                                                                                                                                                                                                                                                                                                                                                                                              
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ 'ad':17 'dw':19 'jf':39 '234':63 '345':1 '4.2':54,55,56,59,62 '455':31 'jqw':66 'qwe':2,18,27,28,35 'wer':36 'wow':65 'asdf':37 'ewr1':43 'qwer':38 'sdjk':40 '5.005':32 'efd.r':3 'ewri2':44 'hjwer':42 'qwqwe':29 'wefjn':48 'gist.c':52 'gist.h':50 'qwerti':67 '234.435':30 'qwe-wer':34 'readlin':53,58,61 'www.com':4 '+4.0e-10':26 'gist.h.c':51 'rewt/ewr':47 '/?ad=qwe&dw':7,10,14,22 '/wqe-324/ewr':49 'aew.werc.ewr':6 'readline-4.2':57,60 '1aew.werc.ewr':9 '2aew.werc.ewr':11 '3aew.werc.ewr':13 '4aew.werc.ewr':15 '/usr/local/fff':45 '/awdf/dwqe/4325':46 'teodor@stack.net':33 '/?ad=qwe&dw=%20%32':25 '5aew.werc.ewr:8100':16 '6aew.werc.ewr:8100':21 '7aew.werc.ewr:8100':24 'aew.werc.ewr/?ad=qwe&dw':5 '1aew.werc.ewr/?ad=qwe&dw':8 '3aew.werc.ewr/?ad=qwe&dw':12 '6aew.werc.ewr:8100/?ad=qwe&dw':20 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':23
 (1 row)
 
 SELECT length(to_tsvector('default', '345 qw'));
@@ -831,7 +828,7 @@ SELECT length(to_tsvector('default', '345 qwe@efd.r '' http://www.com/ http://ae
 <i <b> wow  < jqw <> qwerty'));
  length 
 --------
-     53
+     51
 (1 row)
 
 select to_tsquery('default', 'qwe & sKies '); 
@@ -876,6 +873,36 @@ select to_tsquery('default', '(the|and&(i&1))&fghj');
  '1' & 'fghj'
 (1 row)
 
+select plainto_tsquery('default', 'the and z 1))& fghj');
+  plainto_tsquery   
+--------------------
+ 'z' & '1' & 'fghj'
+(1 row)
+
+select plainto_tsquery('default', 'foo bar') && plainto_tsquery('default', 'asd');
+       ?column?        
+-----------------------
+ 'foo' & 'bar' & 'asd'
+(1 row)
+
+select plainto_tsquery('default', 'foo bar') || plainto_tsquery('default', 'asd fg');
+           ?column?           
+------------------------------
+ 'foo' & 'bar' | 'asd' & 'fg'
+(1 row)
+
+select plainto_tsquery('default', 'foo bar') || !!plainto_tsquery('default', 'asd fg');
+             ?column?              
+-----------------------------------
+ 'foo' & 'bar' | !( 'asd' & 'fg' )
+(1 row)
+
+select plainto_tsquery('default', 'foo bar') && 'asd | fg';
+             ?column?             
+----------------------------------
+ 'foo' & 'bar' & ( 'asd' | 'fg' )
+(1 row)
+
 select 'a b:89  ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca';
  ?column? 
 ----------
index e6f1ae3a8984fb36086edd3bb6da04fce160cfbe..e312cf6af7166f00ffde947583200552dd5cc35b 100644 (file)
@@ -51,10 +51,20 @@ Datum               to_tsquery_name(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(to_tsquery_current);
 Datum          to_tsquery_current(PG_FUNCTION_ARGS);
 
+PG_FUNCTION_INFO_V1(plainto_tsquery);
+Datum          plainto_tsquery(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(plainto_tsquery_name);
+Datum          plainto_tsquery_name(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(plainto_tsquery_current);
+Datum          plainto_tsquery_current(PG_FUNCTION_ARGS);
+
 /* parser's states */
 #define WAITOPERAND 1
 #define WAITOPERATOR   2
 #define WAITFIRSTOPERAND 3
+#define WAITSINGLEOPERAND 4
 
 /*
  * node of query tree, also used
@@ -195,6 +205,14 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2
                                else if (*(state->buf) != ' ')
                                        return ERR;
                                break;
+                       case WAITSINGLEOPERAND:
+                               if ( *(state->buf) == '\0' ) 
+                                       return END;
+                               *strval = state->buf;
+                               *lenval = strlen( state->buf );
+                               state->buf += strlen( state->buf );
+                               state->count++;
+                               return VAL;     
                        default:
                                return ERR;
                                break;
@@ -582,7 +600,7 @@ findoprnd(ITEM * ptr, int4 *pos)
  * input
  */
 static QUERYTYPE *
-                       queryin(char *buf, void (*pushval) (QPRS_STATE *, int, char *, int, int2), int cfg_id)
+queryin(char *buf, void (*pushval) (QPRS_STATE *, int, char *, int, int2), int cfg_id, bool isplain)
 {
        QPRS_STATE      state;
        int4            i;
@@ -599,7 +617,7 @@ static QUERYTYPE *
 
        /* init state */
        state.buf = buf;
-       state.state = WAITFIRSTOPERAND;
+       state.state = (isplain) ? WAITSINGLEOPERAND : WAITFIRSTOPERAND;
        state.count = 0;
        state.num = 0;
        state.str = NULL;
@@ -679,7 +697,7 @@ Datum
 tsquery_in(PG_FUNCTION_ARGS)
 {
        SET_FUNCOID();
-       PG_RETURN_POINTER(queryin((char *) PG_GETARG_POINTER(0), pushval_asis, 0));
+       PG_RETURN_POINTER(queryin((char *) PG_GETARG_POINTER(0), pushval_asis, 0, false));
 }
 
 /*
@@ -910,7 +928,7 @@ to_tsquery(PG_FUNCTION_ARGS)
        str = text2char(in);
        PG_FREE_IF_COPY(in, 1);
 
-       query = queryin(str, pushval_morph, PG_GETARG_INT32(0));
+       query = queryin(str, pushval_morph, PG_GETARG_INT32(0),false);
        
        if ( query->size == 0 )
                PG_RETURN_POINTER(query);
@@ -950,3 +968,59 @@ to_tsquery_current(PG_FUNCTION_ARGS)
                                                                                Int32GetDatum(get_currcfg()),
                                                                                PG_GETARG_DATUM(0)));
 }
+
+Datum
+plainto_tsquery(PG_FUNCTION_ARGS)
+{
+       text       *in = PG_GETARG_TEXT_P(1);
+       char       *str;
+       QUERYTYPE  *query;
+       ITEM       *res;
+       int4            len;
+
+       SET_FUNCOID();
+
+       str = text2char(in);
+       PG_FREE_IF_COPY(in, 1);
+
+       query = queryin(str, pushval_morph, PG_GETARG_INT32(0), true);
+       
+       if ( query->size == 0 )
+               PG_RETURN_POINTER(query);
+
+       res = clean_fakeval_v2(GETQUERY(query), &len);
+       if (!res)
+       {
+               query->len = HDRSIZEQT;
+               query->size = 0;
+               PG_RETURN_POINTER(query);
+       }
+       memcpy((void *) GETQUERY(query), (void *) res, len * sizeof(ITEM));
+       pfree(res);
+       PG_RETURN_POINTER(query);
+}
+
+Datum
+plainto_tsquery_name(PG_FUNCTION_ARGS)
+{
+       text       *name = PG_GETARG_TEXT_P(0);
+       Datum           res;
+
+       SET_FUNCOID();
+       res = DirectFunctionCall2(plainto_tsquery,
+                                                         Int32GetDatum(name2id_cfg(name)),
+                                                         PG_GETARG_DATUM(1));
+
+       PG_FREE_IF_COPY(name, 0);
+       PG_RETURN_DATUM(res);
+}
+
+Datum
+plainto_tsquery_current(PG_FUNCTION_ARGS)
+{
+       SET_FUNCOID();
+       PG_RETURN_DATUM(DirectFunctionCall2(plainto_tsquery,
+                                                                               Int32GetDatum(get_currcfg()),
+                                                                               PG_GETARG_DATUM(0)));
+}
+
index c973def7d4df67472e42f3a1b3db395edd473707..edc2d48fcfbe7f31c54d1b660238bcb1fef29a04 100644 (file)
@@ -14,6 +14,117 @@ tsquery_numnode(PG_FUNCTION_ARGS) {
        PG_RETURN_INT32(nnode);
 }
 
+static QTNode* 
+join_tsqueries(QUERYTYPE *a, QUERYTYPE *b) {
+       QTNode  *res=(QTNode*)palloc0( sizeof(QTNode) );
+
+       res->flags |= QTN_NEEDFREE;
+
+       res->valnode = (ITEM*)palloc0( sizeof(ITEM) );
+       res->valnode->type = OPR;
+
+       res->child = (QTNode**)palloc0( sizeof(QTNode*)*2 );
+       res->child[0] = QT2QTN( GETQUERY(b), GETOPERAND(b) );
+       res->child[1] = QT2QTN( GETQUERY(a), GETOPERAND(a) );
+       res->nchild = 2;
+
+       return res;
+}
+
+PG_FUNCTION_INFO_V1(tsquery_and);
+Datum           tsquery_and(PG_FUNCTION_ARGS);
+
+Datum
+tsquery_and(PG_FUNCTION_ARGS) {
+       QUERYTYPE  *a = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(0)));
+       QUERYTYPE  *b = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(1)));
+       QTNode  *res;
+       QUERYTYPE  *query;
+
+       if ( a->size == 0 ) {
+               PG_FREE_IF_COPY(a,1);
+               PG_RETURN_POINTER(b);
+       } else if ( b->size == 0 ) {
+               PG_FREE_IF_COPY(b,1);
+               PG_RETURN_POINTER(a);
+       }       
+
+       res = join_tsqueries(a, b);
+
+       res->valnode->val = '&';
+
+       query = QTN2QT( res, PlainMemory );
+
+       QTNFree(res);
+       PG_FREE_IF_COPY(a,0);
+       PG_FREE_IF_COPY(b,1);
+
+       PG_RETURN_POINTER(query);
+}
+
+PG_FUNCTION_INFO_V1(tsquery_or);
+Datum           tsquery_or(PG_FUNCTION_ARGS);
+
+Datum
+tsquery_or(PG_FUNCTION_ARGS) {
+       QUERYTYPE  *a = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(0)));
+       QUERYTYPE  *b = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(1)));
+       QTNode  *res;
+       QUERYTYPE  *query;
+
+       if ( a->size == 0 ) {
+               PG_FREE_IF_COPY(a,1);
+               PG_RETURN_POINTER(b);
+       } else if ( b->size == 0 ) {
+               PG_FREE_IF_COPY(b,1);
+               PG_RETURN_POINTER(a);
+       }       
+
+       res = join_tsqueries(a, b);
+
+       res->valnode->val = '|';
+
+       query = QTN2QT( res, PlainMemory );
+
+       QTNFree(res);
+       PG_FREE_IF_COPY(a,0);
+       PG_FREE_IF_COPY(b,1);
+
+       PG_RETURN_POINTER(query);
+}
+
+PG_FUNCTION_INFO_V1(tsquery_not);
+Datum           tsquery_not(PG_FUNCTION_ARGS);
+
+Datum
+tsquery_not(PG_FUNCTION_ARGS) {
+       QUERYTYPE  *a = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(0)));
+       QTNode  *res;
+       QUERYTYPE  *query;
+
+       if ( a->size == 0 ) 
+               PG_RETURN_POINTER(a);
+
+       res=(QTNode*)palloc0( sizeof(QTNode) );
+
+       res->flags |= QTN_NEEDFREE;
+
+       res->valnode = (ITEM*)palloc0( sizeof(ITEM) );
+       res->valnode->type = OPR;
+       res->valnode->val = '!';
+
+       res->child = (QTNode**)palloc0( sizeof(QTNode*) );
+       res->child[0] = QT2QTN( GETQUERY(a), GETOPERAND(a) );
+       res->nchild = 1;
+
+       query = QTN2QT( res, PlainMemory );
+
+       QTNFree(res);
+       PG_FREE_IF_COPY(a,0);
+
+       PG_RETURN_POINTER(query);
+}
+
 static int
 CompareTSQ( QUERYTYPE *a, QUERYTYPE *b ) {
        if ( a->size != b->size ) {
index 0923ce7a19755022e181fcaea6d567152dfa3c0a..bd0baa3b41d4d88603a28194634119f75fab56ce 100644 (file)
@@ -173,6 +173,13 @@ select to_tsquery('default', 'asd&(and|fghj)');
 select to_tsquery('default', '(asd&and)|fghj');
 select to_tsquery('default', '(asd&!and)|fghj');
 select to_tsquery('default', '(the|and&(i&1))&fghj');
+
+select plainto_tsquery('default', 'the and z 1))& fghj');
+select plainto_tsquery('default', 'foo bar') && plainto_tsquery('default', 'asd');
+select plainto_tsquery('default', 'foo bar') || plainto_tsquery('default', 'asd fg');
+select plainto_tsquery('default', 'foo bar') || !!plainto_tsquery('default', 'asd fg');
+select plainto_tsquery('default', 'foo bar') && 'asd | fg';
+
 select 'a b:89  ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca';
 select 'a b:89  ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca:B';
 select 'a b:89  ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca:A';
diff --git a/contrib/tsearch2/ts_locale.c b/contrib/tsearch2/ts_locale.c
new file mode 100644 (file)
index 0000000..b84681f
--- /dev/null
@@ -0,0 +1,61 @@
+#include "ts_locale.h"
+
+#include "utils/builtins.h"
+#include "utils/pg_locale.h"
+#include "mb/pg_wchar.h"
+
+
+#if defined(TS_USE_WIDE) && defined(WIN32)
+
+size_t
+wchar2char( const char *to, const wchar_t *from, size_t len ) {
+       if (GetDatabaseEncoding() == PG_UTF8) {
+               int     r;
+
+               if (len==0)
+                       return 0;
+
+               r = WideCharToMultiByte(CP_UTF8, 0, from, len, to, nbytes,
+                               NULL, NULL);
+
+               
+               if ( r==0 )
+                       ereport(ERROR,
+                               (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+                                       errmsg("UTF-16 to UTF-8 translation failed: %lu",
+                                               GetLastError())));
+
+               return r;
+       }
+
+       return wcstombs(to, from, len);
+}
+
+size_t 
+char2wchar( const wchar_t *to, const char *from, size_t len ) {
+       if (GetDatabaseEncoding() == PG_UTF8) {
+               int     r;
+
+               if (len==0)
+                       return 0;
+
+               r = MultiByteToWideChar(CP_UTF8, 0, from, len,
+                       to, len);
+
+               if (!r) {
+                       pg_verifymbstr(from, len, false);
+                       ereport(ERROR,
+                               (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+                               errmsg("invalid multibyte character for locale"),
+                               errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
+               }
+
+               Assert(r <= nbytes);
+
+               return r;
+       }
+       
+       return mbstowcs(to, from, len);
+}
+
+#endif
diff --git a/contrib/tsearch2/ts_locale.h b/contrib/tsearch2/ts_locale.h
new file mode 100644 (file)
index 0000000..a7ce6f1
--- /dev/null
@@ -0,0 +1,38 @@
+#ifndef __TSLOCALE_H__
+#define __TSLOCALE_H__
+
+#include "postgres.h"
+
+#include <ctype.h>
+#include <limits.h>
+
+/*
+ * towlower() and friends should be in <wctype.h>, but some pre-C99 systems
+ * declare them in <wchar.h>.
+ */
+#ifdef HAVE_WCHAR_H
+#include <wchar.h>
+#endif
+#ifdef HAVE_WCTYPE_H
+#include <wctype.h>
+#endif
+
+#if defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER)
+#define TS_USE_WIDE
+
+#ifdef WIN32
+
+size_t wchar2char( const char *to, const wchar_t *from, size_t len );
+size_t char2wchar( const wchar_t *to, const char *from, size_t len );
+
+#else /* WIN32 */
+
+/* correct mbstowcs */
+#define char2wchar mbstowcs
+#define wchar2char wcstombs
+
+#endif /* WIN32 */
+#endif /* defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER) */ 
+
+#endif  /* __TSLOCALE_H__ */
index 9bdf641e121d239c00ec6cd5eecd6faef3895436..4fdf974d0d1defe50f07fa97d905dfb3a558e119 100644 (file)
@@ -427,6 +427,21 @@ RETURNS tsquery
 AS 'MODULE_PATHNAME','to_tsquery_current'
 LANGUAGE 'c' with (isstrict,iscachable);
 
+CREATE FUNCTION plainto_tsquery(oid, text)
+RETURNS tsquery
+AS 'MODULE_PATHNAME'
+LANGUAGE 'c' with (isstrict,iscachable);
+
+CREATE FUNCTION plainto_tsquery(text, text)
+RETURNS tsquery
+AS 'MODULE_PATHNAME','plainto_tsquery_name'
+LANGUAGE 'c' with (isstrict,iscachable);
+
+CREATE FUNCTION plainto_tsquery(text)
+RETURNS tsquery
+AS 'MODULE_PATHNAME','plainto_tsquery_current'
+LANGUAGE 'c' with (isstrict,iscachable);
+
 --operations
 CREATE FUNCTION exectsq(tsvector, tsquery)
 RETURNS bool
@@ -929,6 +944,47 @@ CREATE OR REPLACE FUNCTION numnode(tsquery)
         language 'C'
         with (isstrict,iscachable);
 
+CREATE OR REPLACE FUNCTION tsquery_and(tsquery,tsquery)
+        returns tsquery
+        as 'MODULE_PATHNAME', 'tsquery_and'
+        language 'C'
+        with (isstrict,iscachable);
+
+CREATE OPERATOR && (
+        LEFTARG = tsquery,
+        RIGHTARG = tsquery,
+        PROCEDURE = tsquery_and,
+        COMMUTATOR = '&&',
+        RESTRICT = contsel,
+        JOIN = contjoinsel
+);
+
+CREATE OR REPLACE FUNCTION tsquery_or(tsquery,tsquery)
+        returns tsquery
+        as 'MODULE_PATHNAME', 'tsquery_or'
+        language 'C'
+        with (isstrict,iscachable);
+
+CREATE OPERATOR || (
+        LEFTARG = tsquery,
+        RIGHTARG = tsquery,
+        PROCEDURE = tsquery_or,
+        COMMUTATOR = '||',
+        RESTRICT = contsel,
+        JOIN = contjoinsel
+);
+
+CREATE OR REPLACE FUNCTION tsquery_not(tsquery)
+        returns tsquery
+        as 'MODULE_PATHNAME', 'tsquery_not'
+        language 'C'
+        with (isstrict,iscachable);
+
+CREATE OPERATOR !! (
+        RIGHTARG = tsquery,
+        PROCEDURE = tsquery_not
+);
+
 --------------rewrite subsystem
 
 CREATE OR REPLACE FUNCTION rewrite(tsquery, text)
index 0070970e2165e054eea0d1afe88c528c02ee2e6d..c4eceba60bb22b2e515908786b8b2c43a7608fe3 100644 (file)
@@ -1,8 +1,8 @@
-# $PostgreSQL: pgsql/contrib/tsearch2/wordparser/Makefile,v 1.8 2005/10/18 01:30:49 tgl Exp $
+# $PostgreSQL: pgsql/contrib/tsearch2/wordparser/Makefile,v 1.9 2005/11/21 12:27:57 teodor Exp $
 
 SUBOBJS =  parser.o deflex.o
 
-EXTRA_CLEAN = SUBSYS.o $(SUBOBJS) parser.c
+EXTRA_CLEAN = SUBSYS.o $(SUBOBJS)
 
 PG_CPPFLAGS = -I$(srcdir)/..
 
@@ -20,13 +20,6 @@ override CFLAGS += $(CFLAGS_SL)
 
 all: SUBSYS.o
 
-parser.c: parser.l
-ifdef FLEX
-       $(FLEX) $(FLEXFLAGS) -8 -Ptsearch2_yy -o'$@' $<
-else
-       @$(missing) flex $< $@
-endif
-
 SUBSYS.o: $(SUBOBJS)
        $(LD) $(LDREL) $(LDOUT) $@ $^
 
index bbf3271b666f4682143cc69f200e8759b084f15a..8f93d277a1e21a0942a2068d8ffaeff3362f4c85 100644 (file)
@@ -15,7 +15,7 @@ const char *lex_descr[] = {
        "Latin part of hyphenated word",
        "Space symbols",
        "HTML Tag",
-       "HTTP head",
+       "Protocol head",
        "Hyphenated word",
        "Latin hyphenated word",
        "Non-latin hyphenated word",
@@ -42,7 +42,7 @@ const char *tok_alias[] = {
        "lpart_hword",
        "blank",
        "tag",
-       "http",
+       "protocol",
        "hword",
        "lhword",
        "nlhword",
index 651d1f9e77301352fac86b64faa7c2eff87c9141..893f8430515ea4990f41e466e3c91cfea55e6a88 100644 (file)
@@ -17,7 +17,7 @@
 #define LATPARTHYPHENWORD      11
 #define SPACE          12
 #define TAG                    13
-#define HTTP           14
+#define PROTOCOL               14
 #define HYPHENWORD     15
 #define LATHYPHENWORD  16
 #define CYRHYPHENWORD  17
diff --git a/contrib/tsearch2/wordparser/parser.c b/contrib/tsearch2/wordparser/parser.c
new file mode 100644 (file)
index 0000000..e414a86
--- /dev/null
@@ -0,0 +1,1028 @@
+#include "postgres.h"
+
+#include "utils/builtins.h"
+#include "utils/pg_locale.h"
+#include "mb/pg_wchar.h"
+
+#include "deflex.h"
+#include "parser.h"
+#include "ts_locale.h"
+
+
+static TParserPosition*
+newTParserPosition(TParserPosition *prev) {
+       TParserPosition *res = (TParserPosition*)palloc(sizeof(TParserPosition));
+
+       if ( prev ) 
+               memcpy(res, prev, sizeof(TParserPosition));
+       else
+               memset(res, 0, sizeof(TParserPosition));
+
+       res->prev = prev;
+
+       res->pushedAtAction = NULL;
+
+       return res;
+}
+
+TParser*
+TParserInit( char *str, int len ) {
+       TParser *prs = (TParser*)palloc0( sizeof(TParser) );
+
+       prs->charmaxlen = pg_database_encoding_max_length();
+       prs->str = str; 
+       prs->lenstr = len;
+
+#ifdef TS_USE_WIDE
+        /*
+         * Use wide char code only when max encoding length > 1 and ctype != C.
+         * Some operating systems fail with multi-byte encodings and a C locale.
+         * Also, for a C locale there is no need to process as multibyte.
+        * From backend/utils/adt/oracle_compat.c Teodor 
+         */
+
+       if ( prs->charmaxlen > 1 && !lc_ctype_is_c() ) {
+               prs->usewide=true;
+               prs->wstr = (wchar_t*)palloc( sizeof(wchar_t) * prs->lenstr );
+               prs->lenwstr = char2wchar( prs->wstr, prs->str, prs->lenstr ); 
+       } else
+#endif
+               prs->usewide=false;
+
+       prs->state = newTParserPosition(NULL);
+       prs->state->state = TPS_Base;
+
+       return prs;
+}
+
+void
+TParserClose( TParser* prs ) {
+       while( prs->state ) {
+               TParserPosition *ptr = prs->state->prev;
+               pfree( prs->state );
+               prs->state = ptr;
+       }
+
+       if ( prs->wstr )
+               pfree( prs->wstr );
+       pfree( prs );
+}
+
+/*
+ * defining support function, equvalent is* macroses, but
+ * working with any possible encodings and locales
+ */
+
+#ifdef TS_USE_WIDE 
+
+#define p_iswhat(type)                                                                                 \
+static int                                                                                     \
+p_is##type(TParser *prs) {                                                                     \
+       Assert( prs->state );                                                                   \
+       return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \
+               is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) );               \
+}                                                                                              \
+                                                                                               \
+static int                                                                                     \
+p_isnot##type(TParser *prs) {                                                                  \
+       return !p_is##type(prs);                                                                \
+}
+
+
+
+/* p_iseq should be used only for ascii symbols */
+
+static int
+p_iseq(TParser *prs, char c) {
+       Assert( prs->state );
+       return ( ( prs->state->charlen==1 && *( prs->str + prs->state->posbyte ) == c ) ) ? 1 : 0;
+}
+
+#else /* TS_USE_WIDE */
+
+#define p_iswhat(type)                                                                                 \
+static int                                                                                     \
+p_is##type(TParser *prs) {                                                                     \
+       Assert( prs->state );                                                                   \
+       return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) );                \
+}                                                                                              \
+                                                                                               \
+static int                                                                                     \
+p_isnot##type(TParser *prs) {                                                                  \
+       return !p_is##type(prs);                                                                \
+}
+
+
+static int
+p_iseq(TParser *prs, char c) {
+       Assert( prs->state );
+       return ( *( prs->str + prs->state->posbyte ) == c ) ) ? 1 : 0;
+}
+
+#endif /* TS_USE_WIDE */
+
+p_iswhat(alnum)
+p_iswhat(alpha)
+p_iswhat(digit)
+p_iswhat(lower)
+p_iswhat(print)
+p_iswhat(punct)
+p_iswhat(space)
+p_iswhat(upper)
+p_iswhat(xdigit)
+
+static int
+p_isEOF(TParser *prs) {
+       Assert( prs->state );
+       return (prs->state->posbyte == prs->lenstr || prs->state->charlen==0) ? 1 : 0; 
+}
+
+static int
+p_iseqC(TParser *prs) {
+       return p_iseq(prs, prs->c); 
+}
+
+static int
+p_isneC(TParser *prs) {
+       return !p_iseq(prs, prs->c);
+}
+
+static int
+p_isascii(TParser *prs) {
+       return ( prs->state->charlen==1 && isascii( (unsigned char) *( prs->str + prs->state->posbyte ) ) ) ? 1 : 0; 
+}
+
+static int
+p_islatin(TParser *prs) {
+       return ( p_isalpha(prs) && p_isascii(prs) ) ? 1 : 0;
+}
+
+static int
+p_isnonlatin(TParser *prs) {
+       return ( p_isalpha(prs) && !p_isascii(prs) ) ? 1 : 0;
+}
+
+void _make_compiler_happy(void);
+void
+_make_compiler_happy(void) {
+       p_isalnum(NULL);        p_isnotalnum(NULL);
+       p_isalpha(NULL);        p_isnotalpha(NULL);
+       p_isdigit(NULL);        p_isnotdigit(NULL);
+       p_islower(NULL);        p_isnotlower(NULL);
+       p_isprint(NULL);        p_isnotprint(NULL);
+       p_ispunct(NULL);        p_isnotpunct(NULL);
+       p_isspace(NULL);        p_isnotspace(NULL);
+       p_isupper(NULL);        p_isnotupper(NULL);
+       p_isxdigit(NULL);       p_isnotxdigit(NULL);
+       p_isEOF(NULL);  
+       p_iseqC(NULL);  p_isneC(NULL);
+}
+
+
+static void
+SpecialTags(TParser *prs) {
+       switch( prs->state->lencharlexeme ) {
+               case 8: /* </script */
+                       if ( pg_strncasecmp( prs->lexeme, "</script", 8 ) == 0 )
+                               prs->ignore = false;
+                       break;
+               case 7: /* <script || </style */
+                       if ( pg_strncasecmp( prs->lexeme, "</style", 7 ) == 0 )
+                               prs->ignore = false;
+                       else if ( pg_strncasecmp( prs->lexeme, "<script", 7 ) == 0 )
+                               prs->ignore = true;
+                       break;
+               case 6: /* <style */
+                       if ( pg_strncasecmp( prs->lexeme, "<style", 6 ) == 0 )
+                               prs->ignore = true;
+                       break;
+               default: break;
+       }
+}
+
+static void
+SpecialFURL(TParser *prs) {
+       prs->wanthost = true;
+       prs->state->posbyte -= prs->state->lenbytelexeme;
+       prs->state->poschar -= prs->state->lencharlexeme;
+}
+
+static void
+SpecialHyphen(TParser *prs) {
+       prs->state->posbyte -= prs->state->lenbytelexeme;
+       prs->state->poschar -= prs->state->lencharlexeme;
+}
+
+static int
+p_isstophost(TParser *prs) {
+       if ( prs->wanthost ) {
+               prs->wanthost = false;
+               return 1;
+       }
+       return 0;
+}
+
+static int
+p_isignore(TParser *prs) {
+       return (prs->ignore) ? 1 : 0;
+}
+
+static int
+p_ishost(TParser *prs) {
+       TParser *tmpprs = TParserInit( prs->str+prs->state->posbyte, prs->lenstr - prs->state->posbyte );
+       int res = 0;
+
+       if ( TParserGet(tmpprs) && tmpprs->type == HOST ) {
+               prs->state->posbyte += tmpprs->lenbytelexeme;
+               prs->state->poschar += tmpprs->lencharlexeme;
+               prs->state->lenbytelexeme += tmpprs->lenbytelexeme;
+               prs->state->lencharlexeme += tmpprs->lencharlexeme;
+               prs->state->charlen =  tmpprs->state->charlen;
+               res = 1;
+       }
+       TParserClose(tmpprs);
+
+       return res;
+}
+
+static int
+p_isURI(TParser *prs) {
+       TParser *tmpprs = TParserInit( prs->str+prs->state->posbyte, prs->lenstr - prs->state->posbyte );
+       int res = 0;
+
+       tmpprs->state = newTParserPosition( tmpprs->state );
+       tmpprs->state->state = TPS_InFileFirst;
+
+       if ( TParserGet(tmpprs) && (tmpprs->type == URI || tmpprs->type == FILEPATH) ) {
+               prs->state->posbyte += tmpprs->lenbytelexeme;
+               prs->state->poschar += tmpprs->lencharlexeme;
+               prs->state->lenbytelexeme += tmpprs->lenbytelexeme;
+               prs->state->lencharlexeme += tmpprs->lencharlexeme;
+               prs->state->charlen =  tmpprs->state->charlen;
+               res = 1;
+       }
+       TParserClose(tmpprs);
+
+       return res;
+}
+
+/*
+ * Table of state/action of parser
+ */
+
+#define A_NEXT         0x0000
+#define A_BINGO                0x0001
+#define A_POP          0x0002
+#define A_PUSH         0x0004
+#define A_RERUN                0x0008
+#define A_CLEAR                0x0010
+#define A_MERGE                0x0020
+#define A_CLRALL       0x0040
+
+static TParserStateActionItem actionTPS_Base[] = {
+       {p_isEOF,       0,      A_NEXT,                 TPS_Null,               0,      NULL},
+       {p_iseqC,       '<',    A_PUSH,                 TPS_InTagFirst,         0,      NULL},
+       {p_isignore,    0,      A_NEXT,                 TPS_InSpace,            0,      NULL},
+       {p_islatin,     0,      A_NEXT,                 TPS_InLatWord,          0,      NULL},
+       {p_isnonlatin,  0,      A_NEXT,                 TPS_InCyrWord,          0,      NULL},
+       {p_isdigit,     0,      A_NEXT,                 TPS_InUnsignedInt,      0,      NULL},
+       {p_iseqC,       '-',    A_PUSH,                 TPS_InSignedIntFirst,   0,      NULL},
+       {p_iseqC,       '+',    A_PUSH,                 TPS_InSignedIntFirst,   0,      NULL},
+       {p_iseqC,       '&',    A_PUSH,                 TPS_InHTMLEntityFirst,  0,      NULL},
+       {p_iseqC,       '/',    A_PUSH,                 TPS_InFileFirst,        0,      NULL},
+       {NULL,          0,      A_NEXT,                 TPS_InSpace,            0,      NULL} 
+}; 
+
+
+static TParserStateActionItem actionTPS_InUWord[] = {
+       {p_isEOF,       0,      A_BINGO,        TPS_Base,               UWORD,          NULL},
+       {p_isalnum,     0,      A_NEXT,         TPS_InUWord,            0,              NULL},
+       {p_iseqC,       '@',    A_PUSH,         TPS_InEmail,            0,              NULL},
+       {p_iseqC,       '/',    A_PUSH,         TPS_InFileFirst,        0,              NULL},
+       {p_iseqC,       '-',    A_PUSH,         TPS_InHyphenUWordFirst, 0,              NULL},
+       {NULL,          0,      A_BINGO,        TPS_Base,               UWORD,          NULL}
+};
+
+static TParserStateActionItem actionTPS_InLatWord[] = {
+       {p_isEOF,       0,      A_BINGO,        TPS_Base,               LATWORD,        NULL},
+       {p_islatin,     0,      A_NEXT,         TPS_Null,               0,              NULL},
+       {p_iseqC,       '.',    A_PUSH,         TPS_InHostFirstDomen,   0,              NULL},
+       {p_iseqC,       '.',    A_PUSH,         TPS_InFileFirst,        0,              NULL},
+       {p_iseqC,       '-',    A_PUSH,         TPS_InHostFirstAN,      0,              NULL},
+       {p_iseqC,       '-',    A_PUSH,         TPS_InHyphenLatWordFirst,0,             NULL},
+       {p_iseqC,       '@',    A_PUSH,         TPS_InEmail,            0,              NULL},
+       {p_iseqC,       ':',    A_PUSH,         TPS_InProtocolFirst,    0,              NULL},
+       {p_iseqC,       '/',    A_PUSH,         TPS_InFileFirst,        0,              NULL},
+       {p_isdigit,     0,      A_PUSH,         TPS_InHost,             0,              NULL},
+       {p_isalnum,     0,      A_NEXT,         TPS_InUWord,            0,              NULL},
+       {NULL,          0,      A_BINGO,        TPS_Base,               LATWORD,        NULL}
+};
+
+static TParserStateActionItem actionTPS_InCyrWord[] = {
+       {p_isEOF,       0,      A_BINGO,        TPS_Base,               CYRWORD,        NULL},
+       {p_isnonlatin,  0,      A_NEXT,         TPS_Null,               0,              NULL},
+       {p_isalnum,     0,      A_NEXT,         TPS_InUWord,            0,              NULL}, 
+       {p_iseqC,       '-',    A_PUSH,         TPS_InHyphenCyrWordFirst,0,             NULL},
+       {NULL,          0,      A_BINGO,        TPS_Base,               CYRWORD,        NULL}
+};
+static TParserStateActionItem actionTPS_InUnsignedInt[] = {
+       {p_isEOF,       0,      A_BINGO,        TPS_Base,               UNSIGNEDINT,    NULL}, 
+       {p_isdigit,     0,      A_NEXT,         TPS_Null,               0,              NULL}, 
+       {p_iseqC,       '.',    A_PUSH,         TPS_InHostFirstDomen,   0,              NULL}, 
+       {p_iseqC,       '.',    A_PUSH,         TPS_InUDecimalFirst,    0,              NULL}, 
+       {p_iseqC,       'e',    A_PUSH,         TPS_InMantissaFirst,    0,              NULL}, 
+       {p_iseqC,       'E',    A_PUSH,         TPS_InMantissaFirst,    0,              NULL}, 
+       {p_islatin,     0,      A_PUSH,         TPS_InHost,             0,              NULL}, 
+       {p_isalpha,     0,      A_NEXT,         TPS_InUWord,            0,              NULL}, 
+       {p_iseqC,       '/',    A_PUSH,         TPS_InFileFirst,        0,              NULL},
+       {NULL,          0,      A_BINGO,        TPS_Base,               UNSIGNEDINT,    NULL}
+};
+static TParserStateActionItem actionTPS_InSignedIntFirst[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {p_isdigit,     0,      A_NEXT|A_CLEAR, TPS_InSignedInt,        0,              NULL}, 
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+static TParserStateActionItem actionTPS_InSignedInt[] = {
+       {p_isEOF,       0,      A_BINGO,        TPS_Base,               SIGNEDINT,      NULL}, 
+       {p_isdigit,     0,      A_NEXT,         TPS_Null,               0,              NULL}, 
+       {p_iseqC,       '.',    A_PUSH,         TPS_InDecimalFirst,     0,              NULL}, 
+       {p_iseqC,       'e',    A_PUSH,         TPS_InMantissaFirst,    0,              NULL}, 
+       {p_iseqC,       'E',    A_PUSH,         TPS_InMantissaFirst,    0,              NULL}, 
+       {NULL,          0,      A_BINGO,        TPS_Base,               SIGNEDINT,      NULL}
+};
+static TParserStateActionItem actionTPS_InSpace[] = {
+       {p_isEOF,       0,      A_BINGO,        TPS_Base,               SPACE,          NULL}, 
+       {p_iseqC,       '<',    A_BINGO,        TPS_Base,               SPACE,          NULL}, 
+       {p_isignore,    0,      A_NEXT,         TPS_Null,               0,              NULL}, 
+       {p_iseqC,       '-',    A_BINGO,        TPS_Base,               SPACE,          NULL}, 
+       {p_iseqC,       '+',    A_BINGO,        TPS_Base,               SPACE,          NULL}, 
+       {p_iseqC,       '&',    A_BINGO,        TPS_Base,               SPACE,          NULL}, 
+       {p_iseqC,       '/',    A_BINGO,        TPS_Base,               SPACE,          NULL}, 
+       {p_isnotalnum,  0,      A_NEXT,         TPS_InSpace,            0,              NULL}, 
+       {NULL,          0,      A_BINGO,        TPS_Base,               SPACE,          NULL} 
+};
+
+static TParserStateActionItem actionTPS_InUDecimalFirst[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {p_isdigit,     0,      A_CLEAR,        TPS_InUDecimal,         0,              NULL}, 
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InUDecimal[] = {
+       {p_isEOF,       0,      A_BINGO,        TPS_Base,               DECIMAL,        NULL}, 
+       {p_isdigit,     0,      A_NEXT,         TPS_InUDecimal,         0,              NULL}, 
+       {p_iseqC,       '.',    A_PUSH,         TPS_InVersionFirst,     0,              NULL}, 
+       {p_iseqC,       'e',    A_PUSH,         TPS_InMantissaFirst,    0,              NULL}, 
+       {p_iseqC,       'E',    A_PUSH,         TPS_InMantissaFirst,    0,              NULL}, 
+       {NULL,          0,      A_BINGO,        TPS_Base,               DECIMAL,        NULL}
+};
+
+static TParserStateActionItem actionTPS_InDecimalFirst[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {p_isdigit,     0,      A_CLEAR,        TPS_InDecimal,          0,              NULL}, 
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InDecimal[] = {
+       {p_isEOF,       0,      A_BINGO,        TPS_Base,               DECIMAL,        NULL}, 
+       {p_isdigit,     0,      A_NEXT,         TPS_InDecimal,          0,              NULL}, 
+       {p_iseqC,       'e',    A_PUSH,         TPS_InMantissaFirst,    0,              NULL}, 
+       {p_iseqC,       'E',    A_PUSH,         TPS_InMantissaFirst,    0,              NULL}, 
+       {NULL,          0,      A_BINGO,        TPS_Base,               DECIMAL,        NULL}
+};
+
+static TParserStateActionItem actionTPS_InVersionFirst[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {p_isdigit,     0,      A_CLEAR,        TPS_InVersion,          0,              NULL}, 
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InVersion[] = {
+       {p_isEOF,       0,      A_BINGO,        TPS_Base,               VERSIONNUMBER,  NULL}, 
+       {p_isdigit,     0,      A_NEXT,         TPS_InVersion,          0,              NULL}, 
+       {p_iseqC,       '.',    A_PUSH,         TPS_InVersionFirst,     0,              NULL}, 
+       {NULL,          0,      A_BINGO,        TPS_Base,               VERSIONNUMBER,  NULL}
+};
+
+static TParserStateActionItem actionTPS_InMantissaFirst[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {p_isdigit,     0,      A_CLEAR,        TPS_InMantissa,         0,              NULL}, 
+       {p_iseqC,       '+',    A_NEXT,         TPS_InMantissaSign,     0,              NULL}, 
+       {p_iseqC,       '-',    A_NEXT,         TPS_InMantissaSign,     0,              NULL}, 
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InMantissaSign[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {p_isdigit,     0,      A_CLEAR,        TPS_InMantissa,         0,              NULL}, 
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InMantissa[] = {
+       {p_isEOF,       0,      A_BINGO,        TPS_Base,               SCIENTIFIC,     NULL}, 
+       {p_isdigit,     0,      A_NEXT,         TPS_InMantissa,         0,              NULL}, 
+       {NULL,          0,      A_BINGO,        TPS_Base,               SCIENTIFIC,     NULL}
+};
+
+static TParserStateActionItem actionTPS_InHTMLEntityFirst[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL},
+       {p_iseqC,       '#',    A_NEXT,         TPS_InHTMLEntityNumFirst,0,             NULL},
+       {p_islatin,     0,      A_NEXT,         TPS_InHTMLEntity,       0,              NULL},
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InHTMLEntity[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {p_islatin,     0,      A_NEXT,         TPS_InHTMLEntity,       0,              NULL}, 
+       {p_iseqC,       ';',    A_NEXT,         TPS_InHTMLEntityEnd,    0,              NULL}, 
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InHTMLEntityNumFirst[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL},
+       {p_isdigit,     0,      A_NEXT,         TPS_InHTMLEntityNum,    0,              NULL}, 
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InHTMLEntityNum[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {p_isdigit,     0,      A_NEXT,         TPS_InHTMLEntityNum,    0,              NULL}, 
+       {p_iseqC,       ';',    A_NEXT,         TPS_InHTMLEntityEnd,    0,              NULL}, 
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InHTMLEntityEnd[] = {
+       {NULL,          0,      A_BINGO|A_CLEAR,TPS_Base,               HTMLENTITY,     NULL}
+};
+
+static TParserStateActionItem actionTPS_InTagFirst[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL},
+       {p_iseqC,       '/',    A_PUSH,         TPS_InTagCloseFirst,    0,              NULL},
+       {p_iseqC,       '!',    A_PUSH,         TPS_InCommentFirst,     0,              NULL},
+       {p_islatin,     0,      A_PUSH,         TPS_InTag,              0,              NULL},
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InTagCloseFirst[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL},
+       {p_islatin,     0,      A_NEXT,         TPS_InTag,              0,              NULL},
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InTag[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {p_iseqC,       '>',    A_NEXT,         TPS_InTagEnd,           0,              SpecialTags}, 
+       {p_iseqC,       '\'',   A_NEXT,         TPS_InTagEscapeK,       0,              NULL},
+       {p_iseqC,       '"',    A_NEXT,         TPS_InTagEscapeKK,      0,              NULL},
+       {p_islatin,     0,      A_NEXT,         TPS_Null,               0,              NULL},
+       {p_isdigit,     0,      A_NEXT,         TPS_Null,               0,              NULL},
+       {p_iseqC,       '=',    A_NEXT,         TPS_Null,               0,              NULL},
+       {p_iseqC,       '-',    A_NEXT,         TPS_Null,               0,              NULL},
+       {p_iseqC,       '#',    A_NEXT,         TPS_Null,               0,              NULL},
+       {p_iseqC,       '%',    A_NEXT,         TPS_Null,               0,              NULL},
+       {p_isspace,     0,      A_NEXT,         TPS_Null,               0,              SpecialTags},
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InTagEscapeK[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {p_iseqC,       '\\',   A_PUSH,         TPS_InTagBackSleshed,   0,              NULL},
+       {p_iseqC,       '\'',   A_NEXT,         TPS_InTag,              0,              NULL},
+       {NULL,          0,      A_NEXT,         TPS_InTagEscapeK,       0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InTagEscapeKK[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {p_iseqC,       '\\',   A_PUSH,         TPS_InTagBackSleshed,   0,              NULL},
+       {p_iseqC,       '"',    A_NEXT,         TPS_InTag,              0,              NULL},
+       {NULL,          0,      A_NEXT,         TPS_InTagEscapeKK,      0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InTagBackSleshed[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {NULL,          0,      A_MERGE,        TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InTagEnd[] = {
+       {NULL,          0,      A_BINGO|A_CLRALL,TPS_Base,              TAG,            NULL}
+};
+
+static TParserStateActionItem actionTPS_InCommentFirst[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL},
+       {p_iseqC,       '-',    A_NEXT,         TPS_InCommentLast,      0,              NULL},
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InCommentLast[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL},
+       {p_iseqC,       '-',    A_NEXT,         TPS_InComment,          0,              NULL},
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InComment[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL},
+       {p_iseqC,       '-',    A_NEXT,         TPS_InCloseCommentFirst,0,              NULL},
+       {NULL,          0,      A_NEXT,         TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InCloseCommentFirst[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL},
+       {p_iseqC,       '-',    A_NEXT,         TPS_InCloseCommentLast, 0,              NULL},
+       {NULL,          0,      A_NEXT,         TPS_InComment,          0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InCloseCommentLast[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL},
+       {p_iseqC,       '-',    A_NEXT,         TPS_Null,               0,              NULL},
+       {p_iseqC,       '>',    A_NEXT,         TPS_InCommentEnd,       0,              NULL},
+       {NULL,          0,      A_NEXT,         TPS_InComment,          0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InCommentEnd[] = {
+       {NULL,          0,      A_BINGO|A_CLRALL,TPS_Base,              TAG,            NULL}
+};
+
+static TParserStateActionItem actionTPS_InHostFirstDomen[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {p_islatin,     0,      A_NEXT,         TPS_InHostDomenSecond,  0,              NULL}, 
+       {p_isdigit,     0,      A_NEXT,         TPS_InHost,             0,              NULL}, 
+       //{p_iseqC,     '-',    A_POP,          TPS_InHostFirstAN,      0,              NULL}, 
+       //{p_iseqC,     '.',    A_POP,          TPS_InHostFirstDomen,   0,              NULL}, 
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InHostDomenSecond[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {p_islatin,     0,      A_NEXT,         TPS_InHostDomen,        0,              NULL}, 
+       {p_isdigit,     0,      A_PUSH,         TPS_InHost,             0,              NULL}, 
+       {p_iseqC,       '-',    A_PUSH,         TPS_InHostFirstAN,      0,              NULL}, 
+       {p_iseqC,       '.',    A_PUSH,         TPS_InHostFirstDomen,   0,              NULL}, 
+       {p_iseqC,       '@',    A_PUSH,         TPS_InEmail,            0,              NULL}, 
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InHostDomen[] = {
+       {p_isEOF,       0,      A_BINGO|A_CLRALL,TPS_Base,              HOST,           NULL}, 
+       {p_islatin,     0,      A_NEXT,         TPS_InHostDomen,        0,              NULL}, 
+       {p_isdigit,     0,      A_PUSH,         TPS_InHost,             0,              NULL}, 
+       {p_iseqC,       ':',    A_PUSH,         TPS_InPortFirst,        0,              NULL}, 
+       {p_iseqC,       '-',    A_PUSH,         TPS_InHostFirstAN,      0,              NULL}, 
+       {p_iseqC,       '.',    A_PUSH,         TPS_InHostFirstDomen,   0,              NULL}, 
+       {p_iseqC,       '@',    A_PUSH,         TPS_InEmail,            0,              NULL}, 
+       {p_isdigit,     0,      A_POP,          TPS_Null,               0,              NULL},
+       {p_isstophost,  0,      A_BINGO|A_CLRALL,TPS_InURIStart,        HOST,           NULL},
+       {p_iseqC,       '/',    A_PUSH,         TPS_InFURL,             0,              NULL},
+       {NULL,          0,      A_BINGO|A_CLRALL,TPS_Base,              HOST,           NULL}
+};
+
+static TParserStateActionItem actionTPS_InPortFirst[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL},
+       {p_isdigit,     0,      A_NEXT,         TPS_InPort,             0,              NULL},
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InPort[] = {
+       {p_isEOF,       0,      A_BINGO|A_CLRALL,TPS_Base,              HOST,           NULL},
+       {p_isdigit,     0,      A_NEXT,         TPS_InPort,             0,              NULL},
+       {p_isstophost,  0,      A_BINGO|A_CLRALL,TPS_InURIStart,        HOST,           NULL},
+       {p_iseqC,       '/',    A_PUSH,         TPS_InFURL,             0,              NULL},
+       {NULL,          0,      A_BINGO|A_CLRALL,TPS_Base,              HOST,           NULL}
+};
+
+static TParserStateActionItem actionTPS_InHostFirstAN[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {p_isdigit,     0,      A_NEXT,         TPS_InHost,             0,              NULL}, 
+       {p_islatin,     0,      A_NEXT,         TPS_InHost,             0,              NULL}, 
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InHost[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {p_isdigit,     0,      A_NEXT,         TPS_InHost,             0,              NULL}, 
+       {p_islatin,     0,      A_NEXT,         TPS_InHost,             0,              NULL}, 
+       {p_iseqC,       '@',    A_PUSH,         TPS_InEmail,            0,              NULL}, 
+       {p_iseqC,       '.',    A_PUSH,         TPS_InHostFirstDomen,   0,              NULL}, 
+       {p_iseqC,       '-',    A_PUSH,         TPS_InHostFirstAN,      0,              NULL}, 
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InEmail[] = {
+       {p_ishost,      0,      A_BINGO|A_CLRALL, TPS_Base,             EMAIL,          NULL},
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InFileFirst[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {p_islatin,     0,      A_CLEAR,        TPS_InFile,             0,              NULL}, 
+       {p_isdigit,     0,      A_CLEAR,        TPS_InFile,             0,              NULL}, 
+       {p_iseqC,       '.',    A_CLEAR,        TPS_InFile,             0,              NULL}, 
+       {p_iseqC,       '_',    A_CLEAR,        TPS_InFile,             0,              NULL}, 
+       {p_iseqC,       '?',    A_PUSH,         TPS_InURIFirst,         0,              NULL}, 
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InFile[] = {
+       {p_isEOF,       0,      A_BINGO,        TPS_Base,               FILEPATH,       NULL}, 
+       {p_islatin,     0,      A_NEXT,         TPS_InFile,             0,              NULL}, 
+       {p_isdigit,     0,      A_NEXT,         TPS_InFile,             0,              NULL}, 
+       {p_iseqC,       '.',    A_PUSH,         TPS_InFileNext,         0,              NULL}, 
+       {p_iseqC,       '_',    A_NEXT,         TPS_InFile,             0,              NULL}, 
+       {p_iseqC,       '-',    A_NEXT,         TPS_InFile,             0,              NULL}, 
+       {p_iseqC,       '/',    A_PUSH,         TPS_InFileFirst,        0,              NULL}, 
+       {p_iseqC,       '?',    A_PUSH,         TPS_InURIFirst,         0,              NULL}, 
+       {NULL,          0,      A_BINGO,        TPS_Base,               FILEPATH,       NULL}
+};
+
+static TParserStateActionItem actionTPS_InFileNext[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {p_islatin,     0,      A_CLEAR,        TPS_InFile,             0,              NULL}, 
+       {p_isdigit,     0,      A_CLEAR,        TPS_InFile,             0,              NULL}, 
+       {p_iseqC,       '_',    A_CLEAR,        TPS_InFile,             0,              NULL}, 
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InURIFirst[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {p_iseqC,       '"',    A_POP,          TPS_Null,               0,              NULL}, 
+       {p_iseqC,       '\'',   A_POP,          TPS_Null,               0,              NULL}, 
+       {p_isnotspace,  0,      A_CLEAR,        TPS_InURI,              0,              NULL}, 
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL},
+};
+static TParserStateActionItem actionTPS_InURIStart[] = {
+       {NULL,          0,      A_NEXT,         TPS_InURI,              0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InURI[] = {
+       {p_isEOF,       0,      A_BINGO,        TPS_Base,               URI,            NULL}, 
+       {p_iseqC,       '"',    A_BINGO,        TPS_Base,               URI,            NULL}, 
+       {p_iseqC,       '\'',   A_BINGO,        TPS_Base,               URI,            NULL}, 
+       {p_isnotspace,  0,      A_NEXT,         TPS_InURI,              0,              NULL}, 
+       {NULL,          0,      A_BINGO,        TPS_Base,               URI,            NULL}
+};
+
+static TParserStateActionItem actionTPS_InFURL[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {p_isURI,       0,      A_BINGO|A_CLRALL,TPS_Base,              FURL,           SpecialFURL},
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+               
+static TParserStateActionItem actionTPS_InProtocolFirst[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {p_iseqC,       '/',    A_NEXT,         TPS_InProtocolSecond,   0,              NULL},
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+               
+static TParserStateActionItem actionTPS_InProtocolSecond[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {p_iseqC,       '/',    A_NEXT,         TPS_InProtocolEnd,      0,              NULL},
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+               
+static TParserStateActionItem actionTPS_InProtocolEnd[] = {
+       {NULL,          0,      A_BINGO|A_CLRALL,TPS_Base,              PROTOCOL,       NULL}
+};
+               
+static TParserStateActionItem actionTPS_InHyphenLatWordFirst[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {p_islatin,     0,      A_NEXT,         TPS_InHyphenLatWord,    0,              NULL}, 
+       {p_isnonlatin,  0,      A_NEXT,         TPS_InHyphenUWord,      0,              NULL}, 
+       {p_isdigit,     0,      A_NEXT,         TPS_InHyphenValue,      0,              NULL}, 
+       {p_isdigit,     0,      A_NEXT,         TPS_InHyphenUWord,      0,              NULL}, 
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InHyphenLatWord[] = {
+       {p_isEOF,       0,      A_BINGO|A_CLRALL,TPS_InParseHyphen,     LATHYPHENWORD,  SpecialHyphen}, 
+       {p_islatin,     0,      A_NEXT,         TPS_InHyphenLatWord,    0,              NULL}, 
+       {p_isnonlatin,  0,      A_NEXT,         TPS_InHyphenUWord,      0,              NULL}, 
+       {p_isdigit,     0,      A_NEXT,         TPS_InHyphenUWord,      0,              NULL}, 
+       {p_iseqC,       '-',    A_PUSH,         TPS_InHyphenLatWordFirst,0,             NULL}, 
+       {NULL,          0,      A_BINGO|A_CLRALL,TPS_InParseHyphen,     LATHYPHENWORD,  SpecialHyphen}
+};
+
+static TParserStateActionItem actionTPS_InHyphenCyrWordFirst[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {p_isnonlatin,  0,      A_NEXT,         TPS_InHyphenCyrWord,    0,              NULL}, 
+       {p_islatin,     0,      A_NEXT,         TPS_InHyphenUWord,      0,              NULL}, 
+       {p_isdigit,     0,      A_NEXT,         TPS_InHyphenValue,      0,              NULL}, 
+       {p_isdigit,     0,      A_NEXT,         TPS_InHyphenUWord,      0,              NULL}, 
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InHyphenCyrWord[] = {
+       {p_isEOF,       0,      A_BINGO|A_CLRALL,TPS_InParseHyphen,     CYRHYPHENWORD,  SpecialHyphen}, 
+       {p_isnonlatin,  0,      A_NEXT,         TPS_InHyphenCyrWord,    0,              NULL}, 
+       {p_islatin,     0,      A_NEXT,         TPS_InHyphenUWord,      0,              NULL}, 
+       {p_isdigit,     0,      A_NEXT,         TPS_InHyphenUWord,      0,              NULL}, 
+       {p_iseqC,       '-',    A_PUSH,         TPS_InHyphenCyrWordFirst,0,             NULL}, 
+       {NULL,          0,      A_BINGO|A_CLRALL,TPS_InParseHyphen,     CYRHYPHENWORD,  SpecialHyphen}
+};
+
+static TParserStateActionItem actionTPS_InHyphenUWordFirst[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {p_isdigit,     0,      A_NEXT,         TPS_InHyphenValue,      0,              NULL}, 
+       {p_isalnum,     0,      A_NEXT,         TPS_InHyphenUWord,      0,              NULL}, 
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InHyphenUWord[] = {
+       {p_isEOF,       0,      A_BINGO|A_CLRALL,TPS_InParseHyphen,     HYPHENWORD,     SpecialHyphen}, 
+       {p_isalnum,     0,      A_NEXT,         TPS_InHyphenUWord,      0,              NULL}, 
+       {p_iseqC,       '-',    A_PUSH,         TPS_InHyphenUWordFirst,0,               NULL}, 
+       {NULL,          0,      A_BINGO|A_CLRALL,TPS_InParseHyphen,     HYPHENWORD,     SpecialHyphen}
+};
+
+static TParserStateActionItem actionTPS_InHyphenValueFirst[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {p_isdigit,     0,      A_NEXT,         TPS_InHyphenValueExact, 0,              NULL}, 
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InHyphenValue[] = {
+       {p_isEOF,       0,      A_BINGO|A_CLRALL,TPS_InParseHyphen,     HYPHENWORD,     SpecialHyphen}, 
+       {p_isdigit,     0,      A_NEXT,         TPS_InHyphenValue,      0,              NULL}, 
+       {p_iseqC,       '.',    A_PUSH,         TPS_InHyphenValueFirst, 0,              NULL}, 
+       {p_iseqC,       '-',    A_PUSH,         TPS_InHyphenUWordFirst,0,               NULL}, 
+       {p_isalpha,     0,      A_NEXT,         TPS_InHyphenUWord,      0,              NULL}, 
+       {NULL,          0,      A_BINGO|A_CLRALL,TPS_InParseHyphen,     HYPHENWORD,     SpecialHyphen}
+};
+
+static TParserStateActionItem actionTPS_InHyphenValueExact[] = {
+       {p_isEOF,       0,      A_BINGO|A_CLRALL,TPS_InParseHyphen,     HYPHENWORD,     SpecialHyphen}, 
+       {p_isdigit,     0,      A_NEXT,         TPS_InHyphenValueExact, 0,              NULL}, 
+       {p_iseqC,       '.',    A_PUSH,         TPS_InHyphenValueFirst, 0,              NULL}, 
+       {p_iseqC,       '-',    A_PUSH,         TPS_InHyphenUWordFirst, 0,              NULL}, 
+       {NULL,          0,      A_BINGO|A_CLRALL,TPS_InParseHyphen,     HYPHENWORD,     SpecialHyphen}
+};
+
+static TParserStateActionItem actionTPS_InParseHyphen[] = {
+       {p_isEOF,       0,      A_RERUN,        TPS_Base,               0,              NULL}, 
+       {p_islatin,     0,      A_NEXT,         TPS_InHyphenLatWordPart,0,              NULL}, 
+       {p_isnonlatin,  0,      A_NEXT,         TPS_InHyphenCyrWordPart,0,              NULL}, 
+       {p_isdigit,     0,      A_NEXT,         TPS_InHyphenUnsignedInt,0,              NULL}, 
+       {p_iseqC,       '-',    A_PUSH,         TPS_InParseHyphenHyphen,0,              NULL}, 
+       {NULL,          0,      A_RERUN,        TPS_Base,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {p_isalnum,     0,      A_BINGO|A_CLEAR,TPS_InParseHyphen,      SPACE,          NULL}, 
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InHyphenCyrWordPart[] = {
+       {p_isEOF,       0,      A_BINGO,        TPS_Base,               CYRPARTHYPHENWORD,NULL}, 
+       {p_isnonlatin,  0,      A_NEXT,         TPS_InHyphenCyrWordPart,0,              NULL}, 
+       {p_islatin,     0,      A_NEXT,         TPS_InHyphenUWordPart,  0,              NULL}, 
+       {p_isdigit,     0,      A_NEXT,         TPS_InHyphenUWordPart,  0,              NULL}, 
+       {NULL,          0,      A_BINGO,        TPS_InParseHyphen,      CYRPARTHYPHENWORD,NULL}
+};
+
+static TParserStateActionItem actionTPS_InHyphenLatWordPart[] = {
+       {p_isEOF,       0,      A_BINGO,        TPS_Base,               LATPARTHYPHENWORD,NULL}, 
+       {p_islatin,     0,      A_NEXT,         TPS_InHyphenLatWordPart,0,              NULL}, 
+       {p_isnonlatin,  0,      A_NEXT,         TPS_InHyphenUWordPart,  0,              NULL}, 
+       {p_isdigit,     0,      A_NEXT,         TPS_InHyphenUWordPart,  0,              NULL}, 
+       {NULL,          0,      A_BINGO,        TPS_InParseHyphen,      LATPARTHYPHENWORD,NULL}
+};
+
+static TParserStateActionItem actionTPS_InHyphenUWordPart[] = {
+       {p_isEOF,       0,      A_BINGO,        TPS_Base,               PARTHYPHENWORD, NULL}, 
+       {p_isalnum,     0,      A_NEXT,         TPS_InHyphenUWordPart,  0,              NULL}, 
+       {NULL,          0,      A_BINGO,        TPS_InParseHyphen,      PARTHYPHENWORD, NULL}
+};
+
+static TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
+       {p_isEOF,       0,      A_BINGO,        TPS_Base,               UNSIGNEDINT,    NULL}, 
+       {p_isdigit,     0,      A_NEXT,         TPS_InHyphenUnsignedInt,0,              NULL}, 
+       {p_isalpha,     0,      A_NEXT,         TPS_InHyphenUWordPart,  0,              NULL}, 
+       {p_iseqC,       '.',    A_PUSH,         TPS_InHDecimalPartFirst,0,              NULL}, 
+       {NULL,          0,      A_BINGO,        TPS_InParseHyphen,      UNSIGNEDINT,    NULL}
+};
+
+static TParserStateActionItem actionTPS_InHDecimalPartFirst[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {p_isdigit,     0,      A_CLEAR,        TPS_InHDecimalPart,     0,              NULL}, 
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InHDecimalPart[] = {
+       {p_isEOF,       0,      A_BINGO,        TPS_Base,               DECIMAL,        NULL}, 
+       {p_isdigit,     0,      A_NEXT,         TPS_InHDecimalPart,     0,              NULL}, 
+       {p_iseqC,       '.',    A_PUSH,         TPS_InHVersionPartFirst,0,              NULL}, 
+       {NULL,          0,      A_BINGO,        TPS_InParseHyphen,      DECIMAL,        NULL}
+};
+
+static TParserStateActionItem actionTPS_InHVersionPartFirst[] = {
+       {p_isEOF,       0,      A_POP,          TPS_Null,               0,              NULL}, 
+       {p_isdigit,     0,      A_CLEAR,        TPS_InHVersionPart,     0,              NULL}, 
+       {NULL,          0,      A_POP,          TPS_Null,               0,              NULL}
+};
+
+static TParserStateActionItem actionTPS_InHVersionPart[] = {
+       {p_isEOF,       0,      A_BINGO,        TPS_Base,               VERSIONNUMBER,  NULL}, 
+       {p_isdigit,     0,      A_NEXT,         TPS_InHVersionPart,     0,              NULL}, 
+       {p_iseqC,       '.',    A_PUSH,         TPS_InHVersionPartFirst,0,              NULL}, 
+       {NULL,          0,      A_BINGO,        TPS_InParseHyphen,      VERSIONNUMBER,  NULL}
+};
+
+/* 
+ * order should be the same as in typedef enum {} TParserState!!
+ */
+
+static const TParserStateAction Actions[] = {
+       { TPS_Base,                     actionTPS_Base }, 
+       { TPS_InUWord,                  actionTPS_InUWord },
+       { TPS_InLatWord,                actionTPS_InLatWord },
+       { TPS_InCyrWord,                actionTPS_InCyrWord },
+       { TPS_InUnsignedInt,            actionTPS_InUnsignedInt },
+       { TPS_InSignedIntFirst,         actionTPS_InSignedIntFirst },
+       { TPS_InSignedInt,              actionTPS_InSignedInt },
+       { TPS_InSpace,                  actionTPS_InSpace },    
+       { TPS_InUDecimalFirst,          actionTPS_InUDecimalFirst },    
+       { TPS_InUDecimal,               actionTPS_InUDecimal }, 
+       { TPS_InDecimalFirst,           actionTPS_InDecimalFirst },     
+       { TPS_InDecimal,                actionTPS_InDecimal },  
+       { TPS_InVersionFirst,           actionTPS_InVersionFirst },     
+       { TPS_InVersion,                actionTPS_InVersion },  
+       { TPS_InMantissaFirst,          actionTPS_InMantissaFirst },    
+       { TPS_InMantissaSign,           actionTPS_InMantissaSign },     
+       { TPS_InMantissa,               actionTPS_InMantissa }, 
+       { TPS_InHTMLEntityFirst,        actionTPS_InHTMLEntityFirst },  
+       { TPS_InHTMLEntity,             actionTPS_InHTMLEntity },       
+       { TPS_InHTMLEntityNumFirst,     actionTPS_InHTMLEntityNumFirst },       
+       { TPS_InHTMLEntityNum,          actionTPS_InHTMLEntityNum },    
+       { TPS_InHTMLEntityEnd,          actionTPS_InHTMLEntityEnd },    
+       { TPS_InTagFirst,               actionTPS_InTagFirst }, 
+       { TPS_InTagCloseFirst,          actionTPS_InTagCloseFirst },    
+       { TPS_InTag,                    actionTPS_InTag },      
+       { TPS_InTagEscapeK,             actionTPS_InTagEscapeK },       
+       { TPS_InTagEscapeKK,            actionTPS_InTagEscapeKK },      
+       { TPS_InTagBackSleshed,         actionTPS_InTagBackSleshed },
+       { TPS_InTagEnd,                 actionTPS_InTagEnd },   
+       { TPS_InCommentFirst,           actionTPS_InCommentFirst },     
+       { TPS_InCommentLast,            actionTPS_InCommentLast },      
+       { TPS_InComment,                actionTPS_InComment },  
+       { TPS_InCloseCommentFirst,      actionTPS_InCloseCommentFirst },        
+       { TPS_InCloseCommentLast,       actionTPS_InCloseCommentLast }, 
+       { TPS_InCommentEnd,             actionTPS_InCommentEnd },       
+       { TPS_InHostFirstDomen,         actionTPS_InHostFirstDomen },   
+       { TPS_InHostDomenSecond,        actionTPS_InHostDomenSecond },  
+       { TPS_InHostDomen,              actionTPS_InHostDomen },        
+       { TPS_InPortFirst,              actionTPS_InPortFirst },        
+       { TPS_InPort,                   actionTPS_InPort },     
+       { TPS_InHostFirstAN,            actionTPS_InHostFirstAN },      
+       { TPS_InHost,                   actionTPS_InHost },     
+       { TPS_InEmail,                  actionTPS_InEmail },    
+       { TPS_InFileFirst,              actionTPS_InFileFirst },        
+       { TPS_InFile,                   actionTPS_InFile },     
+       { TPS_InFileNext,               actionTPS_InFileNext }, 
+       { TPS_InURIFirst,               actionTPS_InURIFirst }, 
+       { TPS_InURIStart,               actionTPS_InURIStart }, 
+       { TPS_InURI,                    actionTPS_InURI },      
+       { TPS_InFURL,                   actionTPS_InFURL },     
+       { TPS_InProtocolFirst,          actionTPS_InProtocolFirst },    
+       { TPS_InProtocolSecond,         actionTPS_InProtocolSecond },   
+       { TPS_InProtocolEnd,            actionTPS_InProtocolEnd },      
+       { TPS_InHyphenLatWordFirst,     actionTPS_InHyphenLatWordFirst },       
+       { TPS_InHyphenLatWord,          actionTPS_InHyphenLatWord },    
+       { TPS_InHyphenCyrWordFirst,     actionTPS_InHyphenCyrWordFirst },       
+       { TPS_InHyphenCyrWord,          actionTPS_InHyphenCyrWord },    
+       { TPS_InHyphenUWordFirst,       actionTPS_InHyphenUWordFirst }, 
+       { TPS_InHyphenUWord,            actionTPS_InHyphenUWord },      
+       { TPS_InHyphenValueFirst,       actionTPS_InHyphenValueFirst }, 
+       { TPS_InHyphenValue,            actionTPS_InHyphenValue },      
+       { TPS_InHyphenValueExact,       actionTPS_InHyphenValueExact }, 
+       { TPS_InParseHyphen,            actionTPS_InParseHyphen },      
+       { TPS_InParseHyphenHyphen,      actionTPS_InParseHyphenHyphen },        
+       { TPS_InHyphenCyrWordPart,      actionTPS_InHyphenCyrWordPart },        
+       { TPS_InHyphenLatWordPart,      actionTPS_InHyphenLatWordPart },        
+       { TPS_InHyphenUWordPart,        actionTPS_InHyphenUWordPart },  
+       { TPS_InHyphenUnsignedInt,      actionTPS_InHyphenUnsignedInt },        
+       { TPS_InHDecimalPartFirst,      actionTPS_InHDecimalPartFirst },        
+       { TPS_InHDecimalPart,           actionTPS_InHDecimalPart },     
+       { TPS_InHVersionPartFirst,      actionTPS_InHVersionPartFirst },        
+       { TPS_InHVersionPart,           actionTPS_InHVersionPart },     
+       { TPS_Null,                     NULL }
+};
+
+
+bool
+TParserGet( TParser *prs ) {
+       TParserStateActionItem *item=NULL;
+
+       if ( prs->state->posbyte >= prs->lenstr ) 
+               return false;
+
+       Assert( prs->state );
+       prs->lexeme    = prs->str + prs->state->posbyte;
+       prs->state->pushedAtAction = NULL;
+
+       /* look at string */
+       while (prs->state->posbyte <= prs->lenstr) {
+               if ( prs->state->posbyte == prs->lenstr ) 
+                       prs->state->charlen = 0;
+               else
+                       prs->state->charlen = ( prs->charmaxlen == 1 ) ? prs->charmaxlen : 
+                               pg_mblen( prs->str + prs->state->posbyte );
+
+               Assert( prs->state->posbyte + prs->state->charlen <= prs->lenstr ); 
+               Assert( prs->state->state >=TPS_Base && prs->state->state < TPS_Null );
+               Assert( Actions[ prs->state->state ].state == prs->state->state ); 
+
+               item = Actions[ prs->state->state ].action;
+               Assert(item!=NULL);
+
+               if ( item < prs->state->pushedAtAction )
+                       item =  prs->state->pushedAtAction;
+
+               /* find action by character class */
+               while( item->isclass ) {
+                       prs->c = item->c;
+                       if ( item->isclass(prs)!=0 ) {
+                               if ( item > prs->state->pushedAtAction ) /* remember: after pushing we were by false way */ 
+                                       break;
+                       } 
+                       item++;
+               }
+
+               prs->state->pushedAtAction = NULL;
+
+               /* call special handler if exists */
+               if ( item->special )
+                       item->special(prs);
+
+               /* BINGO, lexeme is found */
+               if ( item->flags & A_BINGO ) {
+                       Assert( item->type>0 );
+                       prs->lenbytelexeme = prs->state->lenbytelexeme;
+                       prs->lencharlexeme = prs->state->lencharlexeme;
+                       prs->state->lenbytelexeme = prs->state->lencharlexeme = 0;
+                       prs->type = item->type;
+               } 
+
+               /* do various actions by flags */       
+               if ( item->flags & A_POP ) {  /* pop stored state in stack */
+                       TParserPosition *ptr = prs->state->prev;
+                       pfree( prs->state );
+                       prs->state = ptr;
+                       Assert( prs->state );
+               } else if ( item->flags & A_PUSH ) { /* push (store) state in stack */ 
+                       prs->state->pushedAtAction = item; /* remember where we push */
+                       prs->state = newTParserPosition( prs->state );
+               } else if ( item->flags & A_CLEAR ) { /* clear previous pushed state */
+                       TParserPosition *ptr;
+                       Assert( prs->state->prev );
+                       ptr = prs->state->prev->prev;
+                       pfree( prs->state->prev );
+                       prs->state->prev = ptr;
+               } else if ( item->flags & A_CLRALL ) { /* clear all previous pushed state */
+                       TParserPosition *ptr;
+                       while( prs->state->prev ) {
+                               ptr = prs->state->prev->prev;
+                               pfree( prs->state->prev );
+                               prs->state->prev = ptr;
+                       }
+               } else if ( item->flags & A_MERGE ) { /* merge posinfo with current and pushed state */
+                       TParserPosition *ptr = prs->state;
+                       Assert( prs->state->prev );
+                       prs->state = prs->state->prev;
+
+                       prs->state->posbyte = ptr->posbyte;
+                       prs->state->poschar = ptr->poschar;
+                       prs->state->charlen = ptr->charlen;
+                       prs->state->lenbytelexeme = ptr->lenbytelexeme;
+                       prs->state->lencharlexeme = ptr->lencharlexeme;
+                       pfree(ptr);     
+               }
+
+               /* set new state if pointed */
+               if ( item->tostate != TPS_Null ) 
+                       prs->state->state = item->tostate;
+
+               /* check for go away */ 
+               if ( (item->flags & A_BINGO) || (prs->state->posbyte >= prs->lenstr && (item->flags & A_RERUN)==0 ) ) 
+                       break;
+
+               /* go to begining of loop if we should rerun or we just restore state */
+               if ( item->flags & ( A_RERUN | A_POP ) )
+                       continue;
+       
+               /* move forward */      
+               if ( prs->state->charlen ) {
+                       prs->state->posbyte += prs->state->charlen;
+                       prs->state->lenbytelexeme += prs->state->charlen;
+                       prs->state->poschar ++;
+                       prs->state->lencharlexeme ++;
+               }
+       } 
+
+       return (item && (item->flags & A_BINGO)) ? true : false;
+}
+
+
index 3f0e0cd6359ff66f4f91050a215703d4319c6ab9..ee5b3b7ab5471e6dc194647db98bcd3542977464 100644 (file)
 #ifndef __PARSER_H__
 #define __PARSER_H__
 
-extern char *token;
-extern int     tokenlen;
-int                    tsearch2_yylex(void);
-void           tsearch2_start_parse_str(char *, int);
-void           tsearch2_end_parse(void);
+#include <ctype.h>
+#include <limits.h>
+#include "ts_locale.h"
+
+typedef enum {
+       TPS_Base = 0,
+       TPS_InUWord,
+       TPS_InLatWord,
+       TPS_InCyrWord,
+       TPS_InUnsignedInt,
+       TPS_InSignedIntFirst,
+       TPS_InSignedInt,
+       TPS_InSpace,
+       TPS_InUDecimalFirst,
+       TPS_InUDecimal,
+       TPS_InDecimalFirst,
+       TPS_InDecimal,
+       TPS_InVersionFirst,
+       TPS_InVersion,
+       TPS_InMantissaFirst,
+       TPS_InMantissaSign,
+       TPS_InMantissa,
+       TPS_InHTMLEntityFirst,
+       TPS_InHTMLEntity,
+       TPS_InHTMLEntityNumFirst,
+       TPS_InHTMLEntityNum,
+       TPS_InHTMLEntityEnd,
+       TPS_InTagFirst,
+       TPS_InTagCloseFirst,
+       TPS_InTag,
+       TPS_InTagEscapeK,
+       TPS_InTagEscapeKK,
+       TPS_InTagBackSleshed,
+       TPS_InTagEnd,
+       TPS_InCommentFirst,
+       TPS_InCommentLast,
+       TPS_InComment,
+       TPS_InCloseCommentFirst,
+       TPS_InCloseCommentLast,
+       TPS_InCommentEnd,
+       TPS_InHostFirstDomen,
+       TPS_InHostDomenSecond,
+       TPS_InHostDomen,
+       TPS_InPortFirst,
+       TPS_InPort,
+       TPS_InHostFirstAN,
+       TPS_InHost,
+       TPS_InEmail,
+       TPS_InFileFirst,
+       TPS_InFile,
+       TPS_InFileNext,
+       TPS_InURIFirst,
+       TPS_InURIStart,
+       TPS_InURI,
+       TPS_InFURL,
+       TPS_InProtocolFirst,
+       TPS_InProtocolSecond,
+       TPS_InProtocolEnd,
+       TPS_InHyphenLatWordFirst,
+       TPS_InHyphenLatWord,
+       TPS_InHyphenCyrWordFirst,
+       TPS_InHyphenCyrWord,
+       TPS_InHyphenUWordFirst,
+       TPS_InHyphenUWord,
+       TPS_InHyphenValueFirst,
+       TPS_InHyphenValue,
+       TPS_InHyphenValueExact,
+       TPS_InParseHyphen,
+       TPS_InParseHyphenHyphen,
+       TPS_InHyphenCyrWordPart,
+       TPS_InHyphenLatWordPart,
+       TPS_InHyphenUWordPart,
+       TPS_InHyphenUnsignedInt,
+       TPS_InHDecimalPartFirst,
+       TPS_InHDecimalPart,
+       TPS_InHVersionPartFirst,
+       TPS_InHVersionPart,
+       TPS_Null  /* last state (fake value) */
+} TParserState;
+
+/* forward declaration */
+struct TParser;
+
+
+typedef int (*TParserCharTest)(struct TParser*);  /* any p_is* functions except p_iseq */
+typedef void (*TParserSpecial)(struct TParser*);  /* special handler for special cases... */
+
+typedef struct {
+        TParserCharTest isclass;
+        char            c;
+        uint16          flags;
+        TParserState    tostate;
+        int             type;
+        TParserSpecial  special;
+} TParserStateActionItem;
+
+typedef struct {
+        TParserState            state;
+        TParserStateActionItem  *action;
+} TParserStateAction;
+
+typedef struct TParserPosition {
+       int             posbyte; /* position of parser in bytes */
+       int             poschar; /* osition of parser in characters */
+       int             charlen; /* length of current char */
+       int             lenbytelexeme;
+       int             lencharlexeme;
+       TParserState    state;
+       struct TParserPosition  *prev;
+       int             flags;
+       TParserStateActionItem  *pushedAtAction;
+} TParserPosition;
+
+typedef struct TParser {
+       /* string and position information */
+       char            *str;  /* multibyte string */
+       int             lenstr; /* length of mbstring */
+       wchar_t         *wstr;  /* wide character string */ 
+       int             lenwstr; /* length of wsting */
+
+       /* State of parse */
+       int             charmaxlen;
+       bool            usewide;
+       TParserPosition *state;
+       bool            ignore;
+       bool            wanthost;
+
+       /* silly char */
+       char c;
+
+       /* out */
+       char            *lexeme;
+       int             lenbytelexeme;
+       int             lencharlexeme;
+       int             type;
+       
+} TParser;
+
+
+TParser* TParserInit( char *, int );
+bool   TParserGet( TParser* );
+void   TParserClose( TParser* );
 
 #endif
diff --git a/contrib/tsearch2/wordparser/parser.l b/contrib/tsearch2/wordparser/parser.l
deleted file mode 100644 (file)
index a7cb468..0000000
+++ /dev/null
@@ -1,346 +0,0 @@
-%{
-#include "postgres.h"
-
-#include "deflex.h"
-#include "parser.h"
-#include "common.h"
-
-/* Avoid exit() on fatal scanner errors */
-#undef fprintf
-#define fprintf(file, fmt, msg)  ts_error(ERROR, fmt, msg)
-
-char *token = NULL;  /* pointer to token */
-int tokenlen;
-static char *s     = NULL;  /* to return WHOLE hyphenated-word */
-
-YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */
-
-typedef struct {
-       int tlen;
-       int clen;
-       char *str;
-} TagStorage;
-
-static TagStorage ts={0,0,NULL};
-
-static void
-addTag(void)
-{
-       while( ts.clen+tsearch2_yyleng+1 > ts.tlen ) {
-               ts.tlen*=2;
-               ts.str=realloc(ts.str,ts.tlen);
-               if (!ts.str)
-                       ereport(ERROR,
-                                       (errcode(ERRCODE_OUT_OF_MEMORY),
-                                        errmsg("out of memory")));
-        }
-        memcpy(ts.str+ts.clen,tsearch2_yytext,tsearch2_yyleng);
-        ts.clen+=tsearch2_yyleng;
-       ts.str[ts.clen]='\0';
-}
-
-static void
-startTag(void)
-{
-       if ( ts.str==NULL ) {
-               ts.tlen=tsearch2_yyleng+1;
-               ts.str=malloc(ts.tlen);
-               if (!ts.str)
-                       ereport(ERROR,
-                                (errcode(ERRCODE_OUT_OF_MEMORY),
-                                 errmsg("out of memory")));
-       }
-       ts.clen=0;
-       ts.str[0]='\0';
-       addTag();
-}
-
-%}
-
-%option 8bit
-%option never-interactive
-%option nodefault
-%option nounput
-%option noyywrap
-
-/* parser's state for parsing hyphenated-word */
-%x DELIM  
-/* parser's state for parsing URL*/
-%x URL  
-%x SERVER  
-
-/* parser's state for parsing TAGS */
-%x INTAG
-%x QINTAG
-%x INCOMMENT
-%x INSCRIPT
-
-/* cyrillic koi8 char */
-CYRALNUM       [0-9\200-\377]
-CYRALPHA       [\200-\377]
-ALPHA          [a-zA-Z\200-\377]
-ALNUM          [0-9a-zA-Z\200-\377]
-
-
-HOSTNAME       ([-_[:alnum:]]+\.)+[[:alpha:]]+
-URI            [-_[:alnum:]/%,\.;=&?#]+
-
-%%
-
-"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; startTag(); }
-
-<INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" {
-       BEGIN INITIAL; 
-       addTag();
-       token = ts.str;
-       tokenlen = ts.clen;
-       return TAG;
-}
-
-"<!--" { BEGIN INCOMMENT; startTag(); }
-
-<INCOMMENT>"-->"       { 
-       BEGIN INITIAL;
-       addTag();
-       token = ts.str;
-       tokenlen = ts.clen;
-       return TAG;
-}
-
-
-"<"[\![:alpha:]]       { BEGIN INTAG; startTag(); }
-
-"</"[[:alpha:]]        { BEGIN INTAG; startTag(); }
-
-<INTAG>"\""    { BEGIN QINTAG; addTag(); }
-
-<QINTAG>"\\\"" { addTag(); }
-
-<QINTAG>"\""   { BEGIN INTAG; addTag(); }
-
-<INTAG>">"     { 
-       BEGIN INITIAL;
-       addTag();
-       token = ts.str;
-       tokenlen = ts.clen;
-       return TAG;
-}
-
-<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n { addTag(); }    
-
-\&(quot|amp|nbsp|lt|gt)\;   {
-       token = tsearch2_yytext;
-       tokenlen = tsearch2_yyleng;
-       return HTMLENTITY;
-}
-
-\&\#[0-9][0-9]?[0-9]?\; {
-       token = tsearch2_yytext;
-       tokenlen = tsearch2_yyleng;
-       return HTMLENTITY;
-}
-[-_\.[:alnum:]]+@{HOSTNAME}  /* Emails */ { 
-       token = tsearch2_yytext; 
-       tokenlen = tsearch2_yyleng;
-       return EMAIL; 
-}
-
-[+-]?[0-9]+(\.[0-9]+)?[eEdD][+-]?[0-9]+  /* float */   { 
-       token = tsearch2_yytext; 
-       tokenlen = tsearch2_yyleng;
-       return SCIENTIFIC; 
-}
-
-[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
-       token = tsearch2_yytext;
-       tokenlen = tsearch2_yyleng;
-       return VERSIONNUMBER;
-}
-
-[+-]?[0-9]+\.[0-9]+ {
-       token = tsearch2_yytext;
-       tokenlen = tsearch2_yyleng;
-       return DECIMAL;
-}
-
-[+-][0-9]+ { 
-       token = tsearch2_yytext; 
-       tokenlen = tsearch2_yyleng;
-       return SIGNEDINT; 
-}
-
-<DELIM,INITIAL>[0-9]+ { 
-       token = tsearch2_yytext; 
-       tokenlen = tsearch2_yyleng;
-       return UNSIGNEDINT; 
-}
-
-http"://"        { 
-       BEGIN URL; 
-       token = tsearch2_yytext;
-       tokenlen = tsearch2_yyleng;
-       return HTTP;
-}
-
-ftp"://"        { 
-       BEGIN URL; 
-       token = tsearch2_yytext;
-       tokenlen = tsearch2_yyleng;
-       return HTTP;
-}
-
-<URL,INITIAL>{HOSTNAME}[/:]{URI} { 
-       BEGIN SERVER;
-       if (s) { free(s); s=NULL; } 
-       s = strdup( tsearch2_yytext ); 
-       tokenlen = tsearch2_yyleng;
-       yyless( 0 ); 
-       token = s;
-       return FURL;
-}
-
-<SERVER,URL,INITIAL>{HOSTNAME} {
-       token = tsearch2_yytext; 
-       tokenlen = tsearch2_yyleng;
-       return HOST;
-}
-
-<SERVER>[/:]{URI}      {
-       token = tsearch2_yytext;
-       tokenlen = tsearch2_yyleng;
-       return URI;
-}
-
-[[:alnum:]\./_-]+"/"[[:alnum:]\./_-]+ {
-       token = tsearch2_yytext;
-       tokenlen = tsearch2_yyleng;
-       return FILEPATH;
-}
-
-({CYRALPHA}+-)+{CYRALPHA}+ /* composite-word */        {
-       BEGIN DELIM;
-       if (s) { free(s); s=NULL; } 
-       s = strdup( tsearch2_yytext );
-       tokenlen = tsearch2_yyleng;
-       yyless( 0 );
-       token = s;
-       return CYRHYPHENWORD;
-}
-
-([[:alpha:]]+-)+[[:alpha:]]+ /* composite-word */      {
-        BEGIN DELIM;
-       if (s) { free(s); s=NULL; } 
-       s = strdup( tsearch2_yytext );
-       tokenlen = tsearch2_yyleng;
-       yyless( 0 );
-       token = s;
-       return LATHYPHENWORD;
-}
-
-({ALNUM}+-)+{ALNUM}+ /* composite-word */      {
-       BEGIN DELIM;
-       if (s) { free(s); s=NULL; } 
-       s = strdup( tsearch2_yytext );
-       tokenlen = tsearch2_yyleng;
-       yyless( 0 );
-       token = s;
-       return HYPHENWORD;
-}
-
-<DELIM>[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
-       token = tsearch2_yytext;
-       tokenlen = tsearch2_yyleng;
-       return VERSIONNUMBER;
-}
-
-<DELIM>\+?[0-9]+\.[0-9]+ {
-       token = tsearch2_yytext;
-       tokenlen = tsearch2_yyleng;
-       return DECIMAL;
-}
-
-<DELIM>{CYRALPHA}+  /* one word in composite-word */   { 
-       token = tsearch2_yytext; 
-       tokenlen = tsearch2_yyleng;
-       return CYRPARTHYPHENWORD; 
-}
-
-<DELIM>[[:alpha:]]+  /* one word in composite-word */  { 
-       token = tsearch2_yytext; 
-       tokenlen = tsearch2_yyleng;
-       return LATPARTHYPHENWORD; 
-}
-
-<DELIM>{ALNUM}+  /* one word in composite-word */      { 
-       token = tsearch2_yytext; 
-       tokenlen = tsearch2_yyleng;
-       return PARTHYPHENWORD; 
-}
-
-<DELIM>-  { 
-       token = tsearch2_yytext;
-       tokenlen = tsearch2_yyleng;
-       return SPACE;
-}
-
-<DELIM,SERVER,URL>.|\n /* return in basic state */     {
-       BEGIN INITIAL;
-       yyless( 0 );
-}
-
-{CYRALPHA}+ /* normal word */  { 
-       token = tsearch2_yytext; 
-       tokenlen = tsearch2_yyleng;
-       return CYRWORD; 
-}
-
-[[:alpha:]]+ /* normal word */ { 
-       token = tsearch2_yytext; 
-       tokenlen = tsearch2_yyleng;
-       return LATWORD; 
-}
-
-{ALNUM}+ /* normal word */     { 
-       token = tsearch2_yytext; 
-       tokenlen = tsearch2_yyleng;
-       return UWORD; 
-}
-
-[ \r\n\t]+ {
-       token = tsearch2_yytext;
-       tokenlen = tsearch2_yyleng;
-       return SPACE;
-}
-
-. {
-       token = tsearch2_yytext;
-       tokenlen = tsearch2_yyleng;
-       return SPACE;
-} 
-
-%%
-
-/* clearing after parsing from string */
-void
-tsearch2_end_parse(void)
-{
-       if (s)
-       {
-               free(s);
-               s = NULL;
-       } 
-       tsearch2_yy_delete_buffer( buf );
-       buf = NULL;
-} 
-
-/* start parse from string */
-void
-tsearch2_start_parse_str(char* str, int limit)
-{
-       if (buf)
-               tsearch2_end_parse();
-       buf = tsearch2_yy_scan_bytes( str, limit );
-       tsearch2_yy_switch_to_buffer( buf );
-       BEGIN INITIAL;
-}
index 6686257887222aa2face214f4d3dfd24cb7a8190..897ff2795e27690f7f0885eb16d82840718fe826 100644 (file)
@@ -39,8 +39,7 @@ Datum         prsd_start(PG_FUNCTION_ARGS);
 Datum
 prsd_start(PG_FUNCTION_ARGS)
 {
-       tsearch2_start_parse_str((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1));
-       PG_RETURN_POINTER(NULL);
+       PG_RETURN_POINTER(TParserInit( (char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
 }
 
 PG_FUNCTION_INFO_V1(prsd_getlexeme);
@@ -48,14 +47,17 @@ Datum               prsd_getlexeme(PG_FUNCTION_ARGS);
 Datum
 prsd_getlexeme(PG_FUNCTION_ARGS)
 {
-       /* ParserState *p=(ParserState*)PG_GETARG_POINTER(0); */
+       TParser *p=(TParser*)PG_GETARG_POINTER(0); 
        char      **t = (char **) PG_GETARG_POINTER(1);
        int                *tlen = (int *) PG_GETARG_POINTER(2);
-       int                     type = tsearch2_yylex();
 
-       *t = token;
-       *tlen = tokenlen;
-       PG_RETURN_INT32(type);
+       if ( !TParserGet(p) ) 
+               PG_RETURN_INT32(0);
+
+       *t = p->lexeme; 
+       *tlen = p->lenbytelexeme;
+
+       PG_RETURN_INT32(p->type);
 }
 
 PG_FUNCTION_INFO_V1(prsd_end);
@@ -63,8 +65,8 @@ Datum         prsd_end(PG_FUNCTION_ARGS);
 Datum
 prsd_end(PG_FUNCTION_ARGS)
 {
-       /* ParserState *p=(ParserState*)PG_GETARG_POINTER(0); */
-       tsearch2_end_parse();
+       TParser *p=(TParser*)PG_GETARG_POINTER(0);
+       TParserClose(p); 
        PG_RETURN_VOID();
 }