]> granicus.if.org Git - postgresql/blob - src/backend/tsearch/wparser_def.c
Implement SEMI and ANTI joins in the planner and executor. (Semijoins replace
[postgresql] / src / backend / tsearch / wparser_def.c
1 /*-------------------------------------------------------------------------
2  *
3  * wparser_def.c
4  *              Default text search parser
5  *
6  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *        $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.15 2008/06/17 16:09:06 momjian Exp $
11  *
12  *-------------------------------------------------------------------------
13  */
14
15 #include "postgres.h"
16
17 #include "commands/defrem.h"
18 #include "tsearch/ts_locale.h"
19 #include "tsearch/ts_public.h"
20 #include "tsearch/ts_type.h"
21 #include "tsearch/ts_utils.h"
22 #include "utils/builtins.h"
23
24
25 /* Define me to enable tracing of parser behavior */
26 /* #define WPARSER_TRACE */
27
28
29 /* Output token categories */
30
31 #define ASCIIWORD               1
32 #define WORD_T                  2
33 #define NUMWORD                 3
34 #define EMAIL                   4
35 #define URL_T                   5
36 #define HOST                    6
37 #define SCIENTIFIC              7
38 #define VERSIONNUMBER   8
39 #define NUMPARTHWORD    9
40 #define PARTHWORD               10
41 #define ASCIIPARTHWORD  11
42 #define SPACE                   12
43 #define TAG_T                   13
44 #define PROTOCOL                14
45 #define NUMHWORD                15
46 #define ASCIIHWORD              16
47 #define HWORD                   17
48 #define URLPATH                 18
49 #define FILEPATH                19
50 #define DECIMAL                 20
51 #define SIGNEDINT               21
52 #define UNSIGNEDINT             22
53 #define XMLENTITY               23
54
55 #define LASTNUM                 23
56
57 static const char *const tok_alias[] = {
58         "",
59         "asciiword",
60         "word",
61         "numword",
62         "email",
63         "url",
64         "host",
65         "sfloat",
66         "version",
67         "hword_numpart",
68         "hword_part",
69         "hword_asciipart",
70         "blank",
71         "tag",
72         "protocol",
73         "numhword",
74         "asciihword",
75         "hword",
76         "url_path",
77         "file",
78         "float",
79         "int",
80         "uint",
81         "entity"
82 };
83
84 static const char *const lex_descr[] = {
85         "",
86         "Word, all ASCII",
87         "Word, all letters",
88         "Word, letters and digits",
89         "Email address",
90         "URL",
91         "Host",
92         "Scientific notation",
93         "Version number",
94         "Hyphenated word part, letters and digits",
95         "Hyphenated word part, all letters",
96         "Hyphenated word part, all ASCII",
97         "Space symbols",
98         "XML tag",
99         "Protocol head",
100         "Hyphenated word, letters and digits",
101         "Hyphenated word, all ASCII",
102         "Hyphenated word, all letters",
103         "URL path",
104         "File or path name",
105         "Decimal notation",
106         "Signed integer",
107         "Unsigned integer",
108         "XML entity"
109 };
110
111
112 /* Parser states */
113
114 typedef enum
115 {
116         TPS_Base = 0,
117         TPS_InNumWord,
118         TPS_InAsciiWord,
119         TPS_InWord,
120         TPS_InUnsignedInt,
121         TPS_InSignedIntFirst,
122         TPS_InSignedInt,
123         TPS_InSpace,
124         TPS_InUDecimalFirst,
125         TPS_InUDecimal,
126         TPS_InDecimalFirst,
127         TPS_InDecimal,
128         TPS_InVerVersion,
129         TPS_InSVerVersion,
130         TPS_InVersionFirst,
131         TPS_InVersion,
132         TPS_InMantissaFirst,
133         TPS_InMantissaSign,
134         TPS_InMantissa,
135         TPS_InXMLEntityFirst,
136         TPS_InXMLEntity,
137         TPS_InXMLEntityNumFirst,
138         TPS_InXMLEntityNum,
139         TPS_InXMLEntityHexNumFirst,
140         TPS_InXMLEntityHexNum,
141         TPS_InXMLEntityEnd,
142         TPS_InTagFirst,
143         TPS_InXMLBegin,
144         TPS_InTagCloseFirst,
145         TPS_InTagName,
146         TPS_InTagBeginEnd,
147         TPS_InTag,
148         TPS_InTagEscapeK,
149         TPS_InTagEscapeKK,
150         TPS_InTagBackSleshed,
151         TPS_InTagEnd,
152         TPS_InCommentFirst,
153         TPS_InCommentLast,
154         TPS_InComment,
155         TPS_InCloseCommentFirst,
156         TPS_InCloseCommentLast,
157         TPS_InCommentEnd,
158         TPS_InHostFirstDomain,
159         TPS_InHostDomainSecond,
160         TPS_InHostDomain,
161         TPS_InPortFirst,
162         TPS_InPort,
163         TPS_InHostFirstAN,
164         TPS_InHost,
165         TPS_InEmail,
166         TPS_InFileFirst,
167         TPS_InFileTwiddle,
168         TPS_InPathFirst,
169         TPS_InPathFirstFirst,
170         TPS_InPathSecond,
171         TPS_InFile,
172         TPS_InFileNext,
173         TPS_InURLPathFirst,
174         TPS_InURLPathStart,
175         TPS_InURLPath,
176         TPS_InFURL,
177         TPS_InProtocolFirst,
178         TPS_InProtocolSecond,
179         TPS_InProtocolEnd,
180         TPS_InHyphenAsciiWordFirst,
181         TPS_InHyphenAsciiWord,
182         TPS_InHyphenWordFirst,
183         TPS_InHyphenWord,
184         TPS_InHyphenNumWordFirst,
185         TPS_InHyphenNumWord,
186         TPS_InHyphenDigitLookahead,
187         TPS_InParseHyphen,
188         TPS_InParseHyphenHyphen,
189         TPS_InHyphenWordPart,
190         TPS_InHyphenAsciiWordPart,
191         TPS_InHyphenNumWordPart,
192         TPS_InHyphenUnsignedInt,
193         TPS_Null                                        /* last state (fake value) */
194 } TParserState;
195
196 /* forward declaration */
197 struct TParser;
198
199 typedef int (*TParserCharTest) (struct TParser *);              /* any p_is* functions
200                                                                                                                  * except p_iseq */
201 typedef void (*TParserSpecial) (struct TParser *);              /* special handler for
202                                                                                                                  * special cases... */
203
204 typedef struct
205 {
206         TParserCharTest isclass;
207         char            c;
208         uint16          flags;
209         TParserState tostate;
210         int                     type;
211         TParserSpecial special;
212 } TParserStateActionItem;
213
214 /* Flag bits in TParserStateActionItem.flags */
215 #define A_NEXT          0x0000
216 #define A_BINGO         0x0001
217 #define A_POP           0x0002
218 #define A_PUSH          0x0004
219 #define A_RERUN         0x0008
220 #define A_CLEAR         0x0010
221 #define A_MERGE         0x0020
222 #define A_CLRALL        0x0040
223
224 typedef struct TParserPosition
225 {
226         int                     posbyte;                /* position of parser in bytes */
227         int                     poschar;                /* position of parser in characters */
228         int                     charlen;                /* length of current char */
229         int                     lenbytetoken;   /* length of token-so-far in bytes */
230         int                     lenchartoken;   /* and in chars */
231         TParserState state;
232         struct TParserPosition *prev;
233         const TParserStateActionItem *pushedAtAction;
234 } TParserPosition;
235
236 typedef struct TParser
237 {
238         /* string and position information */
239         char       *str;                        /* multibyte string */
240         int                     lenstr;                 /* length of mbstring */
241 #ifdef USE_WIDE_UPPER_LOWER
242         wchar_t    *wstr;                       /* wide character string */
243         int                     lenwstr;                /* length of wsting */
244 #endif
245
246         /* State of parse */
247         int                     charmaxlen;
248         bool            usewide;
249         TParserPosition *state;
250         bool            ignore;
251         bool            wanthost;
252
253         /* silly char */
254         char            c;
255
256         /* out */
257         char       *token;
258         int                     lenbytetoken;
259         int                     lenchartoken;
260         int                     type;
261 } TParser;
262
263
264 /* forward decls here */
265 static bool TParserGet(TParser *prs);
266
267
268 static TParserPosition *
269 newTParserPosition(TParserPosition *prev)
270 {
271         TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition));
272
273         if (prev)
274                 memcpy(res, prev, sizeof(TParserPosition));
275         else
276                 memset(res, 0, sizeof(TParserPosition));
277
278         res->prev = prev;
279
280         res->pushedAtAction = NULL;
281
282         return res;
283 }
284
285 static TParser *
286 TParserInit(char *str, int len)
287 {
288         TParser    *prs = (TParser *) palloc0(sizeof(TParser));
289
290         prs->charmaxlen = pg_database_encoding_max_length();
291         prs->str = str;
292         prs->lenstr = len;
293
294 #ifdef USE_WIDE_UPPER_LOWER
295
296         /*
297          * Use wide char code only when max encoding length > 1.
298          */
299         if (prs->charmaxlen > 1)
300         {
301                 prs->usewide = true;
302                 prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
303                 prs->lenwstr = char2wchar(prs->wstr, prs->lenstr + 1,
304                                                                   prs->str, prs->lenstr);
305         }
306         else
307 #endif
308                 prs->usewide = false;
309
310         prs->state = newTParserPosition(NULL);
311         prs->state->state = TPS_Base;
312
313 #ifdef WPARSER_TRACE
314         fprintf(stderr, "parsing \"%.*s\"\n", len, str);
315 #endif
316
317         return prs;
318 }
319
320 static void
321 TParserClose(TParser *prs)
322 {
323         while (prs->state)
324         {
325                 TParserPosition *ptr = prs->state->prev;
326
327                 pfree(prs->state);
328                 prs->state = ptr;
329         }
330
331 #ifdef USE_WIDE_UPPER_LOWER
332         if (prs->wstr)
333                 pfree(prs->wstr);
334 #endif
335
336         pfree(prs);
337 }
338
339 /*
340  * Character-type support functions, equivalent to is* macros, but
341  * working with any possible encodings and locales. Note,
342  * that with multibyte encoding and C-locale isw* function may fail
343  * or give wrong result. Note 2: multibyte encoding and C-locale
344  * often are used for Asian languages
345  */
346
347 #ifdef USE_WIDE_UPPER_LOWER
348
349 #define p_iswhat(type)                                                                                                          \
350 static int                                                                                                                                      \
351 p_is##type(TParser *prs) {                                                                                                      \
352         Assert( prs->state );                                                                                                   \
353         if ( prs->usewide )                                                                                                             \
354         {                                                                                                                                               \
355                 if ( lc_ctype_is_c() )                                                                                          \
356                         return is##type( 0xff & *( prs->wstr + prs->state->poschar) );  \
357                                                                                                                                                         \
358                 return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) );      \
359         }                                                                                                                                               \
360                                                                                                                                                         \
361         return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
362 }       \
363                                                                                                                                                         \
364 static int                                                                                                                                      \
365 p_isnot##type(TParser *prs) {                                                                                           \
366         return !p_is##type(prs);                                                                                                \
367 }
368
369 static int
370 p_isalnum(TParser *prs)
371 {
372         Assert(prs->state);
373
374         if (prs->usewide)
375         {
376                 if (lc_ctype_is_c())
377                 {
378                         unsigned int c = *(prs->wstr + prs->state->poschar);
379
380                         /*
381                          * any non-ascii symbol with multibyte encoding with C-locale is
382                          * an alpha character
383                          */
384                         if (c > 0x7f)
385                                 return 1;
386
387                         return isalnum(0xff & c);
388                 }
389
390                 return iswalnum((wint_t) *(prs->wstr + prs->state->poschar));
391         }
392
393         return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte));
394 }
395 static int
396 p_isnotalnum(TParser *prs)
397 {
398         return !p_isalnum(prs);
399 }
400
401 static int
402 p_isalpha(TParser *prs)
403 {
404         Assert(prs->state);
405
406         if (prs->usewide)
407         {
408                 if (lc_ctype_is_c())
409                 {
410                         unsigned int c = *(prs->wstr + prs->state->poschar);
411
412                         /*
413                          * any non-ascii symbol with multibyte encoding with C-locale is
414                          * an alpha character
415                          */
416                         if (c > 0x7f)
417                                 return 1;
418
419                         return isalpha(0xff & c);
420                 }
421
422                 return iswalpha((wint_t) *(prs->wstr + prs->state->poschar));
423         }
424
425         return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte));
426 }
427
428 static int
429 p_isnotalpha(TParser *prs)
430 {
431         return !p_isalpha(prs);
432 }
433
434 /* p_iseq should be used only for ascii symbols */
435
436 static int
437 p_iseq(TParser *prs, char c)
438 {
439         Assert(prs->state);
440         return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
441 }
442 #else                                                   /* USE_WIDE_UPPER_LOWER */
443
444 #define p_iswhat(type)                                                                                                          \
445 static int                                                                                                                                      \
446 p_is##type(TParser *prs) {                                                                                                      \
447         Assert( prs->state );                                                                                                   \
448         return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) );  \
449 }       \
450                                                                                                                                                         \
451 static int                                                                                                                                      \
452 p_isnot##type(TParser *prs) {                                                                                           \
453         return !p_is##type(prs);                                                                                                \
454 }
455
456
457 static int
458 p_iseq(TParser *prs, char c)
459 {
460         Assert(prs->state);
461         return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
462 }
463
464 p_iswhat(alnum)
465 p_iswhat(alpha)
466 #endif   /* USE_WIDE_UPPER_LOWER */
467
468 p_iswhat(digit)
469 p_iswhat(lower)
470 p_iswhat(print)
471 p_iswhat(punct)
472 p_iswhat(space)
473 p_iswhat(upper)
474 p_iswhat(xdigit)
475
476 static int
477 p_isEOF(TParser *prs)
478 {
479         Assert(prs->state);
480         return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
481 }
482
483 static int
484 p_iseqC(TParser *prs)
485 {
486         return p_iseq(prs, prs->c);
487 }
488
489 static int
490 p_isneC(TParser *prs)
491 {
492         return !p_iseq(prs, prs->c);
493 }
494
495 static int
496 p_isascii(TParser *prs)
497 {
498         return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
499 }
500
501 static int
502 p_isasclet(TParser *prs)
503 {
504         return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
505 }
506
507
508 /* deliberately suppress unused-function complaints for the above */
509 void            _make_compiler_happy(void);
510 void
511 _make_compiler_happy(void)
512 {
513         p_isalnum(NULL);
514         p_isnotalnum(NULL);
515         p_isalpha(NULL);
516         p_isnotalpha(NULL);
517         p_isdigit(NULL);
518         p_isnotdigit(NULL);
519         p_islower(NULL);
520         p_isnotlower(NULL);
521         p_isprint(NULL);
522         p_isnotprint(NULL);
523         p_ispunct(NULL);
524         p_isnotpunct(NULL);
525         p_isspace(NULL);
526         p_isnotspace(NULL);
527         p_isupper(NULL);
528         p_isnotupper(NULL);
529         p_isxdigit(NULL);
530         p_isnotxdigit(NULL);
531         p_isEOF(NULL);
532         p_iseqC(NULL);
533         p_isneC(NULL);
534 }
535
536
537 static void
538 SpecialTags(TParser *prs)
539 {
540         switch (prs->state->lenchartoken)
541         {
542                 case 8:                 /* </script */
543                         if (pg_strncasecmp(prs->token, "</script", 8) == 0)
544                                 prs->ignore = false;
545                         break;
546                 case 7:                 /* <script || </style */
547                         if (pg_strncasecmp(prs->token, "</style", 7) == 0)
548                                 prs->ignore = false;
549                         else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
550                                 prs->ignore = true;
551                         break;
552                 case 6:                 /* <style */
553                         if (pg_strncasecmp(prs->token, "<style", 6) == 0)
554                                 prs->ignore = true;
555                         break;
556                 default:
557                         break;
558         }
559 }
560
561 static void
562 SpecialFURL(TParser *prs)
563 {
564         prs->wanthost = true;
565         prs->state->posbyte -= prs->state->lenbytetoken;
566         prs->state->poschar -= prs->state->lenchartoken;
567 }
568
569 static void
570 SpecialHyphen(TParser *prs)
571 {
572         prs->state->posbyte -= prs->state->lenbytetoken;
573         prs->state->poschar -= prs->state->lenchartoken;
574 }
575
576 static void
577 SpecialVerVersion(TParser *prs)
578 {
579         prs->state->posbyte -= prs->state->lenbytetoken;
580         prs->state->poschar -= prs->state->lenchartoken;
581         prs->state->lenbytetoken = 0;
582         prs->state->lenchartoken = 0;
583 }
584
585 static int
586 p_isstophost(TParser *prs)
587 {
588         if (prs->wanthost)
589         {
590                 prs->wanthost = false;
591                 return 1;
592         }
593         return 0;
594 }
595
596 static int
597 p_isignore(TParser *prs)
598 {
599         return (prs->ignore) ? 1 : 0;
600 }
601
602 static int
603 p_ishost(TParser *prs)
604 {
605         TParser    *tmpprs = TParserInit(prs->str + prs->state->posbyte, prs->lenstr - prs->state->posbyte);
606         int                     res = 0;
607
608         if (TParserGet(tmpprs) && tmpprs->type == HOST)
609         {
610                 prs->state->posbyte += tmpprs->lenbytetoken;
611                 prs->state->poschar += tmpprs->lenchartoken;
612                 prs->state->lenbytetoken += tmpprs->lenbytetoken;
613                 prs->state->lenchartoken += tmpprs->lenchartoken;
614                 prs->state->charlen = tmpprs->state->charlen;
615                 res = 1;
616         }
617         TParserClose(tmpprs);
618
619         return res;
620 }
621
622 static int
623 p_isURLPath(TParser *prs)
624 {
625         TParser    *tmpprs = TParserInit(prs->str + prs->state->posbyte, prs->lenstr - prs->state->posbyte);
626         int                     res = 0;
627
628         tmpprs->state = newTParserPosition(tmpprs->state);
629         tmpprs->state->state = TPS_InFileFirst;
630
631         if (TParserGet(tmpprs) && (tmpprs->type == URLPATH || tmpprs->type == FILEPATH))
632         {
633                 prs->state->posbyte += tmpprs->lenbytetoken;
634                 prs->state->poschar += tmpprs->lenchartoken;
635                 prs->state->lenbytetoken += tmpprs->lenbytetoken;
636                 prs->state->lenchartoken += tmpprs->lenchartoken;
637                 prs->state->charlen = tmpprs->state->charlen;
638                 res = 1;
639         }
640         TParserClose(tmpprs);
641
642         return res;
643 }
644
645 /*
646  * Table of state/action of parser
647  */
648
649 static const TParserStateActionItem actionTPS_Base[] = {
650         {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
651         {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
652         {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
653         {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
654         {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
655         {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
656         {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
657         {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
658         {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
659         {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
660         {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
661         {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
662         {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
663 };
664
665
666 static const TParserStateActionItem actionTPS_InNumWord[] = {
667         {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
668         {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
669         {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
670         {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
671         {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
672         {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
673         {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
674 };
675
676 static const TParserStateActionItem actionTPS_InAsciiWord[] = {
677         {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
678         {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
679         {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
680         {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
681         {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
682         {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
683         {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
684         {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
685         {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
686         {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
687         {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
688         {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
689         {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
690 };
691
692 static const TParserStateActionItem actionTPS_InWord[] = {
693         {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
694         {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
695         {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
696         {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
697         {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
698 };
699
700 static const TParserStateActionItem actionTPS_InUnsignedInt[] = {
701         {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
702         {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
703         {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
704         {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
705         {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
706         {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
707         {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
708         {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
709         {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
710         {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
711 };
712
713 static const TParserStateActionItem actionTPS_InSignedIntFirst[] = {
714         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
715         {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
716         {NULL, 0, A_POP, TPS_Null, 0, NULL}
717 };
718
719 static const TParserStateActionItem actionTPS_InSignedInt[] = {
720         {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
721         {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
722         {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
723         {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
724         {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
725         {NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
726 };
727
728 static const TParserStateActionItem actionTPS_InSpace[] = {
729         {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
730         {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
731         {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
732         {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
733         {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
734         {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
735         {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
736         {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
737         {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
738 };
739
740 static const TParserStateActionItem actionTPS_InUDecimalFirst[] = {
741         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
742         {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
743         {NULL, 0, A_POP, TPS_Null, 0, NULL}
744 };
745
746 static const TParserStateActionItem actionTPS_InUDecimal[] = {
747         {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL, NULL},
748         {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
749         {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
750         {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
751         {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
752         {NULL, 0, A_BINGO, TPS_Base, DECIMAL, NULL}
753 };
754
755 static const TParserStateActionItem actionTPS_InDecimalFirst[] = {
756         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
757         {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
758         {NULL, 0, A_POP, TPS_Null, 0, NULL}
759 };
760
761 static const TParserStateActionItem actionTPS_InDecimal[] = {
762         {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL, NULL},
763         {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
764         {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
765         {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
766         {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
767         {NULL, 0, A_BINGO, TPS_Base, DECIMAL, NULL}
768 };
769
770 static const TParserStateActionItem actionTPS_InVerVersion[] = {
771         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
772         {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
773         {NULL, 0, A_POP, TPS_Null, 0, NULL}
774 };
775
776 static const TParserStateActionItem actionTPS_InSVerVersion[] = {
777         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
778         {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
779         {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
780 };
781
782
783 static const TParserStateActionItem actionTPS_InVersionFirst[] = {
784         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
785         {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
786         {NULL, 0, A_POP, TPS_Null, 0, NULL}
787 };
788
789 static const TParserStateActionItem actionTPS_InVersion[] = {
790         {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
791         {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
792         {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
793         {NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
794 };
795
796 static const TParserStateActionItem actionTPS_InMantissaFirst[] = {
797         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
798         {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
799         {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
800         {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
801         {NULL, 0, A_POP, TPS_Null, 0, NULL}
802 };
803
804 static const TParserStateActionItem actionTPS_InMantissaSign[] = {
805         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
806         {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
807         {NULL, 0, A_POP, TPS_Null, 0, NULL}
808 };
809
810 static const TParserStateActionItem actionTPS_InMantissa[] = {
811         {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
812         {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
813         {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
814 };
815
816 static const TParserStateActionItem actionTPS_InXMLEntityFirst[] = {
817         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
818         {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
819         {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
820         {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
821         {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
822         {NULL, 0, A_POP, TPS_Null, 0, NULL}
823 };
824
825 static const TParserStateActionItem actionTPS_InXMLEntity[] = {
826         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
827         {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
828         {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
829         {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
830         {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
831         {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
832         {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
833         {NULL, 0, A_POP, TPS_Null, 0, NULL}
834 };
835
836 static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[] = {
837         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
838         {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
839         {p_iseqC, 'X', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
840         {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
841         {NULL, 0, A_POP, TPS_Null, 0, NULL}
842 };
843
844 static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[] = {
845         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
846         {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
847         {NULL, 0, A_POP, TPS_Null, 0, NULL}
848 };
849
850 static const TParserStateActionItem actionTPS_InXMLEntityNum[] = {
851         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
852         {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
853         {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
854         {NULL, 0, A_POP, TPS_Null, 0, NULL}
855 };
856
857 static const TParserStateActionItem actionTPS_InXMLEntityHexNum[] = {
858         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
859         {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
860         {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
861         {NULL, 0, A_POP, TPS_Null, 0, NULL}
862 };
863
864 static const TParserStateActionItem actionTPS_InXMLEntityEnd[] = {
865         {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
866 };
867
868 static const TParserStateActionItem actionTPS_InTagFirst[] = {
869         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
870         {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
871         {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
872         {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
873         {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
874         {p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
875         {p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
876         {NULL, 0, A_POP, TPS_Null, 0, NULL}
877 };
878
879 static const TParserStateActionItem actionTPS_InXMLBegin[] = {
880         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
881         /* <?xml ... */
882     /* XXX do we wants states for the m and l ?  Right now this accepts <?xZ */
883         {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
884         {NULL, 0, A_POP, TPS_Null, 0, NULL}
885 };
886
887 static const TParserStateActionItem actionTPS_InTagCloseFirst[] = {
888         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
889         {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
890         {NULL, 0, A_POP, TPS_Null, 0, NULL}
891 };
892
893 static const TParserStateActionItem actionTPS_InTagName[] = {
894         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
895         /* <br/> case */
896         {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
897         {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
898         {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
899         {p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
900         {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
901         {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
902         {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
903         {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
904         {NULL, 0, A_POP, TPS_Null, 0, NULL}
905 };
906
907 static const TParserStateActionItem actionTPS_InTagBeginEnd[] = {
908         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
909         {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
910         {NULL, 0, A_POP, TPS_Null, 0, NULL}
911 };
912
913 static const TParserStateActionItem actionTPS_InTag[] = {
914         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
915         {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
916         {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
917         {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
918         {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
919         {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
920         {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
921         {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
922         {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
923         {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
924         {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
925         {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
926         {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
927         {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
928         {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
929         {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
930         {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
931         {NULL, 0, A_POP, TPS_Null, 0, NULL}
932 };
933
934 static const TParserStateActionItem actionTPS_InTagEscapeK[] = {
935         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
936         {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
937         {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
938         {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
939 };
940
941 static const TParserStateActionItem actionTPS_InTagEscapeKK[] = {
942         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
943         {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
944         {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
945         {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
946 };
947
948 static const TParserStateActionItem actionTPS_InTagBackSleshed[] = {
949         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
950         {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
951 };
952
953 static const TParserStateActionItem actionTPS_InTagEnd[] = {
954         {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
955 };
956
957 static const TParserStateActionItem actionTPS_InCommentFirst[] = {
958         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
959         {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
960         /* <!DOCTYPE ...> */
961         {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
962         {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
963         {NULL, 0, A_POP, TPS_Null, 0, NULL}
964 };
965
966 static const TParserStateActionItem actionTPS_InCommentLast[] = {
967         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
968         {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
969         {NULL, 0, A_POP, TPS_Null, 0, NULL}
970 };
971
972 static const TParserStateActionItem actionTPS_InComment[] = {
973         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
974         {p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
975         {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
976 };
977
978 static const TParserStateActionItem actionTPS_InCloseCommentFirst[] = {
979         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
980         {p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
981         {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
982 };
983
984 static const TParserStateActionItem actionTPS_InCloseCommentLast[] = {
985         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
986         {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
987         {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
988         {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
989 };
990
991 static const TParserStateActionItem actionTPS_InCommentEnd[] = {
992         {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
993 };
994
995 static const TParserStateActionItem actionTPS_InHostFirstDomain[] = {
996         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
997         {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
998         {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
999         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1000 };
1001
1002 static const TParserStateActionItem actionTPS_InHostDomainSecond[] = {
1003         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1004         {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1005         {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1006         {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1007         {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1008         {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1009         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1010 };
1011
1012 static const TParserStateActionItem actionTPS_InHostDomain[] = {
1013         {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1014         {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1015         {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1016         {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
1017         {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1018         {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1019         {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1020         {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
1021         {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1022         {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1023         {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1024 };
1025
1026 static const TParserStateActionItem actionTPS_InPortFirst[] = {
1027         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1028         {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1029         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1030 };
1031
1032 static const TParserStateActionItem actionTPS_InPort[] = {
1033         {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1034         {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1035         {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1036         {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1037         {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1038 };
1039
1040 static const TParserStateActionItem actionTPS_InHostFirstAN[] = {
1041         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1042         {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1043         {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1044         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1045 };
1046
1047 static const TParserStateActionItem actionTPS_InHost[] = {
1048         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1049         {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1050         {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1051         {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1052         {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1053         {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1054         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1055 };
1056
1057 static const TParserStateActionItem actionTPS_InEmail[] = {
1058         {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
1059         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1060 };
1061
1062 static const TParserStateActionItem actionTPS_InFileFirst[] = {
1063         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1064         {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1065         {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1066         {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
1067         {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1068         {p_iseqC, '?', A_PUSH, TPS_InURLPathFirst, 0, NULL},
1069         {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1070         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1071 };
1072
1073 static const TParserStateActionItem actionTPS_InFileTwiddle[] = {
1074         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1075         {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1076         {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1077         {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1078         {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1079         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1080 };
1081
1082 static const TParserStateActionItem actionTPS_InPathFirst[] = {
1083         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1084         {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1085         {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1086         {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1087         {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1088         {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1089         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1090 };
1091
1092 static const TParserStateActionItem actionTPS_InPathFirstFirst[] = {
1093         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1094         {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1095         {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1096         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1097 };
1098
1099 static const TParserStateActionItem actionTPS_InPathSecond[] = {
1100         {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1101         {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
1102         {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1103         {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1104         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1105 };
1106
1107 static const TParserStateActionItem actionTPS_InFile[] = {
1108         {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
1109         {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1110         {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1111         {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1112         {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1113         {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
1114         {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1115         {p_iseqC, '?', A_PUSH, TPS_InURLPathFirst, 0, NULL},
1116         {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
1117 };
1118
1119 static const TParserStateActionItem actionTPS_InFileNext[] = {
1120         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1121         {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
1122         {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
1123         {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
1124         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1125 };
1126
1127 static const TParserStateActionItem actionTPS_InURLPathFirst[] = {
1128         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1129         {p_iseqC, '"', A_POP, TPS_Null, 0, NULL},
1130         {p_iseqC, '\'', A_POP, TPS_Null, 0, NULL},
1131         {p_isnotspace, 0, A_CLEAR, TPS_InURLPath, 0, NULL},
1132         {NULL, 0, A_POP, TPS_Null, 0, NULL},
1133 };
1134
1135 static const TParserStateActionItem actionTPS_InURLPathStart[] = {
1136         {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
1137 };
1138
1139 static const TParserStateActionItem actionTPS_InURLPath[] = {
1140         {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
1141         {p_iseqC, '"', A_BINGO, TPS_Base, URLPATH, NULL},
1142         {p_iseqC, '\'', A_BINGO, TPS_Base, URLPATH, NULL},
1143         {p_isnotspace, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1144         {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
1145 };
1146
1147 static const TParserStateActionItem actionTPS_InFURL[] = {
1148         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1149         {p_isURLPath, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
1150         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1151 };
1152
1153 static const TParserStateActionItem actionTPS_InProtocolFirst[] = {
1154         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1155         {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
1156         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1157 };
1158
1159 static const TParserStateActionItem actionTPS_InProtocolSecond[] = {
1160         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1161         {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
1162         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1163 };
1164
1165 static const TParserStateActionItem actionTPS_InProtocolEnd[] = {
1166         {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
1167 };
1168
1169 static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
1170         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1171         {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1172         {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1173         {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1174         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1175 };
1176
1177 static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
1178         {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
1179         {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1180         {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1181         {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1182         {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1183         {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
1184 };
1185
1186 static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
1187         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1188         {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1189         {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1190         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1191 };
1192
1193 static const TParserStateActionItem actionTPS_InHyphenWord[] = {
1194         {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
1195         {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1196         {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1197         {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1198         {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
1199 };
1200
1201 static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
1202         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1203         {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1204         {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1205         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1206 };
1207
1208 static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
1209         {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
1210         {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1211         {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
1212         {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
1213 };
1214
1215 static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = {
1216         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1217         {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1218         {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1219         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1220 };
1221
1222 static const TParserStateActionItem actionTPS_InParseHyphen[] = {
1223         {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
1224         {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1225         {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1226         {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
1227         {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
1228         {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
1229 };
1230
1231 static const TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
1232         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1233         {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1234         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1235 };
1236
1237 static const TParserStateActionItem actionTPS_InHyphenWordPart[] = {
1238         {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
1239         {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1240         {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1241         {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
1242 };
1243
1244 static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
1245         {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
1246         {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1247         {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1248         {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1249         {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
1250 };
1251
1252 static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
1253         {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
1254         {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1255         {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
1256 };
1257
1258 static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
1259         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1260         {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1261         {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1262         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1263 };
1264
1265
1266 /*
1267  * main table of per-state parser actions
1268  */
1269 typedef struct
1270 {
1271         const TParserStateActionItem *action;           /* the actual state info */
1272         TParserState state;                     /* only for Assert crosscheck */
1273 #ifdef WPARSER_TRACE
1274         const char *state_name;         /* only for debug printout */
1275 #endif
1276 } TParserStateAction;
1277
1278 #ifdef WPARSER_TRACE
1279 #define TPARSERSTATEACTION(state) \
1280         { CppConcat(action,state), state, CppAsString(state) }
1281 #else
1282 #define TPARSERSTATEACTION(state) \
1283         { CppConcat(action,state), state }
1284 #endif
1285
1286 /*
1287  * order must be the same as in typedef enum {} TParserState!!
1288  */
1289
1290 static const TParserStateAction Actions[] = {
1291         TPARSERSTATEACTION(TPS_Base),
1292         TPARSERSTATEACTION(TPS_InNumWord),
1293         TPARSERSTATEACTION(TPS_InAsciiWord),
1294         TPARSERSTATEACTION(TPS_InWord),
1295         TPARSERSTATEACTION(TPS_InUnsignedInt),
1296         TPARSERSTATEACTION(TPS_InSignedIntFirst),
1297         TPARSERSTATEACTION(TPS_InSignedInt),
1298         TPARSERSTATEACTION(TPS_InSpace),
1299         TPARSERSTATEACTION(TPS_InUDecimalFirst),
1300         TPARSERSTATEACTION(TPS_InUDecimal),
1301         TPARSERSTATEACTION(TPS_InDecimalFirst),
1302         TPARSERSTATEACTION(TPS_InDecimal),
1303         TPARSERSTATEACTION(TPS_InVerVersion),
1304         TPARSERSTATEACTION(TPS_InSVerVersion),
1305         TPARSERSTATEACTION(TPS_InVersionFirst),
1306         TPARSERSTATEACTION(TPS_InVersion),
1307         TPARSERSTATEACTION(TPS_InMantissaFirst),
1308         TPARSERSTATEACTION(TPS_InMantissaSign),
1309         TPARSERSTATEACTION(TPS_InMantissa),
1310         TPARSERSTATEACTION(TPS_InXMLEntityFirst),
1311         TPARSERSTATEACTION(TPS_InXMLEntity),
1312         TPARSERSTATEACTION(TPS_InXMLEntityNumFirst),
1313         TPARSERSTATEACTION(TPS_InXMLEntityNum),
1314         TPARSERSTATEACTION(TPS_InXMLEntityHexNumFirst),
1315         TPARSERSTATEACTION(TPS_InXMLEntityHexNum),
1316         TPARSERSTATEACTION(TPS_InXMLEntityEnd),
1317         TPARSERSTATEACTION(TPS_InTagFirst),
1318         TPARSERSTATEACTION(TPS_InXMLBegin),
1319         TPARSERSTATEACTION(TPS_InTagCloseFirst),
1320         TPARSERSTATEACTION(TPS_InTagName),
1321         TPARSERSTATEACTION(TPS_InTagBeginEnd),
1322         TPARSERSTATEACTION(TPS_InTag),
1323         TPARSERSTATEACTION(TPS_InTagEscapeK),
1324         TPARSERSTATEACTION(TPS_InTagEscapeKK),
1325         TPARSERSTATEACTION(TPS_InTagBackSleshed),
1326         TPARSERSTATEACTION(TPS_InTagEnd),
1327         TPARSERSTATEACTION(TPS_InCommentFirst),
1328         TPARSERSTATEACTION(TPS_InCommentLast),
1329         TPARSERSTATEACTION(TPS_InComment),
1330         TPARSERSTATEACTION(TPS_InCloseCommentFirst),
1331         TPARSERSTATEACTION(TPS_InCloseCommentLast),
1332         TPARSERSTATEACTION(TPS_InCommentEnd),
1333         TPARSERSTATEACTION(TPS_InHostFirstDomain),
1334         TPARSERSTATEACTION(TPS_InHostDomainSecond),
1335         TPARSERSTATEACTION(TPS_InHostDomain),
1336         TPARSERSTATEACTION(TPS_InPortFirst),
1337         TPARSERSTATEACTION(TPS_InPort),
1338         TPARSERSTATEACTION(TPS_InHostFirstAN),
1339         TPARSERSTATEACTION(TPS_InHost),
1340         TPARSERSTATEACTION(TPS_InEmail),
1341         TPARSERSTATEACTION(TPS_InFileFirst),
1342         TPARSERSTATEACTION(TPS_InFileTwiddle),
1343         TPARSERSTATEACTION(TPS_InPathFirst),
1344         TPARSERSTATEACTION(TPS_InPathFirstFirst),
1345         TPARSERSTATEACTION(TPS_InPathSecond),
1346         TPARSERSTATEACTION(TPS_InFile),
1347         TPARSERSTATEACTION(TPS_InFileNext),
1348         TPARSERSTATEACTION(TPS_InURLPathFirst),
1349         TPARSERSTATEACTION(TPS_InURLPathStart),
1350         TPARSERSTATEACTION(TPS_InURLPath),
1351         TPARSERSTATEACTION(TPS_InFURL),
1352         TPARSERSTATEACTION(TPS_InProtocolFirst),
1353         TPARSERSTATEACTION(TPS_InProtocolSecond),
1354         TPARSERSTATEACTION(TPS_InProtocolEnd),
1355         TPARSERSTATEACTION(TPS_InHyphenAsciiWordFirst),
1356         TPARSERSTATEACTION(TPS_InHyphenAsciiWord),
1357         TPARSERSTATEACTION(TPS_InHyphenWordFirst),
1358         TPARSERSTATEACTION(TPS_InHyphenWord),
1359         TPARSERSTATEACTION(TPS_InHyphenNumWordFirst),
1360         TPARSERSTATEACTION(TPS_InHyphenNumWord),
1361         TPARSERSTATEACTION(TPS_InHyphenDigitLookahead),
1362         TPARSERSTATEACTION(TPS_InParseHyphen),
1363         TPARSERSTATEACTION(TPS_InParseHyphenHyphen),
1364         TPARSERSTATEACTION(TPS_InHyphenWordPart),
1365         TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart),
1366         TPARSERSTATEACTION(TPS_InHyphenNumWordPart),
1367         TPARSERSTATEACTION(TPS_InHyphenUnsignedInt)
1368 };
1369
1370
1371 static bool
1372 TParserGet(TParser *prs)
1373 {
1374         const TParserStateActionItem *item = NULL;
1375
1376         Assert(prs->state);
1377
1378         if (prs->state->posbyte >= prs->lenstr)
1379                 return false;
1380
1381         prs->token = prs->str + prs->state->posbyte;
1382         prs->state->pushedAtAction = NULL;
1383
1384         /* look at string */
1385         while (prs->state->posbyte <= prs->lenstr)
1386         {
1387                 if (prs->state->posbyte == prs->lenstr)
1388                         prs->state->charlen = 0;
1389                 else
1390                         prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
1391                                 pg_mblen(prs->str + prs->state->posbyte);
1392
1393                 Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
1394                 Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
1395                 Assert(Actions[prs->state->state].state == prs->state->state);
1396
1397                 if (prs->state->pushedAtAction)
1398                 {
1399                         /* After a POP, pick up at the next test */
1400                         item = prs->state->pushedAtAction + 1;
1401                         prs->state->pushedAtAction = NULL;
1402                 }
1403                 else
1404                 {
1405                         item = Actions[prs->state->state].action;
1406                         Assert(item != NULL);
1407                 }
1408
1409                 /* find action by character class */
1410                 while (item->isclass)
1411                 {
1412                         prs->c = item->c;
1413                         if (item->isclass(prs) != 0)
1414                                 break;
1415                         item++;
1416                 }
1417
1418 #ifdef WPARSER_TRACE
1419                 {
1420                         TParserPosition *ptr;
1421
1422                         fprintf(stderr, "state ");
1423                         /* indent according to stack depth */
1424                         for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
1425                                 fprintf(stderr, "  ");
1426                         fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
1427                         if (prs->state->posbyte < prs->lenstr)
1428                                 fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
1429                         else
1430                                 fprintf(stderr, "at EOF");
1431                         fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
1432                                         (int) (item - Actions[prs->state->state].action),
1433                                         (item->flags & A_BINGO) ? " BINGO" : "",
1434                                         (item->flags & A_POP) ? " POP" : "",
1435                                         (item->flags & A_PUSH) ? " PUSH" : "",
1436                                         (item->flags & A_RERUN) ? " RERUN" : "",
1437                                         (item->flags & A_CLEAR) ? " CLEAR" : "",
1438                                         (item->flags & A_MERGE) ? " MERGE" : "",
1439                                         (item->flags & A_CLRALL) ? " CLRALL" : "",
1440                                         (item->tostate != TPS_Null) ? " tostate " : "",
1441                                         (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
1442                                         (item->type > 0) ? " type " : "",
1443                                         tok_alias[item->type]);
1444                 }
1445 #endif
1446
1447                 /* call special handler if exists */
1448                 if (item->special)
1449                         item->special(prs);
1450
1451                 /* BINGO, token is found */
1452                 if (item->flags & A_BINGO)
1453                 {
1454                         Assert(item->type > 0);
1455                         prs->lenbytetoken = prs->state->lenbytetoken;
1456                         prs->lenchartoken = prs->state->lenchartoken;
1457                         prs->state->lenbytetoken = prs->state->lenchartoken = 0;
1458                         prs->type = item->type;
1459                 }
1460
1461                 /* do various actions by flags */
1462                 if (item->flags & A_POP)
1463                 {                                               /* pop stored state in stack */
1464                         TParserPosition *ptr = prs->state->prev;
1465
1466                         pfree(prs->state);
1467                         prs->state = ptr;
1468                         Assert(prs->state);
1469                 }
1470                 else if (item->flags & A_PUSH)
1471                 {                                               /* push (store) state in stack */
1472                         prs->state->pushedAtAction = item;      /* remember where we push */
1473                         prs->state = newTParserPosition(prs->state);
1474                 }
1475                 else if (item->flags & A_CLEAR)
1476                 {                                               /* clear previous pushed state */
1477                         TParserPosition *ptr;
1478
1479                         Assert(prs->state->prev);
1480                         ptr = prs->state->prev->prev;
1481                         pfree(prs->state->prev);
1482                         prs->state->prev = ptr;
1483                 }
1484                 else if (item->flags & A_CLRALL)
1485                 {                                               /* clear all previous pushed state */
1486                         TParserPosition *ptr;
1487
1488                         while (prs->state->prev)
1489                         {
1490                                 ptr = prs->state->prev->prev;
1491                                 pfree(prs->state->prev);
1492                                 prs->state->prev = ptr;
1493                         }
1494                 }
1495                 else if (item->flags & A_MERGE)
1496                 {                                               /* merge posinfo with current and pushed state */
1497                         TParserPosition *ptr = prs->state;
1498
1499                         Assert(prs->state->prev);
1500                         prs->state = prs->state->prev;
1501
1502                         prs->state->posbyte = ptr->posbyte;
1503                         prs->state->poschar = ptr->poschar;
1504                         prs->state->charlen = ptr->charlen;
1505                         prs->state->lenbytetoken = ptr->lenbytetoken;
1506                         prs->state->lenchartoken = ptr->lenchartoken;
1507                         pfree(ptr);
1508                 }
1509
1510                 /* set new state if pointed */
1511                 if (item->tostate != TPS_Null)
1512                         prs->state->state = item->tostate;
1513
1514                 /* check for go away */
1515                 if ((item->flags & A_BINGO) ||
1516                         (prs->state->posbyte >= prs->lenstr &&
1517                          (item->flags & A_RERUN) == 0))
1518                         break;
1519
1520                 /* go to beginning of loop if we should rerun or we just restore state */
1521                 if (item->flags & (A_RERUN | A_POP))
1522                         continue;
1523
1524                 /* move forward */
1525                 if (prs->state->charlen)
1526                 {
1527                         prs->state->posbyte += prs->state->charlen;
1528                         prs->state->lenbytetoken += prs->state->charlen;
1529                         prs->state->poschar++;
1530                         prs->state->lenchartoken++;
1531                 }
1532         }
1533
1534         return (item && (item->flags & A_BINGO)) ? true : false;
1535 }
1536
1537 Datum
1538 prsd_lextype(PG_FUNCTION_ARGS)
1539 {
1540         LexDescr   *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
1541         int                     i;
1542
1543         for (i = 1; i <= LASTNUM; i++)
1544         {
1545                 descr[i - 1].lexid = i;
1546                 descr[i - 1].alias = pstrdup(tok_alias[i]);
1547                 descr[i - 1].descr = pstrdup(lex_descr[i]);
1548         }
1549
1550         descr[LASTNUM].lexid = 0;
1551
1552         PG_RETURN_POINTER(descr);
1553 }
1554
1555 Datum
1556 prsd_start(PG_FUNCTION_ARGS)
1557 {
1558         PG_RETURN_POINTER(TParserInit((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
1559 }
1560
1561 Datum
1562 prsd_nexttoken(PG_FUNCTION_ARGS)
1563 {
1564         TParser    *p = (TParser *) PG_GETARG_POINTER(0);
1565         char      **t = (char **) PG_GETARG_POINTER(1);
1566         int                *tlen = (int *) PG_GETARG_POINTER(2);
1567
1568         if (!TParserGet(p))
1569                 PG_RETURN_INT32(0);
1570
1571         *t = p->token;
1572         *tlen = p->lenbytetoken;
1573
1574         PG_RETURN_INT32(p->type);
1575 }
1576
1577 Datum
1578 prsd_end(PG_FUNCTION_ARGS)
1579 {
1580         TParser    *p = (TParser *) PG_GETARG_POINTER(0);
1581
1582         TParserClose(p);
1583         PG_RETURN_VOID();
1584 }
1585
1586 #define LEAVETOKEN(x)   ( (x)==SPACE )
1587 #define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1588 #define ENDPUNCTOKEN(x) ( (x)==SPACE )
1589
1590 #define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
1591 #define HLIDIGNORE(x) ( (x)==URL_T || (x)==TAG_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1592 #define XMLHLIDIGNORE(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1593 #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDIGNORE(x) )
1594 #define NOENDTOKEN(x)   ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
1595
1596 typedef struct
1597 {
1598         HeadlineWordEntry *words;
1599         int                     len;
1600 } hlCheck;
1601
1602 static bool
1603 checkcondition_HL(void *checkval, QueryOperand *val)
1604 {
1605         int                     i;
1606
1607         for (i = 0; i < ((hlCheck *) checkval)->len; i++)
1608         {
1609                 if (((hlCheck *) checkval)->words[i].item == val)
1610                         return true;
1611         }
1612         return false;
1613 }
1614
1615
1616 static bool
1617 hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
1618 {
1619         int                     i,
1620                                 j;
1621         QueryItem  *item = GETQUERY(query);
1622         int                     pos = *p;
1623
1624         *q = 0;
1625         *p = 0x7fffffff;
1626
1627         for (j = 0; j < query->size; j++)
1628         {
1629                 if (item->type != QI_VAL)
1630                 {
1631                         item++;
1632                         continue;
1633                 }
1634                 for (i = pos; i < prs->curwords; i++)
1635                 {
1636                         if (prs->words[i].item == &item->operand)
1637                         {
1638                                 if (i > *q)
1639                                         *q = i;
1640                                 break;
1641                         }
1642                 }
1643                 item++;
1644         }
1645
1646         if (*q == 0)
1647                 return false;
1648
1649         item = GETQUERY(query);
1650         for (j = 0; j < query->size; j++)
1651         {
1652                 if (item->type != QI_VAL)
1653                 {
1654                         item++;
1655                         continue;
1656                 }
1657                 for (i = *q; i >= pos; i--)
1658                 {
1659                         if (prs->words[i].item == &item->operand)
1660                         {
1661                                 if (i < *p)
1662                                         *p = i;
1663                                 break;
1664                         }
1665                 }
1666                 item++;
1667         }
1668
1669         if (*p <= *q)
1670         {
1671                 hlCheck         ch;
1672
1673                 ch.words = &(prs->words[*p]);
1674                 ch.len = *q - *p + 1;
1675                 if (TS_execute(GETQUERY(query), &ch, false, checkcondition_HL))
1676                         return true;
1677                 else
1678                 {
1679                         (*p)++;
1680                         return hlCover(prs, query, p, q);
1681                 }
1682         }
1683
1684         return false;
1685 }
1686
1687 Datum
1688 prsd_headline(PG_FUNCTION_ARGS)
1689 {
1690         HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
1691         List       *prsoptions = (List *) PG_GETARG_POINTER(1);
1692         TSQuery         query = PG_GETARG_TSQUERY(2);
1693
1694         /* from opt + start and and tag */
1695         int                     min_words = 15;
1696         int                     max_words = 35;
1697         int                     shortword = 3;
1698
1699         int                     p = 0,
1700                                 q = 0;
1701         int                     bestb = -1,
1702                                 beste = -1;
1703         int                     bestlen = -1;
1704         int                     pose = 0,
1705                                 posb,
1706                                 poslen,
1707                                 curlen;
1708
1709         int                     i;
1710         int                     highlight = 0;
1711         ListCell   *l;
1712
1713         /* config */
1714         prs->startsel = NULL;
1715         prs->stopsel = NULL;
1716         foreach(l, prsoptions)
1717         {
1718                 DefElem    *defel = (DefElem *) lfirst(l);
1719                 char       *val = defGetString(defel);
1720
1721                 if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
1722                         max_words = pg_atoi(val, sizeof(int32), 0);
1723                 else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
1724                         min_words = pg_atoi(val, sizeof(int32), 0);
1725                 else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
1726                         shortword = pg_atoi(val, sizeof(int32), 0);
1727                 else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
1728                         prs->startsel = pstrdup(val);
1729                 else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
1730                         prs->stopsel = pstrdup(val);
1731                 else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
1732                         highlight = (pg_strcasecmp(val, "1") == 0 ||
1733                                                  pg_strcasecmp(val, "on") == 0 ||
1734                                                  pg_strcasecmp(val, "true") == 0 ||
1735                                                  pg_strcasecmp(val, "t") == 0 ||
1736                                                  pg_strcasecmp(val, "y") == 0 ||
1737                                                  pg_strcasecmp(val, "yes") == 0);
1738                 else
1739                         ereport(ERROR,
1740                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1741                                          errmsg("unrecognized headline parameter: \"%s\"",
1742                                                         defel->defname)));
1743         }
1744
1745         if (highlight == 0)
1746         {
1747                 if (min_words >= max_words)
1748                         ereport(ERROR,
1749                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1750                                          errmsg("MinWords should be less than MaxWords")));
1751                 if (min_words <= 0)
1752                         ereport(ERROR,
1753                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1754                                          errmsg("MinWords should be positive")));
1755                 if (shortword < 0)
1756                         ereport(ERROR,
1757                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1758                                          errmsg("ShortWord should be >= 0")));
1759
1760                 while (hlCover(prs, query, &p, &q))
1761                 {
1762                         /* find cover len in words */
1763                         curlen = 0;
1764                         poslen = 0;
1765                         for (i = p; i <= q && curlen < max_words; i++)
1766                         {
1767                                 if (!NONWORDTOKEN(prs->words[i].type))
1768                                         curlen++;
1769                                 if (prs->words[i].item && !prs->words[i].repeated)
1770                                         poslen++;
1771                                 pose = i;
1772                         }
1773
1774                         if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
1775                         {
1776                                 /* best already finded, so try one more cover */
1777                                 p++;
1778                                 continue;
1779                         }
1780
1781                         posb = p;
1782                         if (curlen < max_words)
1783                         {                                       /* find good end */
1784                                 for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
1785                                 {
1786                                         if (i != q)
1787                                         {
1788                                                 if (!NONWORDTOKEN(prs->words[i].type))
1789                                                         curlen++;
1790                                                 if (prs->words[i].item && !prs->words[i].repeated)
1791                                                         poslen++;
1792                                         }
1793                                         pose = i;
1794                                         if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
1795                                                 continue;
1796                                         if (curlen >= min_words)
1797                                                 break;
1798                                 }
1799                                 if (curlen < min_words && i >= prs->curwords)
1800                                 {                               /* got end of text and our cover is shoter
1801                                                                  * than min_words */
1802                                         for (i = p; i >= 0; i--)
1803                                         {
1804                                                 if (!NONWORDTOKEN(prs->words[i].type))
1805                                                         curlen++;
1806                                                 if (prs->words[i].item && !prs->words[i].repeated)
1807                                                         poslen++;
1808                                                 if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
1809                                                         continue;
1810                                                 if (curlen >= min_words)
1811                                                         break;
1812                                         }
1813                                         posb = (i >= 0) ? i : 0;
1814                                 }
1815                         }
1816                         else
1817                         {                                       /* shorter cover :((( */
1818                                 for (; curlen > min_words; i--)
1819                                 {
1820                                         if (!NONWORDTOKEN(prs->words[i].type))
1821                                                 curlen--;
1822                                         if (prs->words[i].item && !prs->words[i].repeated)
1823                                                 poslen--;
1824                                         pose = i;
1825                                         if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
1826                                                 continue;
1827                                         break;
1828                                 }
1829                         }
1830
1831                         if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
1832                                 (bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
1833                                  (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
1834                         {
1835                                 bestb = posb;
1836                                 beste = pose;
1837                                 bestlen = poslen;
1838                         }
1839
1840                         p++;
1841                 }
1842
1843                 if (bestlen < 0)
1844                 {
1845                         curlen = 0;
1846                         for (i = 0; i < prs->curwords && curlen < min_words; i++)
1847                         {
1848                                 if (!NONWORDTOKEN(prs->words[i].type))
1849                                         curlen++;
1850                                 pose = i;
1851                         }
1852                         bestb = 0;
1853                         beste = pose;
1854                 }
1855         }
1856         else
1857         {
1858                 bestb = 0;
1859                 beste = prs->curwords - 1;
1860         }
1861
1862         for (i = bestb; i <= beste; i++)
1863         {
1864                 if (prs->words[i].item)
1865                         prs->words[i].selected = 1;
1866                 if (highlight == 0)
1867                 {
1868                         if (HLIDIGNORE(prs->words[i].type))
1869                                 prs->words[i].replace = 1;
1870                 }
1871                 else
1872                 {
1873                         if (XMLHLIDIGNORE(prs->words[i].type))
1874                                 prs->words[i].replace = 1;
1875                 }
1876
1877                 prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
1878         }
1879
1880         if (!prs->startsel)
1881                 prs->startsel = pstrdup("<b>");
1882         if (!prs->stopsel)
1883                 prs->stopsel = pstrdup("</b>");
1884         prs->startsellen = strlen(prs->startsel);
1885         prs->stopsellen = strlen(prs->stopsel);
1886
1887         PG_RETURN_POINTER(prs);
1888 }