]> granicus.if.org Git - postgresql/blob - src/backend/tsearch/wparser_def.c
Fix XML tag namespace change inadvertantly missed from previous fix. Add
[postgresql] / src / backend / tsearch / wparser_def.c
1 /*-------------------------------------------------------------------------
2  *
3  * wparser_def.c
4  *              Default text search parser
5  *
6  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *        $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.12 2007/11/25 15:37:11 adunstan Exp $
11  *
12  *-------------------------------------------------------------------------
13  */
14
15 #include "postgres.h"
16
17 #include "commands/defrem.h"
18 #include "tsearch/ts_locale.h"
19 #include "tsearch/ts_public.h"
20 #include "tsearch/ts_type.h"
21 #include "tsearch/ts_utils.h"
22 #include "utils/builtins.h"
23
24
25 /* Define me to enable tracing of parser behavior */
26 /* #define WPARSER_TRACE */
27
28
29 /* Output token categories */
30
31 #define ASCIIWORD               1
32 #define WORD_T                  2
33 #define NUMWORD                 3
34 #define EMAIL                   4
35 #define URL_T                   5
36 #define HOST                    6
37 #define SCIENTIFIC              7
38 #define VERSIONNUMBER   8
39 #define NUMPARTHWORD    9
40 #define PARTHWORD               10
41 #define ASCIIPARTHWORD  11
42 #define SPACE                   12
43 #define TAG_T                   13
44 #define PROTOCOL                14
45 #define NUMHWORD                15
46 #define ASCIIHWORD              16
47 #define HWORD                   17
48 #define URLPATH                 18
49 #define FILEPATH                19
50 #define DECIMAL                 20
51 #define SIGNEDINT               21
52 #define UNSIGNEDINT             22
53 #define XMLENTITY               23
54
55 #define LASTNUM                 23
56
57 static const char *const tok_alias[] = {
58         "",
59         "asciiword",
60         "word",
61         "numword",
62         "email",
63         "url",
64         "host",
65         "sfloat",
66         "version",
67         "hword_numpart",
68         "hword_part",
69         "hword_asciipart",
70         "blank",
71         "tag",
72         "protocol",
73         "numhword",
74         "asciihword",
75         "hword",
76         "url_path",
77         "file",
78         "float",
79         "int",
80         "uint",
81         "entity"
82 };
83
84 static const char *const lex_descr[] = {
85         "",
86         "Word, all ASCII",
87         "Word, all letters",
88         "Word, letters and digits",
89         "Email address",
90         "URL",
91         "Host",
92         "Scientific notation",
93         "Version number",
94         "Hyphenated word part, letters and digits",
95         "Hyphenated word part, all letters",
96         "Hyphenated word part, all ASCII",
97         "Space symbols",
98         "XML tag",
99         "Protocol head",
100         "Hyphenated word, letters and digits",
101         "Hyphenated word, all ASCII",
102         "Hyphenated word, all letters",
103         "URL path",
104         "File or path name",
105         "Decimal notation",
106         "Signed integer",
107         "Unsigned integer",
108         "XML entity"
109 };
110
111
112 /* Parser states */
113
114 typedef enum
115 {
116         TPS_Base = 0,
117         TPS_InNumWord,
118         TPS_InAsciiWord,
119         TPS_InWord,
120         TPS_InUnsignedInt,
121         TPS_InSignedIntFirst,
122         TPS_InSignedInt,
123         TPS_InSpace,
124         TPS_InUDecimalFirst,
125         TPS_InUDecimal,
126         TPS_InDecimalFirst,
127         TPS_InDecimal,
128         TPS_InVerVersion,
129         TPS_InSVerVersion,
130         TPS_InVersionFirst,
131         TPS_InVersion,
132         TPS_InMantissaFirst,
133         TPS_InMantissaSign,
134         TPS_InMantissa,
135         TPS_InXMLEntityFirst,
136         TPS_InXMLEntity,
137         TPS_InXMLEntityNumFirst,
138         TPS_InXMLEntityNum,
139         TPS_InXMLEntityHexNumFirst,
140         TPS_InXMLEntityHexNum,
141         TPS_InXMLEntityEnd,
142         TPS_InTagFirst,
143         TPS_InXMLBegin,
144         TPS_InTagCloseFirst,
145         TPS_InTagName,
146         TPS_InTagBeginEnd,
147         TPS_InTag,
148         TPS_InTagEscapeK,
149         TPS_InTagEscapeKK,
150         TPS_InTagBackSleshed,
151         TPS_InTagEnd,
152         TPS_InCommentFirst,
153         TPS_InCommentLast,
154         TPS_InComment,
155         TPS_InCloseCommentFirst,
156         TPS_InCloseCommentLast,
157         TPS_InCommentEnd,
158         TPS_InHostFirstDomain,
159         TPS_InHostDomainSecond,
160         TPS_InHostDomain,
161         TPS_InPortFirst,
162         TPS_InPort,
163         TPS_InHostFirstAN,
164         TPS_InHost,
165         TPS_InEmail,
166         TPS_InFileFirst,
167         TPS_InFileTwiddle,
168         TPS_InPathFirst,
169         TPS_InPathFirstFirst,
170         TPS_InPathSecond,
171         TPS_InFile,
172         TPS_InFileNext,
173         TPS_InURLPathFirst,
174         TPS_InURLPathStart,
175         TPS_InURLPath,
176         TPS_InFURL,
177         TPS_InProtocolFirst,
178         TPS_InProtocolSecond,
179         TPS_InProtocolEnd,
180         TPS_InHyphenAsciiWordFirst,
181         TPS_InHyphenAsciiWord,
182         TPS_InHyphenWordFirst,
183         TPS_InHyphenWord,
184         TPS_InHyphenNumWordFirst,
185         TPS_InHyphenNumWord,
186         TPS_InHyphenDigitLookahead,
187         TPS_InParseHyphen,
188         TPS_InParseHyphenHyphen,
189         TPS_InHyphenWordPart,
190         TPS_InHyphenAsciiWordPart,
191         TPS_InHyphenNumWordPart,
192         TPS_InHyphenUnsignedInt,
193         TPS_Null                                        /* last state (fake value) */
194 } TParserState;
195
196 /* forward declaration */
197 struct TParser;
198
199 typedef int (*TParserCharTest) (struct TParser *);              /* any p_is* functions
200                                                                                                                  * except p_iseq */
201 typedef void (*TParserSpecial) (struct TParser *);              /* special handler for
202                                                                                                                  * special cases... */
203
204 typedef struct
205 {
206         TParserCharTest isclass;
207         char            c;
208         uint16          flags;
209         TParserState tostate;
210         int                     type;
211         TParserSpecial special;
212 } TParserStateActionItem;
213
214 /* Flag bits in TParserStateActionItem.flags */
215 #define A_NEXT          0x0000
216 #define A_BINGO         0x0001
217 #define A_POP           0x0002
218 #define A_PUSH          0x0004
219 #define A_RERUN         0x0008
220 #define A_CLEAR         0x0010
221 #define A_MERGE         0x0020
222 #define A_CLRALL        0x0040
223
224 typedef struct TParserPosition
225 {
226         int                     posbyte;                /* position of parser in bytes */
227         int                     poschar;                /* position of parser in characters */
228         int                     charlen;                /* length of current char */
229         int                     lenbytetoken;   /* length of token-so-far in bytes */
230         int                     lenchartoken;   /* and in chars */
231         TParserState state;
232         struct TParserPosition *prev;
233         const TParserStateActionItem *pushedAtAction;
234 } TParserPosition;
235
236 typedef struct TParser
237 {
238         /* string and position information */
239         char       *str;                        /* multibyte string */
240         int                     lenstr;                 /* length of mbstring */
241 #ifdef TS_USE_WIDE
242         wchar_t    *wstr;                       /* wide character string */
243         int                     lenwstr;                /* length of wsting */
244 #endif
245
246         /* State of parse */
247         int                     charmaxlen;
248         bool            usewide;
249         TParserPosition *state;
250         bool            ignore;
251         bool            wanthost;
252
253         /* silly char */
254         char            c;
255
256         /* out */
257         char       *token;
258         int                     lenbytetoken;
259         int                     lenchartoken;
260         int                     type;
261 } TParser;
262
263
264 /* forward decls here */
265 static bool TParserGet(TParser *prs);
266
267
268 static TParserPosition *
269 newTParserPosition(TParserPosition *prev)
270 {
271         TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition));
272
273         if (prev)
274                 memcpy(res, prev, sizeof(TParserPosition));
275         else
276                 memset(res, 0, sizeof(TParserPosition));
277
278         res->prev = prev;
279
280         res->pushedAtAction = NULL;
281
282         return res;
283 }
284
285 static TParser *
286 TParserInit(char *str, int len)
287 {
288         TParser    *prs = (TParser *) palloc0(sizeof(TParser));
289
290         prs->charmaxlen = pg_database_encoding_max_length();
291         prs->str = str;
292         prs->lenstr = len;
293
294 #ifdef TS_USE_WIDE
295
296         /*
297          * Use wide char code only when max encoding length > 1.
298          */
299         if (prs->charmaxlen > 1)
300         {
301                 prs->usewide = true;
302                 prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
303                 prs->lenwstr = char2wchar(prs->wstr, prs->lenstr + 1,
304                                                                   prs->str, prs->lenstr);
305         }
306         else
307 #endif
308                 prs->usewide = false;
309
310         prs->state = newTParserPosition(NULL);
311         prs->state->state = TPS_Base;
312
313 #ifdef WPARSER_TRACE
314         fprintf(stderr, "parsing \"%.*s\"\n", len, str);
315 #endif
316
317         return prs;
318 }
319
320 static void
321 TParserClose(TParser *prs)
322 {
323         while (prs->state)
324         {
325                 TParserPosition *ptr = prs->state->prev;
326
327                 pfree(prs->state);
328                 prs->state = ptr;
329         }
330
331 #ifdef TS_USE_WIDE
332         if (prs->wstr)
333                 pfree(prs->wstr);
334 #endif
335
336         pfree(prs);
337 }
338
339 /*
340  * Character-type support functions, equivalent to is* macros, but
341  * working with any possible encodings and locales. Note,
342  * that with multibyte encoding and C-locale isw* function may fail
343  * or give wrong result. Note 2: multibyte encoding and C-locale
344  * often are used for Asian languages
345  */
346
347 #ifdef TS_USE_WIDE
348
349 #define p_iswhat(type)                                                                                                          \
350 static int                                                                                                                                      \
351 p_is##type(TParser *prs) {                                                                                                      \
352         Assert( prs->state );                                                                                                   \
353         if ( prs->usewide )                                                                                                             \
354         {                                                                                                                                               \
355                 if ( lc_ctype_is_c() )                                                                                          \
356                         return is##type( 0xff & *( prs->wstr + prs->state->poschar) );  \
357                                                                                                                                                         \
358                 return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) );      \
359         }                                                                                                                                               \
360                                                                                                                                                         \
361         return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
362 }       \
363                                                                                                                                                         \
364 static int                                                                                                                                      \
365 p_isnot##type(TParser *prs) {                                                                                           \
366         return !p_is##type(prs);                                                                                                \
367 }
368
369 static int
370 p_isalnum(TParser *prs)
371 {
372         Assert(prs->state);
373
374         if (prs->usewide)
375         {
376                 if (lc_ctype_is_c())
377                 {
378                         unsigned int c = *(prs->wstr + prs->state->poschar);
379
380                         /*
381                          * any non-ascii symbol with multibyte encoding with C-locale is
382                          * an alpha character
383                          */
384                         if (c > 0x7f)
385                                 return 1;
386
387                         return isalnum(0xff & c);
388                 }
389
390                 return iswalnum((wint_t) *(prs->wstr + prs->state->poschar));
391         }
392
393         return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte));
394 }
395 static int
396 p_isnotalnum(TParser *prs)
397 {
398         return !p_isalnum(prs);
399 }
400
401 static int
402 p_isalpha(TParser *prs)
403 {
404         Assert(prs->state);
405
406         if (prs->usewide)
407         {
408                 if (lc_ctype_is_c())
409                 {
410                         unsigned int c = *(prs->wstr + prs->state->poschar);
411
412                         /*
413                          * any non-ascii symbol with multibyte encoding with C-locale is
414                          * an alpha character
415                          */
416                         if (c > 0x7f)
417                                 return 1;
418
419                         return isalpha(0xff & c);
420                 }
421
422                 return iswalpha((wint_t) *(prs->wstr + prs->state->poschar));
423         }
424
425         return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte));
426 }
427
428 static int
429 p_isnotalpha(TParser *prs)
430 {
431         return !p_isalpha(prs);
432 }
433
434 /* p_iseq should be used only for ascii symbols */
435
436 static int
437 p_iseq(TParser *prs, char c)
438 {
439         Assert(prs->state);
440         return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
441 }
442 #else                                                   /* TS_USE_WIDE */
443
444 #define p_iswhat(type)                                                                                                          \
445 static int                                                                                                                                      \
446 p_is##type(TParser *prs) {                                                                                                      \
447         Assert( prs->state );                                                                                                   \
448         return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) );  \
449 }       \
450                                                                                                                                                         \
451 static int                                                                                                                                      \
452 p_isnot##type(TParser *prs) {                                                                                           \
453         return !p_is##type(prs);                                                                                                \
454 }
455
456
457 static int
458 p_iseq(TParser *prs, char c)
459 {
460         Assert(prs->state);
461         return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
462 }
463
464 p_iswhat(alnum)
465 p_iswhat(alpha)
466 #endif   /* TS_USE_WIDE */
467
468 p_iswhat(digit)
469 p_iswhat(lower)
470 p_iswhat(print)
471 p_iswhat(punct)
472 p_iswhat(space)
473 p_iswhat(upper)
474 p_iswhat(xdigit)
475
476 static int
477 p_isEOF(TParser *prs)
478 {
479         Assert(prs->state);
480         return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
481 }
482
483 static int
484 p_iseqC(TParser *prs)
485 {
486         return p_iseq(prs, prs->c);
487 }
488
489 static int
490 p_isneC(TParser *prs)
491 {
492         return !p_iseq(prs, prs->c);
493 }
494
495 static int
496 p_isascii(TParser *prs)
497 {
498         return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
499 }
500
501 static int
502 p_isasclet(TParser *prs)
503 {
504         return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
505 }
506
507
508 /* deliberately suppress unused-function complaints for the above */
509 void            _make_compiler_happy(void);
510 void
511 _make_compiler_happy(void)
512 {
513         p_isalnum(NULL);
514         p_isnotalnum(NULL);
515         p_isalpha(NULL);
516         p_isnotalpha(NULL);
517         p_isdigit(NULL);
518         p_isnotdigit(NULL);
519         p_islower(NULL);
520         p_isnotlower(NULL);
521         p_isprint(NULL);
522         p_isnotprint(NULL);
523         p_ispunct(NULL);
524         p_isnotpunct(NULL);
525         p_isspace(NULL);
526         p_isnotspace(NULL);
527         p_isupper(NULL);
528         p_isnotupper(NULL);
529         p_isxdigit(NULL);
530         p_isnotxdigit(NULL);
531         p_isEOF(NULL);
532         p_iseqC(NULL);
533         p_isneC(NULL);
534 }
535
536
537 static void
538 SpecialTags(TParser *prs)
539 {
540         switch (prs->state->lenchartoken)
541         {
542                 case 8:                 /* </script */
543                         if (pg_strncasecmp(prs->token, "</script", 8) == 0)
544                                 prs->ignore = false;
545                         break;
546                 case 7:                 /* <script || </style */
547                         if (pg_strncasecmp(prs->token, "</style", 7) == 0)
548                                 prs->ignore = false;
549                         else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
550                                 prs->ignore = true;
551                         break;
552                 case 6:                 /* <style */
553                         if (pg_strncasecmp(prs->token, "<style", 6) == 0)
554                                 prs->ignore = true;
555                         break;
556                 default:
557                         break;
558         }
559 }
560
561 static void
562 SpecialFURL(TParser *prs)
563 {
564         prs->wanthost = true;
565         prs->state->posbyte -= prs->state->lenbytetoken;
566         prs->state->poschar -= prs->state->lenchartoken;
567 }
568
569 static void
570 SpecialHyphen(TParser *prs)
571 {
572         prs->state->posbyte -= prs->state->lenbytetoken;
573         prs->state->poschar -= prs->state->lenchartoken;
574 }
575
576 static void
577 SpecialVerVersion(TParser *prs)
578 {
579         prs->state->posbyte -= prs->state->lenbytetoken;
580         prs->state->poschar -= prs->state->lenchartoken;
581         prs->state->lenbytetoken = 0;
582         prs->state->lenchartoken = 0;
583 }
584
585 static int
586 p_isstophost(TParser *prs)
587 {
588         if (prs->wanthost)
589         {
590                 prs->wanthost = false;
591                 return 1;
592         }
593         return 0;
594 }
595
596 static int
597 p_isignore(TParser *prs)
598 {
599         return (prs->ignore) ? 1 : 0;
600 }
601
602 static int
603 p_ishost(TParser *prs)
604 {
605         TParser    *tmpprs = TParserInit(prs->str + prs->state->posbyte, prs->lenstr - prs->state->posbyte);
606         int                     res = 0;
607
608         if (TParserGet(tmpprs) && tmpprs->type == HOST)
609         {
610                 prs->state->posbyte += tmpprs->lenbytetoken;
611                 prs->state->poschar += tmpprs->lenchartoken;
612                 prs->state->lenbytetoken += tmpprs->lenbytetoken;
613                 prs->state->lenchartoken += tmpprs->lenchartoken;
614                 prs->state->charlen = tmpprs->state->charlen;
615                 res = 1;
616         }
617         TParserClose(tmpprs);
618
619         return res;
620 }
621
622 static int
623 p_isURLPath(TParser *prs)
624 {
625         TParser    *tmpprs = TParserInit(prs->str + prs->state->posbyte, prs->lenstr - prs->state->posbyte);
626         int                     res = 0;
627
628         tmpprs->state = newTParserPosition(tmpprs->state);
629         tmpprs->state->state = TPS_InFileFirst;
630
631         if (TParserGet(tmpprs) && (tmpprs->type == URLPATH || tmpprs->type == FILEPATH))
632         {
633                 prs->state->posbyte += tmpprs->lenbytetoken;
634                 prs->state->poschar += tmpprs->lenchartoken;
635                 prs->state->lenbytetoken += tmpprs->lenbytetoken;
636                 prs->state->lenchartoken += tmpprs->lenchartoken;
637                 prs->state->charlen = tmpprs->state->charlen;
638                 res = 1;
639         }
640         TParserClose(tmpprs);
641
642         return res;
643 }
644
645 /*
646  * Table of state/action of parser
647  */
648
649 static const TParserStateActionItem actionTPS_Base[] = {
650         {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
651         {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
652         {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
653         {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
654         {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
655         {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
656         {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
657         {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
658         {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
659         {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
660         {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
661         {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
662         {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
663 };
664
665
666 static const TParserStateActionItem actionTPS_InNumWord[] = {
667         {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
668         {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
669         {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
670         {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
671         {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
672         {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
673         {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
674 };
675
676 static const TParserStateActionItem actionTPS_InAsciiWord[] = {
677         {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
678         {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
679         {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
680         {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
681         {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
682         {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
683         {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
684         {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
685         {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
686         {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
687         {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
688         {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
689         {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
690 };
691
692 static const TParserStateActionItem actionTPS_InWord[] = {
693         {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
694         {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
695         {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
696         {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
697         {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
698 };
699
700 static const TParserStateActionItem actionTPS_InUnsignedInt[] = {
701         {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
702         {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
703         {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
704         {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
705         {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
706         {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
707         {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
708         {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
709         {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
710         {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
711 };
712
713 static const TParserStateActionItem actionTPS_InSignedIntFirst[] = {
714         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
715         {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
716         {NULL, 0, A_POP, TPS_Null, 0, NULL}
717 };
718
719 static const TParserStateActionItem actionTPS_InSignedInt[] = {
720         {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
721         {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
722         {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
723         {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
724         {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
725         {NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
726 };
727
728 static const TParserStateActionItem actionTPS_InSpace[] = {
729         {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
730         {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
731         {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
732         {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
733         {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
734         {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
735         {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
736         {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
737         {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
738 };
739
740 static const TParserStateActionItem actionTPS_InUDecimalFirst[] = {
741         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
742         {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
743         {NULL, 0, A_POP, TPS_Null, 0, NULL}
744 };
745
746 static const TParserStateActionItem actionTPS_InUDecimal[] = {
747         {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL, NULL},
748         {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
749         {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
750         {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
751         {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
752         {NULL, 0, A_BINGO, TPS_Base, DECIMAL, NULL}
753 };
754
755 static const TParserStateActionItem actionTPS_InDecimalFirst[] = {
756         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
757         {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
758         {NULL, 0, A_POP, TPS_Null, 0, NULL}
759 };
760
761 static const TParserStateActionItem actionTPS_InDecimal[] = {
762         {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL, NULL},
763         {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
764         {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
765         {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
766         {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
767         {NULL, 0, A_BINGO, TPS_Base, DECIMAL, NULL}
768 };
769
770 static const TParserStateActionItem actionTPS_InVerVersion[] = {
771         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
772         {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
773         {NULL, 0, A_POP, TPS_Null, 0, NULL}
774 };
775
776 static const TParserStateActionItem actionTPS_InSVerVersion[] = {
777         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
778         {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
779         {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
780 };
781
782
783 static const TParserStateActionItem actionTPS_InVersionFirst[] = {
784         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
785         {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
786         {NULL, 0, A_POP, TPS_Null, 0, NULL}
787 };
788
789 static const TParserStateActionItem actionTPS_InVersion[] = {
790         {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
791         {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
792         {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
793         {NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
794 };
795
796 static const TParserStateActionItem actionTPS_InMantissaFirst[] = {
797         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
798         {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
799         {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
800         {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
801         {NULL, 0, A_POP, TPS_Null, 0, NULL}
802 };
803
804 static const TParserStateActionItem actionTPS_InMantissaSign[] = {
805         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
806         {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
807         {NULL, 0, A_POP, TPS_Null, 0, NULL}
808 };
809
810 static const TParserStateActionItem actionTPS_InMantissa[] = {
811         {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
812         {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
813         {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
814 };
815
816 static const TParserStateActionItem actionTPS_InXMLEntityFirst[] = {
817         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
818         {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
819         {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
820         {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
821         {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
822         {NULL, 0, A_POP, TPS_Null, 0, NULL}
823 };
824
825 static const TParserStateActionItem actionTPS_InXMLEntity[] = {
826         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
827         {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
828         {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
829         {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
830         {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
831         {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
832         {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
833         {NULL, 0, A_POP, TPS_Null, 0, NULL}
834 };
835
836 static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[] = {
837         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
838         {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
839         {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
840         {NULL, 0, A_POP, TPS_Null, 0, NULL}
841 };
842
843 static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[] = {
844         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
845         {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
846         {NULL, 0, A_POP, TPS_Null, 0, NULL}
847 };
848
849 static const TParserStateActionItem actionTPS_InXMLEntityNum[] = {
850         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
851         {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
852         {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
853         {NULL, 0, A_POP, TPS_Null, 0, NULL}
854 };
855
856 static const TParserStateActionItem actionTPS_InXMLEntityHexNum[] = {
857         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
858         {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
859         {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
860         {NULL, 0, A_POP, TPS_Null, 0, NULL}
861 };
862
863 static const TParserStateActionItem actionTPS_InXMLEntityEnd[] = {
864         {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
865 };
866
867 static const TParserStateActionItem actionTPS_InTagFirst[] = {
868         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
869         {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
870         {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
871         {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
872         {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
873         {p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
874         {p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
875         {NULL, 0, A_POP, TPS_Null, 0, NULL}
876 };
877
878 static const TParserStateActionItem actionTPS_InXMLBegin[] = {
879         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
880         /* <?xml ... */
881     /* XXX do we wants states for the m and l ?  Right now this accepts <?xZ */
882         {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
883         {NULL, 0, A_POP, TPS_Null, 0, NULL}
884 };
885
886 static const TParserStateActionItem actionTPS_InTagCloseFirst[] = {
887         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
888         {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
889         {NULL, 0, A_POP, TPS_Null, 0, NULL}
890 };
891
892 static const TParserStateActionItem actionTPS_InTagName[] = {
893         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
894         /* <br/> case */
895         {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
896         {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
897         {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
898         {p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
899         {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
900         {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
901         {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
902         {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
903         {NULL, 0, A_POP, TPS_Null, 0, NULL}
904 };
905
906 static const TParserStateActionItem actionTPS_InTagBeginEnd[] = {
907         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
908         {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
909         {NULL, 0, A_POP, TPS_Null, 0, NULL}
910 };
911
912 static const TParserStateActionItem actionTPS_InTag[] = {
913         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
914         {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
915         {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
916         {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
917         {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
918         {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
919         {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
920         {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
921         {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
922         {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
923         {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
924         {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
925         {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
926         {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
927         {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
928         {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
929         {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
930         {NULL, 0, A_POP, TPS_Null, 0, NULL}
931 };
932
933 static const TParserStateActionItem actionTPS_InTagEscapeK[] = {
934         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
935         {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
936         {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
937         {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
938 };
939
940 static const TParserStateActionItem actionTPS_InTagEscapeKK[] = {
941         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
942         {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
943         {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
944         {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
945 };
946
947 static const TParserStateActionItem actionTPS_InTagBackSleshed[] = {
948         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
949         {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
950 };
951
952 static const TParserStateActionItem actionTPS_InTagEnd[] = {
953         {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
954 };
955
956 static const TParserStateActionItem actionTPS_InCommentFirst[] = {
957         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
958         {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
959         /* <!DOCTYPE ...> */
960         {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
961         {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
962         {NULL, 0, A_POP, TPS_Null, 0, NULL}
963 };
964
965 static const TParserStateActionItem actionTPS_InCommentLast[] = {
966         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
967         {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
968         {NULL, 0, A_POP, TPS_Null, 0, NULL}
969 };
970
971 static const TParserStateActionItem actionTPS_InComment[] = {
972         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
973         {p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
974         {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
975 };
976
977 static const TParserStateActionItem actionTPS_InCloseCommentFirst[] = {
978         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
979         {p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
980         {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
981 };
982
983 static const TParserStateActionItem actionTPS_InCloseCommentLast[] = {
984         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
985         {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
986         {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
987         {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
988 };
989
990 static const TParserStateActionItem actionTPS_InCommentEnd[] = {
991         {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
992 };
993
994 static const TParserStateActionItem actionTPS_InHostFirstDomain[] = {
995         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
996         {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
997         {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
998         {NULL, 0, A_POP, TPS_Null, 0, NULL}
999 };
1000
1001 static const TParserStateActionItem actionTPS_InHostDomainSecond[] = {
1002         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1003         {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1004         {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1005         {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1006         {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1007         {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1008         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1009 };
1010
1011 static const TParserStateActionItem actionTPS_InHostDomain[] = {
1012         {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1013         {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1014         {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1015         {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
1016         {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1017         {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1018         {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1019         {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
1020         {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1021         {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1022         {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1023 };
1024
1025 static const TParserStateActionItem actionTPS_InPortFirst[] = {
1026         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1027         {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1028         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1029 };
1030
1031 static const TParserStateActionItem actionTPS_InPort[] = {
1032         {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1033         {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1034         {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1035         {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1036         {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1037 };
1038
1039 static const TParserStateActionItem actionTPS_InHostFirstAN[] = {
1040         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1041         {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1042         {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1043         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1044 };
1045
1046 static const TParserStateActionItem actionTPS_InHost[] = {
1047         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1048         {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1049         {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1050         {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1051         {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1052         {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1053         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1054 };
1055
1056 static const TParserStateActionItem actionTPS_InEmail[] = {
1057         {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
1058         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1059 };
1060
1061 static const TParserStateActionItem actionTPS_InFileFirst[] = {
1062         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1063         {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1064         {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1065         {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
1066         {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1067         {p_iseqC, '?', A_PUSH, TPS_InURLPathFirst, 0, NULL},
1068         {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1069         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1070 };
1071
1072 static const TParserStateActionItem actionTPS_InFileTwiddle[] = {
1073         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1074         {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1075         {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1076         {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1077         {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1078         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1079 };
1080
1081 static const TParserStateActionItem actionTPS_InPathFirst[] = {
1082         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1083         {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1084         {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1085         {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1086         {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1087         {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1088         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1089 };
1090
1091 static const TParserStateActionItem actionTPS_InPathFirstFirst[] = {
1092         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1093         {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1094         {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1095         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1096 };
1097
1098 static const TParserStateActionItem actionTPS_InPathSecond[] = {
1099         {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1100         {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
1101         {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1102         {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1103         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1104 };
1105
1106 static const TParserStateActionItem actionTPS_InFile[] = {
1107         {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
1108         {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1109         {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1110         {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1111         {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1112         {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
1113         {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1114         {p_iseqC, '?', A_PUSH, TPS_InURLPathFirst, 0, NULL},
1115         {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
1116 };
1117
1118 static const TParserStateActionItem actionTPS_InFileNext[] = {
1119         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1120         {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
1121         {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
1122         {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
1123         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1124 };
1125
1126 static const TParserStateActionItem actionTPS_InURLPathFirst[] = {
1127         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1128         {p_iseqC, '"', A_POP, TPS_Null, 0, NULL},
1129         {p_iseqC, '\'', A_POP, TPS_Null, 0, NULL},
1130         {p_isnotspace, 0, A_CLEAR, TPS_InURLPath, 0, NULL},
1131         {NULL, 0, A_POP, TPS_Null, 0, NULL},
1132 };
1133
1134 static const TParserStateActionItem actionTPS_InURLPathStart[] = {
1135         {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
1136 };
1137
1138 static const TParserStateActionItem actionTPS_InURLPath[] = {
1139         {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
1140         {p_iseqC, '"', A_BINGO, TPS_Base, URLPATH, NULL},
1141         {p_iseqC, '\'', A_BINGO, TPS_Base, URLPATH, NULL},
1142         {p_isnotspace, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1143         {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
1144 };
1145
1146 static const TParserStateActionItem actionTPS_InFURL[] = {
1147         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1148         {p_isURLPath, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
1149         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1150 };
1151
1152 static const TParserStateActionItem actionTPS_InProtocolFirst[] = {
1153         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1154         {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
1155         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1156 };
1157
1158 static const TParserStateActionItem actionTPS_InProtocolSecond[] = {
1159         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1160         {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
1161         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1162 };
1163
1164 static const TParserStateActionItem actionTPS_InProtocolEnd[] = {
1165         {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
1166 };
1167
1168 static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
1169         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1170         {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1171         {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1172         {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1173         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1174 };
1175
1176 static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
1177         {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
1178         {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1179         {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1180         {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1181         {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1182         {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
1183 };
1184
1185 static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
1186         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1187         {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1188         {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1189         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1190 };
1191
1192 static const TParserStateActionItem actionTPS_InHyphenWord[] = {
1193         {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
1194         {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1195         {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1196         {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1197         {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
1198 };
1199
1200 static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
1201         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1202         {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1203         {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1204         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1205 };
1206
1207 static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
1208         {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
1209         {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1210         {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
1211         {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
1212 };
1213
1214 static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = {
1215         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1216         {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1217         {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1218         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1219 };
1220
1221 static const TParserStateActionItem actionTPS_InParseHyphen[] = {
1222         {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
1223         {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1224         {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1225         {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
1226         {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
1227         {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
1228 };
1229
1230 static const TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
1231         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1232         {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1233         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1234 };
1235
1236 static const TParserStateActionItem actionTPS_InHyphenWordPart[] = {
1237         {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
1238         {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1239         {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1240         {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
1241 };
1242
1243 static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
1244         {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
1245         {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1246         {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1247         {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1248         {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
1249 };
1250
1251 static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
1252         {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
1253         {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1254         {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
1255 };
1256
1257 static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
1258         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1259         {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1260         {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1261         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1262 };
1263
1264
1265 /*
1266  * main table of per-state parser actions
1267  */
1268 typedef struct
1269 {
1270         const TParserStateActionItem *action;           /* the actual state info */
1271         TParserState state;                     /* only for Assert crosscheck */
1272 #ifdef WPARSER_TRACE
1273         const char *state_name;         /* only for debug printout */
1274 #endif
1275 } TParserStateAction;
1276
1277 #ifdef WPARSER_TRACE
1278 #define TPARSERSTATEACTION(state) \
1279         { CppConcat(action,state), state, CppAsString(state) }
1280 #else
1281 #define TPARSERSTATEACTION(state) \
1282         { CppConcat(action,state), state }
1283 #endif
1284
1285 /*
1286  * order must be the same as in typedef enum {} TParserState!!
1287  */
1288
1289 static const TParserStateAction Actions[] = {
1290         TPARSERSTATEACTION(TPS_Base),
1291         TPARSERSTATEACTION(TPS_InNumWord),
1292         TPARSERSTATEACTION(TPS_InAsciiWord),
1293         TPARSERSTATEACTION(TPS_InWord),
1294         TPARSERSTATEACTION(TPS_InUnsignedInt),
1295         TPARSERSTATEACTION(TPS_InSignedIntFirst),
1296         TPARSERSTATEACTION(TPS_InSignedInt),
1297         TPARSERSTATEACTION(TPS_InSpace),
1298         TPARSERSTATEACTION(TPS_InUDecimalFirst),
1299         TPARSERSTATEACTION(TPS_InUDecimal),
1300         TPARSERSTATEACTION(TPS_InDecimalFirst),
1301         TPARSERSTATEACTION(TPS_InDecimal),
1302         TPARSERSTATEACTION(TPS_InVerVersion),
1303         TPARSERSTATEACTION(TPS_InSVerVersion),
1304         TPARSERSTATEACTION(TPS_InVersionFirst),
1305         TPARSERSTATEACTION(TPS_InVersion),
1306         TPARSERSTATEACTION(TPS_InMantissaFirst),
1307         TPARSERSTATEACTION(TPS_InMantissaSign),
1308         TPARSERSTATEACTION(TPS_InMantissa),
1309         TPARSERSTATEACTION(TPS_InXMLEntityFirst),
1310         TPARSERSTATEACTION(TPS_InXMLEntity),
1311         TPARSERSTATEACTION(TPS_InXMLEntityNumFirst),
1312         TPARSERSTATEACTION(TPS_InXMLEntityNum),
1313         TPARSERSTATEACTION(TPS_InXMLEntityHexNumFirst),
1314         TPARSERSTATEACTION(TPS_InXMLEntityHexNum),
1315         TPARSERSTATEACTION(TPS_InXMLEntityEnd),
1316         TPARSERSTATEACTION(TPS_InTagFirst),
1317         TPARSERSTATEACTION(TPS_InXMLBegin),
1318         TPARSERSTATEACTION(TPS_InTagCloseFirst),
1319         TPARSERSTATEACTION(TPS_InTagName),
1320         TPARSERSTATEACTION(TPS_InTagBeginEnd),
1321         TPARSERSTATEACTION(TPS_InTag),
1322         TPARSERSTATEACTION(TPS_InTagEscapeK),
1323         TPARSERSTATEACTION(TPS_InTagEscapeKK),
1324         TPARSERSTATEACTION(TPS_InTagBackSleshed),
1325         TPARSERSTATEACTION(TPS_InTagEnd),
1326         TPARSERSTATEACTION(TPS_InCommentFirst),
1327         TPARSERSTATEACTION(TPS_InCommentLast),
1328         TPARSERSTATEACTION(TPS_InComment),
1329         TPARSERSTATEACTION(TPS_InCloseCommentFirst),
1330         TPARSERSTATEACTION(TPS_InCloseCommentLast),
1331         TPARSERSTATEACTION(TPS_InCommentEnd),
1332         TPARSERSTATEACTION(TPS_InHostFirstDomain),
1333         TPARSERSTATEACTION(TPS_InHostDomainSecond),
1334         TPARSERSTATEACTION(TPS_InHostDomain),
1335         TPARSERSTATEACTION(TPS_InPortFirst),
1336         TPARSERSTATEACTION(TPS_InPort),
1337         TPARSERSTATEACTION(TPS_InHostFirstAN),
1338         TPARSERSTATEACTION(TPS_InHost),
1339         TPARSERSTATEACTION(TPS_InEmail),
1340         TPARSERSTATEACTION(TPS_InFileFirst),
1341         TPARSERSTATEACTION(TPS_InFileTwiddle),
1342         TPARSERSTATEACTION(TPS_InPathFirst),
1343         TPARSERSTATEACTION(TPS_InPathFirstFirst),
1344         TPARSERSTATEACTION(TPS_InPathSecond),
1345         TPARSERSTATEACTION(TPS_InFile),
1346         TPARSERSTATEACTION(TPS_InFileNext),
1347         TPARSERSTATEACTION(TPS_InURLPathFirst),
1348         TPARSERSTATEACTION(TPS_InURLPathStart),
1349         TPARSERSTATEACTION(TPS_InURLPath),
1350         TPARSERSTATEACTION(TPS_InFURL),
1351         TPARSERSTATEACTION(TPS_InProtocolFirst),
1352         TPARSERSTATEACTION(TPS_InProtocolSecond),
1353         TPARSERSTATEACTION(TPS_InProtocolEnd),
1354         TPARSERSTATEACTION(TPS_InHyphenAsciiWordFirst),
1355         TPARSERSTATEACTION(TPS_InHyphenAsciiWord),
1356         TPARSERSTATEACTION(TPS_InHyphenWordFirst),
1357         TPARSERSTATEACTION(TPS_InHyphenWord),
1358         TPARSERSTATEACTION(TPS_InHyphenNumWordFirst),
1359         TPARSERSTATEACTION(TPS_InHyphenNumWord),
1360         TPARSERSTATEACTION(TPS_InHyphenDigitLookahead),
1361         TPARSERSTATEACTION(TPS_InParseHyphen),
1362         TPARSERSTATEACTION(TPS_InParseHyphenHyphen),
1363         TPARSERSTATEACTION(TPS_InHyphenWordPart),
1364         TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart),
1365         TPARSERSTATEACTION(TPS_InHyphenNumWordPart),
1366         TPARSERSTATEACTION(TPS_InHyphenUnsignedInt)
1367 };
1368
1369
1370 static bool
1371 TParserGet(TParser *prs)
1372 {
1373         const TParserStateActionItem *item = NULL;
1374
1375         Assert(prs->state);
1376
1377         if (prs->state->posbyte >= prs->lenstr)
1378                 return false;
1379
1380         prs->token = prs->str + prs->state->posbyte;
1381         prs->state->pushedAtAction = NULL;
1382
1383         /* look at string */
1384         while (prs->state->posbyte <= prs->lenstr)
1385         {
1386                 if (prs->state->posbyte == prs->lenstr)
1387                         prs->state->charlen = 0;
1388                 else
1389                         prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
1390                                 pg_mblen(prs->str + prs->state->posbyte);
1391
1392                 Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
1393                 Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
1394                 Assert(Actions[prs->state->state].state == prs->state->state);
1395
1396                 if (prs->state->pushedAtAction)
1397                 {
1398                         /* After a POP, pick up at the next test */
1399                         item = prs->state->pushedAtAction + 1;
1400                         prs->state->pushedAtAction = NULL;
1401                 }
1402                 else
1403                 {
1404                         item = Actions[prs->state->state].action;
1405                         Assert(item != NULL);
1406                 }
1407
1408                 /* find action by character class */
1409                 while (item->isclass)
1410                 {
1411                         prs->c = item->c;
1412                         if (item->isclass(prs) != 0)
1413                                 break;
1414                         item++;
1415                 }
1416
1417 #ifdef WPARSER_TRACE
1418                 {
1419                         TParserPosition *ptr;
1420
1421                         fprintf(stderr, "state ");
1422                         /* indent according to stack depth */
1423                         for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
1424                                 fprintf(stderr, "  ");
1425                         fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
1426                         if (prs->state->posbyte < prs->lenstr)
1427                                 fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
1428                         else
1429                                 fprintf(stderr, "at EOF");
1430                         fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
1431                                         (int) (item - Actions[prs->state->state].action),
1432                                         (item->flags & A_BINGO) ? " BINGO" : "",
1433                                         (item->flags & A_POP) ? " POP" : "",
1434                                         (item->flags & A_PUSH) ? " PUSH" : "",
1435                                         (item->flags & A_RERUN) ? " RERUN" : "",
1436                                         (item->flags & A_CLEAR) ? " CLEAR" : "",
1437                                         (item->flags & A_MERGE) ? " MERGE" : "",
1438                                         (item->flags & A_CLRALL) ? " CLRALL" : "",
1439                                         (item->tostate != TPS_Null) ? " tostate " : "",
1440                                         (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
1441                                         (item->type > 0) ? " type " : "",
1442                                         tok_alias[item->type]);
1443                 }
1444 #endif
1445
1446                 /* call special handler if exists */
1447                 if (item->special)
1448                         item->special(prs);
1449
1450                 /* BINGO, token is found */
1451                 if (item->flags & A_BINGO)
1452                 {
1453                         Assert(item->type > 0);
1454                         prs->lenbytetoken = prs->state->lenbytetoken;
1455                         prs->lenchartoken = prs->state->lenchartoken;
1456                         prs->state->lenbytetoken = prs->state->lenchartoken = 0;
1457                         prs->type = item->type;
1458                 }
1459
1460                 /* do various actions by flags */
1461                 if (item->flags & A_POP)
1462                 {                                               /* pop stored state in stack */
1463                         TParserPosition *ptr = prs->state->prev;
1464
1465                         pfree(prs->state);
1466                         prs->state = ptr;
1467                         Assert(prs->state);
1468                 }
1469                 else if (item->flags & A_PUSH)
1470                 {                                               /* push (store) state in stack */
1471                         prs->state->pushedAtAction = item;      /* remember where we push */
1472                         prs->state = newTParserPosition(prs->state);
1473                 }
1474                 else if (item->flags & A_CLEAR)
1475                 {                                               /* clear previous pushed state */
1476                         TParserPosition *ptr;
1477
1478                         Assert(prs->state->prev);
1479                         ptr = prs->state->prev->prev;
1480                         pfree(prs->state->prev);
1481                         prs->state->prev = ptr;
1482                 }
1483                 else if (item->flags & A_CLRALL)
1484                 {                                               /* clear all previous pushed state */
1485                         TParserPosition *ptr;
1486
1487                         while (prs->state->prev)
1488                         {
1489                                 ptr = prs->state->prev->prev;
1490                                 pfree(prs->state->prev);
1491                                 prs->state->prev = ptr;
1492                         }
1493                 }
1494                 else if (item->flags & A_MERGE)
1495                 {                                               /* merge posinfo with current and pushed state */
1496                         TParserPosition *ptr = prs->state;
1497
1498                         Assert(prs->state->prev);
1499                         prs->state = prs->state->prev;
1500
1501                         prs->state->posbyte = ptr->posbyte;
1502                         prs->state->poschar = ptr->poschar;
1503                         prs->state->charlen = ptr->charlen;
1504                         prs->state->lenbytetoken = ptr->lenbytetoken;
1505                         prs->state->lenchartoken = ptr->lenchartoken;
1506                         pfree(ptr);
1507                 }
1508
1509                 /* set new state if pointed */
1510                 if (item->tostate != TPS_Null)
1511                         prs->state->state = item->tostate;
1512
1513                 /* check for go away */
1514                 if ((item->flags & A_BINGO) ||
1515                         (prs->state->posbyte >= prs->lenstr &&
1516                          (item->flags & A_RERUN) == 0))
1517                         break;
1518
1519                 /* go to beginning of loop if we should rerun or we just restore state */
1520                 if (item->flags & (A_RERUN | A_POP))
1521                         continue;
1522
1523                 /* move forward */
1524                 if (prs->state->charlen)
1525                 {
1526                         prs->state->posbyte += prs->state->charlen;
1527                         prs->state->lenbytetoken += prs->state->charlen;
1528                         prs->state->poschar++;
1529                         prs->state->lenchartoken++;
1530                 }
1531         }
1532
1533         return (item && (item->flags & A_BINGO)) ? true : false;
1534 }
1535
1536 Datum
1537 prsd_lextype(PG_FUNCTION_ARGS)
1538 {
1539         LexDescr   *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
1540         int                     i;
1541
1542         for (i = 1; i <= LASTNUM; i++)
1543         {
1544                 descr[i - 1].lexid = i;
1545                 descr[i - 1].alias = pstrdup(tok_alias[i]);
1546                 descr[i - 1].descr = pstrdup(lex_descr[i]);
1547         }
1548
1549         descr[LASTNUM].lexid = 0;
1550
1551         PG_RETURN_POINTER(descr);
1552 }
1553
1554 Datum
1555 prsd_start(PG_FUNCTION_ARGS)
1556 {
1557         PG_RETURN_POINTER(TParserInit((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
1558 }
1559
1560 Datum
1561 prsd_nexttoken(PG_FUNCTION_ARGS)
1562 {
1563         TParser    *p = (TParser *) PG_GETARG_POINTER(0);
1564         char      **t = (char **) PG_GETARG_POINTER(1);
1565         int                *tlen = (int *) PG_GETARG_POINTER(2);
1566
1567         if (!TParserGet(p))
1568                 PG_RETURN_INT32(0);
1569
1570         *t = p->token;
1571         *tlen = p->lenbytetoken;
1572
1573         PG_RETURN_INT32(p->type);
1574 }
1575
1576 Datum
1577 prsd_end(PG_FUNCTION_ARGS)
1578 {
1579         TParser    *p = (TParser *) PG_GETARG_POINTER(0);
1580
1581         TParserClose(p);
1582         PG_RETURN_VOID();
1583 }
1584
1585 #define LEAVETOKEN(x)   ( (x)==SPACE )
1586 #define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1587 #define ENDPUNCTOKEN(x) ( (x)==SPACE )
1588
1589 #define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
1590 #define HLIDIGNORE(x) ( (x)==URL_T || (x)==TAG_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1591 #define XMLHLIDIGNORE(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1592 #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDIGNORE(x) )
1593 #define NOENDTOKEN(x)   ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
1594
1595 typedef struct
1596 {
1597         HeadlineWordEntry *words;
1598         int                     len;
1599 } hlCheck;
1600
1601 static bool
1602 checkcondition_HL(void *checkval, QueryOperand *val)
1603 {
1604         int                     i;
1605
1606         for (i = 0; i < ((hlCheck *) checkval)->len; i++)
1607         {
1608                 if (((hlCheck *) checkval)->words[i].item == val)
1609                         return true;
1610         }
1611         return false;
1612 }
1613
1614
1615 static bool
1616 hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
1617 {
1618         int                     i,
1619                                 j;
1620         QueryItem  *item = GETQUERY(query);
1621         int                     pos = *p;
1622
1623         *q = 0;
1624         *p = 0x7fffffff;
1625
1626         for (j = 0; j < query->size; j++)
1627         {
1628                 if (item->type != QI_VAL)
1629                 {
1630                         item++;
1631                         continue;
1632                 }
1633                 for (i = pos; i < prs->curwords; i++)
1634                 {
1635                         if (prs->words[i].item == &item->operand)
1636                         {
1637                                 if (i > *q)
1638                                         *q = i;
1639                                 break;
1640                         }
1641                 }
1642                 item++;
1643         }
1644
1645         if (*q == 0)
1646                 return false;
1647
1648         item = GETQUERY(query);
1649         for (j = 0; j < query->size; j++)
1650         {
1651                 if (item->type != QI_VAL)
1652                 {
1653                         item++;
1654                         continue;
1655                 }
1656                 for (i = *q; i >= pos; i--)
1657                 {
1658                         if (prs->words[i].item == &item->operand)
1659                         {
1660                                 if (i < *p)
1661                                         *p = i;
1662                                 break;
1663                         }
1664                 }
1665                 item++;
1666         }
1667
1668         if (*p <= *q)
1669         {
1670                 hlCheck         ch;
1671
1672                 ch.words = &(prs->words[*p]);
1673                 ch.len = *q - *p + 1;
1674                 if (TS_execute(GETQUERY(query), &ch, false, checkcondition_HL))
1675                         return true;
1676                 else
1677                 {
1678                         (*p)++;
1679                         return hlCover(prs, query, p, q);
1680                 }
1681         }
1682
1683         return false;
1684 }
1685
1686 Datum
1687 prsd_headline(PG_FUNCTION_ARGS)
1688 {
1689         HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
1690         List       *prsoptions = (List *) PG_GETARG_POINTER(1);
1691         TSQuery         query = PG_GETARG_TSQUERY(2);
1692
1693         /* from opt + start and and tag */
1694         int                     min_words = 15;
1695         int                     max_words = 35;
1696         int                     shortword = 3;
1697
1698         int                     p = 0,
1699                                 q = 0;
1700         int                     bestb = -1,
1701                                 beste = -1;
1702         int                     bestlen = -1;
1703         int                     pose = 0,
1704                                 posb,
1705                                 poslen,
1706                                 curlen;
1707
1708         int                     i;
1709         int                     highlight = 0;
1710         ListCell   *l;
1711
1712         /* config */
1713         prs->startsel = NULL;
1714         prs->stopsel = NULL;
1715         foreach(l, prsoptions)
1716         {
1717                 DefElem    *defel = (DefElem *) lfirst(l);
1718                 char       *val = defGetString(defel);
1719
1720                 if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
1721                         max_words = pg_atoi(val, sizeof(int32), 0);
1722                 else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
1723                         min_words = pg_atoi(val, sizeof(int32), 0);
1724                 else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
1725                         shortword = pg_atoi(val, sizeof(int32), 0);
1726                 else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
1727                         prs->startsel = pstrdup(val);
1728                 else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
1729                         prs->stopsel = pstrdup(val);
1730                 else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
1731                         highlight = (pg_strcasecmp(val, "1") == 0 ||
1732                                                  pg_strcasecmp(val, "on") == 0 ||
1733                                                  pg_strcasecmp(val, "true") == 0 ||
1734                                                  pg_strcasecmp(val, "t") == 0 ||
1735                                                  pg_strcasecmp(val, "y") == 0 ||
1736                                                  pg_strcasecmp(val, "yes") == 0);
1737                 else
1738                         ereport(ERROR,
1739                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1740                                          errmsg("unrecognized headline parameter: \"%s\"",
1741                                                         defel->defname)));
1742         }
1743
1744         if (highlight == 0)
1745         {
1746                 if (min_words >= max_words)
1747                         ereport(ERROR,
1748                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1749                                          errmsg("MinWords should be less than MaxWords")));
1750                 if (min_words <= 0)
1751                         ereport(ERROR,
1752                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1753                                          errmsg("MinWords should be positive")));
1754                 if (shortword < 0)
1755                         ereport(ERROR,
1756                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1757                                          errmsg("ShortWord should be >= 0")));
1758
1759                 while (hlCover(prs, query, &p, &q))
1760                 {
1761                         /* find cover len in words */
1762                         curlen = 0;
1763                         poslen = 0;
1764                         for (i = p; i <= q && curlen < max_words; i++)
1765                         {
1766                                 if (!NONWORDTOKEN(prs->words[i].type))
1767                                         curlen++;
1768                                 if (prs->words[i].item && !prs->words[i].repeated)
1769                                         poslen++;
1770                                 pose = i;
1771                         }
1772
1773                         if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
1774                         {
1775                                 /* best already finded, so try one more cover */
1776                                 p++;
1777                                 continue;
1778                         }
1779
1780                         posb = p;
1781                         if (curlen < max_words)
1782                         {                                       /* find good end */
1783                                 for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
1784                                 {
1785                                         if (i != q)
1786                                         {
1787                                                 if (!NONWORDTOKEN(prs->words[i].type))
1788                                                         curlen++;
1789                                                 if (prs->words[i].item && !prs->words[i].repeated)
1790                                                         poslen++;
1791                                         }
1792                                         pose = i;
1793                                         if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
1794                                                 continue;
1795                                         if (curlen >= min_words)
1796                                                 break;
1797                                 }
1798                                 if (curlen < min_words && i >= prs->curwords)
1799                                 {                               /* got end of text and our cover is shoter
1800                                                                  * than min_words */
1801                                         for (i = p; i >= 0; i--)
1802                                         {
1803                                                 if (!NONWORDTOKEN(prs->words[i].type))
1804                                                         curlen++;
1805                                                 if (prs->words[i].item && !prs->words[i].repeated)
1806                                                         poslen++;
1807                                                 if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
1808                                                         continue;
1809                                                 if (curlen >= min_words)
1810                                                         break;
1811                                         }
1812                                         posb = (i >= 0) ? i : 0;
1813                                 }
1814                         }
1815                         else
1816                         {                                       /* shorter cover :((( */
1817                                 for (; curlen > min_words; i--)
1818                                 {
1819                                         if (!NONWORDTOKEN(prs->words[i].type))
1820                                                 curlen--;
1821                                         if (prs->words[i].item && !prs->words[i].repeated)
1822                                                 poslen--;
1823                                         pose = i;
1824                                         if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
1825                                                 continue;
1826                                         break;
1827                                 }
1828                         }
1829
1830                         if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
1831                                 (bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
1832                                  (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
1833                         {
1834                                 bestb = posb;
1835                                 beste = pose;
1836                                 bestlen = poslen;
1837                         }
1838
1839                         p++;
1840                 }
1841
1842                 if (bestlen < 0)
1843                 {
1844                         curlen = 0;
1845                         for (i = 0; i < prs->curwords && curlen < min_words; i++)
1846                         {
1847                                 if (!NONWORDTOKEN(prs->words[i].type))
1848                                         curlen++;
1849                                 pose = i;
1850                         }
1851                         bestb = 0;
1852                         beste = pose;
1853                 }
1854         }
1855         else
1856         {
1857                 bestb = 0;
1858                 beste = prs->curwords - 1;
1859         }
1860
1861         for (i = bestb; i <= beste; i++)
1862         {
1863                 if (prs->words[i].item)
1864                         prs->words[i].selected = 1;
1865                 if (highlight == 0)
1866                 {
1867                         if (HLIDIGNORE(prs->words[i].type))
1868                                 prs->words[i].replace = 1;
1869                 }
1870                 else
1871                 {
1872                         if (XMLHLIDIGNORE(prs->words[i].type))
1873                                 prs->words[i].replace = 1;
1874                 }
1875
1876                 prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
1877         }
1878
1879         if (!prs->startsel)
1880                 prs->startsel = pstrdup("<b>");
1881         if (!prs->stopsel)
1882                 prs->stopsel = pstrdup("</b>");
1883         prs->startsellen = strlen(prs->startsel);
1884         prs->stopsellen = strlen(prs->stopsel);
1885
1886         PG_RETURN_POINTER(prs);
1887 }