]> granicus.if.org Git - postgresql/blob - src/backend/tsearch/wparser_def.c
b80175456d2ee3ef0699eef2f3eeeac58c24bff0
[postgresql] / src / backend / tsearch / wparser_def.c
1 /*-------------------------------------------------------------------------
2  *
3  * wparser_def.c
4  *              Default text search parser
5  *
6  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *        $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.11 2007/11/20 02:25:22 adunstan Exp $
11  *
12  *-------------------------------------------------------------------------
13  */
14
15 #include "postgres.h"
16
17 #include "commands/defrem.h"
18 #include "tsearch/ts_locale.h"
19 #include "tsearch/ts_public.h"
20 #include "tsearch/ts_type.h"
21 #include "tsearch/ts_utils.h"
22 #include "utils/builtins.h"
23
24
25 /* Define me to enable tracing of parser behavior */
26 /* #define WPARSER_TRACE */
27
28
29 /* Output token categories */
30
31 #define ASCIIWORD               1
32 #define WORD_T                  2
33 #define NUMWORD                 3
34 #define EMAIL                   4
35 #define URL_T                   5
36 #define HOST                    6
37 #define SCIENTIFIC              7
38 #define VERSIONNUMBER   8
39 #define NUMPARTHWORD    9
40 #define PARTHWORD               10
41 #define ASCIIPARTHWORD  11
42 #define SPACE                   12
43 #define TAG_T                   13
44 #define PROTOCOL                14
45 #define NUMHWORD                15
46 #define ASCIIHWORD              16
47 #define HWORD                   17
48 #define URLPATH                 18
49 #define FILEPATH                19
50 #define DECIMAL                 20
51 #define SIGNEDINT               21
52 #define UNSIGNEDINT             22
53 #define XMLENTITY               23
54
55 #define LASTNUM                 23
56
57 static const char *const tok_alias[] = {
58         "",
59         "asciiword",
60         "word",
61         "numword",
62         "email",
63         "url",
64         "host",
65         "sfloat",
66         "version",
67         "hword_numpart",
68         "hword_part",
69         "hword_asciipart",
70         "blank",
71         "tag",
72         "protocol",
73         "numhword",
74         "asciihword",
75         "hword",
76         "url_path",
77         "file",
78         "float",
79         "int",
80         "uint",
81         "entity"
82 };
83
84 static const char *const lex_descr[] = {
85         "",
86         "Word, all ASCII",
87         "Word, all letters",
88         "Word, letters and digits",
89         "Email address",
90         "URL",
91         "Host",
92         "Scientific notation",
93         "Version number",
94         "Hyphenated word part, letters and digits",
95         "Hyphenated word part, all letters",
96         "Hyphenated word part, all ASCII",
97         "Space symbols",
98         "XML tag",
99         "Protocol head",
100         "Hyphenated word, letters and digits",
101         "Hyphenated word, all ASCII",
102         "Hyphenated word, all letters",
103         "URL path",
104         "File or path name",
105         "Decimal notation",
106         "Signed integer",
107         "Unsigned integer",
108         "XML entity"
109 };
110
111
112 /* Parser states */
113
114 typedef enum
115 {
116         TPS_Base = 0,
117         TPS_InNumWord,
118         TPS_InAsciiWord,
119         TPS_InWord,
120         TPS_InUnsignedInt,
121         TPS_InSignedIntFirst,
122         TPS_InSignedInt,
123         TPS_InSpace,
124         TPS_InUDecimalFirst,
125         TPS_InUDecimal,
126         TPS_InDecimalFirst,
127         TPS_InDecimal,
128         TPS_InVerVersion,
129         TPS_InSVerVersion,
130         TPS_InVersionFirst,
131         TPS_InVersion,
132         TPS_InMantissaFirst,
133         TPS_InMantissaSign,
134         TPS_InMantissa,
135         TPS_InXMLEntityFirst,
136         TPS_InXMLEntity,
137         TPS_InXMLEntityNumFirst,
138         TPS_InXMLEntityNum,
139         TPS_InXMLEntityHexNumFirst,
140         TPS_InXMLEntityHexNum,
141         TPS_InXMLEntityEnd,
142         TPS_InTagFirst,
143         TPS_InXMLBegin,
144         TPS_InTagCloseFirst,
145         TPS_InTagName,
146         TPS_InTagBeginEnd,
147         TPS_InTag,
148         TPS_InTagEscapeK,
149         TPS_InTagEscapeKK,
150         TPS_InTagBackSleshed,
151         TPS_InTagEnd,
152         TPS_InCommentFirst,
153         TPS_InCommentLast,
154         TPS_InComment,
155         TPS_InCloseCommentFirst,
156         TPS_InCloseCommentLast,
157         TPS_InCommentEnd,
158         TPS_InHostFirstDomain,
159         TPS_InHostDomainSecond,
160         TPS_InHostDomain,
161         TPS_InPortFirst,
162         TPS_InPort,
163         TPS_InHostFirstAN,
164         TPS_InHost,
165         TPS_InEmail,
166         TPS_InFileFirst,
167         TPS_InFileTwiddle,
168         TPS_InPathFirst,
169         TPS_InPathFirstFirst,
170         TPS_InPathSecond,
171         TPS_InFile,
172         TPS_InFileNext,
173         TPS_InURLPathFirst,
174         TPS_InURLPathStart,
175         TPS_InURLPath,
176         TPS_InFURL,
177         TPS_InProtocolFirst,
178         TPS_InProtocolSecond,
179         TPS_InProtocolEnd,
180         TPS_InHyphenAsciiWordFirst,
181         TPS_InHyphenAsciiWord,
182         TPS_InHyphenWordFirst,
183         TPS_InHyphenWord,
184         TPS_InHyphenNumWordFirst,
185         TPS_InHyphenNumWord,
186         TPS_InHyphenDigitLookahead,
187         TPS_InParseHyphen,
188         TPS_InParseHyphenHyphen,
189         TPS_InHyphenWordPart,
190         TPS_InHyphenAsciiWordPart,
191         TPS_InHyphenNumWordPart,
192         TPS_InHyphenUnsignedInt,
193         TPS_Null                                        /* last state (fake value) */
194 } TParserState;
195
196 /* forward declaration */
197 struct TParser;
198
199 typedef int (*TParserCharTest) (struct TParser *);              /* any p_is* functions
200                                                                                                                  * except p_iseq */
201 typedef void (*TParserSpecial) (struct TParser *);              /* special handler for
202                                                                                                                  * special cases... */
203
204 typedef struct
205 {
206         TParserCharTest isclass;
207         char            c;
208         uint16          flags;
209         TParserState tostate;
210         int                     type;
211         TParserSpecial special;
212 } TParserStateActionItem;
213
214 /* Flag bits in TParserStateActionItem.flags */
215 #define A_NEXT          0x0000
216 #define A_BINGO         0x0001
217 #define A_POP           0x0002
218 #define A_PUSH          0x0004
219 #define A_RERUN         0x0008
220 #define A_CLEAR         0x0010
221 #define A_MERGE         0x0020
222 #define A_CLRALL        0x0040
223
224 typedef struct TParserPosition
225 {
226         int                     posbyte;                /* position of parser in bytes */
227         int                     poschar;                /* position of parser in characters */
228         int                     charlen;                /* length of current char */
229         int                     lenbytetoken;   /* length of token-so-far in bytes */
230         int                     lenchartoken;   /* and in chars */
231         TParserState state;
232         struct TParserPosition *prev;
233         const TParserStateActionItem *pushedAtAction;
234 } TParserPosition;
235
236 typedef struct TParser
237 {
238         /* string and position information */
239         char       *str;                        /* multibyte string */
240         int                     lenstr;                 /* length of mbstring */
241 #ifdef TS_USE_WIDE
242         wchar_t    *wstr;                       /* wide character string */
243         int                     lenwstr;                /* length of wsting */
244 #endif
245
246         /* State of parse */
247         int                     charmaxlen;
248         bool            usewide;
249         TParserPosition *state;
250         bool            ignore;
251         bool            wanthost;
252
253         /* silly char */
254         char            c;
255
256         /* out */
257         char       *token;
258         int                     lenbytetoken;
259         int                     lenchartoken;
260         int                     type;
261 } TParser;
262
263
264 /* forward decls here */
265 static bool TParserGet(TParser *prs);
266
267
268 static TParserPosition *
269 newTParserPosition(TParserPosition *prev)
270 {
271         TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition));
272
273         if (prev)
274                 memcpy(res, prev, sizeof(TParserPosition));
275         else
276                 memset(res, 0, sizeof(TParserPosition));
277
278         res->prev = prev;
279
280         res->pushedAtAction = NULL;
281
282         return res;
283 }
284
285 static TParser *
286 TParserInit(char *str, int len)
287 {
288         TParser    *prs = (TParser *) palloc0(sizeof(TParser));
289
290         prs->charmaxlen = pg_database_encoding_max_length();
291         prs->str = str;
292         prs->lenstr = len;
293
294 #ifdef TS_USE_WIDE
295
296         /*
297          * Use wide char code only when max encoding length > 1.
298          */
299         if (prs->charmaxlen > 1)
300         {
301                 prs->usewide = true;
302                 prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
303                 prs->lenwstr = char2wchar(prs->wstr, prs->lenstr + 1,
304                                                                   prs->str, prs->lenstr);
305         }
306         else
307 #endif
308                 prs->usewide = false;
309
310         prs->state = newTParserPosition(NULL);
311         prs->state->state = TPS_Base;
312
313 #ifdef WPARSER_TRACE
314         fprintf(stderr, "parsing \"%.*s\"\n", len, str);
315 #endif
316
317         return prs;
318 }
319
320 static void
321 TParserClose(TParser *prs)
322 {
323         while (prs->state)
324         {
325                 TParserPosition *ptr = prs->state->prev;
326
327                 pfree(prs->state);
328                 prs->state = ptr;
329         }
330
331 #ifdef TS_USE_WIDE
332         if (prs->wstr)
333                 pfree(prs->wstr);
334 #endif
335
336         pfree(prs);
337 }
338
339 /*
340  * Character-type support functions, equivalent to is* macros, but
341  * working with any possible encodings and locales. Note,
342  * that with multibyte encoding and C-locale isw* function may fail
343  * or give wrong result. Note 2: multibyte encoding and C-locale
344  * often are used for Asian languages
345  */
346
347 #ifdef TS_USE_WIDE
348
349 #define p_iswhat(type)                                                                                                          \
350 static int                                                                                                                                      \
351 p_is##type(TParser *prs) {                                                                                                      \
352         Assert( prs->state );                                                                                                   \
353         if ( prs->usewide )                                                                                                             \
354         {                                                                                                                                               \
355                 if ( lc_ctype_is_c() )                                                                                          \
356                         return is##type( 0xff & *( prs->wstr + prs->state->poschar) );  \
357                                                                                                                                                         \
358                 return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) );      \
359         }                                                                                                                                               \
360                                                                                                                                                         \
361         return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
362 }       \
363                                                                                                                                                         \
364 static int                                                                                                                                      \
365 p_isnot##type(TParser *prs) {                                                                                           \
366         return !p_is##type(prs);                                                                                                \
367 }
368
369 static int
370 p_isalnum(TParser *prs)
371 {
372         Assert(prs->state);
373
374         if (prs->usewide)
375         {
376                 if (lc_ctype_is_c())
377                 {
378                         unsigned int c = *(prs->wstr + prs->state->poschar);
379
380                         /*
381                          * any non-ascii symbol with multibyte encoding with C-locale is
382                          * an alpha character
383                          */
384                         if (c > 0x7f)
385                                 return 1;
386
387                         return isalnum(0xff & c);
388                 }
389
390                 return iswalnum((wint_t) *(prs->wstr + prs->state->poschar));
391         }
392
393         return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte));
394 }
395 static int
396 p_isnotalnum(TParser *prs)
397 {
398         return !p_isalnum(prs);
399 }
400
401 static int
402 p_isalpha(TParser *prs)
403 {
404         Assert(prs->state);
405
406         if (prs->usewide)
407         {
408                 if (lc_ctype_is_c())
409                 {
410                         unsigned int c = *(prs->wstr + prs->state->poschar);
411
412                         /*
413                          * any non-ascii symbol with multibyte encoding with C-locale is
414                          * an alpha character
415                          */
416                         if (c > 0x7f)
417                                 return 1;
418
419                         return isalpha(0xff & c);
420                 }
421
422                 return iswalpha((wint_t) *(prs->wstr + prs->state->poschar));
423         }
424
425         return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte));
426 }
427
428 static int
429 p_isnotalpha(TParser *prs)
430 {
431         return !p_isalpha(prs);
432 }
433
434 /* p_iseq should be used only for ascii symbols */
435
436 static int
437 p_iseq(TParser *prs, char c)
438 {
439         Assert(prs->state);
440         return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
441 }
442 #else                                                   /* TS_USE_WIDE */
443
444 #define p_iswhat(type)                                                                                                          \
445 static int                                                                                                                                      \
446 p_is##type(TParser *prs) {                                                                                                      \
447         Assert( prs->state );                                                                                                   \
448         return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) );  \
449 }       \
450                                                                                                                                                         \
451 static int                                                                                                                                      \
452 p_isnot##type(TParser *prs) {                                                                                           \
453         return !p_is##type(prs);                                                                                                \
454 }
455
456
457 static int
458 p_iseq(TParser *prs, char c)
459 {
460         Assert(prs->state);
461         return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
462 }
463
464 p_iswhat(alnum)
465 p_iswhat(alpha)
466 #endif   /* TS_USE_WIDE */
467
468 p_iswhat(digit)
469 p_iswhat(lower)
470 p_iswhat(print)
471 p_iswhat(punct)
472 p_iswhat(space)
473 p_iswhat(upper)
474 p_iswhat(xdigit)
475
476 static int
477 p_isEOF(TParser *prs)
478 {
479         Assert(prs->state);
480         return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
481 }
482
483 static int
484 p_iseqC(TParser *prs)
485 {
486         return p_iseq(prs, prs->c);
487 }
488
489 static int
490 p_isneC(TParser *prs)
491 {
492         return !p_iseq(prs, prs->c);
493 }
494
495 static int
496 p_isascii(TParser *prs)
497 {
498         return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
499 }
500
501 static int
502 p_isasclet(TParser *prs)
503 {
504         return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
505 }
506
507
508 /* deliberately suppress unused-function complaints for the above */
509 void            _make_compiler_happy(void);
510 void
511 _make_compiler_happy(void)
512 {
513         p_isalnum(NULL);
514         p_isnotalnum(NULL);
515         p_isalpha(NULL);
516         p_isnotalpha(NULL);
517         p_isdigit(NULL);
518         p_isnotdigit(NULL);
519         p_islower(NULL);
520         p_isnotlower(NULL);
521         p_isprint(NULL);
522         p_isnotprint(NULL);
523         p_ispunct(NULL);
524         p_isnotpunct(NULL);
525         p_isspace(NULL);
526         p_isnotspace(NULL);
527         p_isupper(NULL);
528         p_isnotupper(NULL);
529         p_isxdigit(NULL);
530         p_isnotxdigit(NULL);
531         p_isEOF(NULL);
532         p_iseqC(NULL);
533         p_isneC(NULL);
534 }
535
536
537 static void
538 SpecialTags(TParser *prs)
539 {
540         switch (prs->state->lenchartoken)
541         {
542                 case 8:                 /* </script */
543                         if (pg_strncasecmp(prs->token, "</script", 8) == 0)
544                                 prs->ignore = false;
545                         break;
546                 case 7:                 /* <script || </style */
547                         if (pg_strncasecmp(prs->token, "</style", 7) == 0)
548                                 prs->ignore = false;
549                         else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
550                                 prs->ignore = true;
551                         break;
552                 case 6:                 /* <style */
553                         if (pg_strncasecmp(prs->token, "<style", 6) == 0)
554                                 prs->ignore = true;
555                         break;
556                 default:
557                         break;
558         }
559 }
560
561 static void
562 SpecialFURL(TParser *prs)
563 {
564         prs->wanthost = true;
565         prs->state->posbyte -= prs->state->lenbytetoken;
566         prs->state->poschar -= prs->state->lenchartoken;
567 }
568
569 static void
570 SpecialHyphen(TParser *prs)
571 {
572         prs->state->posbyte -= prs->state->lenbytetoken;
573         prs->state->poschar -= prs->state->lenchartoken;
574 }
575
576 static void
577 SpecialVerVersion(TParser *prs)
578 {
579         prs->state->posbyte -= prs->state->lenbytetoken;
580         prs->state->poschar -= prs->state->lenchartoken;
581         prs->state->lenbytetoken = 0;
582         prs->state->lenchartoken = 0;
583 }
584
585 static int
586 p_isstophost(TParser *prs)
587 {
588         if (prs->wanthost)
589         {
590                 prs->wanthost = false;
591                 return 1;
592         }
593         return 0;
594 }
595
596 static int
597 p_isignore(TParser *prs)
598 {
599         return (prs->ignore) ? 1 : 0;
600 }
601
602 static int
603 p_ishost(TParser *prs)
604 {
605         TParser    *tmpprs = TParserInit(prs->str + prs->state->posbyte, prs->lenstr - prs->state->posbyte);
606         int                     res = 0;
607
608         if (TParserGet(tmpprs) && tmpprs->type == HOST)
609         {
610                 prs->state->posbyte += tmpprs->lenbytetoken;
611                 prs->state->poschar += tmpprs->lenchartoken;
612                 prs->state->lenbytetoken += tmpprs->lenbytetoken;
613                 prs->state->lenchartoken += tmpprs->lenchartoken;
614                 prs->state->charlen = tmpprs->state->charlen;
615                 res = 1;
616         }
617         TParserClose(tmpprs);
618
619         return res;
620 }
621
622 static int
623 p_isURLPath(TParser *prs)
624 {
625         TParser    *tmpprs = TParserInit(prs->str + prs->state->posbyte, prs->lenstr - prs->state->posbyte);
626         int                     res = 0;
627
628         tmpprs->state = newTParserPosition(tmpprs->state);
629         tmpprs->state->state = TPS_InFileFirst;
630
631         if (TParserGet(tmpprs) && (tmpprs->type == URLPATH || tmpprs->type == FILEPATH))
632         {
633                 prs->state->posbyte += tmpprs->lenbytetoken;
634                 prs->state->poschar += tmpprs->lenchartoken;
635                 prs->state->lenbytetoken += tmpprs->lenbytetoken;
636                 prs->state->lenchartoken += tmpprs->lenchartoken;
637                 prs->state->charlen = tmpprs->state->charlen;
638                 res = 1;
639         }
640         TParserClose(tmpprs);
641
642         return res;
643 }
644
645 /*
646  * Table of state/action of parser
647  */
648
649 static const TParserStateActionItem actionTPS_Base[] = {
650         {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
651         {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
652         {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
653         {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
654         {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
655         {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
656         {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
657         {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
658         {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
659         {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
660         {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
661         {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
662         {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
663 };
664
665
666 static const TParserStateActionItem actionTPS_InNumWord[] = {
667         {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
668         {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
669         {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
670         {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
671         {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
672         {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
673         {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
674 };
675
676 static const TParserStateActionItem actionTPS_InAsciiWord[] = {
677         {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
678         {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
679         {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
680         {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
681         {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
682         {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
683         {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
684         {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
685         {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
686         {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
687         {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
688         {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
689         {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
690 };
691
692 static const TParserStateActionItem actionTPS_InWord[] = {
693         {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
694         {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
695         {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
696         {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
697         {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
698 };
699
700 static const TParserStateActionItem actionTPS_InUnsignedInt[] = {
701         {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
702         {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
703         {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
704         {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
705         {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
706         {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
707         {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
708         {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
709         {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
710         {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
711 };
712
713 static const TParserStateActionItem actionTPS_InSignedIntFirst[] = {
714         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
715         {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
716         {NULL, 0, A_POP, TPS_Null, 0, NULL}
717 };
718
719 static const TParserStateActionItem actionTPS_InSignedInt[] = {
720         {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
721         {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
722         {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
723         {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
724         {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
725         {NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
726 };
727
728 static const TParserStateActionItem actionTPS_InSpace[] = {
729         {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
730         {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
731         {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
732         {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
733         {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
734         {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
735         {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
736         {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
737         {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
738 };
739
740 static const TParserStateActionItem actionTPS_InUDecimalFirst[] = {
741         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
742         {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
743         {NULL, 0, A_POP, TPS_Null, 0, NULL}
744 };
745
746 static const TParserStateActionItem actionTPS_InUDecimal[] = {
747         {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL, NULL},
748         {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
749         {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
750         {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
751         {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
752         {NULL, 0, A_BINGO, TPS_Base, DECIMAL, NULL}
753 };
754
755 static const TParserStateActionItem actionTPS_InDecimalFirst[] = {
756         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
757         {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
758         {NULL, 0, A_POP, TPS_Null, 0, NULL}
759 };
760
761 static const TParserStateActionItem actionTPS_InDecimal[] = {
762         {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL, NULL},
763         {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
764         {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
765         {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
766         {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
767         {NULL, 0, A_BINGO, TPS_Base, DECIMAL, NULL}
768 };
769
770 static const TParserStateActionItem actionTPS_InVerVersion[] = {
771         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
772         {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
773         {NULL, 0, A_POP, TPS_Null, 0, NULL}
774 };
775
776 static const TParserStateActionItem actionTPS_InSVerVersion[] = {
777         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
778         {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
779         {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
780 };
781
782
783 static const TParserStateActionItem actionTPS_InVersionFirst[] = {
784         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
785         {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
786         {NULL, 0, A_POP, TPS_Null, 0, NULL}
787 };
788
789 static const TParserStateActionItem actionTPS_InVersion[] = {
790         {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
791         {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
792         {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
793         {NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
794 };
795
796 static const TParserStateActionItem actionTPS_InMantissaFirst[] = {
797         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
798         {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
799         {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
800         {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
801         {NULL, 0, A_POP, TPS_Null, 0, NULL}
802 };
803
804 static const TParserStateActionItem actionTPS_InMantissaSign[] = {
805         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
806         {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
807         {NULL, 0, A_POP, TPS_Null, 0, NULL}
808 };
809
810 static const TParserStateActionItem actionTPS_InMantissa[] = {
811         {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
812         {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
813         {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
814 };
815
816 static const TParserStateActionItem actionTPS_InXMLEntityFirst[] = {
817         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
818         {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
819         {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
820         {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
821         {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
822         {NULL, 0, A_POP, TPS_Null, 0, NULL}
823 };
824
825 static const TParserStateActionItem actionTPS_InXMLEntity[] = {
826         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
827         {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
828         {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
829         {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
830         {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
831         {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
832         {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
833         {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
834         {NULL, 0, A_POP, TPS_Null, 0, NULL}
835 };
836
837 static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[] = {
838         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
839         {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
840         {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
841         {NULL, 0, A_POP, TPS_Null, 0, NULL}
842 };
843
844 static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[] = {
845         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
846         {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
847         {NULL, 0, A_POP, TPS_Null, 0, NULL}
848 };
849
850 static const TParserStateActionItem actionTPS_InXMLEntityNum[] = {
851         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
852         {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
853         {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
854         {NULL, 0, A_POP, TPS_Null, 0, NULL}
855 };
856
857 static const TParserStateActionItem actionTPS_InXMLEntityHexNum[] = {
858         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
859         {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
860         {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
861         {NULL, 0, A_POP, TPS_Null, 0, NULL}
862 };
863
864 static const TParserStateActionItem actionTPS_InXMLEntityEnd[] = {
865         {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
866 };
867
868 static const TParserStateActionItem actionTPS_InTagFirst[] = {
869         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
870         {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
871         {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
872         {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
873         {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
874         {NULL, 0, A_POP, TPS_Null, 0, NULL}
875 };
876
877 static const TParserStateActionItem actionTPS_InXMLBegin[] = {
878         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
879         /* <?xml ... */
880     /* XXX do we wants states for the m and l ?  Right now this accepts <?xZ */
881         {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
882         {NULL, 0, A_POP, TPS_Null, 0, NULL}
883 };
884
885 static const TParserStateActionItem actionTPS_InTagCloseFirst[] = {
886         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
887         {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
888         {NULL, 0, A_POP, TPS_Null, 0, NULL}
889 };
890
891 static const TParserStateActionItem actionTPS_InTagName[] = {
892         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
893         /* <br/> case */
894         {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
895         {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
896         {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
897         {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
898         {NULL, 0, A_POP, TPS_Null, 0, NULL}
899 };
900
901 static const TParserStateActionItem actionTPS_InTagBeginEnd[] = {
902         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
903         {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
904         {NULL, 0, A_POP, TPS_Null, 0, NULL}
905 };
906
907 static const TParserStateActionItem actionTPS_InTag[] = {
908         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
909         {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
910         {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
911         {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
912         {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
913         {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
914         {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
915         {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
916         {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
917         {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
918         {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
919         {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
920         {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
921         {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
922         {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
923         {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
924         {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
925         {NULL, 0, A_POP, TPS_Null, 0, NULL}
926 };
927
928 static const TParserStateActionItem actionTPS_InTagEscapeK[] = {
929         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
930         {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
931         {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
932         {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
933 };
934
935 static const TParserStateActionItem actionTPS_InTagEscapeKK[] = {
936         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
937         {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
938         {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
939         {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
940 };
941
942 static const TParserStateActionItem actionTPS_InTagBackSleshed[] = {
943         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
944         {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
945 };
946
947 static const TParserStateActionItem actionTPS_InTagEnd[] = {
948         {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
949 };
950
951 static const TParserStateActionItem actionTPS_InCommentFirst[] = {
952         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
953         {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
954         /* <!DOCTYPE ...> */
955         {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
956         {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
957         {NULL, 0, A_POP, TPS_Null, 0, NULL}
958 };
959
960 static const TParserStateActionItem actionTPS_InCommentLast[] = {
961         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
962         {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
963         {NULL, 0, A_POP, TPS_Null, 0, NULL}
964 };
965
966 static const TParserStateActionItem actionTPS_InComment[] = {
967         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
968         {p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
969         {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
970 };
971
972 static const TParserStateActionItem actionTPS_InCloseCommentFirst[] = {
973         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
974         {p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
975         {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
976 };
977
978 static const TParserStateActionItem actionTPS_InCloseCommentLast[] = {
979         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
980         {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
981         {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
982         {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
983 };
984
985 static const TParserStateActionItem actionTPS_InCommentEnd[] = {
986         {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
987 };
988
989 static const TParserStateActionItem actionTPS_InHostFirstDomain[] = {
990         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
991         {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
992         {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
993         {NULL, 0, A_POP, TPS_Null, 0, NULL}
994 };
995
996 static const TParserStateActionItem actionTPS_InHostDomainSecond[] = {
997         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
998         {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
999         {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1000         {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1001         {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1002         {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1003         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1004 };
1005
1006 static const TParserStateActionItem actionTPS_InHostDomain[] = {
1007         {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1008         {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1009         {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1010         {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
1011         {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1012         {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1013         {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1014         {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
1015         {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1016         {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1017         {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1018 };
1019
1020 static const TParserStateActionItem actionTPS_InPortFirst[] = {
1021         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1022         {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1023         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1024 };
1025
1026 static const TParserStateActionItem actionTPS_InPort[] = {
1027         {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1028         {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1029         {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1030         {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1031         {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1032 };
1033
1034 static const TParserStateActionItem actionTPS_InHostFirstAN[] = {
1035         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1036         {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1037         {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1038         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1039 };
1040
1041 static const TParserStateActionItem actionTPS_InHost[] = {
1042         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1043         {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1044         {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1045         {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1046         {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1047         {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1048         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1049 };
1050
1051 static const TParserStateActionItem actionTPS_InEmail[] = {
1052         {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
1053         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1054 };
1055
1056 static const TParserStateActionItem actionTPS_InFileFirst[] = {
1057         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1058         {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1059         {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1060         {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
1061         {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1062         {p_iseqC, '?', A_PUSH, TPS_InURLPathFirst, 0, NULL},
1063         {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1064         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1065 };
1066
1067 static const TParserStateActionItem actionTPS_InFileTwiddle[] = {
1068         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1069         {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1070         {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1071         {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1072         {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1073         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1074 };
1075
1076 static const TParserStateActionItem actionTPS_InPathFirst[] = {
1077         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1078         {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1079         {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1080         {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1081         {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1082         {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1083         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1084 };
1085
1086 static const TParserStateActionItem actionTPS_InPathFirstFirst[] = {
1087         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1088         {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1089         {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1090         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1091 };
1092
1093 static const TParserStateActionItem actionTPS_InPathSecond[] = {
1094         {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1095         {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
1096         {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1097         {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1098         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1099 };
1100
1101 static const TParserStateActionItem actionTPS_InFile[] = {
1102         {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
1103         {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1104         {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1105         {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1106         {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1107         {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
1108         {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1109         {p_iseqC, '?', A_PUSH, TPS_InURLPathFirst, 0, NULL},
1110         {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
1111 };
1112
1113 static const TParserStateActionItem actionTPS_InFileNext[] = {
1114         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1115         {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
1116         {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
1117         {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
1118         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1119 };
1120
1121 static const TParserStateActionItem actionTPS_InURLPathFirst[] = {
1122         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1123         {p_iseqC, '"', A_POP, TPS_Null, 0, NULL},
1124         {p_iseqC, '\'', A_POP, TPS_Null, 0, NULL},
1125         {p_isnotspace, 0, A_CLEAR, TPS_InURLPath, 0, NULL},
1126         {NULL, 0, A_POP, TPS_Null, 0, NULL},
1127 };
1128
1129 static const TParserStateActionItem actionTPS_InURLPathStart[] = {
1130         {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
1131 };
1132
1133 static const TParserStateActionItem actionTPS_InURLPath[] = {
1134         {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
1135         {p_iseqC, '"', A_BINGO, TPS_Base, URLPATH, NULL},
1136         {p_iseqC, '\'', A_BINGO, TPS_Base, URLPATH, NULL},
1137         {p_isnotspace, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1138         {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
1139 };
1140
1141 static const TParserStateActionItem actionTPS_InFURL[] = {
1142         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1143         {p_isURLPath, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
1144         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1145 };
1146
1147 static const TParserStateActionItem actionTPS_InProtocolFirst[] = {
1148         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1149         {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
1150         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1151 };
1152
1153 static const TParserStateActionItem actionTPS_InProtocolSecond[] = {
1154         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1155         {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
1156         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1157 };
1158
1159 static const TParserStateActionItem actionTPS_InProtocolEnd[] = {
1160         {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
1161 };
1162
1163 static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
1164         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1165         {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1166         {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1167         {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1168         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1169 };
1170
1171 static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
1172         {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
1173         {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1174         {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1175         {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1176         {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1177         {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
1178 };
1179
1180 static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
1181         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1182         {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1183         {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1184         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1185 };
1186
1187 static const TParserStateActionItem actionTPS_InHyphenWord[] = {
1188         {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
1189         {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1190         {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1191         {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1192         {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
1193 };
1194
1195 static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
1196         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1197         {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1198         {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1199         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1200 };
1201
1202 static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
1203         {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
1204         {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1205         {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
1206         {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
1207 };
1208
1209 static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = {
1210         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1211         {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1212         {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1213         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1214 };
1215
1216 static const TParserStateActionItem actionTPS_InParseHyphen[] = {
1217         {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
1218         {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1219         {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1220         {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
1221         {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
1222         {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
1223 };
1224
1225 static const TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
1226         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1227         {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1228         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1229 };
1230
1231 static const TParserStateActionItem actionTPS_InHyphenWordPart[] = {
1232         {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
1233         {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1234         {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1235         {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
1236 };
1237
1238 static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
1239         {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
1240         {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1241         {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1242         {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1243         {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
1244 };
1245
1246 static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
1247         {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
1248         {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1249         {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
1250 };
1251
1252 static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
1253         {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1254         {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1255         {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1256         {NULL, 0, A_POP, TPS_Null, 0, NULL}
1257 };
1258
1259
1260 /*
1261  * main table of per-state parser actions
1262  */
1263 typedef struct
1264 {
1265         const TParserStateActionItem *action;           /* the actual state info */
1266         TParserState state;                     /* only for Assert crosscheck */
1267 #ifdef WPARSER_TRACE
1268         const char *state_name;         /* only for debug printout */
1269 #endif
1270 } TParserStateAction;
1271
1272 #ifdef WPARSER_TRACE
1273 #define TPARSERSTATEACTION(state) \
1274         { CppConcat(action,state), state, CppAsString(state) }
1275 #else
1276 #define TPARSERSTATEACTION(state) \
1277         { CppConcat(action,state), state }
1278 #endif
1279
1280 /*
1281  * order must be the same as in typedef enum {} TParserState!!
1282  */
1283
1284 static const TParserStateAction Actions[] = {
1285         TPARSERSTATEACTION(TPS_Base),
1286         TPARSERSTATEACTION(TPS_InNumWord),
1287         TPARSERSTATEACTION(TPS_InAsciiWord),
1288         TPARSERSTATEACTION(TPS_InWord),
1289         TPARSERSTATEACTION(TPS_InUnsignedInt),
1290         TPARSERSTATEACTION(TPS_InSignedIntFirst),
1291         TPARSERSTATEACTION(TPS_InSignedInt),
1292         TPARSERSTATEACTION(TPS_InSpace),
1293         TPARSERSTATEACTION(TPS_InUDecimalFirst),
1294         TPARSERSTATEACTION(TPS_InUDecimal),
1295         TPARSERSTATEACTION(TPS_InDecimalFirst),
1296         TPARSERSTATEACTION(TPS_InDecimal),
1297         TPARSERSTATEACTION(TPS_InVerVersion),
1298         TPARSERSTATEACTION(TPS_InSVerVersion),
1299         TPARSERSTATEACTION(TPS_InVersionFirst),
1300         TPARSERSTATEACTION(TPS_InVersion),
1301         TPARSERSTATEACTION(TPS_InMantissaFirst),
1302         TPARSERSTATEACTION(TPS_InMantissaSign),
1303         TPARSERSTATEACTION(TPS_InMantissa),
1304         TPARSERSTATEACTION(TPS_InXMLEntityFirst),
1305         TPARSERSTATEACTION(TPS_InXMLEntity),
1306         TPARSERSTATEACTION(TPS_InXMLEntityNumFirst),
1307         TPARSERSTATEACTION(TPS_InXMLEntityNum),
1308         TPARSERSTATEACTION(TPS_InXMLEntityHexNumFirst),
1309         TPARSERSTATEACTION(TPS_InXMLEntityHexNum),
1310         TPARSERSTATEACTION(TPS_InXMLEntityEnd),
1311         TPARSERSTATEACTION(TPS_InTagFirst),
1312         TPARSERSTATEACTION(TPS_InXMLBegin),
1313         TPARSERSTATEACTION(TPS_InTagCloseFirst),
1314         TPARSERSTATEACTION(TPS_InTagName),
1315         TPARSERSTATEACTION(TPS_InTagBeginEnd),
1316         TPARSERSTATEACTION(TPS_InTag),
1317         TPARSERSTATEACTION(TPS_InTagEscapeK),
1318         TPARSERSTATEACTION(TPS_InTagEscapeKK),
1319         TPARSERSTATEACTION(TPS_InTagBackSleshed),
1320         TPARSERSTATEACTION(TPS_InTagEnd),
1321         TPARSERSTATEACTION(TPS_InCommentFirst),
1322         TPARSERSTATEACTION(TPS_InCommentLast),
1323         TPARSERSTATEACTION(TPS_InComment),
1324         TPARSERSTATEACTION(TPS_InCloseCommentFirst),
1325         TPARSERSTATEACTION(TPS_InCloseCommentLast),
1326         TPARSERSTATEACTION(TPS_InCommentEnd),
1327         TPARSERSTATEACTION(TPS_InHostFirstDomain),
1328         TPARSERSTATEACTION(TPS_InHostDomainSecond),
1329         TPARSERSTATEACTION(TPS_InHostDomain),
1330         TPARSERSTATEACTION(TPS_InPortFirst),
1331         TPARSERSTATEACTION(TPS_InPort),
1332         TPARSERSTATEACTION(TPS_InHostFirstAN),
1333         TPARSERSTATEACTION(TPS_InHost),
1334         TPARSERSTATEACTION(TPS_InEmail),
1335         TPARSERSTATEACTION(TPS_InFileFirst),
1336         TPARSERSTATEACTION(TPS_InFileTwiddle),
1337         TPARSERSTATEACTION(TPS_InPathFirst),
1338         TPARSERSTATEACTION(TPS_InPathFirstFirst),
1339         TPARSERSTATEACTION(TPS_InPathSecond),
1340         TPARSERSTATEACTION(TPS_InFile),
1341         TPARSERSTATEACTION(TPS_InFileNext),
1342         TPARSERSTATEACTION(TPS_InURLPathFirst),
1343         TPARSERSTATEACTION(TPS_InURLPathStart),
1344         TPARSERSTATEACTION(TPS_InURLPath),
1345         TPARSERSTATEACTION(TPS_InFURL),
1346         TPARSERSTATEACTION(TPS_InProtocolFirst),
1347         TPARSERSTATEACTION(TPS_InProtocolSecond),
1348         TPARSERSTATEACTION(TPS_InProtocolEnd),
1349         TPARSERSTATEACTION(TPS_InHyphenAsciiWordFirst),
1350         TPARSERSTATEACTION(TPS_InHyphenAsciiWord),
1351         TPARSERSTATEACTION(TPS_InHyphenWordFirst),
1352         TPARSERSTATEACTION(TPS_InHyphenWord),
1353         TPARSERSTATEACTION(TPS_InHyphenNumWordFirst),
1354         TPARSERSTATEACTION(TPS_InHyphenNumWord),
1355         TPARSERSTATEACTION(TPS_InHyphenDigitLookahead),
1356         TPARSERSTATEACTION(TPS_InParseHyphen),
1357         TPARSERSTATEACTION(TPS_InParseHyphenHyphen),
1358         TPARSERSTATEACTION(TPS_InHyphenWordPart),
1359         TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart),
1360         TPARSERSTATEACTION(TPS_InHyphenNumWordPart),
1361         TPARSERSTATEACTION(TPS_InHyphenUnsignedInt)
1362 };
1363
1364
1365 static bool
1366 TParserGet(TParser *prs)
1367 {
1368         const TParserStateActionItem *item = NULL;
1369
1370         Assert(prs->state);
1371
1372         if (prs->state->posbyte >= prs->lenstr)
1373                 return false;
1374
1375         prs->token = prs->str + prs->state->posbyte;
1376         prs->state->pushedAtAction = NULL;
1377
1378         /* look at string */
1379         while (prs->state->posbyte <= prs->lenstr)
1380         {
1381                 if (prs->state->posbyte == prs->lenstr)
1382                         prs->state->charlen = 0;
1383                 else
1384                         prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
1385                                 pg_mblen(prs->str + prs->state->posbyte);
1386
1387                 Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
1388                 Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
1389                 Assert(Actions[prs->state->state].state == prs->state->state);
1390
1391                 if (prs->state->pushedAtAction)
1392                 {
1393                         /* After a POP, pick up at the next test */
1394                         item = prs->state->pushedAtAction + 1;
1395                         prs->state->pushedAtAction = NULL;
1396                 }
1397                 else
1398                 {
1399                         item = Actions[prs->state->state].action;
1400                         Assert(item != NULL);
1401                 }
1402
1403                 /* find action by character class */
1404                 while (item->isclass)
1405                 {
1406                         prs->c = item->c;
1407                         if (item->isclass(prs) != 0)
1408                                 break;
1409                         item++;
1410                 }
1411
1412 #ifdef WPARSER_TRACE
1413                 {
1414                         TParserPosition *ptr;
1415
1416                         fprintf(stderr, "state ");
1417                         /* indent according to stack depth */
1418                         for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
1419                                 fprintf(stderr, "  ");
1420                         fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
1421                         if (prs->state->posbyte < prs->lenstr)
1422                                 fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
1423                         else
1424                                 fprintf(stderr, "at EOF");
1425                         fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
1426                                         (int) (item - Actions[prs->state->state].action),
1427                                         (item->flags & A_BINGO) ? " BINGO" : "",
1428                                         (item->flags & A_POP) ? " POP" : "",
1429                                         (item->flags & A_PUSH) ? " PUSH" : "",
1430                                         (item->flags & A_RERUN) ? " RERUN" : "",
1431                                         (item->flags & A_CLEAR) ? " CLEAR" : "",
1432                                         (item->flags & A_MERGE) ? " MERGE" : "",
1433                                         (item->flags & A_CLRALL) ? " CLRALL" : "",
1434                                         (item->tostate != TPS_Null) ? " tostate " : "",
1435                                         (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
1436                                         (item->type > 0) ? " type " : "",
1437                                         tok_alias[item->type]);
1438                 }
1439 #endif
1440
1441                 /* call special handler if exists */
1442                 if (item->special)
1443                         item->special(prs);
1444
1445                 /* BINGO, token is found */
1446                 if (item->flags & A_BINGO)
1447                 {
1448                         Assert(item->type > 0);
1449                         prs->lenbytetoken = prs->state->lenbytetoken;
1450                         prs->lenchartoken = prs->state->lenchartoken;
1451                         prs->state->lenbytetoken = prs->state->lenchartoken = 0;
1452                         prs->type = item->type;
1453                 }
1454
1455                 /* do various actions by flags */
1456                 if (item->flags & A_POP)
1457                 {                                               /* pop stored state in stack */
1458                         TParserPosition *ptr = prs->state->prev;
1459
1460                         pfree(prs->state);
1461                         prs->state = ptr;
1462                         Assert(prs->state);
1463                 }
1464                 else if (item->flags & A_PUSH)
1465                 {                                               /* push (store) state in stack */
1466                         prs->state->pushedAtAction = item;      /* remember where we push */
1467                         prs->state = newTParserPosition(prs->state);
1468                 }
1469                 else if (item->flags & A_CLEAR)
1470                 {                                               /* clear previous pushed state */
1471                         TParserPosition *ptr;
1472
1473                         Assert(prs->state->prev);
1474                         ptr = prs->state->prev->prev;
1475                         pfree(prs->state->prev);
1476                         prs->state->prev = ptr;
1477                 }
1478                 else if (item->flags & A_CLRALL)
1479                 {                                               /* clear all previous pushed state */
1480                         TParserPosition *ptr;
1481
1482                         while (prs->state->prev)
1483                         {
1484                                 ptr = prs->state->prev->prev;
1485                                 pfree(prs->state->prev);
1486                                 prs->state->prev = ptr;
1487                         }
1488                 }
1489                 else if (item->flags & A_MERGE)
1490                 {                                               /* merge posinfo with current and pushed state */
1491                         TParserPosition *ptr = prs->state;
1492
1493                         Assert(prs->state->prev);
1494                         prs->state = prs->state->prev;
1495
1496                         prs->state->posbyte = ptr->posbyte;
1497                         prs->state->poschar = ptr->poschar;
1498                         prs->state->charlen = ptr->charlen;
1499                         prs->state->lenbytetoken = ptr->lenbytetoken;
1500                         prs->state->lenchartoken = ptr->lenchartoken;
1501                         pfree(ptr);
1502                 }
1503
1504                 /* set new state if pointed */
1505                 if (item->tostate != TPS_Null)
1506                         prs->state->state = item->tostate;
1507
1508                 /* check for go away */
1509                 if ((item->flags & A_BINGO) ||
1510                         (prs->state->posbyte >= prs->lenstr &&
1511                          (item->flags & A_RERUN) == 0))
1512                         break;
1513
1514                 /* go to beginning of loop if we should rerun or we just restore state */
1515                 if (item->flags & (A_RERUN | A_POP))
1516                         continue;
1517
1518                 /* move forward */
1519                 if (prs->state->charlen)
1520                 {
1521                         prs->state->posbyte += prs->state->charlen;
1522                         prs->state->lenbytetoken += prs->state->charlen;
1523                         prs->state->poschar++;
1524                         prs->state->lenchartoken++;
1525                 }
1526         }
1527
1528         return (item && (item->flags & A_BINGO)) ? true : false;
1529 }
1530
1531 Datum
1532 prsd_lextype(PG_FUNCTION_ARGS)
1533 {
1534         LexDescr   *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
1535         int                     i;
1536
1537         for (i = 1; i <= LASTNUM; i++)
1538         {
1539                 descr[i - 1].lexid = i;
1540                 descr[i - 1].alias = pstrdup(tok_alias[i]);
1541                 descr[i - 1].descr = pstrdup(lex_descr[i]);
1542         }
1543
1544         descr[LASTNUM].lexid = 0;
1545
1546         PG_RETURN_POINTER(descr);
1547 }
1548
1549 Datum
1550 prsd_start(PG_FUNCTION_ARGS)
1551 {
1552         PG_RETURN_POINTER(TParserInit((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
1553 }
1554
1555 Datum
1556 prsd_nexttoken(PG_FUNCTION_ARGS)
1557 {
1558         TParser    *p = (TParser *) PG_GETARG_POINTER(0);
1559         char      **t = (char **) PG_GETARG_POINTER(1);
1560         int                *tlen = (int *) PG_GETARG_POINTER(2);
1561
1562         if (!TParserGet(p))
1563                 PG_RETURN_INT32(0);
1564
1565         *t = p->token;
1566         *tlen = p->lenbytetoken;
1567
1568         PG_RETURN_INT32(p->type);
1569 }
1570
1571 Datum
1572 prsd_end(PG_FUNCTION_ARGS)
1573 {
1574         TParser    *p = (TParser *) PG_GETARG_POINTER(0);
1575
1576         TParserClose(p);
1577         PG_RETURN_VOID();
1578 }
1579
1580 #define LEAVETOKEN(x)   ( (x)==SPACE )
1581 #define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1582 #define ENDPUNCTOKEN(x) ( (x)==SPACE )
1583
1584 #define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
1585 #define HLIDIGNORE(x) ( (x)==URL_T || (x)==TAG_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1586 #define XMLHLIDIGNORE(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1587 #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDIGNORE(x) )
1588 #define NOENDTOKEN(x)   ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
1589
1590 typedef struct
1591 {
1592         HeadlineWordEntry *words;
1593         int                     len;
1594 } hlCheck;
1595
1596 static bool
1597 checkcondition_HL(void *checkval, QueryOperand *val)
1598 {
1599         int                     i;
1600
1601         for (i = 0; i < ((hlCheck *) checkval)->len; i++)
1602         {
1603                 if (((hlCheck *) checkval)->words[i].item == val)
1604                         return true;
1605         }
1606         return false;
1607 }
1608
1609
1610 static bool
1611 hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
1612 {
1613         int                     i,
1614                                 j;
1615         QueryItem  *item = GETQUERY(query);
1616         int                     pos = *p;
1617
1618         *q = 0;
1619         *p = 0x7fffffff;
1620
1621         for (j = 0; j < query->size; j++)
1622         {
1623                 if (item->type != QI_VAL)
1624                 {
1625                         item++;
1626                         continue;
1627                 }
1628                 for (i = pos; i < prs->curwords; i++)
1629                 {
1630                         if (prs->words[i].item == &item->operand)
1631                         {
1632                                 if (i > *q)
1633                                         *q = i;
1634                                 break;
1635                         }
1636                 }
1637                 item++;
1638         }
1639
1640         if (*q == 0)
1641                 return false;
1642
1643         item = GETQUERY(query);
1644         for (j = 0; j < query->size; j++)
1645         {
1646                 if (item->type != QI_VAL)
1647                 {
1648                         item++;
1649                         continue;
1650                 }
1651                 for (i = *q; i >= pos; i--)
1652                 {
1653                         if (prs->words[i].item == &item->operand)
1654                         {
1655                                 if (i < *p)
1656                                         *p = i;
1657                                 break;
1658                         }
1659                 }
1660                 item++;
1661         }
1662
1663         if (*p <= *q)
1664         {
1665                 hlCheck         ch;
1666
1667                 ch.words = &(prs->words[*p]);
1668                 ch.len = *q - *p + 1;
1669                 if (TS_execute(GETQUERY(query), &ch, false, checkcondition_HL))
1670                         return true;
1671                 else
1672                 {
1673                         (*p)++;
1674                         return hlCover(prs, query, p, q);
1675                 }
1676         }
1677
1678         return false;
1679 }
1680
1681 Datum
1682 prsd_headline(PG_FUNCTION_ARGS)
1683 {
1684         HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
1685         List       *prsoptions = (List *) PG_GETARG_POINTER(1);
1686         TSQuery         query = PG_GETARG_TSQUERY(2);
1687
1688         /* from opt + start and and tag */
1689         int                     min_words = 15;
1690         int                     max_words = 35;
1691         int                     shortword = 3;
1692
1693         int                     p = 0,
1694                                 q = 0;
1695         int                     bestb = -1,
1696                                 beste = -1;
1697         int                     bestlen = -1;
1698         int                     pose = 0,
1699                                 posb,
1700                                 poslen,
1701                                 curlen;
1702
1703         int                     i;
1704         int                     highlight = 0;
1705         ListCell   *l;
1706
1707         /* config */
1708         prs->startsel = NULL;
1709         prs->stopsel = NULL;
1710         foreach(l, prsoptions)
1711         {
1712                 DefElem    *defel = (DefElem *) lfirst(l);
1713                 char       *val = defGetString(defel);
1714
1715                 if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
1716                         max_words = pg_atoi(val, sizeof(int32), 0);
1717                 else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
1718                         min_words = pg_atoi(val, sizeof(int32), 0);
1719                 else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
1720                         shortword = pg_atoi(val, sizeof(int32), 0);
1721                 else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
1722                         prs->startsel = pstrdup(val);
1723                 else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
1724                         prs->stopsel = pstrdup(val);
1725                 else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
1726                         highlight = (pg_strcasecmp(val, "1") == 0 ||
1727                                                  pg_strcasecmp(val, "on") == 0 ||
1728                                                  pg_strcasecmp(val, "true") == 0 ||
1729                                                  pg_strcasecmp(val, "t") == 0 ||
1730                                                  pg_strcasecmp(val, "y") == 0 ||
1731                                                  pg_strcasecmp(val, "yes") == 0);
1732                 else
1733                         ereport(ERROR,
1734                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1735                                          errmsg("unrecognized headline parameter: \"%s\"",
1736                                                         defel->defname)));
1737         }
1738
1739         if (highlight == 0)
1740         {
1741                 if (min_words >= max_words)
1742                         ereport(ERROR,
1743                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1744                                          errmsg("MinWords should be less than MaxWords")));
1745                 if (min_words <= 0)
1746                         ereport(ERROR,
1747                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1748                                          errmsg("MinWords should be positive")));
1749                 if (shortword < 0)
1750                         ereport(ERROR,
1751                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1752                                          errmsg("ShortWord should be >= 0")));
1753
1754                 while (hlCover(prs, query, &p, &q))
1755                 {
1756                         /* find cover len in words */
1757                         curlen = 0;
1758                         poslen = 0;
1759                         for (i = p; i <= q && curlen < max_words; i++)
1760                         {
1761                                 if (!NONWORDTOKEN(prs->words[i].type))
1762                                         curlen++;
1763                                 if (prs->words[i].item && !prs->words[i].repeated)
1764                                         poslen++;
1765                                 pose = i;
1766                         }
1767
1768                         if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
1769                         {
1770                                 /* best already finded, so try one more cover */
1771                                 p++;
1772                                 continue;
1773                         }
1774
1775                         posb = p;
1776                         if (curlen < max_words)
1777                         {                                       /* find good end */
1778                                 for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
1779                                 {
1780                                         if (i != q)
1781                                         {
1782                                                 if (!NONWORDTOKEN(prs->words[i].type))
1783                                                         curlen++;
1784                                                 if (prs->words[i].item && !prs->words[i].repeated)
1785                                                         poslen++;
1786                                         }
1787                                         pose = i;
1788                                         if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
1789                                                 continue;
1790                                         if (curlen >= min_words)
1791                                                 break;
1792                                 }
1793                                 if (curlen < min_words && i >= prs->curwords)
1794                                 {                               /* got end of text and our cover is shoter
1795                                                                  * than min_words */
1796                                         for (i = p; i >= 0; i--)
1797                                         {
1798                                                 if (!NONWORDTOKEN(prs->words[i].type))
1799                                                         curlen++;
1800                                                 if (prs->words[i].item && !prs->words[i].repeated)
1801                                                         poslen++;
1802                                                 if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
1803                                                         continue;
1804                                                 if (curlen >= min_words)
1805                                                         break;
1806                                         }
1807                                         posb = (i >= 0) ? i : 0;
1808                                 }
1809                         }
1810                         else
1811                         {                                       /* shorter cover :((( */
1812                                 for (; curlen > min_words; i--)
1813                                 {
1814                                         if (!NONWORDTOKEN(prs->words[i].type))
1815                                                 curlen--;
1816                                         if (prs->words[i].item && !prs->words[i].repeated)
1817                                                 poslen--;
1818                                         pose = i;
1819                                         if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
1820                                                 continue;
1821                                         break;
1822                                 }
1823                         }
1824
1825                         if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
1826                                 (bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
1827                                  (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
1828                         {
1829                                 bestb = posb;
1830                                 beste = pose;
1831                                 bestlen = poslen;
1832                         }
1833
1834                         p++;
1835                 }
1836
1837                 if (bestlen < 0)
1838                 {
1839                         curlen = 0;
1840                         for (i = 0; i < prs->curwords && curlen < min_words; i++)
1841                         {
1842                                 if (!NONWORDTOKEN(prs->words[i].type))
1843                                         curlen++;
1844                                 pose = i;
1845                         }
1846                         bestb = 0;
1847                         beste = pose;
1848                 }
1849         }
1850         else
1851         {
1852                 bestb = 0;
1853                 beste = prs->curwords - 1;
1854         }
1855
1856         for (i = bestb; i <= beste; i++)
1857         {
1858                 if (prs->words[i].item)
1859                         prs->words[i].selected = 1;
1860                 if (highlight == 0)
1861                 {
1862                         if (HLIDIGNORE(prs->words[i].type))
1863                                 prs->words[i].replace = 1;
1864                 }
1865                 else
1866                 {
1867                         if (XMLHLIDIGNORE(prs->words[i].type))
1868                                 prs->words[i].replace = 1;
1869                 }
1870
1871                 prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
1872         }
1873
1874         if (!prs->startsel)
1875                 prs->startsel = pstrdup("<b>");
1876         if (!prs->stopsel)
1877                 prs->stopsel = pstrdup("</b>");
1878         prs->startsellen = strlen(prs->startsel);
1879         prs->stopsellen = strlen(prs->stopsel);
1880
1881         PG_RETURN_POINTER(prs);
1882 }