1 /*-------------------------------------------------------------------------
4 * Default text search parser
6 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
10 * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.15 2008/06/17 16:09:06 momjian Exp $
12 *-------------------------------------------------------------------------
17 #include "commands/defrem.h"
18 #include "tsearch/ts_locale.h"
19 #include "tsearch/ts_public.h"
20 #include "tsearch/ts_type.h"
21 #include "tsearch/ts_utils.h"
22 #include "utils/builtins.h"
25 /* Define me to enable tracing of parser behavior */
26 /* #define WPARSER_TRACE */
29 /* Output token categories */
38 #define VERSIONNUMBER 8
39 #define NUMPARTHWORD 9
41 #define ASCIIPARTHWORD 11
52 #define UNSIGNEDINT 22
57 static const char *const tok_alias[] = {
84 static const char *const lex_descr[] = {
88 "Word, letters and digits",
92 "Scientific notation",
94 "Hyphenated word part, letters and digits",
95 "Hyphenated word part, all letters",
96 "Hyphenated word part, all ASCII",
100 "Hyphenated word, letters and digits",
101 "Hyphenated word, all ASCII",
102 "Hyphenated word, all letters",
121 TPS_InSignedIntFirst,
135 TPS_InXMLEntityFirst,
137 TPS_InXMLEntityNumFirst,
139 TPS_InXMLEntityHexNumFirst,
140 TPS_InXMLEntityHexNum,
150 TPS_InTagBackSleshed,
155 TPS_InCloseCommentFirst,
156 TPS_InCloseCommentLast,
158 TPS_InHostFirstDomain,
159 TPS_InHostDomainSecond,
169 TPS_InPathFirstFirst,
178 TPS_InProtocolSecond,
180 TPS_InHyphenAsciiWordFirst,
181 TPS_InHyphenAsciiWord,
182 TPS_InHyphenWordFirst,
184 TPS_InHyphenNumWordFirst,
186 TPS_InHyphenDigitLookahead,
188 TPS_InParseHyphenHyphen,
189 TPS_InHyphenWordPart,
190 TPS_InHyphenAsciiWordPart,
191 TPS_InHyphenNumWordPart,
192 TPS_InHyphenUnsignedInt,
193 TPS_Null /* last state (fake value) */
196 /* forward declaration */
199 typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions
201 typedef void (*TParserSpecial) (struct TParser *); /* special handler for
202 * special cases... */
206 TParserCharTest isclass;
209 TParserState tostate;
211 TParserSpecial special;
212 } TParserStateActionItem;
214 /* Flag bits in TParserStateActionItem.flags */
215 #define A_NEXT 0x0000
216 #define A_BINGO 0x0001
218 #define A_PUSH 0x0004
219 #define A_RERUN 0x0008
220 #define A_CLEAR 0x0010
221 #define A_MERGE 0x0020
222 #define A_CLRALL 0x0040
224 typedef struct TParserPosition
226 int posbyte; /* position of parser in bytes */
227 int poschar; /* position of parser in characters */
228 int charlen; /* length of current char */
229 int lenbytetoken; /* length of token-so-far in bytes */
230 int lenchartoken; /* and in chars */
232 struct TParserPosition *prev;
233 const TParserStateActionItem *pushedAtAction;
236 typedef struct TParser
238 /* string and position information */
239 char *str; /* multibyte string */
240 int lenstr; /* length of mbstring */
241 #ifdef USE_WIDE_UPPER_LOWER
242 wchar_t *wstr; /* wide character string */
243 int lenwstr; /* length of wsting */
249 TParserPosition *state;
264 /* forward decls here */
265 static bool TParserGet(TParser *prs);
268 static TParserPosition *
269 newTParserPosition(TParserPosition *prev)
271 TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition));
274 memcpy(res, prev, sizeof(TParserPosition));
276 memset(res, 0, sizeof(TParserPosition));
280 res->pushedAtAction = NULL;
286 TParserInit(char *str, int len)
288 TParser *prs = (TParser *) palloc0(sizeof(TParser));
290 prs->charmaxlen = pg_database_encoding_max_length();
294 #ifdef USE_WIDE_UPPER_LOWER
297 * Use wide char code only when max encoding length > 1.
299 if (prs->charmaxlen > 1)
302 prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
303 prs->lenwstr = char2wchar(prs->wstr, prs->lenstr + 1,
304 prs->str, prs->lenstr);
308 prs->usewide = false;
310 prs->state = newTParserPosition(NULL);
311 prs->state->state = TPS_Base;
314 fprintf(stderr, "parsing \"%.*s\"\n", len, str);
321 TParserClose(TParser *prs)
325 TParserPosition *ptr = prs->state->prev;
331 #ifdef USE_WIDE_UPPER_LOWER
340 * Character-type support functions, equivalent to is* macros, but
341 * working with any possible encodings and locales. Note,
342 * that with multibyte encoding and C-locale isw* function may fail
343 * or give wrong result. Note 2: multibyte encoding and C-locale
344 * often are used for Asian languages
347 #ifdef USE_WIDE_UPPER_LOWER
349 #define p_iswhat(type) \
351 p_is##type(TParser *prs) { \
352 Assert( prs->state ); \
353 if ( prs->usewide ) \
355 if ( lc_ctype_is_c() ) \
356 return is##type( 0xff & *( prs->wstr + prs->state->poschar) ); \
358 return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) ); \
361 return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
365 p_isnot##type(TParser *prs) { \
366 return !p_is##type(prs); \
370 p_isalnum(TParser *prs)
378 unsigned int c = *(prs->wstr + prs->state->poschar);
381 * any non-ascii symbol with multibyte encoding with C-locale is
387 return isalnum(0xff & c);
390 return iswalnum((wint_t) *(prs->wstr + prs->state->poschar));
393 return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte));
396 p_isnotalnum(TParser *prs)
398 return !p_isalnum(prs);
402 p_isalpha(TParser *prs)
410 unsigned int c = *(prs->wstr + prs->state->poschar);
413 * any non-ascii symbol with multibyte encoding with C-locale is
419 return isalpha(0xff & c);
422 return iswalpha((wint_t) *(prs->wstr + prs->state->poschar));
425 return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte));
429 p_isnotalpha(TParser *prs)
431 return !p_isalpha(prs);
434 /* p_iseq should be used only for ascii symbols */
437 p_iseq(TParser *prs, char c)
440 return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
442 #else /* USE_WIDE_UPPER_LOWER */
444 #define p_iswhat(type) \
446 p_is##type(TParser *prs) { \
447 Assert( prs->state ); \
448 return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \
452 p_isnot##type(TParser *prs) { \
453 return !p_is##type(prs); \
458 p_iseq(TParser *prs, char c)
461 return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
466 #endif /* USE_WIDE_UPPER_LOWER */
477 p_isEOF(TParser *prs)
480 return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
484 p_iseqC(TParser *prs)
486 return p_iseq(prs, prs->c);
490 p_isneC(TParser *prs)
492 return !p_iseq(prs, prs->c);
496 p_isascii(TParser *prs)
498 return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
502 p_isasclet(TParser *prs)
504 return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
508 /* deliberately suppress unused-function complaints for the above */
509 void _make_compiler_happy(void);
511 _make_compiler_happy(void)
538 SpecialTags(TParser *prs)
540 switch (prs->state->lenchartoken)
542 case 8: /* </script */
543 if (pg_strncasecmp(prs->token, "</script", 8) == 0)
546 case 7: /* <script || </style */
547 if (pg_strncasecmp(prs->token, "</style", 7) == 0)
549 else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
553 if (pg_strncasecmp(prs->token, "<style", 6) == 0)
562 SpecialFURL(TParser *prs)
564 prs->wanthost = true;
565 prs->state->posbyte -= prs->state->lenbytetoken;
566 prs->state->poschar -= prs->state->lenchartoken;
570 SpecialHyphen(TParser *prs)
572 prs->state->posbyte -= prs->state->lenbytetoken;
573 prs->state->poschar -= prs->state->lenchartoken;
577 SpecialVerVersion(TParser *prs)
579 prs->state->posbyte -= prs->state->lenbytetoken;
580 prs->state->poschar -= prs->state->lenchartoken;
581 prs->state->lenbytetoken = 0;
582 prs->state->lenchartoken = 0;
586 p_isstophost(TParser *prs)
590 prs->wanthost = false;
597 p_isignore(TParser *prs)
599 return (prs->ignore) ? 1 : 0;
603 p_ishost(TParser *prs)
605 TParser *tmpprs = TParserInit(prs->str + prs->state->posbyte, prs->lenstr - prs->state->posbyte);
608 if (TParserGet(tmpprs) && tmpprs->type == HOST)
610 prs->state->posbyte += tmpprs->lenbytetoken;
611 prs->state->poschar += tmpprs->lenchartoken;
612 prs->state->lenbytetoken += tmpprs->lenbytetoken;
613 prs->state->lenchartoken += tmpprs->lenchartoken;
614 prs->state->charlen = tmpprs->state->charlen;
617 TParserClose(tmpprs);
623 p_isURLPath(TParser *prs)
625 TParser *tmpprs = TParserInit(prs->str + prs->state->posbyte, prs->lenstr - prs->state->posbyte);
628 tmpprs->state = newTParserPosition(tmpprs->state);
629 tmpprs->state->state = TPS_InFileFirst;
631 if (TParserGet(tmpprs) && (tmpprs->type == URLPATH || tmpprs->type == FILEPATH))
633 prs->state->posbyte += tmpprs->lenbytetoken;
634 prs->state->poschar += tmpprs->lenchartoken;
635 prs->state->lenbytetoken += tmpprs->lenbytetoken;
636 prs->state->lenchartoken += tmpprs->lenchartoken;
637 prs->state->charlen = tmpprs->state->charlen;
640 TParserClose(tmpprs);
646 * Table of state/action of parser
649 static const TParserStateActionItem actionTPS_Base[] = {
650 {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
651 {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
652 {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
653 {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
654 {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
655 {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
656 {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
657 {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
658 {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
659 {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
660 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
661 {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
662 {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
666 static const TParserStateActionItem actionTPS_InNumWord[] = {
667 {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
668 {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
669 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
670 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
671 {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
672 {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
673 {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
676 static const TParserStateActionItem actionTPS_InAsciiWord[] = {
677 {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
678 {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
679 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
680 {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
681 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
682 {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
683 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
684 {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
685 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
686 {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
687 {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
688 {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
689 {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
692 static const TParserStateActionItem actionTPS_InWord[] = {
693 {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
694 {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
695 {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
696 {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
697 {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
700 static const TParserStateActionItem actionTPS_InUnsignedInt[] = {
701 {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
702 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
703 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
704 {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
705 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
706 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
707 {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
708 {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
709 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
710 {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
713 static const TParserStateActionItem actionTPS_InSignedIntFirst[] = {
714 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
715 {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
716 {NULL, 0, A_POP, TPS_Null, 0, NULL}
719 static const TParserStateActionItem actionTPS_InSignedInt[] = {
720 {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
721 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
722 {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
723 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
724 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
725 {NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
728 static const TParserStateActionItem actionTPS_InSpace[] = {
729 {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
730 {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
731 {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
732 {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
733 {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
734 {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
735 {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
736 {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
737 {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
740 static const TParserStateActionItem actionTPS_InUDecimalFirst[] = {
741 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
742 {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
743 {NULL, 0, A_POP, TPS_Null, 0, NULL}
746 static const TParserStateActionItem actionTPS_InUDecimal[] = {
747 {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL, NULL},
748 {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
749 {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
750 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
751 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
752 {NULL, 0, A_BINGO, TPS_Base, DECIMAL, NULL}
755 static const TParserStateActionItem actionTPS_InDecimalFirst[] = {
756 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
757 {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
758 {NULL, 0, A_POP, TPS_Null, 0, NULL}
761 static const TParserStateActionItem actionTPS_InDecimal[] = {
762 {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL, NULL},
763 {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
764 {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
765 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
766 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
767 {NULL, 0, A_BINGO, TPS_Base, DECIMAL, NULL}
770 static const TParserStateActionItem actionTPS_InVerVersion[] = {
771 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
772 {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
773 {NULL, 0, A_POP, TPS_Null, 0, NULL}
776 static const TParserStateActionItem actionTPS_InSVerVersion[] = {
777 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
778 {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
779 {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
783 static const TParserStateActionItem actionTPS_InVersionFirst[] = {
784 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
785 {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
786 {NULL, 0, A_POP, TPS_Null, 0, NULL}
789 static const TParserStateActionItem actionTPS_InVersion[] = {
790 {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
791 {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
792 {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
793 {NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
796 static const TParserStateActionItem actionTPS_InMantissaFirst[] = {
797 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
798 {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
799 {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
800 {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
801 {NULL, 0, A_POP, TPS_Null, 0, NULL}
804 static const TParserStateActionItem actionTPS_InMantissaSign[] = {
805 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
806 {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
807 {NULL, 0, A_POP, TPS_Null, 0, NULL}
810 static const TParserStateActionItem actionTPS_InMantissa[] = {
811 {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
812 {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
813 {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
816 static const TParserStateActionItem actionTPS_InXMLEntityFirst[] = {
817 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
818 {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
819 {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
820 {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
821 {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
822 {NULL, 0, A_POP, TPS_Null, 0, NULL}
825 static const TParserStateActionItem actionTPS_InXMLEntity[] = {
826 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
827 {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
828 {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
829 {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
830 {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
831 {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
832 {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
833 {NULL, 0, A_POP, TPS_Null, 0, NULL}
836 static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[] = {
837 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
838 {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
839 {p_iseqC, 'X', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
840 {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
841 {NULL, 0, A_POP, TPS_Null, 0, NULL}
844 static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[] = {
845 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
846 {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
847 {NULL, 0, A_POP, TPS_Null, 0, NULL}
850 static const TParserStateActionItem actionTPS_InXMLEntityNum[] = {
851 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
852 {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
853 {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
854 {NULL, 0, A_POP, TPS_Null, 0, NULL}
857 static const TParserStateActionItem actionTPS_InXMLEntityHexNum[] = {
858 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
859 {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
860 {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
861 {NULL, 0, A_POP, TPS_Null, 0, NULL}
864 static const TParserStateActionItem actionTPS_InXMLEntityEnd[] = {
865 {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
868 static const TParserStateActionItem actionTPS_InTagFirst[] = {
869 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
870 {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
871 {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
872 {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
873 {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
874 {p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
875 {p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
876 {NULL, 0, A_POP, TPS_Null, 0, NULL}
879 static const TParserStateActionItem actionTPS_InXMLBegin[] = {
880 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
882 /* XXX do we wants states for the m and l ? Right now this accepts <?xZ */
883 {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
884 {NULL, 0, A_POP, TPS_Null, 0, NULL}
887 static const TParserStateActionItem actionTPS_InTagCloseFirst[] = {
888 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
889 {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
890 {NULL, 0, A_POP, TPS_Null, 0, NULL}
893 static const TParserStateActionItem actionTPS_InTagName[] = {
894 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
896 {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
897 {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
898 {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
899 {p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
900 {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
901 {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
902 {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
903 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
904 {NULL, 0, A_POP, TPS_Null, 0, NULL}
907 static const TParserStateActionItem actionTPS_InTagBeginEnd[] = {
908 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
909 {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
910 {NULL, 0, A_POP, TPS_Null, 0, NULL}
913 static const TParserStateActionItem actionTPS_InTag[] = {
914 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
915 {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
916 {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
917 {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
918 {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
919 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
920 {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
921 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
922 {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
923 {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
924 {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
925 {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
926 {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
927 {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
928 {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
929 {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
930 {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
931 {NULL, 0, A_POP, TPS_Null, 0, NULL}
934 static const TParserStateActionItem actionTPS_InTagEscapeK[] = {
935 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
936 {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
937 {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
938 {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
941 static const TParserStateActionItem actionTPS_InTagEscapeKK[] = {
942 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
943 {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
944 {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
945 {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
948 static const TParserStateActionItem actionTPS_InTagBackSleshed[] = {
949 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
950 {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
953 static const TParserStateActionItem actionTPS_InTagEnd[] = {
954 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
957 static const TParserStateActionItem actionTPS_InCommentFirst[] = {
958 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
959 {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
961 {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
962 {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
963 {NULL, 0, A_POP, TPS_Null, 0, NULL}
966 static const TParserStateActionItem actionTPS_InCommentLast[] = {
967 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
968 {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
969 {NULL, 0, A_POP, TPS_Null, 0, NULL}
972 static const TParserStateActionItem actionTPS_InComment[] = {
973 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
974 {p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
975 {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
978 static const TParserStateActionItem actionTPS_InCloseCommentFirst[] = {
979 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
980 {p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
981 {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
984 static const TParserStateActionItem actionTPS_InCloseCommentLast[] = {
985 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
986 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
987 {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
988 {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
991 static const TParserStateActionItem actionTPS_InCommentEnd[] = {
992 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
995 static const TParserStateActionItem actionTPS_InHostFirstDomain[] = {
996 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
997 {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
998 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
999 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1002 static const TParserStateActionItem actionTPS_InHostDomainSecond[] = {
1003 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1004 {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1005 {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1006 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1007 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1008 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1009 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1012 static const TParserStateActionItem actionTPS_InHostDomain[] = {
1013 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1014 {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1015 {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1016 {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
1017 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1018 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1019 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1020 {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
1021 {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1022 {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1023 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1026 static const TParserStateActionItem actionTPS_InPortFirst[] = {
1027 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1028 {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1029 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1032 static const TParserStateActionItem actionTPS_InPort[] = {
1033 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1034 {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1035 {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1036 {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1037 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1040 static const TParserStateActionItem actionTPS_InHostFirstAN[] = {
1041 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1042 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1043 {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1044 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1047 static const TParserStateActionItem actionTPS_InHost[] = {
1048 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1049 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1050 {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1051 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1052 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1053 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1054 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1057 static const TParserStateActionItem actionTPS_InEmail[] = {
1058 {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
1059 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1062 static const TParserStateActionItem actionTPS_InFileFirst[] = {
1063 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1064 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1065 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1066 {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
1067 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1068 {p_iseqC, '?', A_PUSH, TPS_InURLPathFirst, 0, NULL},
1069 {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1070 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1073 static const TParserStateActionItem actionTPS_InFileTwiddle[] = {
1074 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1075 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1076 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1077 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1078 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1079 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1082 static const TParserStateActionItem actionTPS_InPathFirst[] = {
1083 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1084 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1085 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1086 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1087 {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1088 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1089 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1092 static const TParserStateActionItem actionTPS_InPathFirstFirst[] = {
1093 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1094 {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1095 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1096 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1099 static const TParserStateActionItem actionTPS_InPathSecond[] = {
1100 {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1101 {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
1102 {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1103 {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1104 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1107 static const TParserStateActionItem actionTPS_InFile[] = {
1108 {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
1109 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1110 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1111 {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1112 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1113 {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
1114 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1115 {p_iseqC, '?', A_PUSH, TPS_InURLPathFirst, 0, NULL},
1116 {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
1119 static const TParserStateActionItem actionTPS_InFileNext[] = {
1120 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1121 {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
1122 {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
1123 {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
1124 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1127 static const TParserStateActionItem actionTPS_InURLPathFirst[] = {
1128 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1129 {p_iseqC, '"', A_POP, TPS_Null, 0, NULL},
1130 {p_iseqC, '\'', A_POP, TPS_Null, 0, NULL},
1131 {p_isnotspace, 0, A_CLEAR, TPS_InURLPath, 0, NULL},
1132 {NULL, 0, A_POP, TPS_Null, 0, NULL},
1135 static const TParserStateActionItem actionTPS_InURLPathStart[] = {
1136 {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
1139 static const TParserStateActionItem actionTPS_InURLPath[] = {
1140 {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
1141 {p_iseqC, '"', A_BINGO, TPS_Base, URLPATH, NULL},
1142 {p_iseqC, '\'', A_BINGO, TPS_Base, URLPATH, NULL},
1143 {p_isnotspace, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1144 {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
1147 static const TParserStateActionItem actionTPS_InFURL[] = {
1148 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1149 {p_isURLPath, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
1150 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1153 static const TParserStateActionItem actionTPS_InProtocolFirst[] = {
1154 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1155 {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
1156 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1159 static const TParserStateActionItem actionTPS_InProtocolSecond[] = {
1160 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1161 {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
1162 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1165 static const TParserStateActionItem actionTPS_InProtocolEnd[] = {
1166 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
1169 static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
1170 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1171 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1172 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1173 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1174 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1177 static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
1178 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
1179 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1180 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1181 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1182 {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1183 {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
1186 static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
1187 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1188 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1189 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1190 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1193 static const TParserStateActionItem actionTPS_InHyphenWord[] = {
1194 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
1195 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1196 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1197 {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1198 {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
1201 static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
1202 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1203 {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1204 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1205 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1208 static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
1209 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
1210 {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1211 {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
1212 {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
1215 static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = {
1216 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1217 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1218 {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1219 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1222 static const TParserStateActionItem actionTPS_InParseHyphen[] = {
1223 {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
1224 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1225 {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1226 {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
1227 {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
1228 {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
1231 static const TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
1232 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1233 {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1234 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1237 static const TParserStateActionItem actionTPS_InHyphenWordPart[] = {
1238 {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
1239 {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1240 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1241 {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
1244 static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
1245 {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
1246 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1247 {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1248 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1249 {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
1252 static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
1253 {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
1254 {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1255 {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
1258 static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
1259 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1260 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1261 {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1262 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1267 * main table of per-state parser actions
1271 const TParserStateActionItem *action; /* the actual state info */
1272 TParserState state; /* only for Assert crosscheck */
1273 #ifdef WPARSER_TRACE
1274 const char *state_name; /* only for debug printout */
1276 } TParserStateAction;
1278 #ifdef WPARSER_TRACE
1279 #define TPARSERSTATEACTION(state) \
1280 { CppConcat(action,state), state, CppAsString(state) }
1282 #define TPARSERSTATEACTION(state) \
1283 { CppConcat(action,state), state }
1287 * order must be the same as in typedef enum {} TParserState!!
1290 static const TParserStateAction Actions[] = {
1291 TPARSERSTATEACTION(TPS_Base),
1292 TPARSERSTATEACTION(TPS_InNumWord),
1293 TPARSERSTATEACTION(TPS_InAsciiWord),
1294 TPARSERSTATEACTION(TPS_InWord),
1295 TPARSERSTATEACTION(TPS_InUnsignedInt),
1296 TPARSERSTATEACTION(TPS_InSignedIntFirst),
1297 TPARSERSTATEACTION(TPS_InSignedInt),
1298 TPARSERSTATEACTION(TPS_InSpace),
1299 TPARSERSTATEACTION(TPS_InUDecimalFirst),
1300 TPARSERSTATEACTION(TPS_InUDecimal),
1301 TPARSERSTATEACTION(TPS_InDecimalFirst),
1302 TPARSERSTATEACTION(TPS_InDecimal),
1303 TPARSERSTATEACTION(TPS_InVerVersion),
1304 TPARSERSTATEACTION(TPS_InSVerVersion),
1305 TPARSERSTATEACTION(TPS_InVersionFirst),
1306 TPARSERSTATEACTION(TPS_InVersion),
1307 TPARSERSTATEACTION(TPS_InMantissaFirst),
1308 TPARSERSTATEACTION(TPS_InMantissaSign),
1309 TPARSERSTATEACTION(TPS_InMantissa),
1310 TPARSERSTATEACTION(TPS_InXMLEntityFirst),
1311 TPARSERSTATEACTION(TPS_InXMLEntity),
1312 TPARSERSTATEACTION(TPS_InXMLEntityNumFirst),
1313 TPARSERSTATEACTION(TPS_InXMLEntityNum),
1314 TPARSERSTATEACTION(TPS_InXMLEntityHexNumFirst),
1315 TPARSERSTATEACTION(TPS_InXMLEntityHexNum),
1316 TPARSERSTATEACTION(TPS_InXMLEntityEnd),
1317 TPARSERSTATEACTION(TPS_InTagFirst),
1318 TPARSERSTATEACTION(TPS_InXMLBegin),
1319 TPARSERSTATEACTION(TPS_InTagCloseFirst),
1320 TPARSERSTATEACTION(TPS_InTagName),
1321 TPARSERSTATEACTION(TPS_InTagBeginEnd),
1322 TPARSERSTATEACTION(TPS_InTag),
1323 TPARSERSTATEACTION(TPS_InTagEscapeK),
1324 TPARSERSTATEACTION(TPS_InTagEscapeKK),
1325 TPARSERSTATEACTION(TPS_InTagBackSleshed),
1326 TPARSERSTATEACTION(TPS_InTagEnd),
1327 TPARSERSTATEACTION(TPS_InCommentFirst),
1328 TPARSERSTATEACTION(TPS_InCommentLast),
1329 TPARSERSTATEACTION(TPS_InComment),
1330 TPARSERSTATEACTION(TPS_InCloseCommentFirst),
1331 TPARSERSTATEACTION(TPS_InCloseCommentLast),
1332 TPARSERSTATEACTION(TPS_InCommentEnd),
1333 TPARSERSTATEACTION(TPS_InHostFirstDomain),
1334 TPARSERSTATEACTION(TPS_InHostDomainSecond),
1335 TPARSERSTATEACTION(TPS_InHostDomain),
1336 TPARSERSTATEACTION(TPS_InPortFirst),
1337 TPARSERSTATEACTION(TPS_InPort),
1338 TPARSERSTATEACTION(TPS_InHostFirstAN),
1339 TPARSERSTATEACTION(TPS_InHost),
1340 TPARSERSTATEACTION(TPS_InEmail),
1341 TPARSERSTATEACTION(TPS_InFileFirst),
1342 TPARSERSTATEACTION(TPS_InFileTwiddle),
1343 TPARSERSTATEACTION(TPS_InPathFirst),
1344 TPARSERSTATEACTION(TPS_InPathFirstFirst),
1345 TPARSERSTATEACTION(TPS_InPathSecond),
1346 TPARSERSTATEACTION(TPS_InFile),
1347 TPARSERSTATEACTION(TPS_InFileNext),
1348 TPARSERSTATEACTION(TPS_InURLPathFirst),
1349 TPARSERSTATEACTION(TPS_InURLPathStart),
1350 TPARSERSTATEACTION(TPS_InURLPath),
1351 TPARSERSTATEACTION(TPS_InFURL),
1352 TPARSERSTATEACTION(TPS_InProtocolFirst),
1353 TPARSERSTATEACTION(TPS_InProtocolSecond),
1354 TPARSERSTATEACTION(TPS_InProtocolEnd),
1355 TPARSERSTATEACTION(TPS_InHyphenAsciiWordFirst),
1356 TPARSERSTATEACTION(TPS_InHyphenAsciiWord),
1357 TPARSERSTATEACTION(TPS_InHyphenWordFirst),
1358 TPARSERSTATEACTION(TPS_InHyphenWord),
1359 TPARSERSTATEACTION(TPS_InHyphenNumWordFirst),
1360 TPARSERSTATEACTION(TPS_InHyphenNumWord),
1361 TPARSERSTATEACTION(TPS_InHyphenDigitLookahead),
1362 TPARSERSTATEACTION(TPS_InParseHyphen),
1363 TPARSERSTATEACTION(TPS_InParseHyphenHyphen),
1364 TPARSERSTATEACTION(TPS_InHyphenWordPart),
1365 TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart),
1366 TPARSERSTATEACTION(TPS_InHyphenNumWordPart),
1367 TPARSERSTATEACTION(TPS_InHyphenUnsignedInt)
1372 TParserGet(TParser *prs)
1374 const TParserStateActionItem *item = NULL;
1378 if (prs->state->posbyte >= prs->lenstr)
1381 prs->token = prs->str + prs->state->posbyte;
1382 prs->state->pushedAtAction = NULL;
1384 /* look at string */
1385 while (prs->state->posbyte <= prs->lenstr)
1387 if (prs->state->posbyte == prs->lenstr)
1388 prs->state->charlen = 0;
1390 prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
1391 pg_mblen(prs->str + prs->state->posbyte);
1393 Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
1394 Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
1395 Assert(Actions[prs->state->state].state == prs->state->state);
1397 if (prs->state->pushedAtAction)
1399 /* After a POP, pick up at the next test */
1400 item = prs->state->pushedAtAction + 1;
1401 prs->state->pushedAtAction = NULL;
1405 item = Actions[prs->state->state].action;
1406 Assert(item != NULL);
1409 /* find action by character class */
1410 while (item->isclass)
1413 if (item->isclass(prs) != 0)
1418 #ifdef WPARSER_TRACE
1420 TParserPosition *ptr;
1422 fprintf(stderr, "state ");
1423 /* indent according to stack depth */
1424 for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
1425 fprintf(stderr, " ");
1426 fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
1427 if (prs->state->posbyte < prs->lenstr)
1428 fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
1430 fprintf(stderr, "at EOF");
1431 fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
1432 (int) (item - Actions[prs->state->state].action),
1433 (item->flags & A_BINGO) ? " BINGO" : "",
1434 (item->flags & A_POP) ? " POP" : "",
1435 (item->flags & A_PUSH) ? " PUSH" : "",
1436 (item->flags & A_RERUN) ? " RERUN" : "",
1437 (item->flags & A_CLEAR) ? " CLEAR" : "",
1438 (item->flags & A_MERGE) ? " MERGE" : "",
1439 (item->flags & A_CLRALL) ? " CLRALL" : "",
1440 (item->tostate != TPS_Null) ? " tostate " : "",
1441 (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
1442 (item->type > 0) ? " type " : "",
1443 tok_alias[item->type]);
1447 /* call special handler if exists */
1451 /* BINGO, token is found */
1452 if (item->flags & A_BINGO)
1454 Assert(item->type > 0);
1455 prs->lenbytetoken = prs->state->lenbytetoken;
1456 prs->lenchartoken = prs->state->lenchartoken;
1457 prs->state->lenbytetoken = prs->state->lenchartoken = 0;
1458 prs->type = item->type;
1461 /* do various actions by flags */
1462 if (item->flags & A_POP)
1463 { /* pop stored state in stack */
1464 TParserPosition *ptr = prs->state->prev;
1470 else if (item->flags & A_PUSH)
1471 { /* push (store) state in stack */
1472 prs->state->pushedAtAction = item; /* remember where we push */
1473 prs->state = newTParserPosition(prs->state);
1475 else if (item->flags & A_CLEAR)
1476 { /* clear previous pushed state */
1477 TParserPosition *ptr;
1479 Assert(prs->state->prev);
1480 ptr = prs->state->prev->prev;
1481 pfree(prs->state->prev);
1482 prs->state->prev = ptr;
1484 else if (item->flags & A_CLRALL)
1485 { /* clear all previous pushed state */
1486 TParserPosition *ptr;
1488 while (prs->state->prev)
1490 ptr = prs->state->prev->prev;
1491 pfree(prs->state->prev);
1492 prs->state->prev = ptr;
1495 else if (item->flags & A_MERGE)
1496 { /* merge posinfo with current and pushed state */
1497 TParserPosition *ptr = prs->state;
1499 Assert(prs->state->prev);
1500 prs->state = prs->state->prev;
1502 prs->state->posbyte = ptr->posbyte;
1503 prs->state->poschar = ptr->poschar;
1504 prs->state->charlen = ptr->charlen;
1505 prs->state->lenbytetoken = ptr->lenbytetoken;
1506 prs->state->lenchartoken = ptr->lenchartoken;
1510 /* set new state if pointed */
1511 if (item->tostate != TPS_Null)
1512 prs->state->state = item->tostate;
1514 /* check for go away */
1515 if ((item->flags & A_BINGO) ||
1516 (prs->state->posbyte >= prs->lenstr &&
1517 (item->flags & A_RERUN) == 0))
1520 /* go to beginning of loop if we should rerun or we just restore state */
1521 if (item->flags & (A_RERUN | A_POP))
1525 if (prs->state->charlen)
1527 prs->state->posbyte += prs->state->charlen;
1528 prs->state->lenbytetoken += prs->state->charlen;
1529 prs->state->poschar++;
1530 prs->state->lenchartoken++;
1534 return (item && (item->flags & A_BINGO)) ? true : false;
1538 prsd_lextype(PG_FUNCTION_ARGS)
1540 LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
1543 for (i = 1; i <= LASTNUM; i++)
1545 descr[i - 1].lexid = i;
1546 descr[i - 1].alias = pstrdup(tok_alias[i]);
1547 descr[i - 1].descr = pstrdup(lex_descr[i]);
1550 descr[LASTNUM].lexid = 0;
1552 PG_RETURN_POINTER(descr);
1556 prsd_start(PG_FUNCTION_ARGS)
1558 PG_RETURN_POINTER(TParserInit((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
1562 prsd_nexttoken(PG_FUNCTION_ARGS)
1564 TParser *p = (TParser *) PG_GETARG_POINTER(0);
1565 char **t = (char **) PG_GETARG_POINTER(1);
1566 int *tlen = (int *) PG_GETARG_POINTER(2);
1572 *tlen = p->lenbytetoken;
1574 PG_RETURN_INT32(p->type);
1578 prsd_end(PG_FUNCTION_ARGS)
1580 TParser *p = (TParser *) PG_GETARG_POINTER(0);
1586 #define LEAVETOKEN(x) ( (x)==SPACE )
1587 #define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1588 #define ENDPUNCTOKEN(x) ( (x)==SPACE )
1590 #define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
1591 #define HLIDIGNORE(x) ( (x)==URL_T || (x)==TAG_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1592 #define XMLHLIDIGNORE(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1593 #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDIGNORE(x) )
1594 #define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
1598 HeadlineWordEntry *words;
1603 checkcondition_HL(void *checkval, QueryOperand *val)
1607 for (i = 0; i < ((hlCheck *) checkval)->len; i++)
1609 if (((hlCheck *) checkval)->words[i].item == val)
1617 hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
1621 QueryItem *item = GETQUERY(query);
1627 for (j = 0; j < query->size; j++)
1629 if (item->type != QI_VAL)
1634 for (i = pos; i < prs->curwords; i++)
1636 if (prs->words[i].item == &item->operand)
1649 item = GETQUERY(query);
1650 for (j = 0; j < query->size; j++)
1652 if (item->type != QI_VAL)
1657 for (i = *q; i >= pos; i--)
1659 if (prs->words[i].item == &item->operand)
1673 ch.words = &(prs->words[*p]);
1674 ch.len = *q - *p + 1;
1675 if (TS_execute(GETQUERY(query), &ch, false, checkcondition_HL))
1680 return hlCover(prs, query, p, q);
1688 prsd_headline(PG_FUNCTION_ARGS)
1690 HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
1691 List *prsoptions = (List *) PG_GETARG_POINTER(1);
1692 TSQuery query = PG_GETARG_TSQUERY(2);
1694 /* from opt + start and and tag */
1714 prs->startsel = NULL;
1715 prs->stopsel = NULL;
1716 foreach(l, prsoptions)
1718 DefElem *defel = (DefElem *) lfirst(l);
1719 char *val = defGetString(defel);
1721 if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
1722 max_words = pg_atoi(val, sizeof(int32), 0);
1723 else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
1724 min_words = pg_atoi(val, sizeof(int32), 0);
1725 else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
1726 shortword = pg_atoi(val, sizeof(int32), 0);
1727 else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
1728 prs->startsel = pstrdup(val);
1729 else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
1730 prs->stopsel = pstrdup(val);
1731 else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
1732 highlight = (pg_strcasecmp(val, "1") == 0 ||
1733 pg_strcasecmp(val, "on") == 0 ||
1734 pg_strcasecmp(val, "true") == 0 ||
1735 pg_strcasecmp(val, "t") == 0 ||
1736 pg_strcasecmp(val, "y") == 0 ||
1737 pg_strcasecmp(val, "yes") == 0);
1740 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1741 errmsg("unrecognized headline parameter: \"%s\"",
1747 if (min_words >= max_words)
1749 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1750 errmsg("MinWords should be less than MaxWords")));
1753 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1754 errmsg("MinWords should be positive")));
1757 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1758 errmsg("ShortWord should be >= 0")));
1760 while (hlCover(prs, query, &p, &q))
1762 /* find cover len in words */
1765 for (i = p; i <= q && curlen < max_words; i++)
1767 if (!NONWORDTOKEN(prs->words[i].type))
1769 if (prs->words[i].item && !prs->words[i].repeated)
1774 if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
1776 /* best already finded, so try one more cover */
1782 if (curlen < max_words)
1783 { /* find good end */
1784 for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
1788 if (!NONWORDTOKEN(prs->words[i].type))
1790 if (prs->words[i].item && !prs->words[i].repeated)
1794 if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
1796 if (curlen >= min_words)
1799 if (curlen < min_words && i >= prs->curwords)
1800 { /* got end of text and our cover is shoter
1802 for (i = p; i >= 0; i--)
1804 if (!NONWORDTOKEN(prs->words[i].type))
1806 if (prs->words[i].item && !prs->words[i].repeated)
1808 if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
1810 if (curlen >= min_words)
1813 posb = (i >= 0) ? i : 0;
1817 { /* shorter cover :((( */
1818 for (; curlen > min_words; i--)
1820 if (!NONWORDTOKEN(prs->words[i].type))
1822 if (prs->words[i].item && !prs->words[i].repeated)
1825 if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
1831 if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
1832 (bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
1833 (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
1846 for (i = 0; i < prs->curwords && curlen < min_words; i++)
1848 if (!NONWORDTOKEN(prs->words[i].type))
1859 beste = prs->curwords - 1;
1862 for (i = bestb; i <= beste; i++)
1864 if (prs->words[i].item)
1865 prs->words[i].selected = 1;
1868 if (HLIDIGNORE(prs->words[i].type))
1869 prs->words[i].replace = 1;
1873 if (XMLHLIDIGNORE(prs->words[i].type))
1874 prs->words[i].replace = 1;
1877 prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
1881 prs->startsel = pstrdup("<b>");
1883 prs->stopsel = pstrdup("</b>");
1884 prs->startsellen = strlen(prs->startsel);
1885 prs->stopsellen = strlen(prs->stopsel);
1887 PG_RETURN_POINTER(prs);