1 /*-------------------------------------------------------------------------
4 * Default text search parser
6 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
10 * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.12 2007/11/25 15:37:11 adunstan Exp $
12 *-------------------------------------------------------------------------
17 #include "commands/defrem.h"
18 #include "tsearch/ts_locale.h"
19 #include "tsearch/ts_public.h"
20 #include "tsearch/ts_type.h"
21 #include "tsearch/ts_utils.h"
22 #include "utils/builtins.h"
25 /* Define me to enable tracing of parser behavior */
26 /* #define WPARSER_TRACE */
29 /* Output token categories */
38 #define VERSIONNUMBER 8
39 #define NUMPARTHWORD 9
41 #define ASCIIPARTHWORD 11
52 #define UNSIGNEDINT 22
57 static const char *const tok_alias[] = {
84 static const char *const lex_descr[] = {
88 "Word, letters and digits",
92 "Scientific notation",
94 "Hyphenated word part, letters and digits",
95 "Hyphenated word part, all letters",
96 "Hyphenated word part, all ASCII",
100 "Hyphenated word, letters and digits",
101 "Hyphenated word, all ASCII",
102 "Hyphenated word, all letters",
121 TPS_InSignedIntFirst,
135 TPS_InXMLEntityFirst,
137 TPS_InXMLEntityNumFirst,
139 TPS_InXMLEntityHexNumFirst,
140 TPS_InXMLEntityHexNum,
150 TPS_InTagBackSleshed,
155 TPS_InCloseCommentFirst,
156 TPS_InCloseCommentLast,
158 TPS_InHostFirstDomain,
159 TPS_InHostDomainSecond,
169 TPS_InPathFirstFirst,
178 TPS_InProtocolSecond,
180 TPS_InHyphenAsciiWordFirst,
181 TPS_InHyphenAsciiWord,
182 TPS_InHyphenWordFirst,
184 TPS_InHyphenNumWordFirst,
186 TPS_InHyphenDigitLookahead,
188 TPS_InParseHyphenHyphen,
189 TPS_InHyphenWordPart,
190 TPS_InHyphenAsciiWordPart,
191 TPS_InHyphenNumWordPart,
192 TPS_InHyphenUnsignedInt,
193 TPS_Null /* last state (fake value) */
196 /* forward declaration */
199 typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions
201 typedef void (*TParserSpecial) (struct TParser *); /* special handler for
202 * special cases... */
206 TParserCharTest isclass;
209 TParserState tostate;
211 TParserSpecial special;
212 } TParserStateActionItem;
214 /* Flag bits in TParserStateActionItem.flags */
215 #define A_NEXT 0x0000
216 #define A_BINGO 0x0001
218 #define A_PUSH 0x0004
219 #define A_RERUN 0x0008
220 #define A_CLEAR 0x0010
221 #define A_MERGE 0x0020
222 #define A_CLRALL 0x0040
224 typedef struct TParserPosition
226 int posbyte; /* position of parser in bytes */
227 int poschar; /* position of parser in characters */
228 int charlen; /* length of current char */
229 int lenbytetoken; /* length of token-so-far in bytes */
230 int lenchartoken; /* and in chars */
232 struct TParserPosition *prev;
233 const TParserStateActionItem *pushedAtAction;
236 typedef struct TParser
238 /* string and position information */
239 char *str; /* multibyte string */
240 int lenstr; /* length of mbstring */
242 wchar_t *wstr; /* wide character string */
243 int lenwstr; /* length of wsting */
249 TParserPosition *state;
264 /* forward decls here */
265 static bool TParserGet(TParser *prs);
268 static TParserPosition *
269 newTParserPosition(TParserPosition *prev)
271 TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition));
274 memcpy(res, prev, sizeof(TParserPosition));
276 memset(res, 0, sizeof(TParserPosition));
280 res->pushedAtAction = NULL;
286 TParserInit(char *str, int len)
288 TParser *prs = (TParser *) palloc0(sizeof(TParser));
290 prs->charmaxlen = pg_database_encoding_max_length();
297 * Use wide char code only when max encoding length > 1.
299 if (prs->charmaxlen > 1)
302 prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
303 prs->lenwstr = char2wchar(prs->wstr, prs->lenstr + 1,
304 prs->str, prs->lenstr);
308 prs->usewide = false;
310 prs->state = newTParserPosition(NULL);
311 prs->state->state = TPS_Base;
314 fprintf(stderr, "parsing \"%.*s\"\n", len, str);
321 TParserClose(TParser *prs)
325 TParserPosition *ptr = prs->state->prev;
340 * Character-type support functions, equivalent to is* macros, but
341 * working with any possible encodings and locales. Note,
342 * that with multibyte encoding and C-locale isw* function may fail
343 * or give wrong result. Note 2: multibyte encoding and C-locale
344 * often are used for Asian languages
349 #define p_iswhat(type) \
351 p_is##type(TParser *prs) { \
352 Assert( prs->state ); \
353 if ( prs->usewide ) \
355 if ( lc_ctype_is_c() ) \
356 return is##type( 0xff & *( prs->wstr + prs->state->poschar) ); \
358 return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) ); \
361 return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
365 p_isnot##type(TParser *prs) { \
366 return !p_is##type(prs); \
370 p_isalnum(TParser *prs)
378 unsigned int c = *(prs->wstr + prs->state->poschar);
381 * any non-ascii symbol with multibyte encoding with C-locale is
387 return isalnum(0xff & c);
390 return iswalnum((wint_t) *(prs->wstr + prs->state->poschar));
393 return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte));
396 p_isnotalnum(TParser *prs)
398 return !p_isalnum(prs);
402 p_isalpha(TParser *prs)
410 unsigned int c = *(prs->wstr + prs->state->poschar);
413 * any non-ascii symbol with multibyte encoding with C-locale is
419 return isalpha(0xff & c);
422 return iswalpha((wint_t) *(prs->wstr + prs->state->poschar));
425 return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte));
429 p_isnotalpha(TParser *prs)
431 return !p_isalpha(prs);
434 /* p_iseq should be used only for ascii symbols */
437 p_iseq(TParser *prs, char c)
440 return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
442 #else /* TS_USE_WIDE */
444 #define p_iswhat(type) \
446 p_is##type(TParser *prs) { \
447 Assert( prs->state ); \
448 return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \
452 p_isnot##type(TParser *prs) { \
453 return !p_is##type(prs); \
458 p_iseq(TParser *prs, char c)
461 return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
466 #endif /* TS_USE_WIDE */
477 p_isEOF(TParser *prs)
480 return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
484 p_iseqC(TParser *prs)
486 return p_iseq(prs, prs->c);
490 p_isneC(TParser *prs)
492 return !p_iseq(prs, prs->c);
496 p_isascii(TParser *prs)
498 return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
502 p_isasclet(TParser *prs)
504 return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
508 /* deliberately suppress unused-function complaints for the above */
509 void _make_compiler_happy(void);
511 _make_compiler_happy(void)
538 SpecialTags(TParser *prs)
540 switch (prs->state->lenchartoken)
542 case 8: /* </script */
543 if (pg_strncasecmp(prs->token, "</script", 8) == 0)
546 case 7: /* <script || </style */
547 if (pg_strncasecmp(prs->token, "</style", 7) == 0)
549 else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
553 if (pg_strncasecmp(prs->token, "<style", 6) == 0)
562 SpecialFURL(TParser *prs)
564 prs->wanthost = true;
565 prs->state->posbyte -= prs->state->lenbytetoken;
566 prs->state->poschar -= prs->state->lenchartoken;
570 SpecialHyphen(TParser *prs)
572 prs->state->posbyte -= prs->state->lenbytetoken;
573 prs->state->poschar -= prs->state->lenchartoken;
577 SpecialVerVersion(TParser *prs)
579 prs->state->posbyte -= prs->state->lenbytetoken;
580 prs->state->poschar -= prs->state->lenchartoken;
581 prs->state->lenbytetoken = 0;
582 prs->state->lenchartoken = 0;
586 p_isstophost(TParser *prs)
590 prs->wanthost = false;
597 p_isignore(TParser *prs)
599 return (prs->ignore) ? 1 : 0;
603 p_ishost(TParser *prs)
605 TParser *tmpprs = TParserInit(prs->str + prs->state->posbyte, prs->lenstr - prs->state->posbyte);
608 if (TParserGet(tmpprs) && tmpprs->type == HOST)
610 prs->state->posbyte += tmpprs->lenbytetoken;
611 prs->state->poschar += tmpprs->lenchartoken;
612 prs->state->lenbytetoken += tmpprs->lenbytetoken;
613 prs->state->lenchartoken += tmpprs->lenchartoken;
614 prs->state->charlen = tmpprs->state->charlen;
617 TParserClose(tmpprs);
623 p_isURLPath(TParser *prs)
625 TParser *tmpprs = TParserInit(prs->str + prs->state->posbyte, prs->lenstr - prs->state->posbyte);
628 tmpprs->state = newTParserPosition(tmpprs->state);
629 tmpprs->state->state = TPS_InFileFirst;
631 if (TParserGet(tmpprs) && (tmpprs->type == URLPATH || tmpprs->type == FILEPATH))
633 prs->state->posbyte += tmpprs->lenbytetoken;
634 prs->state->poschar += tmpprs->lenchartoken;
635 prs->state->lenbytetoken += tmpprs->lenbytetoken;
636 prs->state->lenchartoken += tmpprs->lenchartoken;
637 prs->state->charlen = tmpprs->state->charlen;
640 TParserClose(tmpprs);
646 * Table of state/action of parser
649 static const TParserStateActionItem actionTPS_Base[] = {
650 {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
651 {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
652 {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
653 {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
654 {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
655 {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
656 {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
657 {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
658 {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
659 {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
660 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
661 {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
662 {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
666 static const TParserStateActionItem actionTPS_InNumWord[] = {
667 {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
668 {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
669 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
670 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
671 {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
672 {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
673 {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
676 static const TParserStateActionItem actionTPS_InAsciiWord[] = {
677 {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
678 {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
679 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
680 {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
681 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
682 {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
683 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
684 {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
685 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
686 {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
687 {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
688 {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
689 {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
692 static const TParserStateActionItem actionTPS_InWord[] = {
693 {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
694 {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
695 {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
696 {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
697 {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
700 static const TParserStateActionItem actionTPS_InUnsignedInt[] = {
701 {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
702 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
703 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
704 {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
705 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
706 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
707 {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
708 {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
709 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
710 {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
713 static const TParserStateActionItem actionTPS_InSignedIntFirst[] = {
714 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
715 {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
716 {NULL, 0, A_POP, TPS_Null, 0, NULL}
719 static const TParserStateActionItem actionTPS_InSignedInt[] = {
720 {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
721 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
722 {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
723 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
724 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
725 {NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
728 static const TParserStateActionItem actionTPS_InSpace[] = {
729 {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
730 {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
731 {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
732 {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
733 {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
734 {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
735 {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
736 {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
737 {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
740 static const TParserStateActionItem actionTPS_InUDecimalFirst[] = {
741 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
742 {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
743 {NULL, 0, A_POP, TPS_Null, 0, NULL}
746 static const TParserStateActionItem actionTPS_InUDecimal[] = {
747 {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL, NULL},
748 {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
749 {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
750 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
751 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
752 {NULL, 0, A_BINGO, TPS_Base, DECIMAL, NULL}
755 static const TParserStateActionItem actionTPS_InDecimalFirst[] = {
756 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
757 {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
758 {NULL, 0, A_POP, TPS_Null, 0, NULL}
761 static const TParserStateActionItem actionTPS_InDecimal[] = {
762 {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL, NULL},
763 {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
764 {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
765 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
766 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
767 {NULL, 0, A_BINGO, TPS_Base, DECIMAL, NULL}
770 static const TParserStateActionItem actionTPS_InVerVersion[] = {
771 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
772 {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
773 {NULL, 0, A_POP, TPS_Null, 0, NULL}
776 static const TParserStateActionItem actionTPS_InSVerVersion[] = {
777 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
778 {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
779 {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
783 static const TParserStateActionItem actionTPS_InVersionFirst[] = {
784 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
785 {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
786 {NULL, 0, A_POP, TPS_Null, 0, NULL}
789 static const TParserStateActionItem actionTPS_InVersion[] = {
790 {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
791 {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
792 {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
793 {NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
796 static const TParserStateActionItem actionTPS_InMantissaFirst[] = {
797 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
798 {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
799 {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
800 {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
801 {NULL, 0, A_POP, TPS_Null, 0, NULL}
804 static const TParserStateActionItem actionTPS_InMantissaSign[] = {
805 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
806 {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
807 {NULL, 0, A_POP, TPS_Null, 0, NULL}
810 static const TParserStateActionItem actionTPS_InMantissa[] = {
811 {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
812 {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
813 {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
816 static const TParserStateActionItem actionTPS_InXMLEntityFirst[] = {
817 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
818 {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
819 {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
820 {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
821 {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
822 {NULL, 0, A_POP, TPS_Null, 0, NULL}
825 static const TParserStateActionItem actionTPS_InXMLEntity[] = {
826 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
827 {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
828 {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
829 {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
830 {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
831 {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
832 {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
833 {NULL, 0, A_POP, TPS_Null, 0, NULL}
836 static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[] = {
837 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
838 {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
839 {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
840 {NULL, 0, A_POP, TPS_Null, 0, NULL}
843 static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[] = {
844 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
845 {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
846 {NULL, 0, A_POP, TPS_Null, 0, NULL}
849 static const TParserStateActionItem actionTPS_InXMLEntityNum[] = {
850 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
851 {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
852 {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
853 {NULL, 0, A_POP, TPS_Null, 0, NULL}
856 static const TParserStateActionItem actionTPS_InXMLEntityHexNum[] = {
857 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
858 {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
859 {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
860 {NULL, 0, A_POP, TPS_Null, 0, NULL}
863 static const TParserStateActionItem actionTPS_InXMLEntityEnd[] = {
864 {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
867 static const TParserStateActionItem actionTPS_InTagFirst[] = {
868 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
869 {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
870 {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
871 {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
872 {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
873 {p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
874 {p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
875 {NULL, 0, A_POP, TPS_Null, 0, NULL}
878 static const TParserStateActionItem actionTPS_InXMLBegin[] = {
879 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
881 /* XXX do we wants states for the m and l ? Right now this accepts <?xZ */
882 {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
883 {NULL, 0, A_POP, TPS_Null, 0, NULL}
886 static const TParserStateActionItem actionTPS_InTagCloseFirst[] = {
887 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
888 {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
889 {NULL, 0, A_POP, TPS_Null, 0, NULL}
892 static const TParserStateActionItem actionTPS_InTagName[] = {
893 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
895 {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
896 {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
897 {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
898 {p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
899 {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
900 {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
901 {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
902 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
903 {NULL, 0, A_POP, TPS_Null, 0, NULL}
906 static const TParserStateActionItem actionTPS_InTagBeginEnd[] = {
907 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
908 {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
909 {NULL, 0, A_POP, TPS_Null, 0, NULL}
912 static const TParserStateActionItem actionTPS_InTag[] = {
913 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
914 {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
915 {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
916 {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
917 {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
918 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
919 {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
920 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
921 {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
922 {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
923 {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
924 {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
925 {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
926 {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
927 {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
928 {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
929 {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
930 {NULL, 0, A_POP, TPS_Null, 0, NULL}
933 static const TParserStateActionItem actionTPS_InTagEscapeK[] = {
934 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
935 {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
936 {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
937 {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
940 static const TParserStateActionItem actionTPS_InTagEscapeKK[] = {
941 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
942 {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
943 {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
944 {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
947 static const TParserStateActionItem actionTPS_InTagBackSleshed[] = {
948 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
949 {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
952 static const TParserStateActionItem actionTPS_InTagEnd[] = {
953 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
956 static const TParserStateActionItem actionTPS_InCommentFirst[] = {
957 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
958 {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
960 {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
961 {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
962 {NULL, 0, A_POP, TPS_Null, 0, NULL}
965 static const TParserStateActionItem actionTPS_InCommentLast[] = {
966 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
967 {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
968 {NULL, 0, A_POP, TPS_Null, 0, NULL}
971 static const TParserStateActionItem actionTPS_InComment[] = {
972 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
973 {p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
974 {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
977 static const TParserStateActionItem actionTPS_InCloseCommentFirst[] = {
978 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
979 {p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
980 {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
983 static const TParserStateActionItem actionTPS_InCloseCommentLast[] = {
984 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
985 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
986 {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
987 {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
990 static const TParserStateActionItem actionTPS_InCommentEnd[] = {
991 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
994 static const TParserStateActionItem actionTPS_InHostFirstDomain[] = {
995 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
996 {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
997 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
998 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1001 static const TParserStateActionItem actionTPS_InHostDomainSecond[] = {
1002 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1003 {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1004 {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1005 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1006 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1007 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1008 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1011 static const TParserStateActionItem actionTPS_InHostDomain[] = {
1012 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1013 {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1014 {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1015 {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
1016 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1017 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1018 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1019 {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
1020 {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1021 {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1022 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1025 static const TParserStateActionItem actionTPS_InPortFirst[] = {
1026 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1027 {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1028 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1031 static const TParserStateActionItem actionTPS_InPort[] = {
1032 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1033 {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1034 {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1035 {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1036 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1039 static const TParserStateActionItem actionTPS_InHostFirstAN[] = {
1040 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1041 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1042 {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1043 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1046 static const TParserStateActionItem actionTPS_InHost[] = {
1047 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1048 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1049 {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1050 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1051 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1052 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1053 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1056 static const TParserStateActionItem actionTPS_InEmail[] = {
1057 {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
1058 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1061 static const TParserStateActionItem actionTPS_InFileFirst[] = {
1062 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1063 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1064 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1065 {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
1066 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1067 {p_iseqC, '?', A_PUSH, TPS_InURLPathFirst, 0, NULL},
1068 {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1069 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1072 static const TParserStateActionItem actionTPS_InFileTwiddle[] = {
1073 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1074 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1075 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1076 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1077 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1078 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1081 static const TParserStateActionItem actionTPS_InPathFirst[] = {
1082 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1083 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1084 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1085 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1086 {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1087 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1088 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1091 static const TParserStateActionItem actionTPS_InPathFirstFirst[] = {
1092 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1093 {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1094 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1095 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1098 static const TParserStateActionItem actionTPS_InPathSecond[] = {
1099 {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1100 {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
1101 {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1102 {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1103 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1106 static const TParserStateActionItem actionTPS_InFile[] = {
1107 {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
1108 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1109 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1110 {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1111 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1112 {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
1113 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1114 {p_iseqC, '?', A_PUSH, TPS_InURLPathFirst, 0, NULL},
1115 {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
1118 static const TParserStateActionItem actionTPS_InFileNext[] = {
1119 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1120 {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
1121 {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
1122 {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
1123 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1126 static const TParserStateActionItem actionTPS_InURLPathFirst[] = {
1127 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1128 {p_iseqC, '"', A_POP, TPS_Null, 0, NULL},
1129 {p_iseqC, '\'', A_POP, TPS_Null, 0, NULL},
1130 {p_isnotspace, 0, A_CLEAR, TPS_InURLPath, 0, NULL},
1131 {NULL, 0, A_POP, TPS_Null, 0, NULL},
1134 static const TParserStateActionItem actionTPS_InURLPathStart[] = {
1135 {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
1138 static const TParserStateActionItem actionTPS_InURLPath[] = {
1139 {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
1140 {p_iseqC, '"', A_BINGO, TPS_Base, URLPATH, NULL},
1141 {p_iseqC, '\'', A_BINGO, TPS_Base, URLPATH, NULL},
1142 {p_isnotspace, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1143 {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
1146 static const TParserStateActionItem actionTPS_InFURL[] = {
1147 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1148 {p_isURLPath, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
1149 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1152 static const TParserStateActionItem actionTPS_InProtocolFirst[] = {
1153 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1154 {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
1155 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1158 static const TParserStateActionItem actionTPS_InProtocolSecond[] = {
1159 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1160 {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
1161 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1164 static const TParserStateActionItem actionTPS_InProtocolEnd[] = {
1165 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
1168 static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
1169 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1170 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1171 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1172 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1173 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1176 static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
1177 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
1178 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1179 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1180 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1181 {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1182 {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
1185 static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
1186 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1187 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1188 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1189 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1192 static const TParserStateActionItem actionTPS_InHyphenWord[] = {
1193 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
1194 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1195 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1196 {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1197 {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
1200 static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
1201 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1202 {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1203 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1204 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1207 static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
1208 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
1209 {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1210 {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
1211 {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
1214 static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = {
1215 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1216 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1217 {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1218 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1221 static const TParserStateActionItem actionTPS_InParseHyphen[] = {
1222 {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
1223 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1224 {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1225 {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
1226 {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
1227 {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
1230 static const TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
1231 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1232 {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1233 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1236 static const TParserStateActionItem actionTPS_InHyphenWordPart[] = {
1237 {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
1238 {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1239 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1240 {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
1243 static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
1244 {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
1245 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1246 {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1247 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1248 {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
1251 static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
1252 {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
1253 {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1254 {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
1257 static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
1258 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1259 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1260 {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1261 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1266 * main table of per-state parser actions
1270 const TParserStateActionItem *action; /* the actual state info */
1271 TParserState state; /* only for Assert crosscheck */
1272 #ifdef WPARSER_TRACE
1273 const char *state_name; /* only for debug printout */
1275 } TParserStateAction;
1277 #ifdef WPARSER_TRACE
1278 #define TPARSERSTATEACTION(state) \
1279 { CppConcat(action,state), state, CppAsString(state) }
1281 #define TPARSERSTATEACTION(state) \
1282 { CppConcat(action,state), state }
1286 * order must be the same as in typedef enum {} TParserState!!
1289 static const TParserStateAction Actions[] = {
1290 TPARSERSTATEACTION(TPS_Base),
1291 TPARSERSTATEACTION(TPS_InNumWord),
1292 TPARSERSTATEACTION(TPS_InAsciiWord),
1293 TPARSERSTATEACTION(TPS_InWord),
1294 TPARSERSTATEACTION(TPS_InUnsignedInt),
1295 TPARSERSTATEACTION(TPS_InSignedIntFirst),
1296 TPARSERSTATEACTION(TPS_InSignedInt),
1297 TPARSERSTATEACTION(TPS_InSpace),
1298 TPARSERSTATEACTION(TPS_InUDecimalFirst),
1299 TPARSERSTATEACTION(TPS_InUDecimal),
1300 TPARSERSTATEACTION(TPS_InDecimalFirst),
1301 TPARSERSTATEACTION(TPS_InDecimal),
1302 TPARSERSTATEACTION(TPS_InVerVersion),
1303 TPARSERSTATEACTION(TPS_InSVerVersion),
1304 TPARSERSTATEACTION(TPS_InVersionFirst),
1305 TPARSERSTATEACTION(TPS_InVersion),
1306 TPARSERSTATEACTION(TPS_InMantissaFirst),
1307 TPARSERSTATEACTION(TPS_InMantissaSign),
1308 TPARSERSTATEACTION(TPS_InMantissa),
1309 TPARSERSTATEACTION(TPS_InXMLEntityFirst),
1310 TPARSERSTATEACTION(TPS_InXMLEntity),
1311 TPARSERSTATEACTION(TPS_InXMLEntityNumFirst),
1312 TPARSERSTATEACTION(TPS_InXMLEntityNum),
1313 TPARSERSTATEACTION(TPS_InXMLEntityHexNumFirst),
1314 TPARSERSTATEACTION(TPS_InXMLEntityHexNum),
1315 TPARSERSTATEACTION(TPS_InXMLEntityEnd),
1316 TPARSERSTATEACTION(TPS_InTagFirst),
1317 TPARSERSTATEACTION(TPS_InXMLBegin),
1318 TPARSERSTATEACTION(TPS_InTagCloseFirst),
1319 TPARSERSTATEACTION(TPS_InTagName),
1320 TPARSERSTATEACTION(TPS_InTagBeginEnd),
1321 TPARSERSTATEACTION(TPS_InTag),
1322 TPARSERSTATEACTION(TPS_InTagEscapeK),
1323 TPARSERSTATEACTION(TPS_InTagEscapeKK),
1324 TPARSERSTATEACTION(TPS_InTagBackSleshed),
1325 TPARSERSTATEACTION(TPS_InTagEnd),
1326 TPARSERSTATEACTION(TPS_InCommentFirst),
1327 TPARSERSTATEACTION(TPS_InCommentLast),
1328 TPARSERSTATEACTION(TPS_InComment),
1329 TPARSERSTATEACTION(TPS_InCloseCommentFirst),
1330 TPARSERSTATEACTION(TPS_InCloseCommentLast),
1331 TPARSERSTATEACTION(TPS_InCommentEnd),
1332 TPARSERSTATEACTION(TPS_InHostFirstDomain),
1333 TPARSERSTATEACTION(TPS_InHostDomainSecond),
1334 TPARSERSTATEACTION(TPS_InHostDomain),
1335 TPARSERSTATEACTION(TPS_InPortFirst),
1336 TPARSERSTATEACTION(TPS_InPort),
1337 TPARSERSTATEACTION(TPS_InHostFirstAN),
1338 TPARSERSTATEACTION(TPS_InHost),
1339 TPARSERSTATEACTION(TPS_InEmail),
1340 TPARSERSTATEACTION(TPS_InFileFirst),
1341 TPARSERSTATEACTION(TPS_InFileTwiddle),
1342 TPARSERSTATEACTION(TPS_InPathFirst),
1343 TPARSERSTATEACTION(TPS_InPathFirstFirst),
1344 TPARSERSTATEACTION(TPS_InPathSecond),
1345 TPARSERSTATEACTION(TPS_InFile),
1346 TPARSERSTATEACTION(TPS_InFileNext),
1347 TPARSERSTATEACTION(TPS_InURLPathFirst),
1348 TPARSERSTATEACTION(TPS_InURLPathStart),
1349 TPARSERSTATEACTION(TPS_InURLPath),
1350 TPARSERSTATEACTION(TPS_InFURL),
1351 TPARSERSTATEACTION(TPS_InProtocolFirst),
1352 TPARSERSTATEACTION(TPS_InProtocolSecond),
1353 TPARSERSTATEACTION(TPS_InProtocolEnd),
1354 TPARSERSTATEACTION(TPS_InHyphenAsciiWordFirst),
1355 TPARSERSTATEACTION(TPS_InHyphenAsciiWord),
1356 TPARSERSTATEACTION(TPS_InHyphenWordFirst),
1357 TPARSERSTATEACTION(TPS_InHyphenWord),
1358 TPARSERSTATEACTION(TPS_InHyphenNumWordFirst),
1359 TPARSERSTATEACTION(TPS_InHyphenNumWord),
1360 TPARSERSTATEACTION(TPS_InHyphenDigitLookahead),
1361 TPARSERSTATEACTION(TPS_InParseHyphen),
1362 TPARSERSTATEACTION(TPS_InParseHyphenHyphen),
1363 TPARSERSTATEACTION(TPS_InHyphenWordPart),
1364 TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart),
1365 TPARSERSTATEACTION(TPS_InHyphenNumWordPart),
1366 TPARSERSTATEACTION(TPS_InHyphenUnsignedInt)
1371 TParserGet(TParser *prs)
1373 const TParserStateActionItem *item = NULL;
1377 if (prs->state->posbyte >= prs->lenstr)
1380 prs->token = prs->str + prs->state->posbyte;
1381 prs->state->pushedAtAction = NULL;
1383 /* look at string */
1384 while (prs->state->posbyte <= prs->lenstr)
1386 if (prs->state->posbyte == prs->lenstr)
1387 prs->state->charlen = 0;
1389 prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
1390 pg_mblen(prs->str + prs->state->posbyte);
1392 Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
1393 Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
1394 Assert(Actions[prs->state->state].state == prs->state->state);
1396 if (prs->state->pushedAtAction)
1398 /* After a POP, pick up at the next test */
1399 item = prs->state->pushedAtAction + 1;
1400 prs->state->pushedAtAction = NULL;
1404 item = Actions[prs->state->state].action;
1405 Assert(item != NULL);
1408 /* find action by character class */
1409 while (item->isclass)
1412 if (item->isclass(prs) != 0)
1417 #ifdef WPARSER_TRACE
1419 TParserPosition *ptr;
1421 fprintf(stderr, "state ");
1422 /* indent according to stack depth */
1423 for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
1424 fprintf(stderr, " ");
1425 fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
1426 if (prs->state->posbyte < prs->lenstr)
1427 fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
1429 fprintf(stderr, "at EOF");
1430 fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
1431 (int) (item - Actions[prs->state->state].action),
1432 (item->flags & A_BINGO) ? " BINGO" : "",
1433 (item->flags & A_POP) ? " POP" : "",
1434 (item->flags & A_PUSH) ? " PUSH" : "",
1435 (item->flags & A_RERUN) ? " RERUN" : "",
1436 (item->flags & A_CLEAR) ? " CLEAR" : "",
1437 (item->flags & A_MERGE) ? " MERGE" : "",
1438 (item->flags & A_CLRALL) ? " CLRALL" : "",
1439 (item->tostate != TPS_Null) ? " tostate " : "",
1440 (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
1441 (item->type > 0) ? " type " : "",
1442 tok_alias[item->type]);
1446 /* call special handler if exists */
1450 /* BINGO, token is found */
1451 if (item->flags & A_BINGO)
1453 Assert(item->type > 0);
1454 prs->lenbytetoken = prs->state->lenbytetoken;
1455 prs->lenchartoken = prs->state->lenchartoken;
1456 prs->state->lenbytetoken = prs->state->lenchartoken = 0;
1457 prs->type = item->type;
1460 /* do various actions by flags */
1461 if (item->flags & A_POP)
1462 { /* pop stored state in stack */
1463 TParserPosition *ptr = prs->state->prev;
1469 else if (item->flags & A_PUSH)
1470 { /* push (store) state in stack */
1471 prs->state->pushedAtAction = item; /* remember where we push */
1472 prs->state = newTParserPosition(prs->state);
1474 else if (item->flags & A_CLEAR)
1475 { /* clear previous pushed state */
1476 TParserPosition *ptr;
1478 Assert(prs->state->prev);
1479 ptr = prs->state->prev->prev;
1480 pfree(prs->state->prev);
1481 prs->state->prev = ptr;
1483 else if (item->flags & A_CLRALL)
1484 { /* clear all previous pushed state */
1485 TParserPosition *ptr;
1487 while (prs->state->prev)
1489 ptr = prs->state->prev->prev;
1490 pfree(prs->state->prev);
1491 prs->state->prev = ptr;
1494 else if (item->flags & A_MERGE)
1495 { /* merge posinfo with current and pushed state */
1496 TParserPosition *ptr = prs->state;
1498 Assert(prs->state->prev);
1499 prs->state = prs->state->prev;
1501 prs->state->posbyte = ptr->posbyte;
1502 prs->state->poschar = ptr->poschar;
1503 prs->state->charlen = ptr->charlen;
1504 prs->state->lenbytetoken = ptr->lenbytetoken;
1505 prs->state->lenchartoken = ptr->lenchartoken;
1509 /* set new state if pointed */
1510 if (item->tostate != TPS_Null)
1511 prs->state->state = item->tostate;
1513 /* check for go away */
1514 if ((item->flags & A_BINGO) ||
1515 (prs->state->posbyte >= prs->lenstr &&
1516 (item->flags & A_RERUN) == 0))
1519 /* go to beginning of loop if we should rerun or we just restore state */
1520 if (item->flags & (A_RERUN | A_POP))
1524 if (prs->state->charlen)
1526 prs->state->posbyte += prs->state->charlen;
1527 prs->state->lenbytetoken += prs->state->charlen;
1528 prs->state->poschar++;
1529 prs->state->lenchartoken++;
1533 return (item && (item->flags & A_BINGO)) ? true : false;
1537 prsd_lextype(PG_FUNCTION_ARGS)
1539 LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
1542 for (i = 1; i <= LASTNUM; i++)
1544 descr[i - 1].lexid = i;
1545 descr[i - 1].alias = pstrdup(tok_alias[i]);
1546 descr[i - 1].descr = pstrdup(lex_descr[i]);
1549 descr[LASTNUM].lexid = 0;
1551 PG_RETURN_POINTER(descr);
1555 prsd_start(PG_FUNCTION_ARGS)
1557 PG_RETURN_POINTER(TParserInit((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
1561 prsd_nexttoken(PG_FUNCTION_ARGS)
1563 TParser *p = (TParser *) PG_GETARG_POINTER(0);
1564 char **t = (char **) PG_GETARG_POINTER(1);
1565 int *tlen = (int *) PG_GETARG_POINTER(2);
1571 *tlen = p->lenbytetoken;
1573 PG_RETURN_INT32(p->type);
1577 prsd_end(PG_FUNCTION_ARGS)
1579 TParser *p = (TParser *) PG_GETARG_POINTER(0);
1585 #define LEAVETOKEN(x) ( (x)==SPACE )
1586 #define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1587 #define ENDPUNCTOKEN(x) ( (x)==SPACE )
1589 #define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
1590 #define HLIDIGNORE(x) ( (x)==URL_T || (x)==TAG_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1591 #define XMLHLIDIGNORE(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1592 #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDIGNORE(x) )
1593 #define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
1597 HeadlineWordEntry *words;
1602 checkcondition_HL(void *checkval, QueryOperand *val)
1606 for (i = 0; i < ((hlCheck *) checkval)->len; i++)
1608 if (((hlCheck *) checkval)->words[i].item == val)
1616 hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
1620 QueryItem *item = GETQUERY(query);
1626 for (j = 0; j < query->size; j++)
1628 if (item->type != QI_VAL)
1633 for (i = pos; i < prs->curwords; i++)
1635 if (prs->words[i].item == &item->operand)
1648 item = GETQUERY(query);
1649 for (j = 0; j < query->size; j++)
1651 if (item->type != QI_VAL)
1656 for (i = *q; i >= pos; i--)
1658 if (prs->words[i].item == &item->operand)
1672 ch.words = &(prs->words[*p]);
1673 ch.len = *q - *p + 1;
1674 if (TS_execute(GETQUERY(query), &ch, false, checkcondition_HL))
1679 return hlCover(prs, query, p, q);
1687 prsd_headline(PG_FUNCTION_ARGS)
1689 HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
1690 List *prsoptions = (List *) PG_GETARG_POINTER(1);
1691 TSQuery query = PG_GETARG_TSQUERY(2);
1693 /* from opt + start and and tag */
1713 prs->startsel = NULL;
1714 prs->stopsel = NULL;
1715 foreach(l, prsoptions)
1717 DefElem *defel = (DefElem *) lfirst(l);
1718 char *val = defGetString(defel);
1720 if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
1721 max_words = pg_atoi(val, sizeof(int32), 0);
1722 else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
1723 min_words = pg_atoi(val, sizeof(int32), 0);
1724 else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
1725 shortword = pg_atoi(val, sizeof(int32), 0);
1726 else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
1727 prs->startsel = pstrdup(val);
1728 else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
1729 prs->stopsel = pstrdup(val);
1730 else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
1731 highlight = (pg_strcasecmp(val, "1") == 0 ||
1732 pg_strcasecmp(val, "on") == 0 ||
1733 pg_strcasecmp(val, "true") == 0 ||
1734 pg_strcasecmp(val, "t") == 0 ||
1735 pg_strcasecmp(val, "y") == 0 ||
1736 pg_strcasecmp(val, "yes") == 0);
1739 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1740 errmsg("unrecognized headline parameter: \"%s\"",
1746 if (min_words >= max_words)
1748 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1749 errmsg("MinWords should be less than MaxWords")));
1752 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1753 errmsg("MinWords should be positive")));
1756 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1757 errmsg("ShortWord should be >= 0")));
1759 while (hlCover(prs, query, &p, &q))
1761 /* find cover len in words */
1764 for (i = p; i <= q && curlen < max_words; i++)
1766 if (!NONWORDTOKEN(prs->words[i].type))
1768 if (prs->words[i].item && !prs->words[i].repeated)
1773 if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
1775 /* best already finded, so try one more cover */
1781 if (curlen < max_words)
1782 { /* find good end */
1783 for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
1787 if (!NONWORDTOKEN(prs->words[i].type))
1789 if (prs->words[i].item && !prs->words[i].repeated)
1793 if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
1795 if (curlen >= min_words)
1798 if (curlen < min_words && i >= prs->curwords)
1799 { /* got end of text and our cover is shoter
1801 for (i = p; i >= 0; i--)
1803 if (!NONWORDTOKEN(prs->words[i].type))
1805 if (prs->words[i].item && !prs->words[i].repeated)
1807 if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
1809 if (curlen >= min_words)
1812 posb = (i >= 0) ? i : 0;
1816 { /* shorter cover :((( */
1817 for (; curlen > min_words; i--)
1819 if (!NONWORDTOKEN(prs->words[i].type))
1821 if (prs->words[i].item && !prs->words[i].repeated)
1824 if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
1830 if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
1831 (bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
1832 (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
1845 for (i = 0; i < prs->curwords && curlen < min_words; i++)
1847 if (!NONWORDTOKEN(prs->words[i].type))
1858 beste = prs->curwords - 1;
1861 for (i = bestb; i <= beste; i++)
1863 if (prs->words[i].item)
1864 prs->words[i].selected = 1;
1867 if (HLIDIGNORE(prs->words[i].type))
1868 prs->words[i].replace = 1;
1872 if (XMLHLIDIGNORE(prs->words[i].type))
1873 prs->words[i].replace = 1;
1876 prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
1880 prs->startsel = pstrdup("<b>");
1882 prs->stopsel = pstrdup("</b>");
1883 prs->startsellen = strlen(prs->startsel);
1884 prs->stopsellen = strlen(prs->stopsel);
1886 PG_RETURN_POINTER(prs);