1 /*-------------------------------------------------------------------------
4 * Default text search parser
6 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
10 * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.11 2007/11/20 02:25:22 adunstan Exp $
12 *-------------------------------------------------------------------------
17 #include "commands/defrem.h"
18 #include "tsearch/ts_locale.h"
19 #include "tsearch/ts_public.h"
20 #include "tsearch/ts_type.h"
21 #include "tsearch/ts_utils.h"
22 #include "utils/builtins.h"
25 /* Define me to enable tracing of parser behavior */
26 /* #define WPARSER_TRACE */
29 /* Output token categories */
38 #define VERSIONNUMBER 8
39 #define NUMPARTHWORD 9
41 #define ASCIIPARTHWORD 11
52 #define UNSIGNEDINT 22
57 static const char *const tok_alias[] = {
84 static const char *const lex_descr[] = {
88 "Word, letters and digits",
92 "Scientific notation",
94 "Hyphenated word part, letters and digits",
95 "Hyphenated word part, all letters",
96 "Hyphenated word part, all ASCII",
100 "Hyphenated word, letters and digits",
101 "Hyphenated word, all ASCII",
102 "Hyphenated word, all letters",
121 TPS_InSignedIntFirst,
135 TPS_InXMLEntityFirst,
137 TPS_InXMLEntityNumFirst,
139 TPS_InXMLEntityHexNumFirst,
140 TPS_InXMLEntityHexNum,
150 TPS_InTagBackSleshed,
155 TPS_InCloseCommentFirst,
156 TPS_InCloseCommentLast,
158 TPS_InHostFirstDomain,
159 TPS_InHostDomainSecond,
169 TPS_InPathFirstFirst,
178 TPS_InProtocolSecond,
180 TPS_InHyphenAsciiWordFirst,
181 TPS_InHyphenAsciiWord,
182 TPS_InHyphenWordFirst,
184 TPS_InHyphenNumWordFirst,
186 TPS_InHyphenDigitLookahead,
188 TPS_InParseHyphenHyphen,
189 TPS_InHyphenWordPart,
190 TPS_InHyphenAsciiWordPart,
191 TPS_InHyphenNumWordPart,
192 TPS_InHyphenUnsignedInt,
193 TPS_Null /* last state (fake value) */
196 /* forward declaration */
199 typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions
201 typedef void (*TParserSpecial) (struct TParser *); /* special handler for
202 * special cases... */
206 TParserCharTest isclass;
209 TParserState tostate;
211 TParserSpecial special;
212 } TParserStateActionItem;
214 /* Flag bits in TParserStateActionItem.flags */
215 #define A_NEXT 0x0000
216 #define A_BINGO 0x0001
218 #define A_PUSH 0x0004
219 #define A_RERUN 0x0008
220 #define A_CLEAR 0x0010
221 #define A_MERGE 0x0020
222 #define A_CLRALL 0x0040
224 typedef struct TParserPosition
226 int posbyte; /* position of parser in bytes */
227 int poschar; /* position of parser in characters */
228 int charlen; /* length of current char */
229 int lenbytetoken; /* length of token-so-far in bytes */
230 int lenchartoken; /* and in chars */
232 struct TParserPosition *prev;
233 const TParserStateActionItem *pushedAtAction;
236 typedef struct TParser
238 /* string and position information */
239 char *str; /* multibyte string */
240 int lenstr; /* length of mbstring */
242 wchar_t *wstr; /* wide character string */
243 int lenwstr; /* length of wsting */
249 TParserPosition *state;
264 /* forward decls here */
265 static bool TParserGet(TParser *prs);
268 static TParserPosition *
269 newTParserPosition(TParserPosition *prev)
271 TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition));
274 memcpy(res, prev, sizeof(TParserPosition));
276 memset(res, 0, sizeof(TParserPosition));
280 res->pushedAtAction = NULL;
286 TParserInit(char *str, int len)
288 TParser *prs = (TParser *) palloc0(sizeof(TParser));
290 prs->charmaxlen = pg_database_encoding_max_length();
297 * Use wide char code only when max encoding length > 1.
299 if (prs->charmaxlen > 1)
302 prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
303 prs->lenwstr = char2wchar(prs->wstr, prs->lenstr + 1,
304 prs->str, prs->lenstr);
308 prs->usewide = false;
310 prs->state = newTParserPosition(NULL);
311 prs->state->state = TPS_Base;
314 fprintf(stderr, "parsing \"%.*s\"\n", len, str);
321 TParserClose(TParser *prs)
325 TParserPosition *ptr = prs->state->prev;
340 * Character-type support functions, equivalent to is* macros, but
341 * working with any possible encodings and locales. Note,
342 * that with multibyte encoding and C-locale isw* function may fail
343 * or give wrong result. Note 2: multibyte encoding and C-locale
344 * often are used for Asian languages
349 #define p_iswhat(type) \
351 p_is##type(TParser *prs) { \
352 Assert( prs->state ); \
353 if ( prs->usewide ) \
355 if ( lc_ctype_is_c() ) \
356 return is##type( 0xff & *( prs->wstr + prs->state->poschar) ); \
358 return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) ); \
361 return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
365 p_isnot##type(TParser *prs) { \
366 return !p_is##type(prs); \
370 p_isalnum(TParser *prs)
378 unsigned int c = *(prs->wstr + prs->state->poschar);
381 * any non-ascii symbol with multibyte encoding with C-locale is
387 return isalnum(0xff & c);
390 return iswalnum((wint_t) *(prs->wstr + prs->state->poschar));
393 return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte));
396 p_isnotalnum(TParser *prs)
398 return !p_isalnum(prs);
402 p_isalpha(TParser *prs)
410 unsigned int c = *(prs->wstr + prs->state->poschar);
413 * any non-ascii symbol with multibyte encoding with C-locale is
419 return isalpha(0xff & c);
422 return iswalpha((wint_t) *(prs->wstr + prs->state->poschar));
425 return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte));
429 p_isnotalpha(TParser *prs)
431 return !p_isalpha(prs);
434 /* p_iseq should be used only for ascii symbols */
437 p_iseq(TParser *prs, char c)
440 return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
442 #else /* TS_USE_WIDE */
444 #define p_iswhat(type) \
446 p_is##type(TParser *prs) { \
447 Assert( prs->state ); \
448 return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \
452 p_isnot##type(TParser *prs) { \
453 return !p_is##type(prs); \
458 p_iseq(TParser *prs, char c)
461 return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
466 #endif /* TS_USE_WIDE */
477 p_isEOF(TParser *prs)
480 return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
484 p_iseqC(TParser *prs)
486 return p_iseq(prs, prs->c);
490 p_isneC(TParser *prs)
492 return !p_iseq(prs, prs->c);
496 p_isascii(TParser *prs)
498 return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
502 p_isasclet(TParser *prs)
504 return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
508 /* deliberately suppress unused-function complaints for the above */
509 void _make_compiler_happy(void);
511 _make_compiler_happy(void)
538 SpecialTags(TParser *prs)
540 switch (prs->state->lenchartoken)
542 case 8: /* </script */
543 if (pg_strncasecmp(prs->token, "</script", 8) == 0)
546 case 7: /* <script || </style */
547 if (pg_strncasecmp(prs->token, "</style", 7) == 0)
549 else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
553 if (pg_strncasecmp(prs->token, "<style", 6) == 0)
562 SpecialFURL(TParser *prs)
564 prs->wanthost = true;
565 prs->state->posbyte -= prs->state->lenbytetoken;
566 prs->state->poschar -= prs->state->lenchartoken;
570 SpecialHyphen(TParser *prs)
572 prs->state->posbyte -= prs->state->lenbytetoken;
573 prs->state->poschar -= prs->state->lenchartoken;
577 SpecialVerVersion(TParser *prs)
579 prs->state->posbyte -= prs->state->lenbytetoken;
580 prs->state->poschar -= prs->state->lenchartoken;
581 prs->state->lenbytetoken = 0;
582 prs->state->lenchartoken = 0;
586 p_isstophost(TParser *prs)
590 prs->wanthost = false;
597 p_isignore(TParser *prs)
599 return (prs->ignore) ? 1 : 0;
603 p_ishost(TParser *prs)
605 TParser *tmpprs = TParserInit(prs->str + prs->state->posbyte, prs->lenstr - prs->state->posbyte);
608 if (TParserGet(tmpprs) && tmpprs->type == HOST)
610 prs->state->posbyte += tmpprs->lenbytetoken;
611 prs->state->poschar += tmpprs->lenchartoken;
612 prs->state->lenbytetoken += tmpprs->lenbytetoken;
613 prs->state->lenchartoken += tmpprs->lenchartoken;
614 prs->state->charlen = tmpprs->state->charlen;
617 TParserClose(tmpprs);
623 p_isURLPath(TParser *prs)
625 TParser *tmpprs = TParserInit(prs->str + prs->state->posbyte, prs->lenstr - prs->state->posbyte);
628 tmpprs->state = newTParserPosition(tmpprs->state);
629 tmpprs->state->state = TPS_InFileFirst;
631 if (TParserGet(tmpprs) && (tmpprs->type == URLPATH || tmpprs->type == FILEPATH))
633 prs->state->posbyte += tmpprs->lenbytetoken;
634 prs->state->poschar += tmpprs->lenchartoken;
635 prs->state->lenbytetoken += tmpprs->lenbytetoken;
636 prs->state->lenchartoken += tmpprs->lenchartoken;
637 prs->state->charlen = tmpprs->state->charlen;
640 TParserClose(tmpprs);
646 * Table of state/action of parser
649 static const TParserStateActionItem actionTPS_Base[] = {
650 {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
651 {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
652 {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
653 {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
654 {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
655 {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
656 {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
657 {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
658 {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
659 {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
660 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
661 {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
662 {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
666 static const TParserStateActionItem actionTPS_InNumWord[] = {
667 {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
668 {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
669 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
670 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
671 {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
672 {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
673 {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
676 static const TParserStateActionItem actionTPS_InAsciiWord[] = {
677 {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
678 {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
679 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
680 {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
681 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
682 {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
683 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
684 {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
685 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
686 {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
687 {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
688 {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
689 {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
692 static const TParserStateActionItem actionTPS_InWord[] = {
693 {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
694 {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
695 {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
696 {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
697 {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
700 static const TParserStateActionItem actionTPS_InUnsignedInt[] = {
701 {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
702 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
703 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
704 {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
705 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
706 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
707 {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
708 {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
709 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
710 {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
713 static const TParserStateActionItem actionTPS_InSignedIntFirst[] = {
714 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
715 {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
716 {NULL, 0, A_POP, TPS_Null, 0, NULL}
719 static const TParserStateActionItem actionTPS_InSignedInt[] = {
720 {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
721 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
722 {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
723 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
724 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
725 {NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
728 static const TParserStateActionItem actionTPS_InSpace[] = {
729 {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
730 {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
731 {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
732 {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
733 {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
734 {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
735 {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
736 {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
737 {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
740 static const TParserStateActionItem actionTPS_InUDecimalFirst[] = {
741 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
742 {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
743 {NULL, 0, A_POP, TPS_Null, 0, NULL}
746 static const TParserStateActionItem actionTPS_InUDecimal[] = {
747 {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL, NULL},
748 {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
749 {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
750 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
751 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
752 {NULL, 0, A_BINGO, TPS_Base, DECIMAL, NULL}
755 static const TParserStateActionItem actionTPS_InDecimalFirst[] = {
756 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
757 {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
758 {NULL, 0, A_POP, TPS_Null, 0, NULL}
761 static const TParserStateActionItem actionTPS_InDecimal[] = {
762 {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL, NULL},
763 {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
764 {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
765 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
766 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
767 {NULL, 0, A_BINGO, TPS_Base, DECIMAL, NULL}
770 static const TParserStateActionItem actionTPS_InVerVersion[] = {
771 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
772 {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
773 {NULL, 0, A_POP, TPS_Null, 0, NULL}
776 static const TParserStateActionItem actionTPS_InSVerVersion[] = {
777 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
778 {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
779 {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
783 static const TParserStateActionItem actionTPS_InVersionFirst[] = {
784 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
785 {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
786 {NULL, 0, A_POP, TPS_Null, 0, NULL}
789 static const TParserStateActionItem actionTPS_InVersion[] = {
790 {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
791 {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
792 {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
793 {NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
796 static const TParserStateActionItem actionTPS_InMantissaFirst[] = {
797 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
798 {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
799 {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
800 {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
801 {NULL, 0, A_POP, TPS_Null, 0, NULL}
804 static const TParserStateActionItem actionTPS_InMantissaSign[] = {
805 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
806 {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
807 {NULL, 0, A_POP, TPS_Null, 0, NULL}
810 static const TParserStateActionItem actionTPS_InMantissa[] = {
811 {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
812 {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
813 {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
816 static const TParserStateActionItem actionTPS_InXMLEntityFirst[] = {
817 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
818 {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
819 {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
820 {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
821 {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
822 {NULL, 0, A_POP, TPS_Null, 0, NULL}
825 static const TParserStateActionItem actionTPS_InXMLEntity[] = {
826 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
827 {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
828 {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
829 {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
830 {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
831 {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
832 {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
833 {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
834 {NULL, 0, A_POP, TPS_Null, 0, NULL}
837 static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[] = {
838 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
839 {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
840 {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
841 {NULL, 0, A_POP, TPS_Null, 0, NULL}
844 static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[] = {
845 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
846 {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
847 {NULL, 0, A_POP, TPS_Null, 0, NULL}
850 static const TParserStateActionItem actionTPS_InXMLEntityNum[] = {
851 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
852 {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
853 {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
854 {NULL, 0, A_POP, TPS_Null, 0, NULL}
857 static const TParserStateActionItem actionTPS_InXMLEntityHexNum[] = {
858 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
859 {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
860 {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
861 {NULL, 0, A_POP, TPS_Null, 0, NULL}
864 static const TParserStateActionItem actionTPS_InXMLEntityEnd[] = {
865 {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
868 static const TParserStateActionItem actionTPS_InTagFirst[] = {
869 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
870 {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
871 {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
872 {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
873 {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
874 {NULL, 0, A_POP, TPS_Null, 0, NULL}
877 static const TParserStateActionItem actionTPS_InXMLBegin[] = {
878 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
880 /* XXX do we wants states for the m and l ? Right now this accepts <?xZ */
881 {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
882 {NULL, 0, A_POP, TPS_Null, 0, NULL}
885 static const TParserStateActionItem actionTPS_InTagCloseFirst[] = {
886 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
887 {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
888 {NULL, 0, A_POP, TPS_Null, 0, NULL}
891 static const TParserStateActionItem actionTPS_InTagName[] = {
892 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
894 {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
895 {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
896 {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
897 {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
898 {NULL, 0, A_POP, TPS_Null, 0, NULL}
901 static const TParserStateActionItem actionTPS_InTagBeginEnd[] = {
902 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
903 {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
904 {NULL, 0, A_POP, TPS_Null, 0, NULL}
907 static const TParserStateActionItem actionTPS_InTag[] = {
908 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
909 {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
910 {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
911 {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
912 {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
913 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
914 {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
915 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
916 {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
917 {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
918 {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
919 {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
920 {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
921 {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
922 {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
923 {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
924 {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
925 {NULL, 0, A_POP, TPS_Null, 0, NULL}
928 static const TParserStateActionItem actionTPS_InTagEscapeK[] = {
929 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
930 {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
931 {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
932 {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
935 static const TParserStateActionItem actionTPS_InTagEscapeKK[] = {
936 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
937 {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
938 {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
939 {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
942 static const TParserStateActionItem actionTPS_InTagBackSleshed[] = {
943 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
944 {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
947 static const TParserStateActionItem actionTPS_InTagEnd[] = {
948 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
951 static const TParserStateActionItem actionTPS_InCommentFirst[] = {
952 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
953 {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
955 {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
956 {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
957 {NULL, 0, A_POP, TPS_Null, 0, NULL}
960 static const TParserStateActionItem actionTPS_InCommentLast[] = {
961 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
962 {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
963 {NULL, 0, A_POP, TPS_Null, 0, NULL}
966 static const TParserStateActionItem actionTPS_InComment[] = {
967 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
968 {p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
969 {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
972 static const TParserStateActionItem actionTPS_InCloseCommentFirst[] = {
973 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
974 {p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
975 {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
978 static const TParserStateActionItem actionTPS_InCloseCommentLast[] = {
979 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
980 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
981 {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
982 {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
985 static const TParserStateActionItem actionTPS_InCommentEnd[] = {
986 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
989 static const TParserStateActionItem actionTPS_InHostFirstDomain[] = {
990 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
991 {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
992 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
993 {NULL, 0, A_POP, TPS_Null, 0, NULL}
996 static const TParserStateActionItem actionTPS_InHostDomainSecond[] = {
997 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
998 {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
999 {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1000 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1001 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1002 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1003 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1006 static const TParserStateActionItem actionTPS_InHostDomain[] = {
1007 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1008 {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1009 {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1010 {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
1011 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1012 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1013 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1014 {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
1015 {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1016 {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1017 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1020 static const TParserStateActionItem actionTPS_InPortFirst[] = {
1021 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1022 {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1023 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1026 static const TParserStateActionItem actionTPS_InPort[] = {
1027 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1028 {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1029 {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1030 {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1031 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1034 static const TParserStateActionItem actionTPS_InHostFirstAN[] = {
1035 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1036 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1037 {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1038 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1041 static const TParserStateActionItem actionTPS_InHost[] = {
1042 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1043 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1044 {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1045 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1046 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1047 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1048 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1051 static const TParserStateActionItem actionTPS_InEmail[] = {
1052 {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
1053 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1056 static const TParserStateActionItem actionTPS_InFileFirst[] = {
1057 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1058 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1059 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1060 {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
1061 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1062 {p_iseqC, '?', A_PUSH, TPS_InURLPathFirst, 0, NULL},
1063 {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1064 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1067 static const TParserStateActionItem actionTPS_InFileTwiddle[] = {
1068 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1069 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1070 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1071 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1072 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1073 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1076 static const TParserStateActionItem actionTPS_InPathFirst[] = {
1077 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1078 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1079 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1080 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1081 {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1082 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1083 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1086 static const TParserStateActionItem actionTPS_InPathFirstFirst[] = {
1087 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1088 {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1089 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1090 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1093 static const TParserStateActionItem actionTPS_InPathSecond[] = {
1094 {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1095 {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
1096 {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1097 {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1098 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1101 static const TParserStateActionItem actionTPS_InFile[] = {
1102 {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
1103 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1104 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1105 {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1106 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1107 {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
1108 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1109 {p_iseqC, '?', A_PUSH, TPS_InURLPathFirst, 0, NULL},
1110 {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
1113 static const TParserStateActionItem actionTPS_InFileNext[] = {
1114 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1115 {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
1116 {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
1117 {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
1118 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1121 static const TParserStateActionItem actionTPS_InURLPathFirst[] = {
1122 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1123 {p_iseqC, '"', A_POP, TPS_Null, 0, NULL},
1124 {p_iseqC, '\'', A_POP, TPS_Null, 0, NULL},
1125 {p_isnotspace, 0, A_CLEAR, TPS_InURLPath, 0, NULL},
1126 {NULL, 0, A_POP, TPS_Null, 0, NULL},
1129 static const TParserStateActionItem actionTPS_InURLPathStart[] = {
1130 {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
1133 static const TParserStateActionItem actionTPS_InURLPath[] = {
1134 {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
1135 {p_iseqC, '"', A_BINGO, TPS_Base, URLPATH, NULL},
1136 {p_iseqC, '\'', A_BINGO, TPS_Base, URLPATH, NULL},
1137 {p_isnotspace, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1138 {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
1141 static const TParserStateActionItem actionTPS_InFURL[] = {
1142 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1143 {p_isURLPath, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
1144 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1147 static const TParserStateActionItem actionTPS_InProtocolFirst[] = {
1148 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1149 {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
1150 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1153 static const TParserStateActionItem actionTPS_InProtocolSecond[] = {
1154 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1155 {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
1156 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1159 static const TParserStateActionItem actionTPS_InProtocolEnd[] = {
1160 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
1163 static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
1164 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1165 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1166 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1167 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1168 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1171 static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
1172 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
1173 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1174 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1175 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1176 {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1177 {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
1180 static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
1181 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1182 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1183 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1184 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1187 static const TParserStateActionItem actionTPS_InHyphenWord[] = {
1188 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
1189 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1190 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1191 {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1192 {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
1195 static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
1196 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1197 {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1198 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1199 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1202 static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
1203 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
1204 {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1205 {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
1206 {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
1209 static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = {
1210 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1211 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1212 {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1213 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1216 static const TParserStateActionItem actionTPS_InParseHyphen[] = {
1217 {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
1218 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1219 {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1220 {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
1221 {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
1222 {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
1225 static const TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
1226 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1227 {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1228 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1231 static const TParserStateActionItem actionTPS_InHyphenWordPart[] = {
1232 {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
1233 {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1234 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1235 {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
1238 static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
1239 {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
1240 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1241 {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1242 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1243 {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
1246 static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
1247 {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
1248 {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1249 {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
1252 static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
1253 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1254 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1255 {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1256 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1261 * main table of per-state parser actions
1265 const TParserStateActionItem *action; /* the actual state info */
1266 TParserState state; /* only for Assert crosscheck */
1267 #ifdef WPARSER_TRACE
1268 const char *state_name; /* only for debug printout */
1270 } TParserStateAction;
1272 #ifdef WPARSER_TRACE
1273 #define TPARSERSTATEACTION(state) \
1274 { CppConcat(action,state), state, CppAsString(state) }
1276 #define TPARSERSTATEACTION(state) \
1277 { CppConcat(action,state), state }
1281 * order must be the same as in typedef enum {} TParserState!!
1284 static const TParserStateAction Actions[] = {
1285 TPARSERSTATEACTION(TPS_Base),
1286 TPARSERSTATEACTION(TPS_InNumWord),
1287 TPARSERSTATEACTION(TPS_InAsciiWord),
1288 TPARSERSTATEACTION(TPS_InWord),
1289 TPARSERSTATEACTION(TPS_InUnsignedInt),
1290 TPARSERSTATEACTION(TPS_InSignedIntFirst),
1291 TPARSERSTATEACTION(TPS_InSignedInt),
1292 TPARSERSTATEACTION(TPS_InSpace),
1293 TPARSERSTATEACTION(TPS_InUDecimalFirst),
1294 TPARSERSTATEACTION(TPS_InUDecimal),
1295 TPARSERSTATEACTION(TPS_InDecimalFirst),
1296 TPARSERSTATEACTION(TPS_InDecimal),
1297 TPARSERSTATEACTION(TPS_InVerVersion),
1298 TPARSERSTATEACTION(TPS_InSVerVersion),
1299 TPARSERSTATEACTION(TPS_InVersionFirst),
1300 TPARSERSTATEACTION(TPS_InVersion),
1301 TPARSERSTATEACTION(TPS_InMantissaFirst),
1302 TPARSERSTATEACTION(TPS_InMantissaSign),
1303 TPARSERSTATEACTION(TPS_InMantissa),
1304 TPARSERSTATEACTION(TPS_InXMLEntityFirst),
1305 TPARSERSTATEACTION(TPS_InXMLEntity),
1306 TPARSERSTATEACTION(TPS_InXMLEntityNumFirst),
1307 TPARSERSTATEACTION(TPS_InXMLEntityNum),
1308 TPARSERSTATEACTION(TPS_InXMLEntityHexNumFirst),
1309 TPARSERSTATEACTION(TPS_InXMLEntityHexNum),
1310 TPARSERSTATEACTION(TPS_InXMLEntityEnd),
1311 TPARSERSTATEACTION(TPS_InTagFirst),
1312 TPARSERSTATEACTION(TPS_InXMLBegin),
1313 TPARSERSTATEACTION(TPS_InTagCloseFirst),
1314 TPARSERSTATEACTION(TPS_InTagName),
1315 TPARSERSTATEACTION(TPS_InTagBeginEnd),
1316 TPARSERSTATEACTION(TPS_InTag),
1317 TPARSERSTATEACTION(TPS_InTagEscapeK),
1318 TPARSERSTATEACTION(TPS_InTagEscapeKK),
1319 TPARSERSTATEACTION(TPS_InTagBackSleshed),
1320 TPARSERSTATEACTION(TPS_InTagEnd),
1321 TPARSERSTATEACTION(TPS_InCommentFirst),
1322 TPARSERSTATEACTION(TPS_InCommentLast),
1323 TPARSERSTATEACTION(TPS_InComment),
1324 TPARSERSTATEACTION(TPS_InCloseCommentFirst),
1325 TPARSERSTATEACTION(TPS_InCloseCommentLast),
1326 TPARSERSTATEACTION(TPS_InCommentEnd),
1327 TPARSERSTATEACTION(TPS_InHostFirstDomain),
1328 TPARSERSTATEACTION(TPS_InHostDomainSecond),
1329 TPARSERSTATEACTION(TPS_InHostDomain),
1330 TPARSERSTATEACTION(TPS_InPortFirst),
1331 TPARSERSTATEACTION(TPS_InPort),
1332 TPARSERSTATEACTION(TPS_InHostFirstAN),
1333 TPARSERSTATEACTION(TPS_InHost),
1334 TPARSERSTATEACTION(TPS_InEmail),
1335 TPARSERSTATEACTION(TPS_InFileFirst),
1336 TPARSERSTATEACTION(TPS_InFileTwiddle),
1337 TPARSERSTATEACTION(TPS_InPathFirst),
1338 TPARSERSTATEACTION(TPS_InPathFirstFirst),
1339 TPARSERSTATEACTION(TPS_InPathSecond),
1340 TPARSERSTATEACTION(TPS_InFile),
1341 TPARSERSTATEACTION(TPS_InFileNext),
1342 TPARSERSTATEACTION(TPS_InURLPathFirst),
1343 TPARSERSTATEACTION(TPS_InURLPathStart),
1344 TPARSERSTATEACTION(TPS_InURLPath),
1345 TPARSERSTATEACTION(TPS_InFURL),
1346 TPARSERSTATEACTION(TPS_InProtocolFirst),
1347 TPARSERSTATEACTION(TPS_InProtocolSecond),
1348 TPARSERSTATEACTION(TPS_InProtocolEnd),
1349 TPARSERSTATEACTION(TPS_InHyphenAsciiWordFirst),
1350 TPARSERSTATEACTION(TPS_InHyphenAsciiWord),
1351 TPARSERSTATEACTION(TPS_InHyphenWordFirst),
1352 TPARSERSTATEACTION(TPS_InHyphenWord),
1353 TPARSERSTATEACTION(TPS_InHyphenNumWordFirst),
1354 TPARSERSTATEACTION(TPS_InHyphenNumWord),
1355 TPARSERSTATEACTION(TPS_InHyphenDigitLookahead),
1356 TPARSERSTATEACTION(TPS_InParseHyphen),
1357 TPARSERSTATEACTION(TPS_InParseHyphenHyphen),
1358 TPARSERSTATEACTION(TPS_InHyphenWordPart),
1359 TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart),
1360 TPARSERSTATEACTION(TPS_InHyphenNumWordPart),
1361 TPARSERSTATEACTION(TPS_InHyphenUnsignedInt)
1366 TParserGet(TParser *prs)
1368 const TParserStateActionItem *item = NULL;
1372 if (prs->state->posbyte >= prs->lenstr)
1375 prs->token = prs->str + prs->state->posbyte;
1376 prs->state->pushedAtAction = NULL;
1378 /* look at string */
1379 while (prs->state->posbyte <= prs->lenstr)
1381 if (prs->state->posbyte == prs->lenstr)
1382 prs->state->charlen = 0;
1384 prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
1385 pg_mblen(prs->str + prs->state->posbyte);
1387 Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
1388 Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
1389 Assert(Actions[prs->state->state].state == prs->state->state);
1391 if (prs->state->pushedAtAction)
1393 /* After a POP, pick up at the next test */
1394 item = prs->state->pushedAtAction + 1;
1395 prs->state->pushedAtAction = NULL;
1399 item = Actions[prs->state->state].action;
1400 Assert(item != NULL);
1403 /* find action by character class */
1404 while (item->isclass)
1407 if (item->isclass(prs) != 0)
1412 #ifdef WPARSER_TRACE
1414 TParserPosition *ptr;
1416 fprintf(stderr, "state ");
1417 /* indent according to stack depth */
1418 for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
1419 fprintf(stderr, " ");
1420 fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
1421 if (prs->state->posbyte < prs->lenstr)
1422 fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
1424 fprintf(stderr, "at EOF");
1425 fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
1426 (int) (item - Actions[prs->state->state].action),
1427 (item->flags & A_BINGO) ? " BINGO" : "",
1428 (item->flags & A_POP) ? " POP" : "",
1429 (item->flags & A_PUSH) ? " PUSH" : "",
1430 (item->flags & A_RERUN) ? " RERUN" : "",
1431 (item->flags & A_CLEAR) ? " CLEAR" : "",
1432 (item->flags & A_MERGE) ? " MERGE" : "",
1433 (item->flags & A_CLRALL) ? " CLRALL" : "",
1434 (item->tostate != TPS_Null) ? " tostate " : "",
1435 (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
1436 (item->type > 0) ? " type " : "",
1437 tok_alias[item->type]);
1441 /* call special handler if exists */
1445 /* BINGO, token is found */
1446 if (item->flags & A_BINGO)
1448 Assert(item->type > 0);
1449 prs->lenbytetoken = prs->state->lenbytetoken;
1450 prs->lenchartoken = prs->state->lenchartoken;
1451 prs->state->lenbytetoken = prs->state->lenchartoken = 0;
1452 prs->type = item->type;
1455 /* do various actions by flags */
1456 if (item->flags & A_POP)
1457 { /* pop stored state in stack */
1458 TParserPosition *ptr = prs->state->prev;
1464 else if (item->flags & A_PUSH)
1465 { /* push (store) state in stack */
1466 prs->state->pushedAtAction = item; /* remember where we push */
1467 prs->state = newTParserPosition(prs->state);
1469 else if (item->flags & A_CLEAR)
1470 { /* clear previous pushed state */
1471 TParserPosition *ptr;
1473 Assert(prs->state->prev);
1474 ptr = prs->state->prev->prev;
1475 pfree(prs->state->prev);
1476 prs->state->prev = ptr;
1478 else if (item->flags & A_CLRALL)
1479 { /* clear all previous pushed state */
1480 TParserPosition *ptr;
1482 while (prs->state->prev)
1484 ptr = prs->state->prev->prev;
1485 pfree(prs->state->prev);
1486 prs->state->prev = ptr;
1489 else if (item->flags & A_MERGE)
1490 { /* merge posinfo with current and pushed state */
1491 TParserPosition *ptr = prs->state;
1493 Assert(prs->state->prev);
1494 prs->state = prs->state->prev;
1496 prs->state->posbyte = ptr->posbyte;
1497 prs->state->poschar = ptr->poschar;
1498 prs->state->charlen = ptr->charlen;
1499 prs->state->lenbytetoken = ptr->lenbytetoken;
1500 prs->state->lenchartoken = ptr->lenchartoken;
1504 /* set new state if pointed */
1505 if (item->tostate != TPS_Null)
1506 prs->state->state = item->tostate;
1508 /* check for go away */
1509 if ((item->flags & A_BINGO) ||
1510 (prs->state->posbyte >= prs->lenstr &&
1511 (item->flags & A_RERUN) == 0))
1514 /* go to beginning of loop if we should rerun or we just restore state */
1515 if (item->flags & (A_RERUN | A_POP))
1519 if (prs->state->charlen)
1521 prs->state->posbyte += prs->state->charlen;
1522 prs->state->lenbytetoken += prs->state->charlen;
1523 prs->state->poschar++;
1524 prs->state->lenchartoken++;
1528 return (item && (item->flags & A_BINGO)) ? true : false;
1532 prsd_lextype(PG_FUNCTION_ARGS)
1534 LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
1537 for (i = 1; i <= LASTNUM; i++)
1539 descr[i - 1].lexid = i;
1540 descr[i - 1].alias = pstrdup(tok_alias[i]);
1541 descr[i - 1].descr = pstrdup(lex_descr[i]);
1544 descr[LASTNUM].lexid = 0;
1546 PG_RETURN_POINTER(descr);
1550 prsd_start(PG_FUNCTION_ARGS)
1552 PG_RETURN_POINTER(TParserInit((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
1556 prsd_nexttoken(PG_FUNCTION_ARGS)
1558 TParser *p = (TParser *) PG_GETARG_POINTER(0);
1559 char **t = (char **) PG_GETARG_POINTER(1);
1560 int *tlen = (int *) PG_GETARG_POINTER(2);
1566 *tlen = p->lenbytetoken;
1568 PG_RETURN_INT32(p->type);
1572 prsd_end(PG_FUNCTION_ARGS)
1574 TParser *p = (TParser *) PG_GETARG_POINTER(0);
1580 #define LEAVETOKEN(x) ( (x)==SPACE )
1581 #define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1582 #define ENDPUNCTOKEN(x) ( (x)==SPACE )
1584 #define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
1585 #define HLIDIGNORE(x) ( (x)==URL_T || (x)==TAG_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1586 #define XMLHLIDIGNORE(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1587 #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDIGNORE(x) )
1588 #define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
1592 HeadlineWordEntry *words;
1597 checkcondition_HL(void *checkval, QueryOperand *val)
1601 for (i = 0; i < ((hlCheck *) checkval)->len; i++)
1603 if (((hlCheck *) checkval)->words[i].item == val)
1611 hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
1615 QueryItem *item = GETQUERY(query);
1621 for (j = 0; j < query->size; j++)
1623 if (item->type != QI_VAL)
1628 for (i = pos; i < prs->curwords; i++)
1630 if (prs->words[i].item == &item->operand)
1643 item = GETQUERY(query);
1644 for (j = 0; j < query->size; j++)
1646 if (item->type != QI_VAL)
1651 for (i = *q; i >= pos; i--)
1653 if (prs->words[i].item == &item->operand)
1667 ch.words = &(prs->words[*p]);
1668 ch.len = *q - *p + 1;
1669 if (TS_execute(GETQUERY(query), &ch, false, checkcondition_HL))
1674 return hlCover(prs, query, p, q);
1682 prsd_headline(PG_FUNCTION_ARGS)
1684 HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
1685 List *prsoptions = (List *) PG_GETARG_POINTER(1);
1686 TSQuery query = PG_GETARG_TSQUERY(2);
1688 /* from opt + start and and tag */
1708 prs->startsel = NULL;
1709 prs->stopsel = NULL;
1710 foreach(l, prsoptions)
1712 DefElem *defel = (DefElem *) lfirst(l);
1713 char *val = defGetString(defel);
1715 if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
1716 max_words = pg_atoi(val, sizeof(int32), 0);
1717 else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
1718 min_words = pg_atoi(val, sizeof(int32), 0);
1719 else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
1720 shortword = pg_atoi(val, sizeof(int32), 0);
1721 else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
1722 prs->startsel = pstrdup(val);
1723 else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
1724 prs->stopsel = pstrdup(val);
1725 else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
1726 highlight = (pg_strcasecmp(val, "1") == 0 ||
1727 pg_strcasecmp(val, "on") == 0 ||
1728 pg_strcasecmp(val, "true") == 0 ||
1729 pg_strcasecmp(val, "t") == 0 ||
1730 pg_strcasecmp(val, "y") == 0 ||
1731 pg_strcasecmp(val, "yes") == 0);
1734 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1735 errmsg("unrecognized headline parameter: \"%s\"",
1741 if (min_words >= max_words)
1743 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1744 errmsg("MinWords should be less than MaxWords")));
1747 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1748 errmsg("MinWords should be positive")));
1751 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1752 errmsg("ShortWord should be >= 0")));
1754 while (hlCover(prs, query, &p, &q))
1756 /* find cover len in words */
1759 for (i = p; i <= q && curlen < max_words; i++)
1761 if (!NONWORDTOKEN(prs->words[i].type))
1763 if (prs->words[i].item && !prs->words[i].repeated)
1768 if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
1770 /* best already finded, so try one more cover */
1776 if (curlen < max_words)
1777 { /* find good end */
1778 for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
1782 if (!NONWORDTOKEN(prs->words[i].type))
1784 if (prs->words[i].item && !prs->words[i].repeated)
1788 if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
1790 if (curlen >= min_words)
1793 if (curlen < min_words && i >= prs->curwords)
1794 { /* got end of text and our cover is shoter
1796 for (i = p; i >= 0; i--)
1798 if (!NONWORDTOKEN(prs->words[i].type))
1800 if (prs->words[i].item && !prs->words[i].repeated)
1802 if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
1804 if (curlen >= min_words)
1807 posb = (i >= 0) ? i : 0;
1811 { /* shorter cover :((( */
1812 for (; curlen > min_words; i--)
1814 if (!NONWORDTOKEN(prs->words[i].type))
1816 if (prs->words[i].item && !prs->words[i].repeated)
1819 if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
1825 if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
1826 (bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
1827 (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
1840 for (i = 0; i < prs->curwords && curlen < min_words; i++)
1842 if (!NONWORDTOKEN(prs->words[i].type))
1853 beste = prs->curwords - 1;
1856 for (i = bestb; i <= beste; i++)
1858 if (prs->words[i].item)
1859 prs->words[i].selected = 1;
1862 if (HLIDIGNORE(prs->words[i].type))
1863 prs->words[i].replace = 1;
1867 if (XMLHLIDIGNORE(prs->words[i].type))
1868 prs->words[i].replace = 1;
1871 prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
1875 prs->startsel = pstrdup("<b>");
1877 prs->stopsel = pstrdup("</b>");
1878 prs->startsellen = strlen(prs->startsel);
1879 prs->stopsellen = strlen(prs->stopsel);
1881 PG_RETURN_POINTER(prs);