From: Matt Wilmas Date: Tue, 5 May 2009 01:35:44 +0000 (+0000) Subject: MFH: Implemented manual scanning for strings/comments, plus misc. fixes X-Git-Tag: php-5.3.0RC2~11 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=09034cf3f495f980b035922717ab7dcbd6bcc62c;p=php MFH: Implemented manual scanning for strings/comments, plus misc. fixes --- diff --git a/NEWS b/NEWS index b49bfa9a54..10c0e5b4c1 100644 --- a/NEWS +++ b/NEWS @@ -59,6 +59,8 @@ PHP NEWS - Fixed bug #47038 (Memory leak in include). (Dmitry) - Fixed bug #47021 (SoapClient stumbles over WSDL delivered with "Transfer-Encoding: chunked"). (Dmitry) +- Fixed bug #46817 (tokenizer misses last single-line comment (PHP 5.3+, with + re2c lexer)). (Matt, Shire) - Fixed bug #46108 (DateTime - Memory leak when unserializing). (Felipe) - Fixed bug #44861 (scrollable cursor don't work with pgsql). (Matteo) - Fixed bug #44409 (PDO::FETCH_SERIALIZE calls __construct()). (Matteo) diff --git a/Zend/zend_highlight.c b/Zend/zend_highlight.c index 8a885f7200..2c27cda3a9 100644 --- a/Zend/zend_highlight.c +++ b/Zend/zend_highlight.c @@ -142,14 +142,8 @@ ZEND_API void zend_highlight(zend_syntax_highlighter_ini *syntax_highlighter_ini zend_printf("", last_color); } } - switch (token_type) { - case T_END_HEREDOC: - zend_html_puts(token.value.str.val, token.value.str.len TSRMLS_CC); - break; - default: - zend_html_puts(LANG_SCNG(yy_text), LANG_SCNG(yy_leng) TSRMLS_CC); - break; - } + + zend_html_puts(LANG_SCNG(yy_text), LANG_SCNG(yy_leng) TSRMLS_CC); if (token.type == IS_STRING) { switch (token_type) { @@ -170,19 +164,6 @@ ZEND_API void zend_highlight(zend_syntax_highlighter_ini *syntax_highlighter_ini token.type = 0; } - /* handler for trailing comments, see bug #42767 */ - if (LANG_SCNG(yy_leng) && LANG_SCNG(yy_text) < LANG_SCNG(yy_limit)) { - if (last_color != syntax_highlighter_ini->highlight_comment) { - if (last_color != syntax_highlighter_ini->highlight_html) { - zend_printf(""); - } - if (syntax_highlighter_ini->highlight_comment != syntax_highlighter_ini->highlight_html) { - zend_printf("", syntax_highlighter_ini->highlight_comment); - } - } - zend_html_puts(LANG_SCNG(yy_text), (LANG_SCNG(yy_limit) - LANG_SCNG(yy_text)) TSRMLS_CC); - } - if (last_color != syntax_highlighter_ini->highlight_html) { zend_printf("\n"); } diff --git a/Zend/zend_language_scanner.l b/Zend/zend_language_scanner.l index 052c39084a..6717521ebd 100644 --- a/Zend/zend_language_scanner.l +++ b/Zend/zend_language_scanner.l @@ -109,6 +109,12 @@ do { \ } \ } +/* To save initial string length after scanning to first variable, CG(doc_comment_len) can be reused */ +#define SET_DOUBLE_QUOTES_SCANNED_LENGTH(len) CG(doc_comment_len) = (len) +#define GET_DOUBLE_QUOTES_SCANNED_LENGTH() CG(doc_comment_len) + +#define IS_LABEL_START(c) (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z') || (c) == '_' || (c) >= 0x7F) + #define ZEND_IS_OCT(c) ((c)>='0' && (c)<='7') #define ZEND_IS_HEX(c) (((c)>='0' && (c)<='9') || ((c)>='a' && (c)<='f') || ((c)>='A' && (c)<='F')) @@ -835,63 +841,8 @@ LABEL [a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]* WHITESPACE [ \n\r\t]+ TABS_AND_SPACES [ \t]* TOKENS [;:,.\[\]()|^&+-/*=%!~$<>?@] -ANY_CHAR [^\x00] +ANY_CHAR [^] NEWLINE ("\r"|"\n"|"\r\n") -NULL [\x00]{1} - -/* - * LITERAL_DOLLAR matches unescaped $ that aren't followed by a label character - * or a { and therefore will be taken literally. The case of literal $ before - * a variable or "${" is handled in a rule for each string type - */ -DOUBLE_QUOTES_LITERAL_DOLLAR ("$"+([^a-zA-Z_\x7f-\xff$"\\{]|("\\"{ANY_CHAR}))) -BACKQUOTE_LITERAL_DOLLAR ("$"+([^a-zA-Z_\x7f-\xff$`\\{]|("\\"{ANY_CHAR}))) -HEREDOC_LITERAL_DOLLAR ("$"+([^a-zA-Z_\x7f-\xff$\n\r\\{]|("\\"[^\n\r]))) - -/* - * Usually, HEREDOC_NEWLINE will just function like a simple NEWLINE, but some - * special cases need to be handled. HEREDOC_CHARS doesn't allow a line to - * match when { or $, and/or \ is at the end. (("{"*|"$"*)"\\"?) handles that, - * along with cases where { or $, and/or \ is the ONLY thing on a line - * - * The other case is when a line contains a label, followed by ONLY - * { or $, and/or \ Handled by ({LABEL}";"?((("{"+|"$"+)"\\"?)|"\\")) - */ -HEREDOC_NEWLINE ((({LABEL}";"?((("{"+|"$"+)"\\"?)|"\\"))|(("{"*|"$"*)"\\"?)){NEWLINE}) - -/* - * This pattern is just used in the next 2 for matching { or literal $, and/or - * \ escape sequence immediately at the beginning of a line or after a label - */ -HEREDOC_CURLY_OR_ESCAPE_OR_DOLLAR (("{"+[^$\n\r\\{])|("{"*"\\"[^\n\r])|{HEREDOC_LITERAL_DOLLAR}) - -/* - * These 2 label-related patterns allow HEREDOC_CHARS to continue "regular" - * matching after a newline that starts with either a non-label character or a - * label that isn't followed by a newline. Like HEREDOC_CHARS, they won't match - * a variable or "{$" Matching a newline, and possibly label, up TO a variable - * or "{$", is handled in the heredoc rules - * - * The HEREDOC_LABEL_NO_NEWLINE pattern (";"[^$\n\r\\{]) handles cases where ; - * follows a label. [^a-zA-Z0-9_\x7f-\xff;$\n\r\\{] is needed to prevent a label - * character or ; from matching on a possible (real) ending label - */ -HEREDOC_NON_LABEL ([^a-zA-Z_\x7f-\xff$\n\r\\{]|{HEREDOC_CURLY_OR_ESCAPE_OR_DOLLAR}) -HEREDOC_LABEL_NO_NEWLINE ({LABEL}([^a-zA-Z0-9_\x7f-\xff;$\n\r\\{]|(";"[^$\n\r\\{])|(";"?{HEREDOC_CURLY_OR_ESCAPE_OR_DOLLAR}))) - -/* - * CHARS matches everything up to a variable or "{$" - * {'s are matched as long as they aren't followed by a $ - * The case of { before "{$" is handled in a rule for each string type - * - * For heredocs, matching continues across/after newlines if/when it's known - * that the next line doesn't contain a possible ending label - */ -DOUBLE_QUOTES_CHARS ("{"*([^$"\\{]|("\\"{ANY_CHAR}))|{DOUBLE_QUOTES_LITERAL_DOLLAR}) -BACKQUOTE_CHARS ("{"*([^$`\\{]|("\\"{ANY_CHAR}))|{BACKQUOTE_LITERAL_DOLLAR}) -HEREDOC_CHARS ("{"*([^$\n\r\\{]|("\\"[^\n\r]))|{HEREDOC_LITERAL_DOLLAR}|({HEREDOC_NEWLINE}+({HEREDOC_NON_LABEL}|{HEREDOC_LABEL_NO_NEWLINE}))) - -NOWDOC_CHARS ([^\n\r]|{NEWLINE}+([^a-zA-Z_\x7f-\xff\n\r]|({LABEL}([^a-zA-Z0-9_\x7f-\xff;\n\r]|(";"[^\n\r]))))) /* compute yyleng before each rule */ := yyleng = YYCURSOR - SCNG(yy_text); @@ -1530,6 +1481,14 @@ NOWDOC_CHARS ([^\n\r]|{NEWLINE}+([^a-zA-Z_\x7f-\xff\n\r]|({LABEL}([^a-zA- } "" { + YYCTYPE *bracket = zend_memrchr(yytext, '<', yyleng - (sizeof("script language=php>") - 1)); + + if (bracket != SCNG(yy_text)) { + /* Handle previously scanned HTML, as possible