From: Dmitry Stogov Date: Fri, 18 May 2007 13:12:47 +0000 (+0000) Subject: Improved compilation of heredocs and interpolated strings. (Matt) X-Git-Tag: RELEASE_1_4~139 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=5dc51fecf8403d9f34d852c904f42e42d53a594f;p=php Improved compilation of heredocs and interpolated strings. (Matt) --- diff --git a/Zend/zend_compile.c b/Zend/zend_compile.c index 18644d1076..1740564b96 100644 --- a/Zend/zend_compile.c +++ b/Zend/zend_compile.c @@ -989,25 +989,25 @@ void zend_do_init_string(znode *result TSRMLS_DC) } -void zend_do_add_char(znode *result, znode *op1, znode *op2 TSRMLS_DC) -{ - zend_op *opline = get_next_op(CG(active_op_array) TSRMLS_CC); - - opline->opcode = ZEND_ADD_CHAR; - opline->op1 = *op1; - opline->op2 = *op2; - opline->op2.op_type = IS_CONST; - opline->extended_value = CG(literal_type); - opline->result = opline->op1; - *result = opline->result; -} - - void zend_do_add_string(znode *result, znode *op1, znode *op2 TSRMLS_DC) { - zend_op *opline = get_next_op(CG(active_op_array) TSRMLS_CC); + zend_op *opline; - opline->opcode = ZEND_ADD_STRING; + if (Z_UNILEN(op2->u.constant) > 1) { + opline = get_next_op(CG(active_op_array) TSRMLS_CC); + opline->opcode = ZEND_ADD_STRING; + } else if (Z_UNILEN(op2->u.constant) == 1) { + int ch = (Z_TYPE(op2->u.constant) == IS_UNICODE) ? *Z_USTRVAL(op2->u.constant) : *Z_STRVAL(op2->u.constant); + + /* Free memory and use ZEND_ADD_CHAR in case of 1 character strings */ + efree(Z_UNIVAL(op2->u.constant).v); + ZVAL_LONG(&op2->u.constant, ch); + opline = get_next_op(CG(active_op_array) TSRMLS_CC); + opline->opcode = ZEND_ADD_CHAR; + } else { /* String can be empty after a variable at the end of a heredoc */ + efree(Z_UNIVAL(op2->u.constant).v); + return; + } opline->op1 = *op1; opline->op2 = *op2; opline->op2.op_type = IS_CONST; @@ -4154,33 +4154,6 @@ void zend_do_declare_end(znode *declare_token TSRMLS_DC) } -void zend_do_end_heredoc(TSRMLS_D) -{ - int opline_num = get_next_op_number(CG(active_op_array))-1; - zend_op *opline = &CG(active_op_array)->opcodes[opline_num]; - - if (opline->opcode != ZEND_ADD_STRING) { - return; - } - - if (Z_TYPE(opline->op2.u.constant) == IS_UNICODE) { - Z_USTRVAL(opline->op2.u.constant)[(Z_USTRLEN(opline->op2.u.constant)--)-1] = 0; - if (Z_USTRLEN(opline->op2.u.constant)>0) { - if (Z_USTRVAL(opline->op2.u.constant)[Z_USTRLEN(opline->op2.u.constant)-1]=='\r') { - Z_USTRVAL(opline->op2.u.constant)[(Z_USTRLEN(opline->op2.u.constant)--)-1] = 0; - } - } - } else { - Z_STRVAL(opline->op2.u.constant)[(Z_STRLEN(opline->op2.u.constant)--)-1] = 0; - if (Z_STRLEN(opline->op2.u.constant)>0) { - if (Z_STRVAL(opline->op2.u.constant)[Z_STRLEN(opline->op2.u.constant)-1]=='\r') { - Z_STRVAL(opline->op2.u.constant)[(Z_STRLEN(opline->op2.u.constant)--)-1] = 0; - } - } - } -} - - void zend_do_exit(znode *result, znode *message TSRMLS_DC) { zend_op *opline = get_next_op(CG(active_op_array) TSRMLS_CC); @@ -4425,12 +4398,12 @@ int zendlex(znode *zendlval TSRMLS_DC) { int retval; -again: if (CG(increment_lineno)) { CG(zend_lineno)++; CG(increment_lineno) = 0; } +again: Z_TYPE(zendlval->u.constant) = IS_LONG; retval = lex_scan(&zendlval->u.constant TSRMLS_CC); switch (retval) { @@ -4441,8 +4414,7 @@ again: goto again; case T_CLOSE_TAG: - if (LANG_SCNG(yy_text)[LANG_SCNG(yy_leng)-1]=='\n' - || (LANG_SCNG(yy_text)[LANG_SCNG(yy_leng)-2]=='\r' && LANG_SCNG(yy_text)[LANG_SCNG(yy_leng)-1])) { + if (LANG_SCNG(yy_text)[LANG_SCNG(yy_leng)-1] != '>') { CG(increment_lineno) = 1; } retval = ';'; /* implicit ; */ diff --git a/Zend/zend_compile.h b/Zend/zend_compile.h index dd965f8143..d5dc4aa615 100644 --- a/Zend/zend_compile.h +++ b/Zend/zend_compile.h @@ -404,7 +404,6 @@ void zend_check_writable_variable(znode *variable); void zend_do_free(znode *op1 TSRMLS_DC); void zend_do_init_string(znode *result TSRMLS_DC); -void zend_do_add_char(znode *result, znode *op1, znode *op2 TSRMLS_DC); void zend_do_add_string(znode *result, znode *op1, znode *op2 TSRMLS_DC); void zend_do_add_variable(znode *result, znode *op1, znode *op2 TSRMLS_DC); @@ -499,8 +498,6 @@ void zend_do_declare_begin(TSRMLS_D); void zend_do_declare_stmt(znode *var, znode *val TSRMLS_DC); void zend_do_declare_end(znode *declare_token TSRMLS_DC); -void zend_do_end_heredoc(TSRMLS_D); - void zend_do_exit(znode *result, znode *message TSRMLS_DC); void zend_do_begin_silence(znode *strudel_token TSRMLS_DC); diff --git a/Zend/zend_language_parser.y b/Zend/zend_language_parser.y index 06daec9e10..0167453943 100644 --- a/Zend/zend_language_parser.y +++ b/Zend/zend_language_parser.y @@ -24,8 +24,6 @@ * LALR shift/reduce conflicts and how they are resolved: * * - 2 shift/reduce conflicts due to the dangeling elseif/else ambiguity. Solved by shift. - * - 1 shift/reduce conflict due to arrays within encapsulated strings. Solved by shift. - * - 1 shift/reduce conflict due to objects within encapsulated strings. Solved by shift. * */ @@ -49,7 +47,7 @@ %} %pure_parser -%expect 4 +%expect 2 %left T_INCLUDE T_INCLUDE_ONCE T_EVAL T_REQUIRE T_REQUIRE_ONCE %left ',' @@ -718,9 +716,9 @@ scalar: | class_constant { $$ = $1; } | common_scalar { $$ = $1; } | '"' { CG(literal_type) = UG(unicode)?IS_UNICODE:IS_STRING; } encaps_list '"' { $$ = $3; } - | T_START_HEREDOC { CG(literal_type) = UG(unicode)?IS_UNICODE:IS_STRING; } encaps_list T_END_HEREDOC { $$ = $3; zend_do_end_heredoc(TSRMLS_C); } + | T_START_HEREDOC { CG(literal_type) = UG(unicode)?IS_UNICODE:IS_STRING; } encaps_list T_END_HEREDOC { $$ = $3; } | T_BINARY_DOUBLE { CG(literal_type) = IS_STRING; } encaps_list '"' { $$ = $3; } - | T_BINARY_HEREDOC { CG(literal_type) = IS_STRING; } encaps_list T_END_HEREDOC { $$ = $3; zend_do_end_heredoc(TSRMLS_C); } + | T_BINARY_HEREDOC { CG(literal_type) = IS_STRING; } encaps_list T_END_HEREDOC { $$ = $3; } ; @@ -879,16 +877,7 @@ non_empty_array_pair_list: encaps_list: encaps_list encaps_var { zend_do_end_variable_parse(BP_VAR_R, 0 TSRMLS_CC); zend_do_add_variable(&$$, &$1, &$2 TSRMLS_CC); } - | encaps_list T_STRING { zend_do_add_string(&$$, &$1, &$2 TSRMLS_CC); } - | encaps_list T_NUM_STRING { zend_do_add_string(&$$, &$1, &$2 TSRMLS_CC); } | encaps_list T_ENCAPSED_AND_WHITESPACE { zend_do_add_string(&$$, &$1, &$2 TSRMLS_CC); } - | encaps_list T_CHARACTER { zend_do_add_char(&$$, &$1, &$2 TSRMLS_CC); } - | encaps_list T_BAD_CHARACTER { zend_do_add_string(&$$, &$1, &$2 TSRMLS_CC); } - | encaps_list '[' { Z_LVAL($2.u.constant) = (long) '['; zend_do_add_char(&$$, &$1, &$2 TSRMLS_CC); } - | encaps_list ']' { Z_LVAL($2.u.constant) = (long) ']'; zend_do_add_char(&$$, &$1, &$2 TSRMLS_CC); } - | encaps_list '{' { Z_LVAL($2.u.constant) = (long) '{'; zend_do_add_char(&$$, &$1, &$2 TSRMLS_CC); } - | encaps_list '}' { Z_LVAL($2.u.constant) = (long) '}'; zend_do_add_char(&$$, &$1, &$2 TSRMLS_CC); } - | encaps_list T_OBJECT_OPERATOR { znode tmp; Z_LVAL($2.u.constant) = (long) '-'; zend_do_add_char(&tmp, &$1, &$2 TSRMLS_CC); Z_LVAL($2.u.constant) = (long) '>'; zend_do_add_char(&$$, &tmp, &$2 TSRMLS_CC); } | /* empty */ { zend_do_init_string(&$$ TSRMLS_CC); } ; diff --git a/Zend/zend_language_scanner.l b/Zend/zend_language_scanner.l index b992324dbb..718867bda4 100644 --- a/Zend/zend_language_scanner.l +++ b/Zend/zend_language_scanner.l @@ -37,8 +37,11 @@ %x ST_DOUBLE_QUOTES %x ST_BACKQUOTE %x ST_HEREDOC +%x ST_START_HEREDOC +%x ST_END_HEREDOC %x ST_LOOKING_FOR_PROPERTY %x ST_LOOKING_FOR_VARNAME +%x ST_VAR_OFFSET %x ST_COMMENT %x ST_DOC_COMMENT %x ST_ONE_LINE_COMMENT @@ -99,9 +102,7 @@ do { \ char *p = (s), *boundary = p+(l); \ \ while (p= 0) { + min_digits = 1; + max_digits = 2; + Z_USTRLEN_P(zendlval)--; + s++; + n = 1; /* already have one digit */ + codepoint = digit; + } else { + *t++ = 0x5C; /*'\\'*/ + *t++ = *s; + } + break; default: digit = zend_get_octal_digit(*s); if (digit >= 0) { @@ -1118,14 +1099,6 @@ int zend_scan_unicode_double_string(zval *zendlval TSRMLS_DC) bits = 3; n = 1; /* already have one digit */ codepoint = digit; - } else if (c == 0x78 /*'x'*/ - && (s+1) < end && (digit = zend_get_hex_digit(*(s+1))) >= 0) { - min_digits = 1; - max_digits = 2; - Z_USTRLEN_P(zendlval)--; - s++; - n = 1; /* already have one digit */ - codepoint = digit; } else { *t++ = 0x5C; /*'\\'*/ *t++ = *s; @@ -1163,26 +1136,30 @@ int zend_scan_unicode_double_string(zval *zendlval TSRMLS_DC) efree(Z_USTRVAL_P(zendlval)); return 0; } - } else { - s++; + + /* s is already incremented and not past a newline */ + continue; } } else { - *t++ = *s++; + *t++ = *s; } + + if (*s == 0x0A /*'\n'*/ || (*s == 0x0D /*'\r'*/ && (*(s+1) != 0x0A /*'\n'*/))) { + CG(zend_lineno)++; + } + s++; } *t = 0; - return T_CONSTANT_ENCAPSED_STRING; + return type; } -int zend_scan_unicode_single_string(zval *zendlval TSRMLS_DC) +static int zend_scan_unicode_single_string(zval *zendlval TSRMLS_DC) { register UChar *s, *t; UChar *end; UChar32 codepoint = 0; - HANDLE_NEWLINES(yytext, yyleng); - if (!zend_copy_scanner_string(zendlval, yytext+1, yyleng-2, IS_UNICODE, SCNG(output_conv) TSRMLS_CC)) { return 0; } @@ -1265,25 +1242,26 @@ int zend_scan_unicode_single_string(zval *zendlval TSRMLS_DC) *t++ = *s; break; } - s++; } else { - *t++ = *s++; + *t++ = *s; } + + if (*s == 0x0A /*'\n'*/ || (*s == 0x0D /*'\r'*/ && (*(s+1) != 0x0A /*'\n'*/))) { + CG(zend_lineno)++; + } + s++; } *t = 0; return T_CONSTANT_ENCAPSED_STRING; } -int zend_scan_binary_double_string(zval *zendlval, int bprefix TSRMLS_DC) +static void zend_scan_binary_escape_string(zval *zendlval, char *str, int len, char quote_type TSRMLS_DC) { register char *s, *t; char *end; - Z_STRVAL_P(zendlval) = estrndup(yytext+bprefix+1, yyleng-bprefix-2); - Z_STRLEN_P(zendlval) = yyleng-bprefix-2; - Z_TYPE_P(zendlval) = IS_STRING; - HANDLE_NEWLINES(yytext, yyleng); + ZVAL_STRINGL(zendlval, str, len, 1); /* convert escape sequences */ s = t = Z_STRVAL_P(zendlval); @@ -1307,12 +1285,37 @@ int zend_scan_binary_double_string(zval *zendlval, int bprefix TSRMLS_DC) *t++ = '\t'; Z_STRLEN_P(zendlval)--; break; + case '"': + case '`': + if (*s != quote_type) { + *t++ = '\\'; + *t++ = *s; + break; + } case '\\': case '$': - case '"': *t++ = *s; Z_STRLEN_P(zendlval)--; break; + case 'x': + case 'X': + if (ZEND_IS_HEX(*(s+1))) { + char hex_buf[3] = { 0, 0, 0 }; + + Z_STRLEN_P(zendlval)--; /* for the 'x' */ + + hex_buf[0] = *(++s); + Z_STRLEN_P(zendlval)--; + if (ZEND_IS_HEX(*(s+1))) { + hex_buf[1] = *(++s); + Z_STRLEN_P(zendlval)--; + } + *t++ = (char) strtol(hex_buf, NULL, 16); + } else { + *t++ = '\\'; + *t++ = *s; + } + break; default: /* check for an octal */ if (ZEND_IS_OCT(*s)) { @@ -1320,52 +1323,39 @@ int zend_scan_binary_double_string(zval *zendlval, int bprefix TSRMLS_DC) octal_buf[0] = *s; Z_STRLEN_P(zendlval)--; - if ((s+1)?@] -ENCAPSED_TOKENS [\[\]{}$] -ESCAPED_AND_WHITESPACE [\n\t\r #'.:;,()|^&+-/*=%!~<>?@]+ ANY_CHAR (.|[\n]) NEWLINE ("\r"|"\n"|"\r\n") +/* + * LITERAL_DOLLAR matches unescaped $ that aren't followed by a label character + * or a { and therefore will be taken literally. The case of literal $ before + * a variable or "${" is handled in a rule for each string type + */ +DOUBLE_QUOTES_LITERAL_DOLLAR ("$"+([^a-zA-Z_\x7f-\xff$"\\{]|("\\"{ANY_CHAR}))) +BACKQUOTE_LITERAL_DOLLAR ("$"+([^a-zA-Z_\x7f-\xff$`\\{]|("\\"{ANY_CHAR}))) +HEREDOC_LITERAL_DOLLAR ("$"+([^a-zA-Z_\x7f-\xff$\n\r\\{]|("\\"[^\n\r]))) + +/* + * Usually, HEREDOC_NEWLINE will just function like a simple NEWLINE, but some + * special cases need to be handled. HEREDOC_CHARS doesn't allow a line to + * match when { or $, and/or \ is at the end. (("{"*|"$"*)"\\"?) handles that, + * along with cases where { or $, and/or \ is the ONLY thing on a line + * + * The other case is when a line contains a label, followed by ONLY + * { or $, and/or \ Handled by ({LABEL}";"?((("{"+|"$"+)"\\"?)|"\\")) + */ +HEREDOC_NEWLINE ((({LABEL}";"?((("{"+|"$"+)"\\"?)|"\\"))|(("{"*|"$"*)"\\"?)){NEWLINE}) + +/* + * This pattern is just used in the next 2 for matching { or literal $, and/or + * \ escape sequence immediately at the beginning of a line or after a label + */ +HEREDOC_CURLY_OR_ESCAPE_OR_DOLLAR (("{"+[^$\n\r\\{])|("{"*"\\"[^\n\r])|{HEREDOC_LITERAL_DOLLAR}) + +/* + * These 2 label-related patterns allow HEREDOC_CHARS to continue "regular" + * matching after a newline that starts with either a non-label character or a + * label that isn't followed by a newline. Like HEREDOC_CHARS, they won't match + * a variable or "{$" Matching a newline, and possibly label, up TO a variable + * or "{$", is handled in the heredoc rules + * + * The HEREDOC_LABEL_NO_NEWLINE pattern (";"[^$\n\r\\{]) handles cases where ; + * follows a label. [^a-zA-Z0-9_\x7f-\xff;$\n\r\\{] is needed to prevent a label + * character or ; from matching on a possible (real) ending label + */ +HEREDOC_NON_LABEL ([^a-zA-Z_\x7f-\xff$\n\r\\{]|{HEREDOC_CURLY_OR_ESCAPE_OR_DOLLAR}) +HEREDOC_LABEL_NO_NEWLINE ({LABEL}([^a-zA-Z0-9_\x7f-\xff;$\n\r\\{]|(";"[^$\n\r\\{])|(";"?{HEREDOC_CURLY_OR_ESCAPE_OR_DOLLAR}))) + +/* + * CHARS matches everything up to a variable or "{$" + * {'s are matched as long as they aren't followed by a $ + * The case of { before "{$" is handled in a rule for each string type + * + * For heredocs, matching continues across/after newlines if/when it's known + * that the next line doesn't contain a possible ending label + */ +DOUBLE_QUOTES_CHARS ("{"*([^$"\\{]|("\\"{ANY_CHAR}))|{DOUBLE_QUOTES_LITERAL_DOLLAR}) +BACKQUOTE_CHARS ("{"*([^$`\\{]|("\\"{ANY_CHAR}))|{BACKQUOTE_LITERAL_DOLLAR}) +HEREDOC_CHARS ("{"*([^$\n\r\\{]|("\\"[^\n\r]))|{HEREDOC_LITERAL_DOLLAR}|({HEREDOC_NEWLINE}+({HEREDOC_NON_LABEL}|{HEREDOC_LABEL_NO_NEWLINE}))) + %option noyylineno %option noyywrap %% @@ -1560,11 +1602,15 @@ NEWLINE ("\r"|"\n"|"\r\n") return T_IMPLEMENTS; } -"->" { +"->" { yy_push_state(ST_LOOKING_FOR_PROPERTY TSRMLS_CC); return T_OBJECT_OPERATOR; } +"->" { + return T_OBJECT_OPERATOR; +} + {LABEL} { yy_pop_state(TSRMLS_C); if (!zend_copy_scanner_string(zendlval, yytext, yyleng, UG(unicode)?IS_UNICODE:IS_STRING, SCNG(output_conv) TSRMLS_CC)) { @@ -1906,7 +1952,19 @@ NEWLINE ("\r"|"\n"|"\r\n") } } -{LNUM}|{HNUM} { /* treat numbers (almost) as strings inside encapsulated strings */ +0|([1-9][0-9]*) { /* Offset could be treated as a long */ + if (yyleng < MAX_LENGTH_OF_LONG - 1 || (yyleng == MAX_LENGTH_OF_LONG - 1 && strcmp(yytext, long_min_digits) < 0)) { + Z_LVAL_P(zendlval) = strtol(yytext, NULL, 10); + Z_TYPE_P(zendlval) = IS_LONG; + } else { + if (!zend_copy_scanner_string(zendlval, yytext, yyleng, CG(literal_type), SCNG(output_conv) TSRMLS_CC)) { + return 0; + } + } + return T_NUM_STRING; +} + +{LNUM}|{HNUM} { /* Offset must be treated as a string */ if (!zend_copy_scanner_string(zendlval, yytext, yyleng, CG(literal_type), SCNG(output_conv) TSRMLS_CC)) { return 0; } @@ -2080,7 +2138,40 @@ NEWLINE ("\r"|"\n"|"\r\n") return T_OPEN_TAG; } -"$"{LABEL} { +"$"{LABEL} { + if (!zend_copy_scanner_string(zendlval, (yytext+1), (yyleng-1), UG(unicode)?IS_UNICODE:IS_STRING, SCNG(output_conv) TSRMLS_CC)) { + return 0; + } + if (UG(unicode) && !zend_check_and_normalize_identifier(zendlval)) { + return 0; + } + return T_VARIABLE; +} + +%{ +/* Make sure a label character follows "->", otherwise there is no property + * and "->" will be taken literally + */ %} +"$"{LABEL}"->"[a-zA-Z_\x7f-\xff] { + yyless(yyleng - 3); + yy_push_state(ST_LOOKING_FOR_PROPERTY TSRMLS_CC); + + if (!zend_copy_scanner_string(zendlval, (yytext+1), (yyleng-1), UG(unicode)?IS_UNICODE:IS_STRING, SCNG(output_conv) TSRMLS_CC)) { + return 0; + } + if (UG(unicode) && !zend_check_and_normalize_identifier(zendlval)) { + return 0; + } + return T_VARIABLE; +} + +%{ +/* A [ always designates a variable offset, regardless of what follows + */ %} +"$"{LABEL}"[" { + yyless(yyleng - 1); + yy_push_state(ST_VAR_OFFSET TSRMLS_CC); + if (!zend_copy_scanner_string(zendlval, (yytext+1), (yyleng-1), UG(unicode)?IS_UNICODE:IS_STRING, SCNG(output_conv) TSRMLS_CC)) { return 0; } @@ -2090,6 +2181,21 @@ NEWLINE ("\r"|"\n"|"\r\n") return T_VARIABLE; } +"]" { + yy_pop_state(TSRMLS_C); + return ']'; +} + +{TOKENS}|[{}] { + /* Only '[' can be valid, but returning other tokens will allow a more explicit parse error */ + return yytext[0]; +} + +[ \n\r\t'"`\\#] { + yyless(0); + yy_pop_state(TSRMLS_C); +} + {LABEL} { if (!zend_copy_scanner_string(zendlval, yytext, yyleng, UG(unicode)?IS_UNICODE:IS_STRING, SCNG(output_conv) TSRMLS_CC)) { return 0; @@ -2100,7 +2206,7 @@ NEWLINE ("\r"|"\n"|"\r\n") return T_STRING; } -{LABEL} { +{LABEL} { if (!zend_copy_scanner_string(zendlval, yytext, yyleng, CG(literal_type), SCNG(output_conv) TSRMLS_CC)) { return 0; } @@ -2230,37 +2336,44 @@ NEWLINE ("\r"|"\n"|"\r\n") } -(["]([^$"\\]|("\\".))*["]) { +%{ +/* ("{"*|"$"*) handles { or $ at the end of a string (or the entire contents) + */ %} +(["]{DOUBLE_QUOTES_CHARS}*("{"*|"$"*)["]) { if (UG(unicode)) { - return zend_scan_unicode_double_string(zendlval TSRMLS_CC); + return zend_scan_unicode_escape_string(zendlval, yytext+1, yyleng-2, 0x22 /*'"'*/, T_CONSTANT_ENCAPSED_STRING TSRMLS_CC); } else { - return zend_scan_binary_double_string(zendlval, 0 TSRMLS_CC); + zend_scan_binary_escape_string(zendlval, yytext+1, yyleng-2, '"' TSRMLS_CC); + return T_CONSTANT_ENCAPSED_STRING; } } -(b["]([^$"\\]|("\\".))*["]) { - return zend_scan_binary_double_string(zendlval, 1 TSRMLS_CC); +(b["]{DOUBLE_QUOTES_CHARS}*("{"*|"$"*)["]) { + zend_scan_binary_escape_string(zendlval, yytext+2, yyleng-3, '"' TSRMLS_CC); + return T_CONSTANT_ENCAPSED_STRING; } -([']([^'\\]|("\\".))*[']) { +([']([^'\\]|("\\"{ANY_CHAR}))*[']) { if (UG(unicode)) { return zend_scan_unicode_single_string(zendlval TSRMLS_CC); } else { - return zend_scan_binary_single_string(zendlval, 0 TSRMLS_CC); + zend_scan_binary_single_string(zendlval, yytext+1, yyleng-2 TSRMLS_CC); + return T_CONSTANT_ENCAPSED_STRING; } } -("b'"([^'\\]|("\\".))*[']) { - return zend_scan_binary_single_string(zendlval, 1 TSRMLS_CC); +("b'"([^'\\]|("\\"{ANY_CHAR}))*[']) { + zend_scan_binary_single_string(zendlval, yytext+2, yyleng-3 TSRMLS_CC); + return T_CONSTANT_ENCAPSED_STRING; } ["] { BEGIN(ST_DOUBLE_QUOTES); - return '\"'; + return '"'; } b["] { @@ -2278,7 +2391,7 @@ NEWLINE ("\r"|"\n"|"\r\n") CG(heredoc_len)--; } CG(heredoc) = estrndup(s, CG(heredoc_len)); - BEGIN(ST_HEREDOC); + BEGIN(ST_START_HEREDOC); return T_BINARY_HEREDOC; } @@ -2293,7 +2406,7 @@ NEWLINE ("\r"|"\n"|"\r\n") CG(heredoc_len)--; } CG(heredoc) = estrndup(s, CG(heredoc_len)); - BEGIN(ST_HEREDOC); + BEGIN(ST_START_HEREDOC); return T_START_HEREDOC; } @@ -2304,204 +2417,180 @@ NEWLINE ("\r"|"\n"|"\r\n") } -^{LABEL}(";")?{NEWLINE} { - int label_len; +{ANY_CHAR} { + yyless(0); + BEGIN(ST_HEREDOC); +} - if (yytext[yyleng-2]=='\r') { - label_len = yyleng-2; - } else { - label_len = yyleng-1; - } +{LABEL}";"?[\n\r] { + int label_len = yyleng - 1; if (yytext[label_len-1]==';') { label_len--; } + yyless(label_len); + if (label_len==CG(heredoc_len) && !memcmp(yytext, CG(heredoc), label_len)) { - Z_STRVAL_P(zendlval) = estrndup(yytext, label_len); /* unput destroys yytext */ + Z_STRVAL_P(zendlval) = CG(heredoc); Z_STRLEN_P(zendlval) = label_len; - yyless(yyleng - (yyleng - label_len)); - efree(CG(heredoc)); CG(heredoc)=NULL; CG(heredoc_len)=0; BEGIN(ST_IN_SCRIPTING); return T_END_HEREDOC; } else { - CG(zend_lineno)++; - if (!zend_copy_scanner_string(zendlval, yytext, yyleng, CG(literal_type), SCNG(output_conv) TSRMLS_CC)) { - return 0; - } - return T_STRING; + yymore(); + BEGIN(ST_HEREDOC); } } +%{ +/* Match everything up to and including a possible ending label, so if the label + * doesn't match, it's kept with the rest of the string + * + * {HEREDOC_NEWLINE}+ handles the case of more than one newline sequence that + * couldn't be matched with HEREDOC_CHARS, because of the following label + */ %} +{HEREDOC_CHARS}*{HEREDOC_NEWLINE}+{LABEL}";"?[\n\r] { + char *end = yytext + yyleng - 1; -{ESCAPED_AND_WHITESPACE} { - HANDLE_NEWLINES(yytext, yyleng); - if (!zend_copy_scanner_string(zendlval, yytext, yyleng, CG(literal_type), SCNG(output_conv) TSRMLS_CC)) { - return 0; + if (end[-1] == ';') { + end--; + yyleng--; } - return T_ENCAPSED_AND_WHITESPACE; -} -[`]+ { - if (!zend_copy_scanner_string(zendlval, yytext, yyleng, CG(literal_type), SCNG(output_conv) TSRMLS_CC)) { - return 0; - } - return T_ENCAPSED_AND_WHITESPACE; -} + if (yyleng > CG(heredoc_len) && !memcmp(end - CG(heredoc_len), CG(heredoc), CG(heredoc_len))) { + int len = yyleng - CG(heredoc_len) - 2; /* 2 for newline before and after label */ + if (len > 0 && yytext[len - 1] == '\r' && yytext[len] == '\n') { + len--; + } -["]+ { - if (!zend_copy_scanner_string(zendlval, yytext, yyleng, CG(literal_type), SCNG(output_conv) TSRMLS_CC)) { - return 0; - } - return T_ENCAPSED_AND_WHITESPACE; -} + /* Go back before last label char, to match in ST_END_HEREDOC state */ + yyless(yyleng - 2); + /* Subtract the remaining label length. yyleng must include newline + * before label, for zend_highlight/strip, tokenizer, etc. */ + yyleng -= CG(heredoc_len) - 1; -"$"[^a-zA-Z_\x7f-\xff{] { - Z_LVAL_P(zendlval) = (long) yytext[0]; - if (yyleng == 2) { - yyless(1); + CG(increment_lineno) = 1; /* For newline before label */ + BEGIN(ST_END_HEREDOC); + + if (CG(literal_type) == IS_UNICODE) { + return zend_scan_unicode_escape_string(zendlval, yytext, len, 0, T_ENCAPSED_AND_WHITESPACE TSRMLS_CC); + } else { + zend_scan_binary_escape_string(zendlval, yytext, len, 0 TSRMLS_CC); + return T_ENCAPSED_AND_WHITESPACE; + } + } else { + /* Go back to end of label, so there's something to match again in case + * there's a variable at the beginning of the next line */ + yyless(yyleng - 1); + yymore(); } - return T_CHARACTER; } - -{ENCAPSED_TOKENS} { - Z_LVAL_P(zendlval) = (long) yytext[0]; - return yytext[0]; +{ANY_CHAR} { + Z_STRVAL_P(zendlval) = CG(heredoc); + Z_STRLEN_P(zendlval) = CG(heredoc_len); + yytext = Z_STRVAL_P(zendlval); + yyleng = Z_STRLEN_P(zendlval); + CG(heredoc) = NULL; + CG(heredoc_len) = 0; + BEGIN(ST_IN_SCRIPTING); + return T_END_HEREDOC; } + "{$" { - Z_LVAL_P(zendlval) = (long) yytext[0]; + Z_LVAL_P(zendlval) = (long) '{'; yy_push_state(ST_IN_SCRIPTING TSRMLS_CC); yyless(1); return T_CURLY_OPEN; } -"\\\"" { - Z_LVAL_P(zendlval) = (long) '"'; - return T_CHARACTER; -} - -"\\`" { - Z_LVAL_P(zendlval) = (long) '`'; - return T_CHARACTER; +{DOUBLE_QUOTES_CHARS}+ { + if (CG(literal_type) == IS_UNICODE) { + return zend_scan_unicode_escape_string(zendlval, yytext, yyleng, 0x22 /*'"'*/, T_ENCAPSED_AND_WHITESPACE TSRMLS_CC); + } else { + zend_scan_binary_escape_string(zendlval, yytext, yyleng, '"' TSRMLS_CC); + return T_ENCAPSED_AND_WHITESPACE; + } } -"\\"[0-7]{1,3} { - Z_LVAL_P(zendlval) = strtol(yytext+1, NULL, 8); - return T_CHARACTER; -} +%{ +/* "{"{2,}|"$"{2,} handles { before "{$" or literal $ before a variable or "${" + * (("{"+|"$"+)["]) handles { or $ at the end of a string + * + * Same for backquotes and heredocs, except the second case doesn't apply to + * heredocs. yyless(yyleng - 1) is used to correct taking one character too many + */ %} +{DOUBLE_QUOTES_CHARS}*("{"{2,}|"$"{2,}|(("{"+|"$"+)["])) { + yyless(yyleng - 1); -"\\x"[0-9A-Fa-f]{1,2} { - Z_LVAL_P(zendlval) = strtol (yytext+2, NULL, 16); - return T_CHARACTER; + if (CG(literal_type) == IS_UNICODE) { + return zend_scan_unicode_escape_string(zendlval, yytext, yyleng, 0x22 /*'"'*/, T_ENCAPSED_AND_WHITESPACE TSRMLS_CC); + } else { + zend_scan_binary_escape_string(zendlval, yytext, yyleng, '"' TSRMLS_CC); + return T_ENCAPSED_AND_WHITESPACE; + } } -"\\u"[0-9A-Fa-f]{0,6} { - UChar32 codepoint; - int req_digits = (yytext[1] == 'U') ? 6 : 4; - +{BACKQUOTE_CHARS}+ { if (CG(literal_type) == IS_UNICODE) { - if (zend_digits_to_codepoint(yytext+2, yytext+yyleng, &codepoint, req_digits)) { - if (codepoint <= 0x10FFFF) { - Z_LVAL_P(zendlval) = (long) codepoint; - /* give back if we grabbed more than needed for \u case */ - if (yyleng > req_digits + 2) { - yyless(req_digits + 2); - } - return T_CHARACTER; - } else { - zend_error(E_COMPILE_WARNING,"\\U%06x is above the highest valid codepoint 0x10FFFF", codepoint); - return 0; - } - } else { - zend_error(E_COMPILE_WARNING,"\\%c escape sequence requires exactly %d hexadecimal digits", yytext[1], req_digits); - return 0; - } + return zend_scan_unicode_escape_string(zendlval, yytext, yyleng, 0x60 /*'`'*/, T_ENCAPSED_AND_WHITESPACE TSRMLS_CC); } else { - zend_copy_scanner_string(zendlval, yytext, yyleng, CG(literal_type), SCNG(output_conv) TSRMLS_CC); - return T_STRING; + zend_scan_binary_escape_string(zendlval, yytext, yyleng, '`' TSRMLS_CC); + return T_ENCAPSED_AND_WHITESPACE; } } +{BACKQUOTE_CHARS}*("{"{2,}|"$"{2,}|(("{"+|"$"+)[`])) { + yyless(yyleng - 1); -"\\C"("{"[A-Z0-9 -]+"}")? { - UChar32 codepoint; - - if (CG(literal_type) == IS_UNICODE && (yytext[1] == 'C')) { - /* minimum valid string is \C{.} */ - if (yyleng >= 5) { - /* safe, since we have } at the end */ - yytext[yyleng-1] = 0; - if (zend_uchar_from_name(yytext+3, &codepoint)) { - Z_LVAL_P(zendlval) = (long) codepoint; - return T_CHARACTER; - } else { - zend_error(E_COMPILE_WARNING, "Invalid Unicode character name: '%s'", yytext+3); - return 0; - } - } else { - zend_error(E_COMPILE_WARNING, "Invalid \\C{..} sequence"); - return 0; - } + if (CG(literal_type) == IS_UNICODE) { + return zend_scan_unicode_escape_string(zendlval, yytext, yyleng, 0x60 /*'`'*/, T_ENCAPSED_AND_WHITESPACE TSRMLS_CC); } else { - zend_copy_scanner_string(zendlval, yytext, yyleng, CG(literal_type), SCNG(output_conv) TSRMLS_CC); - return T_STRING; + zend_scan_binary_escape_string(zendlval, yytext, yyleng, '`' TSRMLS_CC); + return T_ENCAPSED_AND_WHITESPACE; } } -"\\{" { - if (!zend_copy_scanner_string(zendlval, yytext, yyleng, CG(literal_type), SCNG(output_conv) TSRMLS_CC)) { - return 0; - } - return T_STRING; -} -"\\"{ANY_CHAR} { - switch (yytext[1]) { - case 'n': - Z_LVAL_P(zendlval) = (long) '\n'; - break; - case 't': - Z_LVAL_P(zendlval) = (long) '\t'; - break; - case 'r': - Z_LVAL_P(zendlval) = (long) '\r'; - break; - case '\\': - Z_LVAL_P(zendlval) = (long) '\\'; - break; - case '$': - Z_LVAL_P(zendlval) = (long) yytext[1]; - break; - default: - if (!zend_copy_scanner_string(zendlval, yytext, yyleng, CG(literal_type), SCNG(output_conv) TSRMLS_CC)) { - return 0; - } - return T_BAD_CHARACTER; - break; +%{ +/* ({HEREDOC_NEWLINE}+({LABEL}";"?)?)? handles the possible case of newline + * sequences, possibly followed by a label, that couldn't be matched with + * HEREDOC_CHARS because of a following variable or "{$" + * + * This doesn't affect real ending labels, as they are followed by a newline, + * which will result in a longer match for the correct rule if present + */ %} +{HEREDOC_CHARS}*({HEREDOC_NEWLINE}+({LABEL}";"?)?)? { + if (CG(literal_type) == IS_UNICODE) { + return zend_scan_unicode_escape_string(zendlval, yytext, yyleng, 0, T_ENCAPSED_AND_WHITESPACE TSRMLS_CC); + } else { + zend_scan_binary_escape_string(zendlval, yytext, yyleng, 0 TSRMLS_CC); + return T_ENCAPSED_AND_WHITESPACE; } - return T_CHARACTER; } +{HEREDOC_CHARS}*({HEREDOC_NEWLINE}+({LABEL}";"?)?)?("{"{2,}|"$"{2,}) { + yyless(yyleng - 1); -["'`]+ { - if (!zend_copy_scanner_string(zendlval, yytext, yyleng, CG(literal_type), SCNG(output_conv) TSRMLS_CC)) { - return 0; + if (CG(literal_type) == IS_UNICODE) { + return zend_scan_unicode_escape_string(zendlval, yytext, yyleng, 0, T_ENCAPSED_AND_WHITESPACE TSRMLS_CC); + } else { + zend_scan_binary_escape_string(zendlval, yytext, yyleng, 0 TSRMLS_CC); + return T_ENCAPSED_AND_WHITESPACE; } - return T_ENCAPSED_AND_WHITESPACE; } ["] { BEGIN(ST_IN_SCRIPTING); - return '\"'; + return '"'; } @@ -2511,10 +2600,6 @@ NEWLINE ("\r"|"\n"|"\r\n") } -<> { - return 0; -} - <> { zend_error(E_COMPILE_WARNING,"Unterminated comment starting line %d", CG(comment_start_line)); return 0; @@ -2522,6 +2607,6 @@ NEWLINE ("\r"|"\n"|"\r\n") -{ANY_CHAR} { +{ANY_CHAR} { zend_error(E_COMPILE_WARNING,"Unexpected character in input: '%c' (ASCII=%d) state=%d", yytext[0], yytext[0], YYSTATE); } diff --git a/Zend/zend_vm_def.h b/Zend/zend_vm_def.h index 3e739f153c..9dd76f3a4b 100644 --- a/Zend/zend_vm_def.h +++ b/Zend/zend_vm_def.h @@ -1629,7 +1629,7 @@ ZEND_VM_HANDLER(53, ZEND_INIT_STRING, ANY, ANY) Z_STRVAL_P(tmp) = emalloc(1); Z_STRVAL_P(tmp)[0] = 0; Z_STRLEN_P(tmp) = 0; - Z_TYPE_P(tmp) = EX(opline)->extended_value; + Z_TYPE_P(tmp) = IS_STRING; } tmp->refcount = 1; tmp->is_ref = 0; @@ -1666,15 +1666,18 @@ ZEND_VM_HANDLER(56, ZEND_ADD_VAR, TMP, TMP|VAR|CV) zend_free_op free_op1, free_op2; zval *var = GET_OP2_ZVAL_PTR(BP_VAR_R); zval var_copy; - int use_copy; + int use_copy = 0; - if (opline->extended_value == IS_UNICODE) { - zend_make_unicode_zval(var, &var_copy, &use_copy); - } else { - zend_make_string_zval(var, &var_copy, &use_copy); - } - if (use_copy) { - var = &var_copy; + if (Z_TYPE_P(var) != opline->extended_value) { + if (opline->extended_value == IS_UNICODE) { + zend_make_unicode_zval(var, &var_copy, &use_copy); + } else { + zend_make_string_zval(var, &var_copy, &use_copy); + } + + if (use_copy) { + var = &var_copy; + } } add_string_to_string(&EX_T(opline->result.u.var).tmp_var, GET_OP1_ZVAL_PTR(BP_VAR_NA), var); diff --git a/Zend/zend_vm_execute.h b/Zend/zend_vm_execute.h index f9798f60fe..3c75a34518 100644 --- a/Zend/zend_vm_execute.h +++ b/Zend/zend_vm_execute.h @@ -122,7 +122,7 @@ static int ZEND_INIT_STRING_SPEC_HANDLER(ZEND_OPCODE_HANDLER_ARGS) Z_STRVAL_P(tmp) = emalloc(1); Z_STRVAL_P(tmp)[0] = 0; Z_STRLEN_P(tmp) = 0; - Z_TYPE_P(tmp) = EX(opline)->extended_value; + Z_TYPE_P(tmp) = IS_STRING; } tmp->refcount = 1; tmp->is_ref = 0; @@ -5832,15 +5832,18 @@ static int ZEND_ADD_VAR_SPEC_TMP_TMP_HANDLER(ZEND_OPCODE_HANDLER_ARGS) zend_free_op free_op1, free_op2; zval *var = _get_zval_ptr_tmp(&opline->op2, EX(Ts), &free_op2 TSRMLS_CC); zval var_copy; - int use_copy; + int use_copy = 0; - if (opline->extended_value == IS_UNICODE) { - zend_make_unicode_zval(var, &var_copy, &use_copy); - } else { - zend_make_string_zval(var, &var_copy, &use_copy); - } - if (use_copy) { - var = &var_copy; + if (Z_TYPE_P(var) != opline->extended_value) { + if (opline->extended_value == IS_UNICODE) { + zend_make_unicode_zval(var, &var_copy, &use_copy); + } else { + zend_make_string_zval(var, &var_copy, &use_copy); + } + + if (use_copy) { + var = &var_copy; + } } add_string_to_string(&EX_T(opline->result.u.var).tmp_var, _get_zval_ptr_tmp(&opline->op1, EX(Ts), &free_op1 TSRMLS_CC), var); @@ -6280,15 +6283,18 @@ static int ZEND_ADD_VAR_SPEC_TMP_VAR_HANDLER(ZEND_OPCODE_HANDLER_ARGS) zend_free_op free_op1, free_op2; zval *var = _get_zval_ptr_var(&opline->op2, EX(Ts), &free_op2 TSRMLS_CC); zval var_copy; - int use_copy; + int use_copy = 0; - if (opline->extended_value == IS_UNICODE) { - zend_make_unicode_zval(var, &var_copy, &use_copy); - } else { - zend_make_string_zval(var, &var_copy, &use_copy); - } - if (use_copy) { - var = &var_copy; + if (Z_TYPE_P(var) != opline->extended_value) { + if (opline->extended_value == IS_UNICODE) { + zend_make_unicode_zval(var, &var_copy, &use_copy); + } else { + zend_make_string_zval(var, &var_copy, &use_copy); + } + + if (use_copy) { + var = &var_copy; + } } add_string_to_string(&EX_T(opline->result.u.var).tmp_var, _get_zval_ptr_tmp(&opline->op1, EX(Ts), &free_op1 TSRMLS_CC), var); @@ -6822,15 +6828,18 @@ static int ZEND_ADD_VAR_SPEC_TMP_CV_HANDLER(ZEND_OPCODE_HANDLER_ARGS) zend_free_op free_op1; zval *var = _get_zval_ptr_cv(&opline->op2, EX(Ts), BP_VAR_R TSRMLS_CC); zval var_copy; - int use_copy; + int use_copy = 0; - if (opline->extended_value == IS_UNICODE) { - zend_make_unicode_zval(var, &var_copy, &use_copy); - } else { - zend_make_string_zval(var, &var_copy, &use_copy); - } - if (use_copy) { - var = &var_copy; + if (Z_TYPE_P(var) != opline->extended_value) { + if (opline->extended_value == IS_UNICODE) { + zend_make_unicode_zval(var, &var_copy, &use_copy); + } else { + zend_make_string_zval(var, &var_copy, &use_copy); + } + + if (use_copy) { + var = &var_copy; + } } add_string_to_string(&EX_T(opline->result.u.var).tmp_var, _get_zval_ptr_tmp(&opline->op1, EX(Ts), &free_op1 TSRMLS_CC), var); diff --git a/ext/tokenizer/tests/001.phpt b/ext/tokenizer/tests/001.phpt index a8ab6bd523..7bac6b96e5 100644 --- a/ext/tokenizer/tests/001.phpt +++ b/ext/tokenizer/tests/001.phpt @@ -57,8 +57,6 @@ echo token_name(T_STRING_VARNAME), "\n"; echo token_name(T_VARIABLE), "\n"; echo token_name(T_NUM_STRING), "\n"; echo token_name(T_INLINE_HTML), "\n"; -echo token_name(T_CHARACTER), "\n"; -echo token_name(T_BAD_CHARACTER), "\n"; echo token_name(T_ENCAPSED_AND_WHITESPACE), "\n"; echo token_name(T_CONSTANT_ENCAPSED_STRING), "\n"; echo token_name(T_ECHO), "\n"; @@ -185,8 +183,6 @@ T_STRING_VARNAME T_VARIABLE T_NUM_STRING T_INLINE_HTML -T_CHARACTER -T_BAD_CHARACTER T_ENCAPSED_AND_WHITESPACE T_CONSTANT_ENCAPSED_STRING T_ECHO @@ -314,8 +310,6 @@ T_STRING_VARNAME T_VARIABLE T_NUM_STRING T_INLINE_HTML -T_CHARACTER -T_BAD_CHARACTER T_ENCAPSED_AND_WHITESPACE T_CONSTANT_ENCAPSED_STRING T_ECHO diff --git a/ext/tokenizer/tests/bug26463.phpt b/ext/tokenizer/tests/bug26463.phpt index c72d478403..de32769617 100644 --- a/ext/tokenizer/tests/bug26463.phpt +++ b/ext/tokenizer/tests/bug26463.phpt @@ -15,12 +15,12 @@ DDDD; ?>'; var_dump(token_get_all($str)); ?> ---EXPECT-- +--EXPECTF-- array(19) { [0]=> array(3) { [0]=> - int(370) + int(%d) [1]=> string(6) " array(3) { [0]=> - int(311) + int(%d) [1]=> string(2) "$x" [2]=> @@ -41,7 +41,7 @@ array(19) { [3]=> array(3) { [0]=> - int(374) + int(%d) [1]=> string(6) "<<
array(3) { [0]=> - int(309) + int(%d) [1]=> string(13) "jhdsjkfhjdsh " @@ -61,7 +61,7 @@ array(19) { [5]=> array(3) { [0]=> - int(375) + int(%d) [1]=> string(2) "DD" [2]=> @@ -70,7 +70,7 @@ array(19) { [6]=> array(3) { [0]=> - int(373) + int(%d) [1]=> string(1) " " @@ -82,7 +82,7 @@ array(19) { [8]=> array(3) { [0]=> - int(317) + int(%d) [1]=> string(2) """" [2]=> @@ -93,7 +93,7 @@ array(19) { [10]=> array(3) { [0]=> - int(373) + int(%d) [1]=> string(1) " " @@ -103,7 +103,7 @@ array(19) { [11]=> array(3) { [0]=> - int(311) + int(%d) [1]=> string(2) "$a" [2]=> @@ -114,7 +114,7 @@ array(19) { [13]=> array(3) { [0]=> - int(374) + int(%d) [1]=> string(8) "<< array(3) { [0]=> - int(309) + int(%d) [1]=> string(13) "jhdsjkfhjdsh " @@ -134,7 +134,7 @@ array(19) { [15]=> array(3) { [0]=> - int(375) + int(%d) [1]=> string(4) "DDDD" [2]=> @@ -145,7 +145,7 @@ array(19) { [17]=> array(3) { [0]=> - int(373) + int(%d) [1]=> string(1) " " @@ -155,7 +155,7 @@ array(19) { [18]=> array(3) { [0]=> - int(372) + int(%d) [1]=> string(2) "?>" [2]=> @@ -167,7 +167,7 @@ array(19) { [0]=> array(3) { [0]=> - int(370) + int(%d) [1]=> string(6) " array(3) { [0]=> - int(311) + int(%d) [1]=> string(2) "$x" [2]=> @@ -188,7 +188,7 @@ array(19) { [3]=> array(3) { [0]=> - int(374) + int(%d) [1]=> string(6) "<<
array(3) { [0]=> - int(309) + int(%d) [1]=> string(13) "jhdsjkfhjdsh " @@ -208,7 +208,7 @@ array(19) { [5]=> array(3) { [0]=> - int(375) + int(%d) [1]=> string(2) "DD" [2]=> @@ -217,7 +217,7 @@ array(19) { [6]=> array(3) { [0]=> - int(373) + int(%d) [1]=> string(1) " " @@ -229,7 +229,7 @@ array(19) { [8]=> array(3) { [0]=> - int(317) + int(%d) [1]=> string(2) """" [2]=> @@ -240,7 +240,7 @@ array(19) { [10]=> array(3) { [0]=> - int(373) + int(%d) [1]=> string(1) " " @@ -250,7 +250,7 @@ array(19) { [11]=> array(3) { [0]=> - int(311) + int(%d) [1]=> string(2) "$a" [2]=> @@ -261,7 +261,7 @@ array(19) { [13]=> array(3) { [0]=> - int(374) + int(%d) [1]=> string(8) "<< array(3) { [0]=> - int(309) + int(%d) [1]=> string(13) "jhdsjkfhjdsh " @@ -281,7 +281,7 @@ array(19) { [15]=> array(3) { [0]=> - int(375) + int(%d) [1]=> string(4) "DDDD" [2]=> @@ -292,7 +292,7 @@ array(19) { [17]=> array(3) { [0]=> - int(373) + int(%d) [1]=> string(1) " " @@ -302,7 +302,7 @@ array(19) { [18]=> array(3) { [0]=> - int(372) + int(%d) [1]=> string(2) "?>" [2]=> diff --git a/ext/tokenizer/tokenizer.c b/ext/tokenizer/tokenizer.c index 3727ec0a49..bd60a970fe 100644 --- a/ext/tokenizer/tokenizer.c +++ b/ext/tokenizer/tokenizer.c @@ -282,12 +282,15 @@ static void tokenize(zval *return_value TSRMLS_DC) while ((token_type = lex_scan(&token TSRMLS_CC))) { destroy = 1; switch (token_type) { + case T_CLOSE_TAG: + if (zendtext[zendleng - 1] != '>') { + CG(zend_lineno)++; + } case T_OPEN_TAG: case T_OPEN_TAG_WITH_ECHO: case T_WHITESPACE: case T_COMMENT: case T_DOC_COMMENT: - case T_CLOSE_TAG: destroy = 0; break; } @@ -297,6 +300,10 @@ static void tokenize(zval *return_value TSRMLS_DC) array_init(keyword); add_next_index_long(keyword, token_type); if (token_type == T_END_HEREDOC) { + if (CG(increment_lineno)) { + token_line = ++CG(zend_lineno); + CG(increment_lineno) = 0; + } add_next_index_stringl(keyword, Z_STRVAL(token), Z_STRLEN(token), 1); efree(Z_STRVAL(token)); } else { @@ -372,8 +379,6 @@ get_token_type_name(int token_type) case T_VARIABLE: return "T_VARIABLE"; case T_NUM_STRING: return "T_NUM_STRING"; case T_INLINE_HTML: return "T_INLINE_HTML"; - case T_CHARACTER: return "T_CHARACTER"; - case T_BAD_CHARACTER: return "T_BAD_CHARACTER"; case T_ENCAPSED_AND_WHITESPACE: return "T_ENCAPSED_AND_WHITESPACE"; case T_CONSTANT_ENCAPSED_STRING: return "T_CONSTANT_ENCAPSED_STRING"; case T_ECHO: return "T_ECHO";