From 80d2409fd8fe382ea14d6b45c528a452fa796905 Mon Sep 17 00:00:00 2001 From: Dmitry Stogov Date: Fri, 18 May 2007 13:12:05 +0000 Subject: [PATCH] Improved compilation of heredocs and interpolated strings. (Matt, Dmitry) --- NEWS | 1 + Zend/zend_compile.c | 54 ++-- Zend/zend_compile.h | 3 - Zend/zend_language_parser.y | 16 +- Zend/zend_language_scanner.l | 551 ++++++++++++++++++++--------------- Zend/zend_vm_def.h | 11 +- Zend/zend_vm_execute.h | 33 ++- ext/tokenizer/tests/001.phpt | 4 - ext/tokenizer/tokenizer.c | 13 +- 9 files changed, 378 insertions(+), 308 deletions(-) diff --git a/NEWS b/NEWS index 24bc28fb6b..a2707fd1b6 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,7 @@ PHP NEWS ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ?? ??? 2007, PHP 5.2.3 +- Improved compilation of heredocs and interpolated strings. (Matt, Dmitry) - Optimized out a couple of per-request syscalls (Rasmus) - Upgraded SQLite 3 to version 3.3.16 (Ilia) - Added PDO::FETCH_KEY_PAIR mode that will fetch a 2 column result set into diff --git a/Zend/zend_compile.c b/Zend/zend_compile.c index a505eefae1..3fc7a6dac5 100644 --- a/Zend/zend_compile.c +++ b/Zend/zend_compile.c @@ -941,24 +941,25 @@ void zend_do_init_string(znode *result TSRMLS_DC) } -void zend_do_add_char(znode *result, znode *op1, znode *op2 TSRMLS_DC) -{ - zend_op *opline = get_next_op(CG(active_op_array) TSRMLS_CC); - - opline->opcode = ZEND_ADD_CHAR; - opline->op1 = *op1; - opline->op2 = *op2; - opline->op2.op_type = IS_CONST; - opline->result = opline->op1; - *result = opline->result; -} - - void zend_do_add_string(znode *result, znode *op1, znode *op2 TSRMLS_DC) { - zend_op *opline = get_next_op(CG(active_op_array) TSRMLS_CC); + zend_op *opline; + + if (Z_STRLEN(op2->u.constant) > 1) { + opline = get_next_op(CG(active_op_array) TSRMLS_CC); + opline->opcode = ZEND_ADD_STRING; + } else if (Z_STRLEN(op2->u.constant) == 1) { + int ch = *Z_STRVAL(op2->u.constant); - opline->opcode = ZEND_ADD_STRING; + /* Free memory and use ZEND_ADD_CHAR in case of 1 character strings */ + efree(Z_STRVAL(op2->u.constant)); + ZVAL_LONG(&op2->u.constant, ch); + opline = get_next_op(CG(active_op_array) TSRMLS_CC); + opline->opcode = ZEND_ADD_CHAR; + } else { /* String can be empty after a variable at the end of a heredoc */ + efree(Z_STRVAL(op2->u.constant)); + return; + } opline->op1 = *op1; opline->op2 = *op2; opline->op2.op_type = IS_CONST; @@ -3930,24 +3931,6 @@ void zend_do_declare_end(znode *declare_token TSRMLS_DC) } -void zend_do_end_heredoc(TSRMLS_D) -{ - int opline_num = get_next_op_number(CG(active_op_array))-1; - zend_op *opline = &CG(active_op_array)->opcodes[opline_num]; - - if (opline->opcode != ZEND_ADD_STRING) { - return; - } - - opline->op2.u.constant.value.str.val[(opline->op2.u.constant.value.str.len--)-1] = 0; - if (opline->op2.u.constant.value.str.len>0) { - if (opline->op2.u.constant.value.str.val[opline->op2.u.constant.value.str.len-1]=='\r') { - opline->op2.u.constant.value.str.val[(opline->op2.u.constant.value.str.len--)-1] = 0; - } - } -} - - void zend_do_exit(znode *result, znode *message TSRMLS_DC) { zend_op *opline = get_next_op(CG(active_op_array) TSRMLS_CC); @@ -4136,12 +4119,12 @@ int zendlex(znode *zendlval TSRMLS_DC) { int retval; -again: if (CG(increment_lineno)) { CG(zend_lineno)++; CG(increment_lineno) = 0; } +again: Z_TYPE(zendlval->u.constant) = IS_LONG; retval = lex_scan(&zendlval->u.constant TSRMLS_CC); switch (retval) { @@ -4152,8 +4135,7 @@ again: goto again; case T_CLOSE_TAG: - if (LANG_SCNG(yy_text)[LANG_SCNG(yy_leng)-1]=='\n' - || (LANG_SCNG(yy_text)[LANG_SCNG(yy_leng)-2]=='\r' && LANG_SCNG(yy_text)[LANG_SCNG(yy_leng)-1])) { + if (LANG_SCNG(yy_text)[LANG_SCNG(yy_leng)-1] != '>') { CG(increment_lineno) = 1; } retval = ';'; /* implicit ; */ diff --git a/Zend/zend_compile.h b/Zend/zend_compile.h index cd177bb0d5..0687698144 100644 --- a/Zend/zend_compile.h +++ b/Zend/zend_compile.h @@ -392,7 +392,6 @@ void zend_check_writable_variable(znode *variable); void zend_do_free(znode *op1 TSRMLS_DC); void zend_do_init_string(znode *result TSRMLS_DC); -void zend_do_add_char(znode *result, znode *op1, znode *op2 TSRMLS_DC); void zend_do_add_string(znode *result, znode *op1, znode *op2 TSRMLS_DC); void zend_do_add_variable(znode *result, znode *op1, znode *op2 TSRMLS_DC); @@ -488,8 +487,6 @@ void zend_do_declare_begin(TSRMLS_D); void zend_do_declare_stmt(znode *var, znode *val TSRMLS_DC); void zend_do_declare_end(znode *declare_token TSRMLS_DC); -void zend_do_end_heredoc(TSRMLS_D); - void zend_do_exit(znode *result, znode *message TSRMLS_DC); void zend_do_begin_silence(znode *strudel_token TSRMLS_DC); diff --git a/Zend/zend_language_parser.y b/Zend/zend_language_parser.y index 5d401e4c8e..c81120eddb 100644 --- a/Zend/zend_language_parser.y +++ b/Zend/zend_language_parser.y @@ -24,8 +24,6 @@ * LALR shift/reduce conflicts and how they are resolved: * * - 2 shift/reduce conflicts due to the dangeling elseif/else ambiguity. Solved by shift. - * - 1 shift/reduce conflict due to arrays within encapsulated strings. Solved by shift. - * - 1 shift/reduce conflict due to objects within encapsulated strings. Solved by shift. * */ @@ -49,7 +47,7 @@ %} %pure_parser -%expect 4 +%expect 2 %left T_INCLUDE T_INCLUDE_ONCE T_EVAL T_REQUIRE T_REQUIRE_ONCE %left ',' @@ -709,8 +707,7 @@ scalar: | class_constant { $$ = $1; } | common_scalar { $$ = $1; } | '"' encaps_list '"' { $$ = $2; } - | '\'' encaps_list '\'' { $$ = $2; } - | T_START_HEREDOC encaps_list T_END_HEREDOC { $$ = $2; zend_do_end_heredoc(TSRMLS_C); } + | T_START_HEREDOC encaps_list T_END_HEREDOC { $$ = $2; } ; @@ -869,16 +866,7 @@ non_empty_array_pair_list: encaps_list: encaps_list encaps_var { zend_do_end_variable_parse(BP_VAR_R, 0 TSRMLS_CC); zend_do_add_variable(&$$, &$1, &$2 TSRMLS_CC); } - | encaps_list T_STRING { zend_do_add_string(&$$, &$1, &$2 TSRMLS_CC); } - | encaps_list T_NUM_STRING { zend_do_add_string(&$$, &$1, &$2 TSRMLS_CC); } | encaps_list T_ENCAPSED_AND_WHITESPACE { zend_do_add_string(&$$, &$1, &$2 TSRMLS_CC); } - | encaps_list T_CHARACTER { zend_do_add_char(&$$, &$1, &$2 TSRMLS_CC); } - | encaps_list T_BAD_CHARACTER { zend_do_add_string(&$$, &$1, &$2 TSRMLS_CC); } - | encaps_list '[' { Z_LVAL($2.u.constant) = (long) '['; zend_do_add_char(&$$, &$1, &$2 TSRMLS_CC); } - | encaps_list ']' { Z_LVAL($2.u.constant) = (long) ']'; zend_do_add_char(&$$, &$1, &$2 TSRMLS_CC); } - | encaps_list '{' { Z_LVAL($2.u.constant) = (long) '{'; zend_do_add_char(&$$, &$1, &$2 TSRMLS_CC); } - | encaps_list '}' { Z_LVAL($2.u.constant) = (long) '}'; zend_do_add_char(&$$, &$1, &$2 TSRMLS_CC); } - | encaps_list T_OBJECT_OPERATOR { znode tmp; Z_LVAL($2.u.constant) = (long) '-'; zend_do_add_char(&tmp, &$1, &$2 TSRMLS_CC); Z_LVAL($2.u.constant) = (long) '>'; zend_do_add_char(&$$, &tmp, &$2 TSRMLS_CC); } | /* empty */ { zend_do_init_string(&$$ TSRMLS_CC); } ; diff --git a/Zend/zend_language_scanner.l b/Zend/zend_language_scanner.l index ca0912323f..351d3b137f 100644 --- a/Zend/zend_language_scanner.l +++ b/Zend/zend_language_scanner.l @@ -35,11 +35,13 @@ %x ST_IN_SCRIPTING %x ST_DOUBLE_QUOTES -%x ST_SINGLE_QUOTE %x ST_BACKQUOTE %x ST_HEREDOC +%x ST_START_HEREDOC +%x ST_END_HEREDOC %x ST_LOOKING_FOR_PROPERTY %x ST_LOOKING_FOR_VARNAME +%x ST_VAR_OFFSET %x ST_COMMENT %x ST_DOC_COMMENT %x ST_ONE_LINE_COMMENT @@ -96,9 +98,7 @@ do { \ char *p = (s), *boundary = p+(l); \ \ while (pvalue.str.val = (char *)estrndup(yytext, yyleng); \ zendlval->value.str.len = yyleng; #endif /* ZEND_MULTIBYTE */ + +static void zend_scan_escape_string(zval *zendlval, char *str, int len, char quote_type TSRMLS_DC) +{ + register char *s, *t; + char *end; + + ZVAL_STRINGL(zendlval, str, len, 1); + + /* convert escape sequences */ + s = t = zendlval->value.str.val; + end = s+zendlval->value.str.len; + while (s=end) { + continue; + } + switch(*s) { + case 'n': + *t++ = '\n'; + zendlval->value.str.len--; + break; + case 'r': + *t++ = '\r'; + zendlval->value.str.len--; + break; + case 't': + *t++ = '\t'; + zendlval->value.str.len--; + break; + case '"': + case '`': + if (*s != quote_type) { + *t++ = '\\'; + *t++ = *s; + break; + } + case '\\': + case '$': + *t++ = *s; + zendlval->value.str.len--; + break; + case 'x': + case 'X': + if (ZEND_IS_HEX(*(s+1))) { + char hex_buf[3] = { 0, 0, 0 }; + + zendlval->value.str.len--; /* for the 'x' */ + + hex_buf[0] = *(++s); + zendlval->value.str.len--; + if (ZEND_IS_HEX(*(s+1))) { + hex_buf[1] = *(++s); + zendlval->value.str.len--; + } + *t++ = (char) strtol(hex_buf, NULL, 16); + } else { + *t++ = '\\'; + *t++ = *s; + } + break; + default: + /* check for an octal */ + if (ZEND_IS_OCT(*s)) { + char octal_buf[4] = { 0, 0, 0, 0 }; + + octal_buf[0] = *s; + zendlval->value.str.len--; + if (ZEND_IS_OCT(*(s+1))) { + octal_buf[1] = *(++s); + zendlval->value.str.len--; + if (ZEND_IS_OCT(*(s+1))) { + octal_buf[2] = *(++s); + zendlval->value.str.len--; + } + } + *t++ = (char) strtol(octal_buf, NULL, 8); + } else { + *t++ = '\\'; + *t++ = *s; + } + break; + } + } else { + *t++ = *s; + } + + if (*s == '\n' || (*s == '\r' && (*(s+1) != '\n'))) { + CG(zend_lineno)++; + } + s++; + } + *t = 0; + +#ifdef ZEND_MULTIBYTE + if (SCNG(output_filter)) { + s = zendlval->value.str.val; + SCNG(output_filter)(&(zendlval->value.str.val), &(zendlval->value.str.len), s, zendlval->value.str.len TSRMLS_CC); + efree(s); + } +#endif /* ZEND_MULTIBYTE */ +} + %} LNUM [0-9]+ @@ -799,11 +902,61 @@ LABEL [a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]* WHITESPACE [ \n\r\t]+ TABS_AND_SPACES [ \t]* TOKENS [;:,.\[\]()|^&+-/*=%!~$<>?@] -ENCAPSED_TOKENS [\[\]{}$] -ESCAPED_AND_WHITESPACE [\n\t\r #'.:;,()|^&+-/*=%!~<>?@]+ ANY_CHAR (.|[\n]) NEWLINE ("\r"|"\n"|"\r\n") +/* + * LITERAL_DOLLAR matches unescaped $ that aren't followed by a label character + * or a { and therefore will be taken literally. The case of literal $ before + * a variable or "${" is handled in a rule for each string type + */ +DOUBLE_QUOTES_LITERAL_DOLLAR ("$"+([^a-zA-Z_\x7f-\xff$"\\{]|("\\"{ANY_CHAR}))) +BACKQUOTE_LITERAL_DOLLAR ("$"+([^a-zA-Z_\x7f-\xff$`\\{]|("\\"{ANY_CHAR}))) +HEREDOC_LITERAL_DOLLAR ("$"+([^a-zA-Z_\x7f-\xff$\n\r\\{]|("\\"[^\n\r]))) + +/* + * Usually, HEREDOC_NEWLINE will just function like a simple NEWLINE, but some + * special cases need to be handled. HEREDOC_CHARS doesn't allow a line to + * match when { or $, and/or \ is at the end. (("{"*|"$"*)"\\"?) handles that, + * along with cases where { or $, and/or \ is the ONLY thing on a line + * + * The other case is when a line contains a label, followed by ONLY + * { or $, and/or \ Handled by ({LABEL}";"?((("{"+|"$"+)"\\"?)|"\\")) + */ +HEREDOC_NEWLINE ((({LABEL}";"?((("{"+|"$"+)"\\"?)|"\\"))|(("{"*|"$"*)"\\"?)){NEWLINE}) + +/* + * This pattern is just used in the next 2 for matching { or literal $, and/or + * \ escape sequence immediately at the beginning of a line or after a label + */ +HEREDOC_CURLY_OR_ESCAPE_OR_DOLLAR (("{"+[^$\n\r\\{])|("{"*"\\"[^\n\r])|{HEREDOC_LITERAL_DOLLAR}) + +/* + * These 2 label-related patterns allow HEREDOC_CHARS to continue "regular" + * matching after a newline that starts with either a non-label character or a + * label that isn't followed by a newline. Like HEREDOC_CHARS, they won't match + * a variable or "{$" Matching a newline, and possibly label, up TO a variable + * or "{$", is handled in the heredoc rules + * + * The HEREDOC_LABEL_NO_NEWLINE pattern (";"[^$\n\r\\{]) handles cases where ; + * follows a label. [^a-zA-Z0-9_\x7f-\xff;$\n\r\\{] is needed to prevent a label + * character or ; from matching on a possible (real) ending label + */ +HEREDOC_NON_LABEL ([^a-zA-Z_\x7f-\xff$\n\r\\{]|{HEREDOC_CURLY_OR_ESCAPE_OR_DOLLAR}) +HEREDOC_LABEL_NO_NEWLINE ({LABEL}([^a-zA-Z0-9_\x7f-\xff;$\n\r\\{]|(";"[^$\n\r\\{])|(";"?{HEREDOC_CURLY_OR_ESCAPE_OR_DOLLAR}))) + +/* + * CHARS matches everything up to a variable or "{$" + * {'s are matched as long as they aren't followed by a $ + * The case of { before "{$" is handled in a rule for each string type + * + * For heredocs, matching continues across/after newlines if/when it's known + * that the next line doesn't contain a possible ending label + */ +DOUBLE_QUOTES_CHARS ("{"*([^$"\\{]|("\\"{ANY_CHAR}))|{DOUBLE_QUOTES_LITERAL_DOLLAR}) +BACKQUOTE_CHARS ("{"*([^$`\\{]|("\\"{ANY_CHAR}))|{BACKQUOTE_LITERAL_DOLLAR}) +HEREDOC_CHARS ("{"*([^$\n\r\\{]|("\\"[^\n\r]))|{HEREDOC_LITERAL_DOLLAR}|({HEREDOC_NEWLINE}+({HEREDOC_NON_LABEL}|{HEREDOC_LABEL_NO_NEWLINE}))) + %option noyylineno %option noyywrap %% @@ -948,11 +1101,15 @@ NEWLINE ("\r"|"\n"|"\r\n") return T_IMPLEMENTS; } -"->" { +"->" { yy_push_state(ST_LOOKING_FOR_PROPERTY TSRMLS_CC); return T_OBJECT_OPERATOR; } +"->" { + return T_OBJECT_OPERATOR; +} + {LABEL} { yy_pop_state(TSRMLS_C); zend_copy_value(zendlval, yytext, yyleng); @@ -1282,7 +1439,19 @@ NEWLINE ("\r"|"\n"|"\r\n") } } -{LNUM}|{HNUM} { /* treat numbers (almost) as strings inside encapsulated strings */ +0|([1-9][0-9]*) { /* Offset could be treated as a long */ + if (yyleng < MAX_LENGTH_OF_LONG - 1 || (yyleng == MAX_LENGTH_OF_LONG - 1 && strcmp(yytext, long_min_digits) < 0)) { + zendlval->value.lval = strtol(yytext, NULL, 10); + zendlval->type = IS_LONG; + } else { + zendlval->value.str.val = (char *)estrndup(yytext, yyleng); + zendlval->value.str.len = yyleng; + zendlval->type = IS_STRING; + } + return T_NUM_STRING; +} + +{LNUM}|{HNUM} { /* Offset must be treated as a string */ zendlval->value.str.val = (char *)estrndup(yytext, yyleng); zendlval->value.str.len = yyleng; zendlval->type = IS_STRING; @@ -1445,19 +1614,51 @@ NEWLINE ("\r"|"\n"|"\r\n") return T_OPEN_TAG; } -"$"{LABEL} { +"$"{LABEL} { zend_copy_value(zendlval, (yytext+1), (yyleng-1)); zendlval->type = IS_STRING; return T_VARIABLE; } -{LABEL} { - zend_copy_value(zendlval, yytext, yyleng); +%{ +/* Make sure a label character follows "->", otherwise there is no property + * and "->" will be taken literally + */ %} +"$"{LABEL}"->"[a-zA-Z_\x7f-\xff] { + yyless(yyleng - 3); + yy_push_state(ST_LOOKING_FOR_PROPERTY TSRMLS_CC); + zend_copy_value(zendlval, (yytext+1), (yyleng-1)); zendlval->type = IS_STRING; - return T_STRING; + return T_VARIABLE; +} + +%{ +/* A [ always designates a variable offset, regardless of what follows + */ %} +"$"{LABEL}"[" { + yyless(yyleng - 1); + yy_push_state(ST_VAR_OFFSET TSRMLS_CC); + zend_copy_value(zendlval, (yytext+1), (yyleng-1)); + zendlval->type = IS_STRING; + return T_VARIABLE; +} + +"]" { + yy_pop_state(TSRMLS_C); + return ']'; +} + +{TOKENS}|[{}] { + /* Only '[' can be valid, but returning other tokens will allow a more explicit parse error */ + return yytext[0]; } -{LABEL} { +[ \n\r\t'"`\\#] { + yyless(0); + yy_pop_state(TSRMLS_C); +} + +{LABEL} { zend_copy_value(zendlval, yytext, yyleng); zendlval->type = IS_STRING; return T_STRING; @@ -1581,106 +1782,25 @@ NEWLINE ("\r"|"\n"|"\r\n") } -("b"?["]([^$"\\]|("\\".))*["]) { - register char *s, *t; - char *end; - int bprefix = (*yytext == 'b') ? 1 : 0; - - zendlval->value.str.val = estrndup(yytext+bprefix+1, yyleng-bprefix-2); - zendlval->value.str.len = yyleng-bprefix-2; - zendlval->type = IS_STRING; - HANDLE_NEWLINES(yytext, yyleng); - - /* convert escape sequences */ - s = t = zendlval->value.str.val; - end = s+zendlval->value.str.len; - while (s=end) { - continue; - } - switch(*s) { - case 'n': - *t++ = '\n'; - zendlval->value.str.len--; - break; - case 'r': - *t++ = '\r'; - zendlval->value.str.len--; - break; - case 't': - *t++ = '\t'; - zendlval->value.str.len--; - break; - case '\\': - case '$': - case '"': - *t++ = *s; - zendlval->value.str.len--; - break; - default: - /* check for an octal */ - if (ZEND_IS_OCT(*s)) { - char octal_buf[4] = { 0, 0, 0, 0 }; - - octal_buf[0] = *s; - zendlval->value.str.len--; - if ((s+1)value.str.len--; - if ((s+1)value.str.len--; - } - } - *t++ = (char) strtol(octal_buf, NULL, 8); - } else if (*s=='x' && (s+1)value.str.len--; /* for the 'x' */ - - hex_buf[0] = *(++s); - zendlval->value.str.len--; - if ((s+1)value.str.len--; - } - *t++ = (char) strtol(hex_buf, NULL, 16); - } else { - *t++ = '\\'; - *t++ = *s; - } - break; - } - s++; - } else { - *t++ = *s++; - } - } - *t = 0; - -#ifdef ZEND_MULTIBYTE - if (SCNG(output_filter)) { - s = zendlval->value.str.val; - SCNG(output_filter)(&(zendlval->value.str.val), &(zendlval->value.str.len), s, zendlval->value.str.len TSRMLS_CC); - efree(s); - } -#endif /* ZEND_MULTIBYTE */ +%{ +/* ("{"*|"$"*) handles { or $ at the end of a string (or the entire contents) + */ %} +(b?["]{DOUBLE_QUOTES_CHARS}*("{"*|"$"*)["]) { + int bprefix = (yytext[0] != '"') ? 1 : 0; + zend_scan_escape_string(zendlval, yytext+bprefix+1, yyleng-bprefix-2, '"' TSRMLS_CC); return T_CONSTANT_ENCAPSED_STRING; } -("b"?[']([^'\\]|("\\".))*[']) { +(b?[']([^'\\]|("\\"{ANY_CHAR}))*[']) { register char *s, *t; char *end; - int bprefix = (*yytext == 'b') ? 1 : 0; + int bprefix = (yytext[0] != '\'') ? 1 : 0; zendlval->value.str.val = estrndup(yytext+bprefix+1, yyleng-bprefix-2); zendlval->value.str.len = yyleng-bprefix-2; zendlval->type = IS_STRING; - HANDLE_NEWLINES(yytext, yyleng); /* convert escape sequences */ s = t = zendlval->value.str.val; @@ -1702,10 +1822,14 @@ NEWLINE ("\r"|"\n"|"\r\n") *t++ = *s; break; } - s++; } else { - *t++ = *s++; + *t++ = *s; + } + + if (*s == '\n' || (*s == '\r' && (*(s+1) != '\n'))) { + CG(zend_lineno)++; } + s++; } *t = 0; @@ -1723,13 +1847,13 @@ NEWLINE ("\r"|"\n"|"\r\n") b?["] { BEGIN(ST_DOUBLE_QUOTES); - return '\"'; + return '"'; } -"b"?"<<<"{TABS_AND_SPACES}{LABEL}{NEWLINE} { +b?"<<<"{TABS_AND_SPACES}{LABEL}{NEWLINE} { char *s; - int bprefix = (*yytext == 'b') ? 1 : 0; + int bprefix = (yytext[0] != '<') ? 1 : 0; CG(zend_lineno)++; CG(heredoc_len) = yyleng-bprefix-3-1-(yytext[yyleng-2]=='\r'?1:0); @@ -1739,7 +1863,7 @@ NEWLINE ("\r"|"\n"|"\r\n") CG(heredoc_len)--; } CG(heredoc) = estrndup(s, CG(heredoc_len)); - BEGIN(ST_HEREDOC); + BEGIN(ST_START_HEREDOC); return T_START_HEREDOC; } @@ -1750,171 +1874,148 @@ NEWLINE ("\r"|"\n"|"\r\n") } -b?['] { - BEGIN(ST_SINGLE_QUOTE); - return '\''; +{ANY_CHAR} { + yyless(0); + BEGIN(ST_HEREDOC); } - -^{LABEL}(";")?{NEWLINE} { - int label_len; - - if (yytext[yyleng-2]=='\r') { - label_len = yyleng-2; - } else { - label_len = yyleng-1; - } +{LABEL}";"?[\n\r] { + int label_len = yyleng - 1; if (yytext[label_len-1]==';') { label_len--; } + yyless(label_len); + if (label_len==CG(heredoc_len) && !memcmp(yytext, CG(heredoc), label_len)) { - zendlval->value.str.val = estrndup(yytext, label_len); /* unput destroys yytext */ + zendlval->value.str.val = CG(heredoc); zendlval->value.str.len = label_len; - yyless(yyleng - (yyleng - label_len)); - efree(CG(heredoc)); CG(heredoc)=NULL; CG(heredoc_len)=0; BEGIN(ST_IN_SCRIPTING); return T_END_HEREDOC; } else { - CG(zend_lineno)++; - zend_copy_value(zendlval, yytext, yyleng); - zendlval->type = IS_STRING; - return T_STRING; + yymore(); + BEGIN(ST_HEREDOC); } } +%{ +/* Match everything up to and including a possible ending label, so if the label + * doesn't match, it's kept with the rest of the string + * + * {HEREDOC_NEWLINE}+ handles the case of more than one newline sequence that + * couldn't be matched with HEREDOC_CHARS, because of the following label + */ %} +{HEREDOC_CHARS}*{HEREDOC_NEWLINE}+{LABEL}";"?[\n\r] { + char *end = yytext + yyleng - 1; -{ESCAPED_AND_WHITESPACE} { - HANDLE_NEWLINES(yytext, yyleng); - zendlval->value.str.val = (char *) estrndup(yytext, yyleng); - zendlval->value.str.len = yyleng; - zendlval->type = IS_STRING; - return T_ENCAPSED_AND_WHITESPACE; -} - -([^'\\]|\\[^'\\])+ { - HANDLE_NEWLINES(yytext, yyleng); - zend_copy_value(zendlval, yytext, yyleng); - zendlval->type = IS_STRING; - return T_ENCAPSED_AND_WHITESPACE; -} - + if (end[-1] == ';') { + end--; + yyleng--; + } -[`]+ { - zend_copy_value(zendlval, yytext, yyleng); - zendlval->type = IS_STRING; - return T_ENCAPSED_AND_WHITESPACE; -} + if (yyleng > CG(heredoc_len) && !memcmp(end - CG(heredoc_len), CG(heredoc), CG(heredoc_len))) { + int len = yyleng - CG(heredoc_len) - 2; /* 2 for newline before and after label */ + if (len > 0 && yytext[len - 1] == '\r' && yytext[len] == '\n') { + len--; + } -["]+ { - zend_copy_value(zendlval, yytext, yyleng); - zendlval->type = IS_STRING; - return T_ENCAPSED_AND_WHITESPACE; -} + /* Go back before last label char, to match in ST_END_HEREDOC state */ + yyless(yyleng - 2); + /* Subtract the remaining label length. yyleng must include newline + * before label, for zend_highlight/strip, tokenizer, etc. */ + yyleng -= CG(heredoc_len) - 1; -"$"[^a-zA-Z_\x7f-\xff{] { - zendlval->value.lval = (long) yytext[0]; - if (yyleng == 2) { - yyless(1); + CG(increment_lineno) = 1; /* For newline before label */ + BEGIN(ST_END_HEREDOC); + zend_scan_escape_string(zendlval, yytext, len, 0 TSRMLS_CC); + return T_ENCAPSED_AND_WHITESPACE; + } else { + /* Go back to end of label, so there's something to match again in case + * there's a variable at the beginning of the next line */ + yyless(yyleng - 1); + yymore(); } - return T_CHARACTER; } - -{ENCAPSED_TOKENS} { - zendlval->value.lval = (long) yytext[0]; - return yytext[0]; +{ANY_CHAR} { + zendlval->value.str.val = CG(heredoc); + zendlval->value.str.len = CG(heredoc_len); + yytext = zendlval->value.str.val; + yyleng = zendlval->value.str.len; + CG(heredoc) = NULL; + CG(heredoc_len) = 0; + BEGIN(ST_IN_SCRIPTING); + return T_END_HEREDOC; } -"\\{" { - zendlval->value.str.val = estrndup("\\{", sizeof("\\{") - 1); - zendlval->value.str.len = sizeof("\\{") - 1; - zendlval->type = IS_STRING; - return T_STRING; -} "{$" { - zendlval->value.lval = (long) yytext[0]; + zendlval->value.lval = (long) '{'; yy_push_state(ST_IN_SCRIPTING TSRMLS_CC); yyless(1); return T_CURLY_OPEN; } -"\\'" { - zendlval->value.lval = (long) '\''; - return T_CHARACTER; +{DOUBLE_QUOTES_CHARS}+ { + zend_scan_escape_string(zendlval, yytext, yyleng, '"' TSRMLS_CC); + return T_ENCAPSED_AND_WHITESPACE; } -"\\\\" { - zendlval->value.lval = (long)'\\'; - return T_CHARACTER; +%{ +/* "{"{2,}|"$"{2,} handles { before "{$" or literal $ before a variable or "${" + * (("{"+|"$"+)["]) handles { or $ at the end of a string + * + * Same for backquotes and heredocs, except the second case doesn't apply to + * heredocs. yyless(yyleng - 1) is used to correct taking one character too many + */ %} +{DOUBLE_QUOTES_CHARS}*("{"{2,}|"$"{2,}|(("{"+|"$"+)["])) { + yyless(yyleng - 1); + zend_scan_escape_string(zendlval, yytext, yyleng, '"' TSRMLS_CC); + return T_ENCAPSED_AND_WHITESPACE; } -"\\\"" { - zendlval->value.lval = (long) '"'; - return T_CHARACTER; -} -"\\`" { - zendlval->value.lval = (long) '`'; - return T_CHARACTER; +{BACKQUOTE_CHARS}+ { + zend_scan_escape_string(zendlval, yytext, yyleng, '`' TSRMLS_CC); + return T_ENCAPSED_AND_WHITESPACE; } -"\\"[0-7]{1,3} { - zendlval->value.lval = strtol(yytext+1, NULL, 8); - return T_CHARACTER; +{BACKQUOTE_CHARS}*("{"{2,}|"$"{2,}|(("{"+|"$"+)[`])) { + yyless(yyleng - 1); + zend_scan_escape_string(zendlval, yytext, yyleng, '`' TSRMLS_CC); + return T_ENCAPSED_AND_WHITESPACE; } -"\\x"[0-9A-Fa-f]{1,2} { - zendlval->value.lval = strtol (yytext+2, NULL, 16); - return T_CHARACTER; -} -"\\"{ANY_CHAR} { - switch (yytext[1]) { - case 'n': - zendlval->value.lval = (long) '\n'; - break; - case 't': - zendlval->value.lval = (long) '\t'; - break; - case 'r': - zendlval->value.lval = (long) '\r'; - break; - case '\\': - zendlval->value.lval = (long) '\\'; - break; - case '$': - zendlval->value.lval = (long) yytext[1]; - break; - default: - zendlval->value.str.val = estrndup(yytext, yyleng); - zendlval->value.str.len = yyleng; - zendlval->type = IS_STRING; - return T_BAD_CHARACTER; - break; - } - return T_CHARACTER; +%{ +/* ({HEREDOC_NEWLINE}+({LABEL}";"?)?)? handles the possible case of newline + * sequences, possibly followed by a label, that couldn't be matched with + * HEREDOC_CHARS because of a following variable or "{$" + * + * This doesn't affect real ending labels, as they are followed by a newline, + * which will result in a longer match for the correct rule if present + */ %} +{HEREDOC_CHARS}*({HEREDOC_NEWLINE}+({LABEL}";"?)?)? { + zend_scan_escape_string(zendlval, yytext, yyleng, 0 TSRMLS_CC); + return T_ENCAPSED_AND_WHITESPACE; } - -["'`]+ { - zendlval->value.str.val = (char *) estrndup(yytext, yyleng); - zendlval->value.str.len = yyleng; - zendlval->type = IS_STRING; +{HEREDOC_CHARS}*({HEREDOC_NEWLINE}+({LABEL}";"?)?)?("{"{2,}|"$"{2,}) { + yyless(yyleng - 1); + zend_scan_escape_string(zendlval, yytext, yyleng, 0 TSRMLS_CC); return T_ENCAPSED_AND_WHITESPACE; } ["] { BEGIN(ST_IN_SCRIPTING); - return '\"'; + return '"'; } @@ -1924,16 +2025,6 @@ NEWLINE ("\r"|"\n"|"\r\n") } -['] { - BEGIN(ST_IN_SCRIPTING); - return '\''; -} - - -<> { - return 0; -} - <> { zend_error(E_COMPILE_WARNING,"Unterminated comment starting line %d", CG(comment_start_line)); return 0; @@ -1941,6 +2032,6 @@ NEWLINE ("\r"|"\n"|"\r\n") -{ANY_CHAR} { +{ANY_CHAR} { zend_error(E_COMPILE_WARNING,"Unexpected character in input: '%c' (ASCII=%d) state=%d", yytext[0], yytext[0], YYSTATE); } diff --git a/Zend/zend_vm_def.h b/Zend/zend_vm_def.h index 919d2cf003..7ca9377fc1 100644 --- a/Zend/zend_vm_def.h +++ b/Zend/zend_vm_def.h @@ -1619,11 +1619,14 @@ ZEND_VM_HANDLER(56, ZEND_ADD_VAR, TMP, TMP|VAR|CV) zend_free_op free_op1, free_op2; zval *var = GET_OP2_ZVAL_PTR(BP_VAR_R); zval var_copy; - int use_copy; + int use_copy = 0; - zend_make_printable_zval(var, &var_copy, &use_copy); - if (use_copy) { - var = &var_copy; + if (Z_TYPE_P(var) != IS_STRING) { + zend_make_printable_zval(var, &var_copy, &use_copy); + + if (use_copy) { + var = &var_copy; + } } add_string_to_string( &EX_T(opline->result.u.var).tmp_var, GET_OP1_ZVAL_PTR(BP_VAR_NA), diff --git a/Zend/zend_vm_execute.h b/Zend/zend_vm_execute.h index 5cc340cea6..0307dae993 100644 --- a/Zend/zend_vm_execute.h +++ b/Zend/zend_vm_execute.h @@ -5660,11 +5660,14 @@ static int ZEND_ADD_VAR_SPEC_TMP_TMP_HANDLER(ZEND_OPCODE_HANDLER_ARGS) zend_free_op free_op1, free_op2; zval *var = _get_zval_ptr_tmp(&opline->op2, EX(Ts), &free_op2 TSRMLS_CC); zval var_copy; - int use_copy; + int use_copy = 0; - zend_make_printable_zval(var, &var_copy, &use_copy); - if (use_copy) { - var = &var_copy; + if (Z_TYPE_P(var) != IS_STRING) { + zend_make_printable_zval(var, &var_copy, &use_copy); + + if (use_copy) { + var = &var_copy; + } } add_string_to_string( &EX_T(opline->result.u.var).tmp_var, _get_zval_ptr_tmp(&opline->op1, EX(Ts), &free_op1 TSRMLS_CC), @@ -6102,11 +6105,14 @@ static int ZEND_ADD_VAR_SPEC_TMP_VAR_HANDLER(ZEND_OPCODE_HANDLER_ARGS) zend_free_op free_op1, free_op2; zval *var = _get_zval_ptr_var(&opline->op2, EX(Ts), &free_op2 TSRMLS_CC); zval var_copy; - int use_copy; + int use_copy = 0; - zend_make_printable_zval(var, &var_copy, &use_copy); - if (use_copy) { - var = &var_copy; + if (Z_TYPE_P(var) != IS_STRING) { + zend_make_printable_zval(var, &var_copy, &use_copy); + + if (use_copy) { + var = &var_copy; + } } add_string_to_string( &EX_T(opline->result.u.var).tmp_var, _get_zval_ptr_tmp(&opline->op1, EX(Ts), &free_op1 TSRMLS_CC), @@ -6637,11 +6643,14 @@ static int ZEND_ADD_VAR_SPEC_TMP_CV_HANDLER(ZEND_OPCODE_HANDLER_ARGS) zend_free_op free_op1; zval *var = _get_zval_ptr_cv(&opline->op2, EX(Ts), BP_VAR_R TSRMLS_CC); zval var_copy; - int use_copy; + int use_copy = 0; - zend_make_printable_zval(var, &var_copy, &use_copy); - if (use_copy) { - var = &var_copy; + if (Z_TYPE_P(var) != IS_STRING) { + zend_make_printable_zval(var, &var_copy, &use_copy); + + if (use_copy) { + var = &var_copy; + } } add_string_to_string( &EX_T(opline->result.u.var).tmp_var, _get_zval_ptr_tmp(&opline->op1, EX(Ts), &free_op1 TSRMLS_CC), diff --git a/ext/tokenizer/tests/001.phpt b/ext/tokenizer/tests/001.phpt index 36ed2696fa..203e3c7ddb 100644 --- a/ext/tokenizer/tests/001.phpt +++ b/ext/tokenizer/tests/001.phpt @@ -57,8 +57,6 @@ echo token_name(T_STRING_VARNAME), "\n"; echo token_name(T_VARIABLE), "\n"; echo token_name(T_NUM_STRING), "\n"; echo token_name(T_INLINE_HTML), "\n"; -echo token_name(T_CHARACTER), "\n"; -echo token_name(T_BAD_CHARACTER), "\n"; echo token_name(T_ENCAPSED_AND_WHITESPACE), "\n"; echo token_name(T_CONSTANT_ENCAPSED_STRING), "\n"; echo token_name(T_ECHO), "\n"; @@ -185,8 +183,6 @@ T_STRING_VARNAME T_VARIABLE T_NUM_STRING T_INLINE_HTML -T_CHARACTER -T_BAD_CHARACTER T_ENCAPSED_AND_WHITESPACE T_CONSTANT_ENCAPSED_STRING T_ECHO diff --git a/ext/tokenizer/tokenizer.c b/ext/tokenizer/tokenizer.c index 7df3439b1e..4cd5242b62 100644 --- a/ext/tokenizer/tokenizer.c +++ b/ext/tokenizer/tokenizer.c @@ -280,14 +280,15 @@ static void tokenize(zval *return_value TSRMLS_DC) while ((token_type = lex_scan(&token TSRMLS_CC))) { destroy = 1; switch (token_type) { - case EOF: - zendleng--; /* don't count EOF */ + case T_CLOSE_TAG: + if (zendtext[zendleng - 1] != '>') { + CG(zend_lineno)++; + } case T_OPEN_TAG: case T_OPEN_TAG_WITH_ECHO: case T_WHITESPACE: case T_COMMENT: case T_DOC_COMMENT: - case T_CLOSE_TAG: destroy = 0; break; } @@ -297,6 +298,10 @@ static void tokenize(zval *return_value TSRMLS_DC) array_init(keyword); add_next_index_long(keyword, token_type); if (token_type == T_END_HEREDOC) { + if (CG(increment_lineno)) { + token_line = ++CG(zend_lineno); + CG(increment_lineno) = 0; + } add_next_index_stringl(keyword, Z_STRVAL(token), Z_STRLEN(token), 1); efree(Z_STRVAL(token)); } else { @@ -372,8 +377,6 @@ get_token_type_name(int token_type) case T_VARIABLE: return "T_VARIABLE"; case T_NUM_STRING: return "T_NUM_STRING"; case T_INLINE_HTML: return "T_INLINE_HTML"; - case T_CHARACTER: return "T_CHARACTER"; - case T_BAD_CHARACTER: return "T_BAD_CHARACTER"; case T_ENCAPSED_AND_WHITESPACE: return "T_ENCAPSED_AND_WHITESPACE"; case T_CONSTANT_ENCAPSED_STRING: return "T_CONSTANT_ENCAPSED_STRING"; case T_ECHO: return "T_ECHO"; -- 2.40.0