From 5a09b9fb0f9aa432843673887cde40bfc8737020 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 23 Mar 2017 16:14:39 +0100 Subject: [PATCH] Add PhpToken class RFC: https://wiki.php.net/rfc/token_as_object Relative to the RFC, this also adds a __toString() method, as discussed on list. Closes GH-5176. --- UPGRADING | 6 + ext/tokenizer/tests/PhpToken_constructor.phpt | 46 +++ ext/tokenizer/tests/PhpToken_extension.phpt | 36 ++ .../tests/PhpToken_extension_errors.phpt | 30 ++ .../tests/PhpToken_final_constructor.phpt | 15 + ext/tokenizer/tests/PhpToken_getAll.phpt | 358 +++++++++++++++++ ext/tokenizer/tests/PhpToken_methods.phpt | 119 ++++++ ext/tokenizer/tests/PhpToken_toString.phpt | 18 + ext/tokenizer/tokenizer.c | 378 +++++++++++++++--- ext/tokenizer/tokenizer.stub.php | 16 + ext/tokenizer/tokenizer_arginfo.h | 25 ++ ext/tokenizer/tokenizer_data.c | 2 +- ext/tokenizer/tokenizer_data_gen.sh | 2 +- 13 files changed, 1001 insertions(+), 50 deletions(-) create mode 100644 ext/tokenizer/tests/PhpToken_constructor.phpt create mode 100644 ext/tokenizer/tests/PhpToken_extension.phpt create mode 100644 ext/tokenizer/tests/PhpToken_extension_errors.phpt create mode 100644 ext/tokenizer/tests/PhpToken_final_constructor.phpt create mode 100644 ext/tokenizer/tests/PhpToken_getAll.phpt create mode 100644 ext/tokenizer/tests/PhpToken_methods.phpt create mode 100644 ext/tokenizer/tests/PhpToken_toString.phpt diff --git a/UPGRADING b/UPGRADING index 4bd913396a..ab0999282d 100644 --- a/UPGRADING +++ b/UPGRADING @@ -533,6 +533,12 @@ PHP 8.0 UPGRADE NOTES 7. New Classes and Interfaces ======================================== +- Tokenizer: + . The new PhpToken class adds an object-based interface to the tokenizer. + It provides a more uniform and ergonomic representation, while being more + memory efficient and faster. + RFC: https://wiki.php.net/rfc/token_as_object + ======================================== 8. Removed Extensions and SAPIs ======================================== diff --git a/ext/tokenizer/tests/PhpToken_constructor.phpt b/ext/tokenizer/tests/PhpToken_constructor.phpt new file mode 100644 index 0000000000..fb167ac684 --- /dev/null +++ b/ext/tokenizer/tests/PhpToken_constructor.phpt @@ -0,0 +1,46 @@ +--TEST-- +PhpToken constructor +--SKIPIF-- + +--FILE-- + +--EXPECT-- +object(PhpToken)#1 (4) { + ["id"]=> + int(300) + ["text"]=> + string(8) "function" + ["line"]=> + int(-1) + ["pos"]=> + int(-1) +} +object(PhpToken)#2 (4) { + ["id"]=> + int(300) + ["text"]=> + string(8) "function" + ["line"]=> + int(10) + ["pos"]=> + int(-1) +} +object(PhpToken)#1 (4) { + ["id"]=> + int(300) + ["text"]=> + string(8) "function" + ["line"]=> + int(10) + ["pos"]=> + int(100) +} diff --git a/ext/tokenizer/tests/PhpToken_extension.phpt b/ext/tokenizer/tests/PhpToken_extension.phpt new file mode 100644 index 0000000000..ef1a4f1272 --- /dev/null +++ b/ext/tokenizer/tests/PhpToken_extension.phpt @@ -0,0 +1,36 @@ +--TEST-- +Extending the PhpToken class +--SKIPIF-- + +--FILE-- +text); + } +} + +foreach (MyPhpToken::getAll($code) as $token) { + echo $token->getLoweredText(); + + if ($token->extra !== 123) { + echo "Missing property!\n"; + } +} + +?> +--EXPECT-- + +--FILE-- +getMessage(), "\n"; +} + +abstract class MyPhpToken2 extends PhpToken { +} + +try { + var_dump(MyPhpToken2::getAll("getMessage(), "\n"; +} + +?> +--EXPECT-- +Undefined constant 'UNKNOWN' +Cannot instantiate abstract class MyPhpToken2 diff --git a/ext/tokenizer/tests/PhpToken_final_constructor.phpt b/ext/tokenizer/tests/PhpToken_final_constructor.phpt new file mode 100644 index 0000000000..7f4061dbe8 --- /dev/null +++ b/ext/tokenizer/tests/PhpToken_final_constructor.phpt @@ -0,0 +1,15 @@ +--TEST-- +Check that the PhpToken constructor is final +--SKIPIF-- + +--FILE-- + +--EXPECTF-- +Fatal error: Cannot override final method PhpToken::__construct() in %s on line %d diff --git a/ext/tokenizer/tests/PhpToken_getAll.phpt b/ext/tokenizer/tests/PhpToken_getAll.phpt new file mode 100644 index 0000000000..604a979023 --- /dev/null +++ b/ext/tokenizer/tests/PhpToken_getAll.phpt @@ -0,0 +1,358 @@ +--TEST-- +PhpToken::getAll() method +--SKIPIF-- + +--FILE-- + +--EXPECT-- +array(15) { + [0]=> + object(PhpToken)#1 (4) { + ["id"]=> + int(382) + ["text"]=> + string(6) " + int(1) + ["pos"]=> + int(0) + } + [1]=> + object(PhpToken)#2 (4) { + ["id"]=> + int(342) + ["text"]=> + string(8) "function" + ["line"]=> + int(2) + ["pos"]=> + int(6) + } + [2]=> + object(PhpToken)#3 (4) { + ["id"]=> + int(385) + ["text"]=> + string(1) " " + ["line"]=> + int(2) + ["pos"]=> + int(14) + } + [3]=> + object(PhpToken)#4 (4) { + ["id"]=> + int(310) + ["text"]=> + string(3) "foo" + ["line"]=> + int(2) + ["pos"]=> + int(15) + } + [4]=> + object(PhpToken)#5 (4) { + ["id"]=> + int(40) + ["text"]=> + string(1) "(" + ["line"]=> + int(2) + ["pos"]=> + int(18) + } + [5]=> + object(PhpToken)#6 (4) { + ["id"]=> + int(41) + ["text"]=> + string(1) ")" + ["line"]=> + int(2) + ["pos"]=> + int(19) + } + [6]=> + object(PhpToken)#7 (4) { + ["id"]=> + int(385) + ["text"]=> + string(1) " " + ["line"]=> + int(2) + ["pos"]=> + int(20) + } + [7]=> + object(PhpToken)#8 (4) { + ["id"]=> + int(123) + ["text"]=> + string(1) "{" + ["line"]=> + int(2) + ["pos"]=> + int(21) + } + [8]=> + object(PhpToken)#9 (4) { + ["id"]=> + int(385) + ["text"]=> + string(5) " + " + ["line"]=> + int(2) + ["pos"]=> + int(22) + } + [9]=> + object(PhpToken)#10 (4) { + ["id"]=> + int(324) + ["text"]=> + string(4) "echo" + ["line"]=> + int(3) + ["pos"]=> + int(27) + } + [10]=> + object(PhpToken)#11 (4) { + ["id"]=> + int(385) + ["text"]=> + string(1) " " + ["line"]=> + int(3) + ["pos"]=> + int(31) + } + [11]=> + object(PhpToken)#12 (4) { + ["id"]=> + int(314) + ["text"]=> + string(5) ""bar"" + ["line"]=> + int(3) + ["pos"]=> + int(32) + } + [12]=> + object(PhpToken)#13 (4) { + ["id"]=> + int(59) + ["text"]=> + string(1) ";" + ["line"]=> + int(3) + ["pos"]=> + int(37) + } + [13]=> + object(PhpToken)#14 (4) { + ["id"]=> + int(385) + ["text"]=> + string(1) " +" + ["line"]=> + int(3) + ["pos"]=> + int(38) + } + [14]=> + object(PhpToken)#15 (4) { + ["id"]=> + int(125) + ["text"]=> + string(1) "}" + ["line"]=> + int(4) + ["pos"]=> + int(39) + } +} +array(15) { + [0]=> + object(PhpToken)#15 (4) { + ["id"]=> + int(382) + ["text"]=> + string(6) " + int(1) + ["pos"]=> + int(0) + } + [1]=> + object(PhpToken)#14 (4) { + ["id"]=> + int(342) + ["text"]=> + string(8) "function" + ["line"]=> + int(2) + ["pos"]=> + int(6) + } + [2]=> + object(PhpToken)#13 (4) { + ["id"]=> + int(385) + ["text"]=> + string(1) " " + ["line"]=> + int(2) + ["pos"]=> + int(14) + } + [3]=> + object(PhpToken)#12 (4) { + ["id"]=> + int(310) + ["text"]=> + string(3) "foo" + ["line"]=> + int(2) + ["pos"]=> + int(15) + } + [4]=> + object(PhpToken)#11 (4) { + ["id"]=> + int(40) + ["text"]=> + string(1) "(" + ["line"]=> + int(2) + ["pos"]=> + int(18) + } + [5]=> + object(PhpToken)#10 (4) { + ["id"]=> + int(41) + ["text"]=> + string(1) ")" + ["line"]=> + int(2) + ["pos"]=> + int(19) + } + [6]=> + object(PhpToken)#9 (4) { + ["id"]=> + int(385) + ["text"]=> + string(1) " " + ["line"]=> + int(2) + ["pos"]=> + int(20) + } + [7]=> + object(PhpToken)#8 (4) { + ["id"]=> + int(123) + ["text"]=> + string(1) "{" + ["line"]=> + int(2) + ["pos"]=> + int(21) + } + [8]=> + object(PhpToken)#7 (4) { + ["id"]=> + int(385) + ["text"]=> + string(5) " + " + ["line"]=> + int(2) + ["pos"]=> + int(22) + } + [9]=> + object(PhpToken)#6 (4) { + ["id"]=> + int(324) + ["text"]=> + string(4) "echo" + ["line"]=> + int(3) + ["pos"]=> + int(27) + } + [10]=> + object(PhpToken)#5 (4) { + ["id"]=> + int(385) + ["text"]=> + string(1) " " + ["line"]=> + int(3) + ["pos"]=> + int(31) + } + [11]=> + object(PhpToken)#4 (4) { + ["id"]=> + int(314) + ["text"]=> + string(5) ""bar"" + ["line"]=> + int(3) + ["pos"]=> + int(32) + } + [12]=> + object(PhpToken)#3 (4) { + ["id"]=> + int(59) + ["text"]=> + string(1) ";" + ["line"]=> + int(3) + ["pos"]=> + int(37) + } + [13]=> + object(PhpToken)#2 (4) { + ["id"]=> + int(385) + ["text"]=> + string(1) " +" + ["line"]=> + int(3) + ["pos"]=> + int(38) + } + [14]=> + object(PhpToken)#1 (4) { + ["id"]=> + int(125) + ["text"]=> + string(1) "}" + ["line"]=> + int(4) + ["pos"]=> + int(39) + } +} diff --git a/ext/tokenizer/tests/PhpToken_methods.phpt b/ext/tokenizer/tests/PhpToken_methods.phpt new file mode 100644 index 0000000000..9429cea7ed --- /dev/null +++ b/ext/tokenizer/tests/PhpToken_methods.phpt @@ -0,0 +1,119 @@ +--TEST-- +PhpToken instance methods +--SKIPIF-- + +--FILE-- + $token) { + printf("[%2d] %-26s %s\n", $i, $token->getTokenName(), + $token->isIgnorable() ? "ignorable" : "meaningful"); +} + +// is() variations + +echo "\nSuccess:\n"; +var_dump($tokens[4]->is(T_FUNCTION)); +var_dump($tokens[4]->is('function')); +var_dump($tokens[4]->is(['class', T_FUNCTION])); +var_dump($tokens[4]->is([T_CLASS, 'function'])); + +echo "\nFailure:\n"; +var_dump($tokens[4]->is(T_CLASS)); +var_dump($tokens[4]->is('class')); +var_dump($tokens[4]->is(['class', T_TRAIT])); +var_dump($tokens[4]->is([T_CLASS, 'trait'])); + +echo "\nError:\n"; +try { + $tokens[4]->is(3.141); +} catch (TypeError $e) { + echo $e->getMessage(), "\n"; +} +try { + $tokens[4]->is([3.141]); +} catch (TypeError $e) { + echo $e->getMessage(), "\n"; +} + +unset($tokens[4]->id); +unset($tokens[4]->text); +try { + $tokens[4]->is(T_FUNCTION); +} catch (Error $e) { + echo $e->getMessage(), "\n"; +} +try { + $tokens[4]->is('function'); +} catch (Error $e) { + echo $e->getMessage(), "\n"; +} +try { + $tokens[4]->is([T_FUNCTION]); +} catch (Error $e) { + echo $e->getMessage(), "\n"; +} +try { + $tokens[4]->is(['function']); +} catch (Error $e) { + echo $e->getMessage(), "\n"; +} + +echo "\nName of unknown token:\n"; +$token = new PhpToken(100000, "foo"); +var_dump($token->getTokenName()); + +?> +--EXPECT-- +[ 0] T_OPEN_TAG ignorable +[ 1] T_COMMENT ignorable +[ 2] T_DOC_COMMENT ignorable +[ 3] T_WHITESPACE ignorable +[ 4] T_FUNCTION meaningful +[ 5] T_WHITESPACE ignorable +[ 6] T_STRING meaningful +[ 7] ( meaningful +[ 8] ) meaningful +[ 9] T_WHITESPACE ignorable +[10] { meaningful +[11] T_WHITESPACE ignorable +[12] T_ECHO meaningful +[13] T_WHITESPACE ignorable +[14] T_CONSTANT_ENCAPSED_STRING meaningful +[15] ; meaningful +[16] T_WHITESPACE ignorable +[17] } meaningful + +Success: +bool(true) +bool(true) +bool(true) +bool(true) + +Failure: +bool(false) +bool(false) +bool(false) +bool(false) + +Error: +Kind must be of type int, string or array +Kind array must have elements of type int or string +Typed property PhpToken::$id must not be accessed before initialization +Typed property PhpToken::$text must not be accessed before initialization +Typed property PhpToken::$id must not be accessed before initialization +Typed property PhpToken::$text must not be accessed before initialization + +Name of unknown token: +NULL diff --git a/ext/tokenizer/tests/PhpToken_toString.phpt b/ext/tokenizer/tests/PhpToken_toString.phpt new file mode 100644 index 0000000000..17dbfa84a7 --- /dev/null +++ b/ext/tokenizer/tests/PhpToken_toString.phpt @@ -0,0 +1,18 @@ +--TEST-- +PhpToken implements __toString() +--FILE-- +__toString()); + +?> +--EXPECT-- +string(27) " +#include "zend_interfaces.h" #define zendtext LANG_SCNG(yy_text) #define zendleng LANG_SCNG(yy_leng) #define zendcursor LANG_SCNG(yy_cursor) #define zendlimit LANG_SCNG(yy_limit) -#define TOKEN_PARSE 1 +#define TOKEN_PARSE (1 << 0) + +zend_class_entry *php_token_ce; void tokenizer_token_get_all_register_constants(INIT_FUNC_ARGS) { REGISTER_LONG_CONSTANT("TOKEN_PARSE", TOKEN_PARSE, CONST_CS|CONST_PERSISTENT); @@ -72,12 +75,237 @@ zend_module_entry tokenizer_module_entry = { ZEND_GET_MODULE(tokenizer) #endif +static zval *php_token_get_id(zval *obj) { + zval *id = OBJ_PROP_NUM(Z_OBJ_P(obj), 0); + if (Z_ISUNDEF_P(id)) { + zend_throw_error(NULL, + "Typed property PhpToken::$id must not be accessed before initialization"); + return NULL; + } + + ZVAL_DEREF(id); + ZEND_ASSERT(Z_TYPE_P(id) == IS_LONG); + return id; +} + +static zend_string *php_token_get_text(zval *obj) { + zval *text_zval = OBJ_PROP_NUM(Z_OBJ_P(obj), 1); + if (Z_ISUNDEF_P(text_zval)) { + zend_throw_error(NULL, + "Typed property PhpToken::$text must not be accessed before initialization"); + return NULL; + } + + ZVAL_DEREF(text_zval); + ZEND_ASSERT(Z_TYPE_P(text_zval) == IS_STRING); + return Z_STR_P(text_zval); +} + +static zend_bool tokenize_common( + zval *return_value, zend_string *source, zend_long flags, zend_class_entry *token_class); + +PHP_METHOD(PhpToken, getAll) +{ + zend_string *source; + zend_long flags = 0; + zend_class_entry *token_class; + + ZEND_PARSE_PARAMETERS_START(1, 2) + Z_PARAM_STR(source) + Z_PARAM_OPTIONAL + Z_PARAM_LONG(flags) + ZEND_PARSE_PARAMETERS_END(); + + token_class = zend_get_called_scope(execute_data); + + /* Check construction preconditions in advance, so these are not repeated for each token. */ + if (token_class->ce_flags & ZEND_ACC_EXPLICIT_ABSTRACT_CLASS) { + zend_throw_error(NULL, "Cannot instantiate abstract class %s", ZSTR_VAL(token_class->name)); + RETURN_THROWS(); + } + if (zend_update_class_constants(token_class) == FAILURE) { + RETURN_THROWS(); + } + + if (!tokenize_common(return_value, source, flags, token_class)) { + RETURN_THROWS(); + } +} + +PHP_METHOD(PhpToken, __construct) +{ + zend_long id; + zend_string *text; + zend_long line = -1; + zend_long pos = -1; + zend_object *obj = Z_OBJ_P(ZEND_THIS); + + ZEND_PARSE_PARAMETERS_START(2, 4) + Z_PARAM_LONG(id) + Z_PARAM_STR(text) + Z_PARAM_OPTIONAL + Z_PARAM_LONG(line) + Z_PARAM_LONG(pos) + ZEND_PARSE_PARAMETERS_END(); + + ZVAL_LONG(OBJ_PROP_NUM(obj, 0), id); + zval_ptr_dtor(OBJ_PROP_NUM(obj, 1)); + ZVAL_STR_COPY(OBJ_PROP_NUM(obj, 1), text); + ZVAL_LONG(OBJ_PROP_NUM(obj, 2), line); + ZVAL_LONG(OBJ_PROP_NUM(obj, 3), pos); +} + +PHP_METHOD(PhpToken, is) +{ + zval *kind; + + ZEND_PARSE_PARAMETERS_START(1, 1) + Z_PARAM_ZVAL(kind) + ZEND_PARSE_PARAMETERS_END(); + + if (Z_TYPE_P(kind) == IS_LONG) { + zval *id_zval = php_token_get_id(ZEND_THIS); + if (!id_zval) { + RETURN_THROWS(); + } + + RETURN_BOOL(Z_LVAL_P(id_zval) == Z_LVAL_P(kind)); + } else if (Z_TYPE_P(kind) == IS_STRING) { + zend_string *text = php_token_get_text(ZEND_THIS); + if (!text) { + RETURN_THROWS(); + } + + RETURN_BOOL(zend_string_equals(text, Z_STR_P(kind))); + } else if (Z_TYPE_P(kind) == IS_ARRAY) { + zval *id_zval = NULL, *entry; + zend_string *text = NULL; + ZEND_HASH_FOREACH_VAL(Z_ARRVAL_P(kind), entry) { + ZVAL_DEREF(entry); + if (Z_TYPE_P(entry) == IS_LONG) { + if (!id_zval) { + id_zval = php_token_get_id(ZEND_THIS); + if (!id_zval) { + RETURN_THROWS(); + } + } + if (Z_LVAL_P(id_zval) == Z_LVAL_P(entry)) { + RETURN_TRUE; + } + } else if (Z_TYPE_P(entry) == IS_STRING) { + if (!text) { + text = php_token_get_text(ZEND_THIS); + if (!text) { + RETURN_THROWS(); + } + } + if (zend_string_equals(text, Z_STR_P(entry))) { + RETURN_TRUE; + } + } else { + zend_type_error("Kind array must have elements of type int or string"); + RETURN_THROWS(); + } + } ZEND_HASH_FOREACH_END(); + RETURN_FALSE; + } else { + zend_type_error("Kind must be of type int, string or array"); + RETURN_THROWS(); + } +} + +PHP_METHOD(PhpToken, isIgnorable) +{ + ZEND_PARSE_PARAMETERS_NONE(); + + zval *id_zval = php_token_get_id(ZEND_THIS); + if (!id_zval) { + RETURN_THROWS(); + } + + zend_long id = Z_LVAL_P(id_zval); + RETURN_BOOL(id == T_WHITESPACE || id == T_COMMENT || id == T_DOC_COMMENT || id == T_OPEN_TAG); +} + +PHP_METHOD(PhpToken, getTokenName) +{ + ZEND_PARSE_PARAMETERS_NONE(); + + zval *id_zval = php_token_get_id(ZEND_THIS); + if (!id_zval) { + RETURN_THROWS(); + } + + if (Z_LVAL_P(id_zval) < 256) { + RETURN_INTERNED_STR(ZSTR_CHAR(Z_LVAL_P(id_zval))); + } else { + const char *token_name = get_token_type_name(Z_LVAL_P(id_zval)); + if (!token_name) { + RETURN_NULL(); + } + + RETURN_STRING(token_name); + } +} + +PHP_METHOD(PhpToken, __toString) +{ + ZEND_PARSE_PARAMETERS_NONE(); + + zend_string *text = php_token_get_text(ZEND_THIS); + if (!text) { + RETURN_THROWS(); + } + + RETURN_STR_COPY(text); +} + +static const zend_function_entry php_token_methods[] = { + PHP_ME(PhpToken, getAll, arginfo_class_PhpToken_getAll, ZEND_ACC_PUBLIC|ZEND_ACC_STATIC) + PHP_ME(PhpToken, __construct, arginfo_class_PhpToken___construct, ZEND_ACC_PUBLIC|ZEND_ACC_FINAL) + PHP_ME(PhpToken, is, arginfo_class_PhpToken_is, ZEND_ACC_PUBLIC) + PHP_ME(PhpToken, isIgnorable, arginfo_class_PhpToken_isIgnorable, ZEND_ACC_PUBLIC) + PHP_ME(PhpToken, getTokenName, arginfo_class_PhpToken_getTokenName, ZEND_ACC_PUBLIC) + PHP_ME(PhpToken, __toString, arginfo_class_PhpToken___toString, ZEND_ACC_PUBLIC) + PHP_FE_END +}; + /* {{{ PHP_MINIT_FUNCTION */ PHP_MINIT_FUNCTION(tokenizer) { + zend_class_entry ce; + zend_string *name; + zval default_val; + ZVAL_UNDEF(&default_val); + tokenizer_register_constants(INIT_FUNC_ARGS_PASSTHRU); tokenizer_token_get_all_register_constants(INIT_FUNC_ARGS_PASSTHRU); + + INIT_CLASS_ENTRY(ce, "PhpToken", php_token_methods); + php_token_ce = zend_register_internal_class(&ce); + zend_class_implements(php_token_ce, 1, zend_ce_stringable); + + name = zend_string_init("id", sizeof("id") - 1, 1); + zend_declare_typed_property(php_token_ce, name, &default_val, ZEND_ACC_PUBLIC, NULL, + (zend_type) ZEND_TYPE_INIT_MASK(MAY_BE_LONG)); + zend_string_release(name); + + name = zend_string_init("text", sizeof("text") - 1, 1); + zend_declare_typed_property(php_token_ce, name, &default_val, ZEND_ACC_PUBLIC, NULL, + (zend_type) ZEND_TYPE_INIT_MASK(MAY_BE_STRING)); + zend_string_release(name); + + name = zend_string_init("line", sizeof("line") - 1, 1); + zend_declare_typed_property(php_token_ce, name, &default_val, ZEND_ACC_PUBLIC, NULL, + (zend_type) ZEND_TYPE_INIT_MASK(MAY_BE_LONG)); + zend_string_release(name); + + name = zend_string_init("pos", sizeof("pos") - 1, 1); + zend_declare_typed_property(php_token_ce, name, &default_val, ZEND_ACC_PUBLIC, NULL, + (zend_type) ZEND_TYPE_INIT_MASK(MAY_BE_LONG)); + zend_string_release(name); + return SUCCESS; } /* }}} */ @@ -92,29 +320,56 @@ PHP_MINFO_FUNCTION(tokenizer) } /* }}} */ -static void add_token(zval *return_value, int token_type, - unsigned char *text, size_t leng, int lineno) { - if (token_type >= 256) { - zval keyword; - array_init(&keyword); - add_next_index_long(&keyword, token_type); - if (leng == 1) { - add_next_index_str(&keyword, ZSTR_CHAR(text[0])); - } else { - add_next_index_stringl(&keyword, (char *) text, leng); +static zend_string *make_str(unsigned char *text, size_t leng, HashTable *interned_strings) { + if (leng == 1) { + return ZSTR_CHAR(text[0]); + } else if (interned_strings) { + zend_string *interned_str = zend_hash_str_find_ptr(interned_strings, (char *) text, leng); + if (interned_str) { + return zend_string_copy(interned_str); } - add_next_index_long(&keyword, lineno); - add_next_index_zval(return_value, &keyword); + interned_str = zend_string_init((char *) text, leng, 0); + zend_hash_add_new_ptr(interned_strings, interned_str, interned_str); + return interned_str; } else { - if (leng == 1) { - add_next_index_str(return_value, ZSTR_CHAR(text[0])); - } else { - add_next_index_stringl(return_value, (char *) text, leng); + return zend_string_init((char *) text, leng, 0); + } +} + +static void add_token( + zval *return_value, int token_type, unsigned char *text, size_t leng, int lineno, + zend_class_entry *token_class, HashTable *interned_strings) { + zval token; + if (token_class) { + zend_object *obj = zend_objects_new(token_class); + ZVAL_OBJ(&token, obj); + ZVAL_LONG(OBJ_PROP_NUM(obj, 0), token_type); + ZVAL_STR(OBJ_PROP_NUM(obj, 1), make_str(text, leng, interned_strings)); + ZVAL_LONG(OBJ_PROP_NUM(obj, 2), lineno); + ZVAL_LONG(OBJ_PROP_NUM(obj, 3), text - LANG_SCNG(yy_start)); + + /* If the class is extended with additional properties, initialized them as well. */ + if (UNEXPECTED(token_class->default_properties_count > 4)) { + zval *dst = OBJ_PROP_NUM(obj, 4); + zval *src = &token_class->default_properties_table[4]; + zval *end = token_class->default_properties_table + + token_class->default_properties_count; + for (; src < end; src++, dst++) { + ZVAL_COPY_PROP(dst, src); + } } + } else if (token_type >= 256) { + array_init(&token); + add_next_index_long(&token, token_type); + add_next_index_str(&token, make_str(text, leng, interned_strings)); + add_next_index_long(&token, lineno); + } else { + ZVAL_STR(&token, make_str(text, leng, interned_strings)); } + zend_hash_next_index_insert_new(Z_ARRVAL_P(return_value), &token); } -static zend_bool tokenize(zval *return_value, zend_string *source) +static zend_bool tokenize(zval *return_value, zend_string *source, zend_class_entry *token_class) { zval source_zval; zend_lex_state original_lex_state; @@ -122,6 +377,7 @@ static zend_bool tokenize(zval *return_value, zend_string *source) int token_type; int token_line = 1; int need_tokens = -1; /* for __halt_compiler lexing. -1 = disabled */ + HashTable interned_strings; ZVAL_STR_COPY(&source_zval, source); zend_save_lexical_state(&original_lex_state); @@ -132,10 +388,13 @@ static zend_bool tokenize(zval *return_value, zend_string *source) } LANG_SCNG(yy_state) = yycINITIAL; + zend_hash_init(&interned_strings, 0, NULL, NULL, 0); array_init(return_value); while ((token_type = lex_scan(&token, NULL))) { - add_token(return_value, token_type, zendtext, zendleng, token_line); + add_token( + return_value, token_type, zendtext, zendleng, token_line, + token_class, &interned_strings); if (Z_TYPE(token) != IS_UNDEF) { zval_ptr_dtor_nogc(&token); @@ -150,8 +409,9 @@ static zend_bool tokenize(zval *return_value, zend_string *source) ) { /* fetch the rest into a T_INLINE_HTML */ if (zendcursor != zendlimit) { - add_token(return_value, T_INLINE_HTML, - zendcursor, zendlimit - zendcursor, token_line); + add_token( + return_value, T_INLINE_HTML, zendcursor, zendlimit - zendcursor, + token_line, token_class, &interned_strings); } break; } @@ -169,46 +429,56 @@ static zend_bool tokenize(zval *return_value, zend_string *source) zval_ptr_dtor_str(&source_zval); zend_restore_lexical_state(&original_lex_state); + zend_hash_destroy(&interned_strings); return 1; } +struct event_context { + zval *tokens; + zend_class_entry *token_class; +}; + void on_event(zend_php_scanner_event event, int token, int line, void *context) { - zval *token_stream = (zval *) context; + struct event_context *ctx = context; HashTable *tokens_ht; zval *token_zv; switch (event) { case ON_TOKEN: - { - if (token == END) break; - /* Special cases */ - if (token == ';' && LANG_SCNG(yy_leng) > 1) { /* ?> or ?>\n or ?>\r\n */ - token = T_CLOSE_TAG; - } else if (token == T_ECHO && LANG_SCNG(yy_leng) == sizeof(" 1) { /* ?> or ?>\n or ?>\r\n */ + token = T_CLOSE_TAG; + } else if (token == T_ECHO && LANG_SCNG(yy_leng) == sizeof("tokens, token, + LANG_SCNG(yy_text), LANG_SCNG(yy_leng), line, ctx->token_class, NULL); break; case ON_FEEDBACK: - tokens_ht = Z_ARRVAL_P(token_stream); + tokens_ht = Z_ARRVAL_P(ctx->tokens); token_zv = zend_hash_index_find(tokens_ht, zend_hash_num_elements(tokens_ht) - 1); - if (token_zv && Z_TYPE_P(token_zv) == IS_ARRAY) { + ZEND_ASSERT(token_zv); + if (Z_TYPE_P(token_zv) == IS_ARRAY) { ZVAL_LONG(zend_hash_index_find(Z_ARRVAL_P(token_zv), 0), token); + } else { + zend_update_property_long(php_token_ce, token_zv, "type", sizeof("type")-1, token); } break; case ON_STOP: if (LANG_SCNG(yy_cursor) != LANG_SCNG(yy_limit)) { - add_token(token_stream, T_INLINE_HTML, LANG_SCNG(yy_cursor), - LANG_SCNG(yy_limit) - LANG_SCNG(yy_cursor), CG(zend_lineno)); + add_token(ctx->tokens, T_INLINE_HTML, LANG_SCNG(yy_cursor), + LANG_SCNG(yy_limit) - LANG_SCNG(yy_cursor), CG(zend_lineno), + ctx->token_class, NULL); } break; } } -static zend_bool tokenize_parse(zval *return_value, zend_string *source) +static zend_bool tokenize_parse( + zval *return_value, zend_string *source, zend_class_entry *token_class) { zval source_zval; zend_lex_state original_lex_state; @@ -222,14 +492,18 @@ static zend_bool tokenize_parse(zval *return_value, zend_string *source) zend_save_lexical_state(&original_lex_state); if ((success = (zend_prepare_string_for_scanning(&source_zval, "") == SUCCESS))) { + struct event_context ctx; zval token_stream; array_init(&token_stream); + ctx.tokens = &token_stream; + ctx.token_class = token_class; + CG(ast) = NULL; CG(ast_arena) = zend_arena_create(1024 * 32); LANG_SCNG(yy_state) = yycINITIAL; LANG_SCNG(on_event) = on_event; - LANG_SCNG(on_event_context) = &token_stream; + LANG_SCNG(on_event_context) = &ctx; if((success = (zendparse() == SUCCESS))) { ZVAL_COPY_VALUE(return_value, &token_stream); @@ -250,6 +524,19 @@ static zend_bool tokenize_parse(zval *return_value, zend_string *source) return success; } +static zend_bool tokenize_common( + zval *return_value, zend_string *source, zend_long flags, zend_class_entry *token_class) +{ + if (flags & TOKEN_PARSE) { + return tokenize_parse(return_value, source, token_class); + } else { + int success = tokenize(return_value, source, token_class); + /* Normal token_get_all() should not throw. */ + zend_clear_exception(); + return success; + } +} + /* }}} */ /* {{{ proto array token_get_all(string source [, int flags]) @@ -258,7 +545,6 @@ PHP_FUNCTION(token_get_all) { zend_string *source; zend_long flags = 0; - zend_bool success; ZEND_PARSE_PARAMETERS_START(1, 2) Z_PARAM_STR(source) @@ -266,15 +552,7 @@ PHP_FUNCTION(token_get_all) Z_PARAM_LONG(flags) ZEND_PARSE_PARAMETERS_END(); - if (flags & TOKEN_PARSE) { - success = tokenize_parse(return_value, source); - } else { - success = tokenize(return_value, source); - /* Normal token_get_all() should not throw. */ - zend_clear_exception(); - } - - if (!success) { + if (!tokenize_common(return_value, source, flags, /* token_class */ NULL)) { RETURN_THROWS(); } } @@ -290,6 +568,10 @@ PHP_FUNCTION(token_name) Z_PARAM_LONG(type) ZEND_PARSE_PARAMETERS_END(); - RETVAL_STRING(get_token_type_name(type)); + const char *token_name = get_token_type_name(type); + if (!token_name) { + token_name = "UNKNOWN"; + } + RETURN_STRING(token_name); } /* }}} */ diff --git a/ext/tokenizer/tokenizer.stub.php b/ext/tokenizer/tokenizer.stub.php index 63a6c2e72c..801c1c8504 100644 --- a/ext/tokenizer/tokenizer.stub.php +++ b/ext/tokenizer/tokenizer.stub.php @@ -3,3 +3,19 @@ function token_get_all(string $source, int $flags = 0): array {} function token_name(int $token): string {} + +class PhpToken implements Stringable { + /** @return static[] */ + public static function getAll(string $code, int $flags = 0): array; + + public final function __construct(int $id, string $text, int $line = -1, int $pos = -1); + + /** @param int|string|array $kind */ + public function is($kind): bool; + + public function isIgnorable(): bool; + + public function getTokenName(): ?string; + + public function __toString(): string; +} diff --git a/ext/tokenizer/tokenizer_arginfo.h b/ext/tokenizer/tokenizer_arginfo.h index d777535a48..d927c8d0e6 100644 --- a/ext/tokenizer/tokenizer_arginfo.h +++ b/ext/tokenizer/tokenizer_arginfo.h @@ -8,3 +8,28 @@ ZEND_END_ARG_INFO() ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_token_name, 0, 1, IS_STRING, 0) ZEND_ARG_TYPE_INFO(0, token, IS_LONG, 0) ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_class_PhpToken_getAll, 0, 1, IS_ARRAY, 0) + ZEND_ARG_TYPE_INFO(0, code, IS_STRING, 0) + ZEND_ARG_TYPE_INFO(0, flags, IS_LONG, 0) +ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_INFO_EX(arginfo_class_PhpToken___construct, 0, 0, 2) + ZEND_ARG_TYPE_INFO(0, id, IS_LONG, 0) + ZEND_ARG_TYPE_INFO(0, text, IS_STRING, 0) + ZEND_ARG_TYPE_INFO(0, line, IS_LONG, 0) + ZEND_ARG_TYPE_INFO(0, pos, IS_LONG, 0) +ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_class_PhpToken_is, 0, 1, _IS_BOOL, 0) + ZEND_ARG_INFO(0, kind) +ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_class_PhpToken_isIgnorable, 0, 0, _IS_BOOL, 0) +ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_class_PhpToken_getTokenName, 0, 0, IS_STRING, 1) +ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_class_PhpToken___toString, 0, 0, IS_STRING, 0) +ZEND_END_ARG_INFO() diff --git a/ext/tokenizer/tokenizer_data.c b/ext/tokenizer/tokenizer_data.c index 7e580dd844..3ddf89521a 100644 --- a/ext/tokenizer/tokenizer_data.c +++ b/ext/tokenizer/tokenizer_data.c @@ -306,6 +306,6 @@ char *get_token_type_name(int token_type) case T_BAD_CHARACTER: return "T_BAD_CHARACTER"; } - return "UNKNOWN"; + return NULL; } diff --git a/ext/tokenizer/tokenizer_data_gen.sh b/ext/tokenizer/tokenizer_data_gen.sh index 4d5e97ddde..1dbe77d2e7 100755 --- a/ext/tokenizer/tokenizer_data_gen.sh +++ b/ext/tokenizer/tokenizer_data_gen.sh @@ -71,7 +71,7 @@ awk ' echo ' } - return "UNKNOWN"; + return NULL; } ' >> $outfile -- 2.40.0