]> granicus.if.org Git - php/commitdiff
Add PhpToken class
authorNikita Popov <nikita.ppv@gmail.com>
Thu, 23 Mar 2017 15:14:39 +0000 (16:14 +0100)
committerNikita Popov <nikita.ppv@gmail.com>
Thu, 26 Mar 2020 10:09:18 +0000 (11:09 +0100)
RFC: https://wiki.php.net/rfc/token_as_object

Relative to the RFC, this also adds a __toString() method,
as discussed on list.

Closes GH-5176.

13 files changed:
UPGRADING
ext/tokenizer/tests/PhpToken_constructor.phpt [new file with mode: 0644]
ext/tokenizer/tests/PhpToken_extension.phpt [new file with mode: 0644]
ext/tokenizer/tests/PhpToken_extension_errors.phpt [new file with mode: 0644]
ext/tokenizer/tests/PhpToken_final_constructor.phpt [new file with mode: 0644]
ext/tokenizer/tests/PhpToken_getAll.phpt [new file with mode: 0644]
ext/tokenizer/tests/PhpToken_methods.phpt [new file with mode: 0644]
ext/tokenizer/tests/PhpToken_toString.phpt [new file with mode: 0644]
ext/tokenizer/tokenizer.c
ext/tokenizer/tokenizer.stub.php
ext/tokenizer/tokenizer_arginfo.h
ext/tokenizer/tokenizer_data.c
ext/tokenizer/tokenizer_data_gen.sh

index 4bd913396abf7706c12b46aabe970f15f4e35a2f..ab0999282d476db7b3135778afeab3083337bcb0 100644 (file)
--- a/UPGRADING
+++ b/UPGRADING
@@ -533,6 +533,12 @@ PHP 8.0 UPGRADE NOTES
 7. New Classes and Interfaces
 ========================================
 
+- Tokenizer:
+  . The new PhpToken class adds an object-based interface to the tokenizer.
+    It provides a more uniform and ergonomic representation, while being more
+    memory efficient and faster.
+    RFC: https://wiki.php.net/rfc/token_as_object
+
 ========================================
 8. Removed Extensions and SAPIs
 ========================================
diff --git a/ext/tokenizer/tests/PhpToken_constructor.phpt b/ext/tokenizer/tests/PhpToken_constructor.phpt
new file mode 100644 (file)
index 0000000..fb167ac
--- /dev/null
@@ -0,0 +1,46 @@
+--TEST--
+PhpToken constructor
+--SKIPIF--
+<?php if (!extension_loaded("tokenizer")) print "skip tokenizer extension not enabled"; ?>
+--FILE--
+<?php
+
+$token = new PhpToken(300, 'function');
+var_dump($token);
+$token = new PhpToken(300, 'function', 10);
+var_dump($token);
+$token = new PhpToken(300, 'function', 10, 100);
+var_dump($token);
+
+?>
+--EXPECT--
+object(PhpToken)#1 (4) {
+  ["id"]=>
+  int(300)
+  ["text"]=>
+  string(8) "function"
+  ["line"]=>
+  int(-1)
+  ["pos"]=>
+  int(-1)
+}
+object(PhpToken)#2 (4) {
+  ["id"]=>
+  int(300)
+  ["text"]=>
+  string(8) "function"
+  ["line"]=>
+  int(10)
+  ["pos"]=>
+  int(-1)
+}
+object(PhpToken)#1 (4) {
+  ["id"]=>
+  int(300)
+  ["text"]=>
+  string(8) "function"
+  ["line"]=>
+  int(10)
+  ["pos"]=>
+  int(100)
+}
diff --git a/ext/tokenizer/tests/PhpToken_extension.phpt b/ext/tokenizer/tests/PhpToken_extension.phpt
new file mode 100644 (file)
index 0000000..ef1a4f1
--- /dev/null
@@ -0,0 +1,36 @@
+--TEST--
+Extending the PhpToken class
+--SKIPIF--
+<?php if (!extension_loaded("tokenizer")) print "skip tokenizer extension not enabled"; ?>
+--FILE--
+<?php
+
+$code = <<<'PHP'
+<?PHP
+FUNCTION FOO() {
+    ECHO "bar";
+}
+PHP;
+
+class MyPhpToken extends PhpToken {
+    public int $extra = 123;
+
+    public function getLoweredText(): string {
+        return strtolower($this->text);
+    }
+}
+
+foreach (MyPhpToken::getAll($code) as $token) {
+    echo $token->getLoweredText();
+
+    if ($token->extra !== 123) {
+        echo "Missing property!\n";
+    }
+}
+
+?>
+--EXPECT--
+<?php
+function foo() {
+    echo "bar";
+}
diff --git a/ext/tokenizer/tests/PhpToken_extension_errors.phpt b/ext/tokenizer/tests/PhpToken_extension_errors.phpt
new file mode 100644 (file)
index 0000000..89604a9
--- /dev/null
@@ -0,0 +1,30 @@
+--TEST--
+PhpToken extensions that throw during construction
+--SKIPIF--
+<?php if (!extension_loaded("tokenizer")) print "skip tokenizer extension not enabled"; ?>
+--FILE--
+<?php
+
+class MyPhpToken1 extends PhpToken {
+    public $extra = UNKNOWN;
+}
+
+try {
+    var_dump(MyPhpToken1::getAll("<?php foo"));
+} catch (Error $e) {
+    echo $e->getMessage(), "\n";
+}
+
+abstract class MyPhpToken2 extends PhpToken {
+}
+
+try {
+    var_dump(MyPhpToken2::getAll("<?php foo"));
+} catch (Error $e) {
+    echo $e->getMessage(), "\n";
+}
+
+?>
+--EXPECT--
+Undefined constant 'UNKNOWN'
+Cannot instantiate abstract class MyPhpToken2
diff --git a/ext/tokenizer/tests/PhpToken_final_constructor.phpt b/ext/tokenizer/tests/PhpToken_final_constructor.phpt
new file mode 100644 (file)
index 0000000..7f4061d
--- /dev/null
@@ -0,0 +1,15 @@
+--TEST--
+Check that the PhpToken constructor is final
+--SKIPIF--
+<?php if (!extension_loaded("tokenizer")) print "skip tokenizer extension not enabled"; ?>
+--FILE--
+<?php
+
+class MyPhpToken extends PhpToken {
+    public function __construct() {
+    }
+}
+
+?>
+--EXPECTF--
+Fatal error: Cannot override final method PhpToken::__construct() in %s on line %d
diff --git a/ext/tokenizer/tests/PhpToken_getAll.phpt b/ext/tokenizer/tests/PhpToken_getAll.phpt
new file mode 100644 (file)
index 0000000..604a979
--- /dev/null
@@ -0,0 +1,358 @@
+--TEST--
+PhpToken::getAll() method
+--SKIPIF--
+<?php if (!extension_loaded("tokenizer")) print "skip tokenizer extension not enabled"; ?>
+--FILE--
+<?php
+
+$code = <<<'PHP'
+<?php
+function foo() {
+    echo "bar";
+}
+PHP;
+var_dump(PhpToken::getAll($code));
+var_dump(PhpToken::getAll($code, TOKEN_PARSE));
+
+?>
+--EXPECT--
+array(15) {
+  [0]=>
+  object(PhpToken)#1 (4) {
+    ["id"]=>
+    int(382)
+    ["text"]=>
+    string(6) "<?php
+"
+    ["line"]=>
+    int(1)
+    ["pos"]=>
+    int(0)
+  }
+  [1]=>
+  object(PhpToken)#2 (4) {
+    ["id"]=>
+    int(342)
+    ["text"]=>
+    string(8) "function"
+    ["line"]=>
+    int(2)
+    ["pos"]=>
+    int(6)
+  }
+  [2]=>
+  object(PhpToken)#3 (4) {
+    ["id"]=>
+    int(385)
+    ["text"]=>
+    string(1) " "
+    ["line"]=>
+    int(2)
+    ["pos"]=>
+    int(14)
+  }
+  [3]=>
+  object(PhpToken)#4 (4) {
+    ["id"]=>
+    int(310)
+    ["text"]=>
+    string(3) "foo"
+    ["line"]=>
+    int(2)
+    ["pos"]=>
+    int(15)
+  }
+  [4]=>
+  object(PhpToken)#5 (4) {
+    ["id"]=>
+    int(40)
+    ["text"]=>
+    string(1) "("
+    ["line"]=>
+    int(2)
+    ["pos"]=>
+    int(18)
+  }
+  [5]=>
+  object(PhpToken)#6 (4) {
+    ["id"]=>
+    int(41)
+    ["text"]=>
+    string(1) ")"
+    ["line"]=>
+    int(2)
+    ["pos"]=>
+    int(19)
+  }
+  [6]=>
+  object(PhpToken)#7 (4) {
+    ["id"]=>
+    int(385)
+    ["text"]=>
+    string(1) " "
+    ["line"]=>
+    int(2)
+    ["pos"]=>
+    int(20)
+  }
+  [7]=>
+  object(PhpToken)#8 (4) {
+    ["id"]=>
+    int(123)
+    ["text"]=>
+    string(1) "{"
+    ["line"]=>
+    int(2)
+    ["pos"]=>
+    int(21)
+  }
+  [8]=>
+  object(PhpToken)#9 (4) {
+    ["id"]=>
+    int(385)
+    ["text"]=>
+    string(5) "
+    "
+    ["line"]=>
+    int(2)
+    ["pos"]=>
+    int(22)
+  }
+  [9]=>
+  object(PhpToken)#10 (4) {
+    ["id"]=>
+    int(324)
+    ["text"]=>
+    string(4) "echo"
+    ["line"]=>
+    int(3)
+    ["pos"]=>
+    int(27)
+  }
+  [10]=>
+  object(PhpToken)#11 (4) {
+    ["id"]=>
+    int(385)
+    ["text"]=>
+    string(1) " "
+    ["line"]=>
+    int(3)
+    ["pos"]=>
+    int(31)
+  }
+  [11]=>
+  object(PhpToken)#12 (4) {
+    ["id"]=>
+    int(314)
+    ["text"]=>
+    string(5) ""bar""
+    ["line"]=>
+    int(3)
+    ["pos"]=>
+    int(32)
+  }
+  [12]=>
+  object(PhpToken)#13 (4) {
+    ["id"]=>
+    int(59)
+    ["text"]=>
+    string(1) ";"
+    ["line"]=>
+    int(3)
+    ["pos"]=>
+    int(37)
+  }
+  [13]=>
+  object(PhpToken)#14 (4) {
+    ["id"]=>
+    int(385)
+    ["text"]=>
+    string(1) "
+"
+    ["line"]=>
+    int(3)
+    ["pos"]=>
+    int(38)
+  }
+  [14]=>
+  object(PhpToken)#15 (4) {
+    ["id"]=>
+    int(125)
+    ["text"]=>
+    string(1) "}"
+    ["line"]=>
+    int(4)
+    ["pos"]=>
+    int(39)
+  }
+}
+array(15) {
+  [0]=>
+  object(PhpToken)#15 (4) {
+    ["id"]=>
+    int(382)
+    ["text"]=>
+    string(6) "<?php
+"
+    ["line"]=>
+    int(1)
+    ["pos"]=>
+    int(0)
+  }
+  [1]=>
+  object(PhpToken)#14 (4) {
+    ["id"]=>
+    int(342)
+    ["text"]=>
+    string(8) "function"
+    ["line"]=>
+    int(2)
+    ["pos"]=>
+    int(6)
+  }
+  [2]=>
+  object(PhpToken)#13 (4) {
+    ["id"]=>
+    int(385)
+    ["text"]=>
+    string(1) " "
+    ["line"]=>
+    int(2)
+    ["pos"]=>
+    int(14)
+  }
+  [3]=>
+  object(PhpToken)#12 (4) {
+    ["id"]=>
+    int(310)
+    ["text"]=>
+    string(3) "foo"
+    ["line"]=>
+    int(2)
+    ["pos"]=>
+    int(15)
+  }
+  [4]=>
+  object(PhpToken)#11 (4) {
+    ["id"]=>
+    int(40)
+    ["text"]=>
+    string(1) "("
+    ["line"]=>
+    int(2)
+    ["pos"]=>
+    int(18)
+  }
+  [5]=>
+  object(PhpToken)#10 (4) {
+    ["id"]=>
+    int(41)
+    ["text"]=>
+    string(1) ")"
+    ["line"]=>
+    int(2)
+    ["pos"]=>
+    int(19)
+  }
+  [6]=>
+  object(PhpToken)#9 (4) {
+    ["id"]=>
+    int(385)
+    ["text"]=>
+    string(1) " "
+    ["line"]=>
+    int(2)
+    ["pos"]=>
+    int(20)
+  }
+  [7]=>
+  object(PhpToken)#8 (4) {
+    ["id"]=>
+    int(123)
+    ["text"]=>
+    string(1) "{"
+    ["line"]=>
+    int(2)
+    ["pos"]=>
+    int(21)
+  }
+  [8]=>
+  object(PhpToken)#7 (4) {
+    ["id"]=>
+    int(385)
+    ["text"]=>
+    string(5) "
+    "
+    ["line"]=>
+    int(2)
+    ["pos"]=>
+    int(22)
+  }
+  [9]=>
+  object(PhpToken)#6 (4) {
+    ["id"]=>
+    int(324)
+    ["text"]=>
+    string(4) "echo"
+    ["line"]=>
+    int(3)
+    ["pos"]=>
+    int(27)
+  }
+  [10]=>
+  object(PhpToken)#5 (4) {
+    ["id"]=>
+    int(385)
+    ["text"]=>
+    string(1) " "
+    ["line"]=>
+    int(3)
+    ["pos"]=>
+    int(31)
+  }
+  [11]=>
+  object(PhpToken)#4 (4) {
+    ["id"]=>
+    int(314)
+    ["text"]=>
+    string(5) ""bar""
+    ["line"]=>
+    int(3)
+    ["pos"]=>
+    int(32)
+  }
+  [12]=>
+  object(PhpToken)#3 (4) {
+    ["id"]=>
+    int(59)
+    ["text"]=>
+    string(1) ";"
+    ["line"]=>
+    int(3)
+    ["pos"]=>
+    int(37)
+  }
+  [13]=>
+  object(PhpToken)#2 (4) {
+    ["id"]=>
+    int(385)
+    ["text"]=>
+    string(1) "
+"
+    ["line"]=>
+    int(3)
+    ["pos"]=>
+    int(38)
+  }
+  [14]=>
+  object(PhpToken)#1 (4) {
+    ["id"]=>
+    int(125)
+    ["text"]=>
+    string(1) "}"
+    ["line"]=>
+    int(4)
+    ["pos"]=>
+    int(39)
+  }
+}
diff --git a/ext/tokenizer/tests/PhpToken_methods.phpt b/ext/tokenizer/tests/PhpToken_methods.phpt
new file mode 100644 (file)
index 0000000..9429cea
--- /dev/null
@@ -0,0 +1,119 @@
+--TEST--
+PhpToken instance methods
+--SKIPIF--
+<?php if (!extension_loaded("tokenizer")) print "skip tokenizer extension not enabled"; ?>
+--FILE--
+<?php
+
+$code = <<<'PHP'
+<?php
+// comment
+/** comment */
+function foo() {
+    echo "bar";
+}
+PHP;
+
+// Token names and ignorability.
+$tokens = PhpToken::getAll($code);
+foreach ($tokens as $i => $token) {
+    printf("[%2d] %-26s %s\n", $i, $token->getTokenName(),
+        $token->isIgnorable() ? "ignorable" : "meaningful");
+}
+
+// is() variations
+
+echo "\nSuccess:\n";
+var_dump($tokens[4]->is(T_FUNCTION));
+var_dump($tokens[4]->is('function'));
+var_dump($tokens[4]->is(['class', T_FUNCTION]));
+var_dump($tokens[4]->is([T_CLASS, 'function']));
+
+echo "\nFailure:\n";
+var_dump($tokens[4]->is(T_CLASS));
+var_dump($tokens[4]->is('class'));
+var_dump($tokens[4]->is(['class', T_TRAIT]));
+var_dump($tokens[4]->is([T_CLASS, 'trait']));
+
+echo "\nError:\n";
+try {
+    $tokens[4]->is(3.141);
+} catch (TypeError $e) {
+    echo $e->getMessage(), "\n";
+}
+try {
+    $tokens[4]->is([3.141]);
+} catch (TypeError $e) {
+    echo $e->getMessage(), "\n";
+}
+
+unset($tokens[4]->id);
+unset($tokens[4]->text);
+try {
+    $tokens[4]->is(T_FUNCTION);
+} catch (Error $e) {
+    echo $e->getMessage(), "\n";
+}
+try {
+    $tokens[4]->is('function');
+} catch (Error $e) {
+    echo $e->getMessage(), "\n";
+}
+try {
+    $tokens[4]->is([T_FUNCTION]);
+} catch (Error $e) {
+    echo $e->getMessage(), "\n";
+}
+try {
+    $tokens[4]->is(['function']);
+} catch (Error $e) {
+    echo $e->getMessage(), "\n";
+}
+
+echo "\nName of unknown token:\n";
+$token = new PhpToken(100000, "foo");
+var_dump($token->getTokenName());
+
+?>
+--EXPECT--
+[ 0] T_OPEN_TAG                 ignorable
+[ 1] T_COMMENT                  ignorable
+[ 2] T_DOC_COMMENT              ignorable
+[ 3] T_WHITESPACE               ignorable
+[ 4] T_FUNCTION                 meaningful
+[ 5] T_WHITESPACE               ignorable
+[ 6] T_STRING                   meaningful
+[ 7] (                          meaningful
+[ 8] )                          meaningful
+[ 9] T_WHITESPACE               ignorable
+[10] {                          meaningful
+[11] T_WHITESPACE               ignorable
+[12] T_ECHO                     meaningful
+[13] T_WHITESPACE               ignorable
+[14] T_CONSTANT_ENCAPSED_STRING meaningful
+[15] ;                          meaningful
+[16] T_WHITESPACE               ignorable
+[17] }                          meaningful
+
+Success:
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+
+Failure:
+bool(false)
+bool(false)
+bool(false)
+bool(false)
+
+Error:
+Kind must be of type int, string or array
+Kind array must have elements of type int or string
+Typed property PhpToken::$id must not be accessed before initialization
+Typed property PhpToken::$text must not be accessed before initialization
+Typed property PhpToken::$id must not be accessed before initialization
+Typed property PhpToken::$text must not be accessed before initialization
+
+Name of unknown token:
+NULL
diff --git a/ext/tokenizer/tests/PhpToken_toString.phpt b/ext/tokenizer/tests/PhpToken_toString.phpt
new file mode 100644 (file)
index 0000000..17dbfa8
--- /dev/null
@@ -0,0 +1,18 @@
+--TEST--
+PhpToken implements __toString()
+--FILE--
+<?php
+
+$tokens = PhpToken::getAll('<?php echo "Hello ". $what;');
+var_dump(implode($tokens));
+
+var_dump($tokens[0] instanceof Stringable);
+var_dump((string) $tokens[0]);
+var_dump($tokens[0]->__toString());
+
+?>
+--EXPECT--
+string(27) "<?php echo "Hello ". $what;"
+bool(true)
+string(6) "<?php "
+string(6) "<?php "
index 1ac5275ff09a3c5510ea5425a86a0eb98e73bc0b..222c3e96a39287d206c91633b1c5fcf51a8c48f2 100644 (file)
 #include "zend_language_scanner.h"
 #include "zend_language_scanner_defs.h"
 #include <zend_language_parser.h>
+#include "zend_interfaces.h"
 
 #define zendtext   LANG_SCNG(yy_text)
 #define zendleng   LANG_SCNG(yy_leng)
 #define zendcursor LANG_SCNG(yy_cursor)
 #define zendlimit  LANG_SCNG(yy_limit)
 
-#define TOKEN_PARSE                            1
+#define TOKEN_PARSE (1 << 0)
+
+zend_class_entry *php_token_ce;
 
 void tokenizer_token_get_all_register_constants(INIT_FUNC_ARGS) {
        REGISTER_LONG_CONSTANT("TOKEN_PARSE", TOKEN_PARSE, CONST_CS|CONST_PERSISTENT);
@@ -72,12 +75,237 @@ zend_module_entry tokenizer_module_entry = {
 ZEND_GET_MODULE(tokenizer)
 #endif
 
+static zval *php_token_get_id(zval *obj) {
+       zval *id = OBJ_PROP_NUM(Z_OBJ_P(obj), 0);
+       if (Z_ISUNDEF_P(id)) {
+               zend_throw_error(NULL,
+                       "Typed property PhpToken::$id must not be accessed before initialization");
+               return NULL;
+       }
+
+       ZVAL_DEREF(id);
+       ZEND_ASSERT(Z_TYPE_P(id) == IS_LONG);
+       return id;
+}
+
+static zend_string *php_token_get_text(zval *obj) {
+       zval *text_zval = OBJ_PROP_NUM(Z_OBJ_P(obj), 1);
+       if (Z_ISUNDEF_P(text_zval)) {
+               zend_throw_error(NULL,
+                       "Typed property PhpToken::$text must not be accessed before initialization");
+               return NULL;
+       }
+
+       ZVAL_DEREF(text_zval);
+       ZEND_ASSERT(Z_TYPE_P(text_zval) == IS_STRING);
+       return Z_STR_P(text_zval);
+}
+
+static zend_bool tokenize_common(
+               zval *return_value, zend_string *source, zend_long flags, zend_class_entry *token_class);
+
+PHP_METHOD(PhpToken, getAll)
+{
+       zend_string *source;
+       zend_long flags = 0;
+       zend_class_entry *token_class;
+
+       ZEND_PARSE_PARAMETERS_START(1, 2)
+               Z_PARAM_STR(source)
+               Z_PARAM_OPTIONAL
+               Z_PARAM_LONG(flags)
+       ZEND_PARSE_PARAMETERS_END();
+
+       token_class = zend_get_called_scope(execute_data);
+
+       /* Check construction preconditions in advance, so these are not repeated for each token. */
+       if (token_class->ce_flags & ZEND_ACC_EXPLICIT_ABSTRACT_CLASS) {
+               zend_throw_error(NULL, "Cannot instantiate abstract class %s", ZSTR_VAL(token_class->name));
+               RETURN_THROWS();
+       }
+       if (zend_update_class_constants(token_class) == FAILURE) {
+               RETURN_THROWS();
+       }
+
+       if (!tokenize_common(return_value, source, flags, token_class)) {
+               RETURN_THROWS();
+       }
+}
+
+PHP_METHOD(PhpToken, __construct)
+{
+       zend_long id;
+       zend_string *text;
+       zend_long line = -1;
+       zend_long pos = -1;
+       zend_object *obj = Z_OBJ_P(ZEND_THIS);
+
+       ZEND_PARSE_PARAMETERS_START(2, 4)
+               Z_PARAM_LONG(id)
+               Z_PARAM_STR(text)
+               Z_PARAM_OPTIONAL
+               Z_PARAM_LONG(line)
+               Z_PARAM_LONG(pos)
+       ZEND_PARSE_PARAMETERS_END();
+
+       ZVAL_LONG(OBJ_PROP_NUM(obj, 0), id);
+       zval_ptr_dtor(OBJ_PROP_NUM(obj, 1));
+       ZVAL_STR_COPY(OBJ_PROP_NUM(obj, 1), text);
+       ZVAL_LONG(OBJ_PROP_NUM(obj, 2), line);
+       ZVAL_LONG(OBJ_PROP_NUM(obj, 3), pos);
+}
+
+PHP_METHOD(PhpToken, is)
+{
+       zval *kind;
+
+       ZEND_PARSE_PARAMETERS_START(1, 1)
+               Z_PARAM_ZVAL(kind)
+       ZEND_PARSE_PARAMETERS_END();
+
+       if (Z_TYPE_P(kind) == IS_LONG) {
+               zval *id_zval = php_token_get_id(ZEND_THIS);
+               if (!id_zval) {
+                       RETURN_THROWS();
+               }
+
+               RETURN_BOOL(Z_LVAL_P(id_zval) == Z_LVAL_P(kind));
+       } else if (Z_TYPE_P(kind) == IS_STRING) {
+               zend_string *text = php_token_get_text(ZEND_THIS);
+               if (!text) {
+                       RETURN_THROWS();
+               }
+
+               RETURN_BOOL(zend_string_equals(text, Z_STR_P(kind)));
+       } else if (Z_TYPE_P(kind) == IS_ARRAY) {
+               zval *id_zval = NULL, *entry;
+               zend_string *text = NULL;
+               ZEND_HASH_FOREACH_VAL(Z_ARRVAL_P(kind), entry) {
+                       ZVAL_DEREF(entry);
+                       if (Z_TYPE_P(entry) == IS_LONG) {
+                               if (!id_zval) {
+                                       id_zval = php_token_get_id(ZEND_THIS);
+                                       if (!id_zval) {
+                                               RETURN_THROWS();
+                                       }
+                               }
+                               if (Z_LVAL_P(id_zval) == Z_LVAL_P(entry)) {
+                                       RETURN_TRUE;
+                               }
+                       } else if (Z_TYPE_P(entry) == IS_STRING) {
+                               if (!text) {
+                                       text = php_token_get_text(ZEND_THIS);
+                                       if (!text) {
+                                               RETURN_THROWS();
+                                       }
+                               }
+                               if (zend_string_equals(text, Z_STR_P(entry))) {
+                                       RETURN_TRUE;
+                               }
+                       } else {
+                               zend_type_error("Kind array must have elements of type int or string");
+                               RETURN_THROWS();
+                       }
+               } ZEND_HASH_FOREACH_END();
+               RETURN_FALSE;
+       } else {
+               zend_type_error("Kind must be of type int, string or array");
+               RETURN_THROWS();
+       }
+}
+
+PHP_METHOD(PhpToken, isIgnorable)
+{
+       ZEND_PARSE_PARAMETERS_NONE();
+
+       zval *id_zval = php_token_get_id(ZEND_THIS);
+       if (!id_zval) {
+               RETURN_THROWS();
+       }
+
+       zend_long id = Z_LVAL_P(id_zval);
+       RETURN_BOOL(id == T_WHITESPACE || id == T_COMMENT || id == T_DOC_COMMENT || id == T_OPEN_TAG);
+}
+
+PHP_METHOD(PhpToken, getTokenName)
+{
+       ZEND_PARSE_PARAMETERS_NONE();
+
+       zval *id_zval = php_token_get_id(ZEND_THIS);
+       if (!id_zval) {
+               RETURN_THROWS();
+       }
+
+       if (Z_LVAL_P(id_zval) < 256) {
+               RETURN_INTERNED_STR(ZSTR_CHAR(Z_LVAL_P(id_zval)));
+       } else {
+               const char *token_name = get_token_type_name(Z_LVAL_P(id_zval));
+               if (!token_name) {
+                       RETURN_NULL();
+               }
+
+               RETURN_STRING(token_name);
+       }
+}
+
+PHP_METHOD(PhpToken, __toString)
+{
+       ZEND_PARSE_PARAMETERS_NONE();
+
+       zend_string *text = php_token_get_text(ZEND_THIS);
+       if (!text) {
+               RETURN_THROWS();
+       }
+
+       RETURN_STR_COPY(text);
+}
+
+static const zend_function_entry php_token_methods[] = {
+       PHP_ME(PhpToken, getAll, arginfo_class_PhpToken_getAll, ZEND_ACC_PUBLIC|ZEND_ACC_STATIC)
+       PHP_ME(PhpToken, __construct, arginfo_class_PhpToken___construct, ZEND_ACC_PUBLIC|ZEND_ACC_FINAL)
+       PHP_ME(PhpToken, is, arginfo_class_PhpToken_is, ZEND_ACC_PUBLIC)
+       PHP_ME(PhpToken, isIgnorable, arginfo_class_PhpToken_isIgnorable, ZEND_ACC_PUBLIC)
+       PHP_ME(PhpToken, getTokenName, arginfo_class_PhpToken_getTokenName, ZEND_ACC_PUBLIC)
+       PHP_ME(PhpToken, __toString, arginfo_class_PhpToken___toString, ZEND_ACC_PUBLIC)
+       PHP_FE_END
+};
+
 /* {{{ PHP_MINIT_FUNCTION
  */
 PHP_MINIT_FUNCTION(tokenizer)
 {
+       zend_class_entry ce;
+       zend_string *name;
+       zval default_val;
+       ZVAL_UNDEF(&default_val);
+
        tokenizer_register_constants(INIT_FUNC_ARGS_PASSTHRU);
        tokenizer_token_get_all_register_constants(INIT_FUNC_ARGS_PASSTHRU);
+
+       INIT_CLASS_ENTRY(ce, "PhpToken", php_token_methods);
+       php_token_ce = zend_register_internal_class(&ce);
+       zend_class_implements(php_token_ce, 1, zend_ce_stringable);
+
+       name = zend_string_init("id", sizeof("id") - 1, 1);
+       zend_declare_typed_property(php_token_ce, name, &default_val, ZEND_ACC_PUBLIC, NULL,
+               (zend_type) ZEND_TYPE_INIT_MASK(MAY_BE_LONG));
+       zend_string_release(name);
+
+       name = zend_string_init("text", sizeof("text") - 1, 1);
+       zend_declare_typed_property(php_token_ce, name, &default_val, ZEND_ACC_PUBLIC, NULL,
+               (zend_type) ZEND_TYPE_INIT_MASK(MAY_BE_STRING));
+       zend_string_release(name);
+
+       name = zend_string_init("line", sizeof("line") - 1, 1);
+       zend_declare_typed_property(php_token_ce, name, &default_val, ZEND_ACC_PUBLIC, NULL,
+               (zend_type) ZEND_TYPE_INIT_MASK(MAY_BE_LONG));
+       zend_string_release(name);
+
+       name = zend_string_init("pos", sizeof("pos") - 1, 1);
+       zend_declare_typed_property(php_token_ce, name, &default_val, ZEND_ACC_PUBLIC, NULL,
+               (zend_type) ZEND_TYPE_INIT_MASK(MAY_BE_LONG));
+       zend_string_release(name);
+
        return SUCCESS;
 }
 /* }}} */
@@ -92,29 +320,56 @@ PHP_MINFO_FUNCTION(tokenizer)
 }
 /* }}} */
 
-static void add_token(zval *return_value, int token_type,
-               unsigned char *text, size_t leng, int lineno) {
-       if (token_type >= 256) {
-               zval keyword;
-               array_init(&keyword);
-               add_next_index_long(&keyword, token_type);
-               if (leng == 1) {
-                       add_next_index_str(&keyword, ZSTR_CHAR(text[0]));
-               } else {
-                       add_next_index_stringl(&keyword, (char *) text, leng);
+static zend_string *make_str(unsigned char *text, size_t leng, HashTable *interned_strings) {
+       if (leng == 1) {
+               return ZSTR_CHAR(text[0]);
+       } else if (interned_strings) {
+               zend_string *interned_str = zend_hash_str_find_ptr(interned_strings, (char *) text, leng);
+               if (interned_str) {
+                       return zend_string_copy(interned_str);
                }
-               add_next_index_long(&keyword, lineno);
-               add_next_index_zval(return_value, &keyword);
+               interned_str = zend_string_init((char *) text, leng, 0);
+               zend_hash_add_new_ptr(interned_strings, interned_str, interned_str);
+               return interned_str;
        } else {
-               if (leng == 1) {
-                       add_next_index_str(return_value, ZSTR_CHAR(text[0]));
-               } else {
-                       add_next_index_stringl(return_value, (char *) text, leng);
+               return zend_string_init((char *) text, leng, 0);
+       }
+}
+
+static void add_token(
+               zval *return_value, int token_type, unsigned char *text, size_t leng, int lineno,
+               zend_class_entry *token_class, HashTable *interned_strings) {
+       zval token;
+       if (token_class) {
+               zend_object *obj = zend_objects_new(token_class);
+               ZVAL_OBJ(&token, obj);
+               ZVAL_LONG(OBJ_PROP_NUM(obj, 0), token_type);
+               ZVAL_STR(OBJ_PROP_NUM(obj, 1), make_str(text, leng, interned_strings));
+               ZVAL_LONG(OBJ_PROP_NUM(obj, 2), lineno);
+               ZVAL_LONG(OBJ_PROP_NUM(obj, 3), text - LANG_SCNG(yy_start));
+
+               /* If the class is extended with additional properties, initialized them as well. */
+               if (UNEXPECTED(token_class->default_properties_count > 4)) {
+                       zval *dst = OBJ_PROP_NUM(obj, 4);
+                       zval *src = &token_class->default_properties_table[4];
+                       zval *end = token_class->default_properties_table
+                               + token_class->default_properties_count;
+                       for (; src < end; src++, dst++) {
+                               ZVAL_COPY_PROP(dst, src);
+                       }
                }
+       } else if (token_type >= 256) {
+               array_init(&token);
+               add_next_index_long(&token, token_type);
+               add_next_index_str(&token, make_str(text, leng, interned_strings));
+               add_next_index_long(&token, lineno);
+       } else {
+               ZVAL_STR(&token, make_str(text, leng, interned_strings));
        }
+       zend_hash_next_index_insert_new(Z_ARRVAL_P(return_value), &token);
 }
 
-static zend_bool tokenize(zval *return_value, zend_string *source)
+static zend_bool tokenize(zval *return_value, zend_string *source, zend_class_entry *token_class)
 {
        zval source_zval;
        zend_lex_state original_lex_state;
@@ -122,6 +377,7 @@ static zend_bool tokenize(zval *return_value, zend_string *source)
        int token_type;
        int token_line = 1;
        int need_tokens = -1; /* for __halt_compiler lexing. -1 = disabled */
+       HashTable interned_strings;
 
        ZVAL_STR_COPY(&source_zval, source);
        zend_save_lexical_state(&original_lex_state);
@@ -132,10 +388,13 @@ static zend_bool tokenize(zval *return_value, zend_string *source)
        }
 
        LANG_SCNG(yy_state) = yycINITIAL;
+       zend_hash_init(&interned_strings, 0, NULL, NULL, 0);
        array_init(return_value);
 
        while ((token_type = lex_scan(&token, NULL))) {
-               add_token(return_value, token_type, zendtext, zendleng, token_line);
+               add_token(
+                       return_value, token_type, zendtext, zendleng, token_line,
+                       token_class, &interned_strings);
 
                if (Z_TYPE(token) != IS_UNDEF) {
                        zval_ptr_dtor_nogc(&token);
@@ -150,8 +409,9 @@ static zend_bool tokenize(zval *return_value, zend_string *source)
                        ) {
                                /* fetch the rest into a T_INLINE_HTML */
                                if (zendcursor != zendlimit) {
-                                       add_token(return_value, T_INLINE_HTML,
-                                               zendcursor, zendlimit - zendcursor, token_line);
+                                       add_token(
+                                               return_value, T_INLINE_HTML, zendcursor, zendlimit - zendcursor,
+                                               token_line, token_class, &interned_strings);
                                }
                                break;
                        }
@@ -169,46 +429,56 @@ static zend_bool tokenize(zval *return_value, zend_string *source)
 
        zval_ptr_dtor_str(&source_zval);
        zend_restore_lexical_state(&original_lex_state);
+       zend_hash_destroy(&interned_strings);
 
        return 1;
 }
 
+struct event_context {
+       zval *tokens;
+       zend_class_entry *token_class;
+};
+
 void on_event(zend_php_scanner_event event, int token, int line, void *context)
 {
-       zval *token_stream = (zval *) context;
+       struct event_context *ctx = context;
        HashTable *tokens_ht;
        zval *token_zv;
 
        switch (event) {
                case ON_TOKEN:
-                       {
-                               if (token == END) break;
-                               /* Special cases */
-                               if (token == ';' && LANG_SCNG(yy_leng) > 1) { /* ?> or ?>\n or ?>\r\n */
-                                       token = T_CLOSE_TAG;
-                               } else if (token == T_ECHO && LANG_SCNG(yy_leng) == sizeof("<?=") - 1) {
-                                       token = T_OPEN_TAG_WITH_ECHO;
-                               }
-                               add_token(token_stream, token, LANG_SCNG(yy_text), LANG_SCNG(yy_leng), line);
+                       if (token == END) break;
+                       /* Special cases */
+                       if (token == ';' && LANG_SCNG(yy_leng) > 1) { /* ?> or ?>\n or ?>\r\n */
+                               token = T_CLOSE_TAG;
+                       } else if (token == T_ECHO && LANG_SCNG(yy_leng) == sizeof("<?=") - 1) {
+                               token = T_OPEN_TAG_WITH_ECHO;
                        }
+                       add_token(ctx->tokens, token,
+                               LANG_SCNG(yy_text), LANG_SCNG(yy_leng), line, ctx->token_class, NULL);
                        break;
                case ON_FEEDBACK:
-                       tokens_ht = Z_ARRVAL_P(token_stream);
+                       tokens_ht = Z_ARRVAL_P(ctx->tokens);
                        token_zv = zend_hash_index_find(tokens_ht, zend_hash_num_elements(tokens_ht) - 1);
-                       if (token_zv && Z_TYPE_P(token_zv) == IS_ARRAY) {
+                       ZEND_ASSERT(token_zv);
+                       if (Z_TYPE_P(token_zv) == IS_ARRAY) {
                                ZVAL_LONG(zend_hash_index_find(Z_ARRVAL_P(token_zv), 0), token);
+                       } else {
+                               zend_update_property_long(php_token_ce, token_zv, "type", sizeof("type")-1, token);
                        }
                        break;
                case ON_STOP:
                        if (LANG_SCNG(yy_cursor) != LANG_SCNG(yy_limit)) {
-                               add_token(token_stream, T_INLINE_HTML, LANG_SCNG(yy_cursor),
-                                       LANG_SCNG(yy_limit) - LANG_SCNG(yy_cursor), CG(zend_lineno));
+                               add_token(ctx->tokens, T_INLINE_HTML, LANG_SCNG(yy_cursor),
+                                       LANG_SCNG(yy_limit) - LANG_SCNG(yy_cursor), CG(zend_lineno),
+                                       ctx->token_class, NULL);
                        }
                        break;
        }
 }
 
-static zend_bool tokenize_parse(zval *return_value, zend_string *source)
+static zend_bool tokenize_parse(
+               zval *return_value, zend_string *source, zend_class_entry *token_class)
 {
        zval source_zval;
        zend_lex_state original_lex_state;
@@ -222,14 +492,18 @@ static zend_bool tokenize_parse(zval *return_value, zend_string *source)
        zend_save_lexical_state(&original_lex_state);
 
        if ((success = (zend_prepare_string_for_scanning(&source_zval, "") == SUCCESS))) {
+               struct event_context ctx;
                zval token_stream;
                array_init(&token_stream);
 
+               ctx.tokens = &token_stream;
+               ctx.token_class = token_class;
+
                CG(ast) = NULL;
                CG(ast_arena) = zend_arena_create(1024 * 32);
                LANG_SCNG(yy_state) = yycINITIAL;
                LANG_SCNG(on_event) = on_event;
-               LANG_SCNG(on_event_context) = &token_stream;
+               LANG_SCNG(on_event_context) = &ctx;
 
                if((success = (zendparse() == SUCCESS))) {
                        ZVAL_COPY_VALUE(return_value, &token_stream);
@@ -250,6 +524,19 @@ static zend_bool tokenize_parse(zval *return_value, zend_string *source)
        return success;
 }
 
+static zend_bool tokenize_common(
+               zval *return_value, zend_string *source, zend_long flags, zend_class_entry *token_class)
+{
+       if (flags & TOKEN_PARSE) {
+               return tokenize_parse(return_value, source, token_class);
+       } else {
+               int success = tokenize(return_value, source, token_class);
+               /* Normal token_get_all() should not throw. */
+               zend_clear_exception();
+               return success;
+       }
+}
+
 /* }}} */
 
 /* {{{ proto array token_get_all(string source [, int flags])
@@ -258,7 +545,6 @@ PHP_FUNCTION(token_get_all)
 {
        zend_string *source;
        zend_long flags = 0;
-       zend_bool success;
 
        ZEND_PARSE_PARAMETERS_START(1, 2)
                Z_PARAM_STR(source)
@@ -266,15 +552,7 @@ PHP_FUNCTION(token_get_all)
                Z_PARAM_LONG(flags)
        ZEND_PARSE_PARAMETERS_END();
 
-       if (flags & TOKEN_PARSE) {
-               success = tokenize_parse(return_value, source);
-       } else {
-               success = tokenize(return_value, source);
-               /* Normal token_get_all() should not throw. */
-               zend_clear_exception();
-       }
-
-       if (!success) {
+       if (!tokenize_common(return_value, source, flags, /* token_class */ NULL)) {
                RETURN_THROWS();
        }
 }
@@ -290,6 +568,10 @@ PHP_FUNCTION(token_name)
                Z_PARAM_LONG(type)
        ZEND_PARSE_PARAMETERS_END();
 
-       RETVAL_STRING(get_token_type_name(type));
+       const char *token_name = get_token_type_name(type);
+       if (!token_name) {
+               token_name = "UNKNOWN";
+       }
+       RETURN_STRING(token_name);
 }
 /* }}} */
index 63a6c2e72ca76169b0a4d76fa5af6fd048af5c65..801c1c8504ff33e8ff752de18707633c02f02dcd 100644 (file)
@@ -3,3 +3,19 @@
 function token_get_all(string $source, int $flags = 0): array {}
 
 function token_name(int $token): string {}
+
+class PhpToken implements Stringable {
+    /** @return static[] */
+    public static function getAll(string $code, int $flags = 0): array;
+
+    public final function __construct(int $id, string $text, int $line = -1, int $pos = -1);
+
+    /** @param int|string|array $kind */
+    public function is($kind): bool;
+
+    public function isIgnorable(): bool;
+
+    public function getTokenName(): ?string;
+
+    public function __toString(): string;
+}
index d777535a48d7e2ff5771cfdbb9421241401c65c1..d927c8d0e610e703657f59bf41091d09404890ee 100644 (file)
@@ -8,3 +8,28 @@ ZEND_END_ARG_INFO()
 ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_token_name, 0, 1, IS_STRING, 0)
        ZEND_ARG_TYPE_INFO(0, token, IS_LONG, 0)
 ZEND_END_ARG_INFO()
+
+ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_class_PhpToken_getAll, 0, 1, IS_ARRAY, 0)
+       ZEND_ARG_TYPE_INFO(0, code, IS_STRING, 0)
+       ZEND_ARG_TYPE_INFO(0, flags, IS_LONG, 0)
+ZEND_END_ARG_INFO()
+
+ZEND_BEGIN_ARG_INFO_EX(arginfo_class_PhpToken___construct, 0, 0, 2)
+       ZEND_ARG_TYPE_INFO(0, id, IS_LONG, 0)
+       ZEND_ARG_TYPE_INFO(0, text, IS_STRING, 0)
+       ZEND_ARG_TYPE_INFO(0, line, IS_LONG, 0)
+       ZEND_ARG_TYPE_INFO(0, pos, IS_LONG, 0)
+ZEND_END_ARG_INFO()
+
+ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_class_PhpToken_is, 0, 1, _IS_BOOL, 0)
+       ZEND_ARG_INFO(0, kind)
+ZEND_END_ARG_INFO()
+
+ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_class_PhpToken_isIgnorable, 0, 0, _IS_BOOL, 0)
+ZEND_END_ARG_INFO()
+
+ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_class_PhpToken_getTokenName, 0, 0, IS_STRING, 1)
+ZEND_END_ARG_INFO()
+
+ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_class_PhpToken___toString, 0, 0, IS_STRING, 0)
+ZEND_END_ARG_INFO()
index 7e580dd84456ce52713630c9e001434a98c75a24..3ddf89521a8cd1b623a41abeefe91c34893ec43c 100644 (file)
@@ -306,6 +306,6 @@ char *get_token_type_name(int token_type)
                case T_BAD_CHARACTER: return "T_BAD_CHARACTER";
 
        }
-       return "UNKNOWN";
+       return NULL;
 }
 
index 4d5e97ddde74b39b3e19ddf684068e698349a1bf..1dbe77d2e7ebc16efcb392b103d11d2c55a29f75 100755 (executable)
@@ -71,7 +71,7 @@ awk '
 
 echo '
        }
-       return "UNKNOWN";
+       return NULL;
 }
 ' >> $outfile