From 3f76f9416ff82c10174dc382285b4b91789e278b Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Sat, 14 Sep 2019 12:08:20 +0200
Subject: [PATCH] Fix double-free on invalid large octal with separators

To clean up the mess here a bit, check for invalid octal digits
with an explicit loop instead of mixing this into the string to
number conversion.

Also clean up some type usage.
---
 Zend/zend_language_scanner.l                  | 77 +++++++------------
 .../invalid_large_octal_with_underscores.phpt | 31 ++++++++
 2 files changed, 60 insertions(+), 48 deletions(-)
 create mode 100644 ext/tokenizer/tests/invalid_large_octal_with_underscores.phpt

diff --git a/Zend/zend_language_scanner.l b/Zend/zend_language_scanner.l
index 84ed669dfb..86b8bb1a70 100644
--- a/Zend/zend_language_scanner.l
+++ b/Zend/zend_language_scanner.l
@@ -120,7 +120,7 @@ do {																			\
 
 BEGIN_EXTERN_C()
 
-static void strip_underscores(char *str, int *len)
+static void strip_underscores(char *str, size_t *len)
 {
 	char *src = str, *dest = str;
 	while (*src != '\0') {
@@ -1771,8 +1771,9 @@ NEWLINE ("\r"|"\n"|"\r\n")
 
 <ST_IN_SCRIPTING>{BNUM} {
 	/* The +/- 2 skips "0b" */
-	int len = yyleng - 2, contains_underscores;
+	size_t len = yyleng - 2;
 	char *end, *bin = yytext + 2;
+	zend_bool contains_underscores;
 
 	/* Skip any leading 0s */
 	while (len > 0 && (*bin == '0' || *bin == '_')) {
@@ -1811,10 +1812,25 @@ NEWLINE ("\r"|"\n"|"\r\n")
 }
 
 <ST_IN_SCRIPTING>{LNUM} {
-	int len = yyleng, contains_underscores;
+	size_t len = yyleng;
 	char *end, *lnum = yytext;
-
-	contains_underscores = (memchr(lnum, '_', len) != NULL);
+	zend_bool is_octal = lnum[0] == '0';
+	zend_bool contains_underscores = (memchr(lnum, '_', len) != NULL);
+
+	/* Digits 8 and 9 are illegal in octal literals. */
+	if (is_octal) {
+		size_t i;
+		for (i = 0; i < len; i++) {
+			if (lnum[i] == '8' || lnum[i] == '9') {
+				zend_throw_exception(zend_ce_parse_error, "Invalid numeric literal", 0);
+				ZVAL_UNDEF(zendlval);
+				if (PARSER_MODE()) {
+					RETURN_TOKEN(T_ERROR);
+				}
+				RETURN_TOKEN_WITH_VAL(T_LNUMBER);
+			}
+		}
+	}
 
 	if (contains_underscores) {
 		lnum = estrndup(lnum, len);
@@ -1824,21 +1840,8 @@ NEWLINE ("\r"|"\n"|"\r\n")
 	if (len < MAX_LENGTH_OF_LONG - 1) { /* Won't overflow */
 		errno = 0;
 		/* base must be passed explicitly for correct parse error on Windows */
-		ZVAL_LONG(zendlval, ZEND_STRTOL(lnum, &end, lnum[0] == '0' ? 8 : 10));
-		/* This isn't an assert, we need to ensure 019 isn't valid octal
-		 * Because the lexing itself doesn't do that for us
-		 */
-		if (end != lnum + len) {
-			zend_throw_exception(zend_ce_parse_error, "Invalid numeric literal", 0);
-			ZVAL_UNDEF(zendlval);
-			if (contains_underscores) {
-				efree(lnum);
-			}
-			if (PARSER_MODE()) {
-				RETURN_TOKEN(T_ERROR);
-			}
-			RETURN_TOKEN_WITH_VAL(T_LNUMBER);
-		}
+		ZVAL_LONG(zendlval, ZEND_STRTOL(lnum, &end, is_octal ? 8 : 10));
+		ZEND_ASSERT(end == lnum + len);
 	} else {
 		errno = 0;
 		ZVAL_LONG(zendlval, ZEND_STRTOL(lnum, &end, 0));
@@ -1849,35 +1852,13 @@ NEWLINE ("\r"|"\n"|"\r\n")
 			} else {
 				ZVAL_DOUBLE(zendlval, zend_strtod(lnum, (const char **)&end));
 			}
-			/* Also not an assert for the same reason */
-			if (end != lnum + len) {
-				zend_throw_exception(zend_ce_parse_error,
-					"Invalid numeric literal", 0);
-				ZVAL_UNDEF(zendlval);
-				if (contains_underscores) {
-					efree(lnum);
-				}
-				if (PARSER_MODE()) {
-					RETURN_TOKEN(T_ERROR);
-				}
-			}
+			ZEND_ASSERT(end == lnum + len);
 			if (contains_underscores) {
 				efree(lnum);
 			}
 			RETURN_TOKEN_WITH_VAL(T_DNUMBER);
 		}
-		/* Also not an assert for the same reason */
-		if (end != lnum + len) {
-			zend_throw_exception(zend_ce_parse_error, "Invalid numeric literal", 0);
-			ZVAL_UNDEF(zendlval);
-			if (contains_underscores) {
-				efree(lnum);
-			}
-			if (PARSER_MODE()) {
-				RETURN_TOKEN(T_ERROR);
-			}
-			RETURN_TOKEN_WITH_VAL(T_DNUMBER);
-		}
+		ZEND_ASSERT(end == lnum + len);
 	}
 	ZEND_ASSERT(!errno);
 	if (contains_underscores) {
@@ -1888,8 +1869,9 @@ NEWLINE ("\r"|"\n"|"\r\n")
 
 <ST_IN_SCRIPTING>{HNUM} {
 	/* The +/- 2 skips "0x" */
-	int len = yyleng - 2, contains_underscores;
+	size_t len = yyleng - 2;
 	char *end, *hex = yytext + 2;
+	zend_bool contains_underscores;
 
 	/* Skip any leading 0s */
 	while (len > 0 && (*hex == '0' || *hex == '_')) {
@@ -1954,10 +1936,9 @@ string:
 
 <ST_IN_SCRIPTING>{DNUM}|{EXPONENT_DNUM} {
 	const char *end;
-	int len = yyleng, contains_underscores;
+	size_t len = yyleng;
 	char *dnum = yytext;
-
-	contains_underscores = (memchr(dnum, '_', len) != NULL);
+	zend_bool contains_underscores = (memchr(dnum, '_', len) != NULL);
 
 	if (contains_underscores) {
 		dnum = estrndup(dnum, len);
diff --git a/ext/tokenizer/tests/invalid_large_octal_with_underscores.phpt b/ext/tokenizer/tests/invalid_large_octal_with_underscores.phpt
new file mode 100644
index 0000000000..b6d43c1ff1
--- /dev/null
+++ b/ext/tokenizer/tests/invalid_large_octal_with_underscores.phpt
@@ -0,0 +1,31 @@
+--TEST--
+Large invalid octal number with underscores
+--FILE--
+<?php
+
+var_dump(token_get_all("<?php 0_10000000000000000000009;"));
+
+?>
+--EXPECTF--
+array(3) {
+  [0]=>
+  array(3) {
+    [0]=>
+    int(%d)
+    [1]=>
+    string(6) "<?php "
+    [2]=>
+    int(1)
+  }
+  [1]=>
+  array(3) {
+    [0]=>
+    int(%d)
+    [1]=>
+    string(25) "0_10000000000000000000009"
+    [2]=>
+    int(1)
+  }
+  [2]=>
+  string(1) ";"
+}
-- 
2.40.0