]> granicus.if.org Git - php/commitdiff
- Fixed bug #49687 (utf8_decode vulnerabilities and deficiencies in the number
authorGustavo André dos Santos Lopes <cataphract@php.net>
Wed, 27 Oct 2010 18:13:25 +0000 (18:13 +0000)
committerGustavo André dos Santos Lopes <cataphract@php.net>
Wed, 27 Oct 2010 18:13:25 +0000 (18:13 +0000)
  of reported malformed sequences). (Gustavo)
#Made a public interface for get_next_char/utf-8 in trunk to use in utf8_decode.
#In PHP 5.3, trunk's get_next_char was copied to xml.c because 5.3's
#get_next_char is different and is not prepared to recover appropriately from
#errors.

ext/standard/html.c
ext/standard/html.h
ext/xml/tests/bug49687.phpt [new file with mode: 0644]
ext/xml/xml.c

index 354e18bfec14a863fc4acb2bd8aa0706e126a74d..de763cf72a6945f47beb84cbbef5234c8929062c 100644 (file)
@@ -92,9 +92,9 @@ ZEND_EXTERN_MODULE_GLOBALS(mbstring)
 
 /* {{{ get_next_char
  */
-static unsigned int get_next_char(
+static inline unsigned int get_next_char(
                enum entity_charset charset,
-               unsigned char *str,
+               const unsigned char *str,
                size_t str_len,
                size_t *cursor,
                int *status)
@@ -352,6 +352,18 @@ static unsigned int get_next_char(
 }
 /* }}} */
 
+/* {{{ php_next_utf8_char
+ * Public interface for get_next_char used with UTF-8 */
+ PHPAPI unsigned int php_next_utf8_char(
+               const unsigned char *str,
+               size_t str_len,
+               size_t *cursor,
+               int *status)
+{
+       return get_next_char(cs_utf_8, str, str_len, cursor, status);
+}
+/* }}} */
+
 /* {{{ entity_charset determine_charset
  * returns the charset identifier based on current locale or a hint.
  * defaults to UTF-8 */
index 4915e171cb927e311aa9d13375e9920308f520bf..8d9efc406e6294523311c6f484b0734ee1a3ed96 100644 (file)
@@ -57,5 +57,6 @@ PHP_FUNCTION(get_html_translation_table);
 PHPAPI char *php_escape_html_entities(unsigned char *old, size_t oldlen, size_t *newlen, int all, int flags, char *hint_charset TSRMLS_DC);
 PHPAPI char *php_escape_html_entities_ex(unsigned char *old, size_t oldlen, size_t *newlen, int all, int flags, char *hint_charset, zend_bool double_encode TSRMLS_DC);
 PHPAPI char *php_unescape_html_entities(unsigned char *old, size_t oldlen, size_t *newlen, int all, int flags, char *hint_charset TSRMLS_DC);
+PHPAPI unsigned int php_next_utf8_char(const unsigned char *str, size_t str_len, size_t *cursor, int *status);
 
 #endif /* HTML_H */
diff --git a/ext/xml/tests/bug49687.phpt b/ext/xml/tests/bug49687.phpt
new file mode 100644 (file)
index 0000000..3ff19ce
--- /dev/null
@@ -0,0 +1,24 @@
+--TEST--\r
+Bug #49687 Several utf8_decode deficiencies and vulnerabilities\r
+--SKIPIF--\r
+<?php\r
+require_once("skipif.inc");\r
+if (!extension_loaded('xml')) die ("skip xml extension not available");\r
+?>\r
+--FILE--\r
+<?php\r
+\r
+$tests = array(\r
+    "\x41\xC2\x3E\x42",\r
+    "\xE3\x80\x22",\r
+    "\x41\x98\xBA\x42\xE2\x98\x43\xE2\x98\xBA\xE2\x98",\r
+);\r
+foreach ($tests as $t) {\r
+    echo bin2hex(utf8_decode($t)), "\n";\r
+}\r
+echo "Done.\n";\r
+--EXPECT--\r
+413f3e42\r
+3f22\r
+413f3f423f433f3f\r
+Done.\r
index 72729d6de5c9540951fd6c851f60f2343a363d28..54e03d5e63eed94756ef3734984049c404c7f22f 100644 (file)
@@ -32,6 +32,7 @@
 #include "zend_variables.h"
 #include "ext/standard/php_string.h"
 #include "ext/standard/info.h"
+#include "ext/standard/html.h"
 
 #if HAVE_XML
 
@@ -662,7 +663,7 @@ PHPAPI char *xml_utf8_encode(const char *s, int len, int *newlen, const XML_Char
 /* {{{ xml_utf8_decode */
 PHPAPI char *xml_utf8_decode(const XML_Char *s, int len, int *newlen, const XML_Char *encoding)
 {
-       int pos = len;
+       size_t pos = 0;
        char *newbuf = emalloc(len + 1);
        unsigned int c;
        char (*decoder)(unsigned short) = NULL;
@@ -681,36 +682,15 @@ PHPAPI char *xml_utf8_decode(const XML_Char *s, int len, int *newlen, const XML_
                newbuf[*newlen] = '\0';
                return newbuf;
        }
-       while (pos > 0) {
-               c = (unsigned char)(*s);
-               if (c >= 0xf0) { /* four bytes encoded, 21 bits */
-                       if(pos-4 >= 0) {
-                               c = ((s[0]&7)<<18) | ((s[1]&63)<<12) | ((s[2]&63)<<6) | (s[3]&63);
-                       } else {
-                               c = '?';        
-                       }
-                       s += 4;
-                       pos -= 4;
-               } else if (c >= 0xe0) { /* three bytes encoded, 16 bits */
-                       if(pos-3 >= 0) {
-                               c = ((s[0]&63)<<12) | ((s[1]&63)<<6) | (s[2]&63);
-                       } else {
-                               c = '?';
-                       }
-                       s += 3;
-                       pos -= 3;
-               } else if (c >= 0xc0) { /* two bytes encoded, 11 bits */
-                       if(pos-2 >= 0) {
-                               c = ((s[0]&63)<<6) | (s[1]&63);
-                       } else {
-                               c = '?';
-                       }
-                       s += 2;
-                       pos -= 2;
-               } else {
-                       s++;
-                       pos--;
+
+       while (pos < (size_t)len) {
+               int status = FAILURE;
+               c = php_next_utf8_char((const unsigned char*)s, (size_t) len, &pos, &status);
+
+               if (status == FAILURE || c > 0xFFU) {
+                       c = '?';
                }
+
                newbuf[*newlen] = decoder ? decoder(c) : c;
                ++*newlen;
        }