]> granicus.if.org Git - php/commitdiff
- Fixed get_next_char(), used by htmlentities/htmlspecialchars, accepting
authorGustavo André dos Santos Lopes <cataphract@php.net>
Thu, 14 Oct 2010 19:14:06 +0000 (19:14 +0000)
committerGustavo André dos Santos Lopes <cataphract@php.net>
Thu, 14 Oct 2010 19:14:06 +0000 (19:14 +0000)
  certain ill-formed UTF-8 sequences.

ext/standard/html.c
ext/standard/tests/strings/htmlentities-utf-2.phpt
ext/standard/tests/strings/htmlentities-utf-3.phpt [new file with mode: 0644]
ext/standard/tests/strings/htmlentities-utf.phpt

index d32246d5130b3fa7fccf365379983298dc8ee802..5d683e237bb8226c99203663607cfca0026540f9 100644 (file)
@@ -129,7 +129,7 @@ inline static unsigned int get_next_char(enum entity_charset charset,
                                        MB_WRITE(c);
                                        this_char = c;
                                        pos++;
-                               } else if (c < 0xc0) {
+                               } else if (c < 0xc2) {
                                        MB_FAILURE(pos);
                                } else if (c < 0xe0) {
                                        CHECK_LEN(pos, 2);
@@ -161,7 +161,7 @@ inline static unsigned int get_next_char(enum entity_charset charset,
                                        MB_WRITE((unsigned char)str[pos + 1]);
                                        MB_WRITE((unsigned char)str[pos + 2]);
                                        pos += 3;
-                               } else if (c < 0xf8) {
+                               } else if (c < 0xf5) {
                                        CHECK_LEN(pos, 4);
                                        if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
                                                MB_FAILURE(pos);
@@ -173,7 +173,7 @@ inline static unsigned int get_next_char(enum entity_charset charset,
                                                MB_FAILURE(pos);
                                        }
                                        this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
-                                       if (this_char < 0x10000) {
+                                       if (this_char < 0x10000 || this_char > 0x10FFFF) {
                                                MB_FAILURE(pos);
                                        }
                                        MB_WRITE((unsigned char)c);
index c5f4ac4ea6dbf4705cf9984946fff83a4f8c739c..d515dc0ff1f133d97eab3825b657e6d7759c0ad7 100755 (executable)
@@ -50,8 +50,8 @@ foreach($strings as $string) {
 %unicode|string%(16) "266561637574653b"
 %unicode|string%(2) "79"
 %unicode|string%(2) "79"
-%unicode|string%(8) "f7bfbfbf"
-%unicode|string%(8) "f7bfbfbf"
+%unicode|string%(0) ""
+%unicode|string%(0) ""
 %unicode|string%(0) ""
 %unicode|string%(0) ""
 %unicode|string%(0) ""
diff --git a/ext/standard/tests/strings/htmlentities-utf-3.phpt b/ext/standard/tests/strings/htmlentities-utf-3.phpt
new file mode 100644 (file)
index 0000000..c28917b
--- /dev/null
@@ -0,0 +1,83 @@
+--TEST--\r
+Test get_next_char(), used by htmlentities()/htmlspecialchars(): validity of UTF-8 sequences\r
+--FILE--\r
+<?php\r
+\r
+/* conformance to Unicode 5.2, section 3.9, D92 */\r
+\r
+$val_ranges = array(\r
+       array(array(0x00, 0x7F)),\r
+       array(array(0xC2, 0xDF), array(0x80, 0xBF)),\r
+       array(array(0xE0, 0xE0), array(0xA0, 0xBF), array(0x80, 0xBF)),\r
+       array(array(0xE1, 0xEC), array(0x80, 0xBF), array(0x80, 0xBF)),\r
+       array(array(0xED, 0xED), array(0x80, 0x9F), array(0x80, 0xBF)),\r
+       array(array(0xEE, 0xEF), array(0x80, 0xBF), array(0x80, 0xBF)),\r
+       array(array(0xF0, 0xF0), array(0x90, 0xBF), array(0x80, 0xBF), array(0x80, 0xBF)),\r
+       array(array(0xF1, 0xF3), array(0x80, 0xBF), array(0x80, 0xBF), array(0x80, 0xBF)),\r
+       array(array(0xF4, 0xF4), array(0x80, 0x8F), array(0x80, 0xBF), array(0x80, 0xBF)),\r
+);\r
+\r
+function is_valid($seq) {\r
+       global $val_ranges;\r
+       $b = ord($seq[0]);\r
+       foreach ($val_ranges as $l) {\r
+               if ($b >= $l[0][0] && $b <= $l[0][1]) {\r
+                       if (count($l) != strlen($seq)) {\r
+                               return false;\r
+                       }\r
+                       for ($n = 1; $n < strlen($seq); $n++) {\r
+                               if (ord($seq[$n]) < $l[$n][0] || ord($seq[$n]) > $l[$n][1]) {\r
+                                       return false;\r
+                               }\r
+                       }\r
+                       return true;\r
+               }\r
+       }\r
+       return false;\r
+}\r
+\r
+function concordance($s) {\r
+       $vhe = strlen(htmlspecialchars($s, ENT_QUOTES, "UTF-8")) > 0;\r
+       $v = is_valid($s);\r
+       return ($vhe === $v);\r
+}\r
+\r
+for ($b1 = 0xC0; $b1 < 0xE0; $b1++) {\r
+       for ($b2 = 0x80; $b2 < 0xBF; $b2++) {\r
+               $s = chr($b1).chr($b2);\r
+               if (!concordance($s))\r
+                       echo "Discordance for ".bin2hex($s),"\n";\r
+       }\r
+}\r
+\r
+\r
+for ($b1 = 0xE0; $b1 < 0xEF; $b1++) {\r
+       for ($b2 = 0x80; $b2 < 0xBF; $b2++) {\r
+               $s = chr($b1).chr($b2)."\x80";\r
+               if (!concordance($s))\r
+                       echo "Discordance for ".bin2hex($s),"\n";\r
+               $s = chr($b1).chr($b2)."\xBF";\r
+               if (!concordance($s))\r
+                       echo "Discordance for ".bin2hex($s),"\n";\r
+       }\r
+}\r
+\r
+for ($b1 = 0xF0; $b1 < 0xFF; $b1++) {\r
+       for ($b2 = 0x80; $b2 < 0xBF; $b2++) {\r
+               $s = chr($b1).chr($b2)."\x80\x80";\r
+               if (!concordance($s))\r
+                       echo "Discordance for ".bin2hex($s),"\n";\r
+               $s = chr($b1).chr($b2)."\xBF\x80";\r
+               if (!concordance($s))\r
+                       echo "Discordance for ".bin2hex($s),"\n";\r
+               $s = chr($b1).chr($b2)."\x80\xBF";\r
+               if (!concordance($s))\r
+                       echo "Discordance for ".bin2hex($s),"\n";\r
+               $s = chr($b1).chr($b2)."\xBF\xBF";\r
+               if (!concordance($s))\r
+                       echo "Discordance for ".bin2hex($s),"\n";\r
+       }\r
+}\r
+echo "Done.\n";\r
+--EXPECT--\r
+Done.\r
index 1daafc61d82b1477c146a78b57fd53c779eb375a..6a66e4df451a2c624ee7e76d3048812fa495a8be 100755 (executable)
@@ -50,8 +50,8 @@ foreach($strings as $string) {
 %unicode|string%(16) "266561637574653b"
 %unicode|string%(0) ""
 %unicode|string%(0) ""
-%unicode|string%(8) "f7bfbfbf"
-%unicode|string%(8) "f7bfbfbf"
+%unicode|string%(0) ""
+%unicode|string%(0) ""
 %unicode|string%(0) ""
 %unicode|string%(0) ""
 %unicode|string%(0) ""