]> granicus.if.org Git - php/commitdiff
MFH Fix bug #46944 - UTF-8 characters outside the BMP aren't encoded correctly.
authorScott MacVicar <scottmac@php.net>
Thu, 12 Feb 2009 00:15:46 +0000 (00:15 +0000)
committerScott MacVicar <scottmac@php.net>
Thu, 12 Feb 2009 00:15:46 +0000 (00:15 +0000)
Forgot to merge this back in January

ext/json/tests/bug46944.phpt [new file with mode: 0644]
ext/json/utf8_decode.c
ext/json/utf8_to_utf16.c

diff --git a/ext/json/tests/bug46944.phpt b/ext/json/tests/bug46944.phpt
new file mode 100644 (file)
index 0000000..812a548
--- /dev/null
@@ -0,0 +1,35 @@
+--TEST--
+Bug #46944 (json_encode() doesn't handle 3 byte utf8 correctly)
+--SKIPIF--
+<?php if (!extension_loaded('json')) print 'skip'; ?>
+--FILE--
+<?php
+
+for ($i = 1; $i <= 16; $i++) {
+       $first = 0xf0|($i >> 2);
+       $second = 0x8f|($i & 3) << 4;
+       $string = sprintf("aa%c%c\xbf\xbdzz", $first, $second);
+       echo json_encode($string) . "\n";
+}
+
+
+echo "Done\n";
+?>
+--EXPECT--
+"aa\ud83f\udffdzz"
+"aa\ud87f\udffdzz"
+"aa\ud8bf\udffdzz"
+"aa\ud8ff\udffdzz"
+"aa\ud93f\udffdzz"
+"aa\ud97f\udffdzz"
+"aa\ud9bf\udffdzz"
+"aa\ud9ff\udffdzz"
+"aa\uda3f\udffdzz"
+"aa\uda7f\udffdzz"
+"aa\udabf\udffdzz"
+"aa\udaff\udffdzz"
+"aa\udb3f\udffdzz"
+"aa\udb7f\udffdzz"
+"aa\udbbf\udffdzz"
+"aa\udbff\udffdzz"
+Done
index cea1f8cec8dcbc90fab8d46c402872f29e833f87..2d0422bedb6131189974212722234bf97837df95 100644 (file)
@@ -165,7 +165,7 @@ utf8_decode_next(json_utf8_decode *utf8)
 /*
     Three continuation (65536 to 1114111)
 */
-    if ((c & 0xF1) == 0xF0) {
+    if ((c & 0xF8) == 0xF0) {
         int c1 = cont(utf8);
         int c2 = cont(utf8);
         int c3 = cont(utf8);
index 42ea9e5d8eb86fd59c278c25ae550708ec65f61d..599f0e13b48b26e57fcd755ab270375a5d413da1 100644 (file)
@@ -46,7 +46,7 @@ utf8_to_utf16(unsigned short w[], char p[], int length)
             w[the_index] = (unsigned short)c;
             the_index += 1;
         } else {
-            c &= 0xFFFF;
+            c -= 0x10000;
             w[the_index] = (unsigned short)(0xD800 | (c >> 10));
             the_index += 1;
             w[the_index] = (unsigned short)(0xDC00 | (c & 0x3FF));