]> granicus.if.org Git - python/commitdiff
Issue #11489: JSON decoder now accepts lone surrogates.
authorSerhiy Storchaka <storchaka@gmail.com>
Tue, 26 Nov 2013 19:25:28 +0000 (21:25 +0200)
committerSerhiy Storchaka <storchaka@gmail.com>
Tue, 26 Nov 2013 19:25:28 +0000 (21:25 +0200)
Lib/json/decoder.py
Lib/test/test_json/test_scanstring.py
Misc/NEWS
Modules/_json.c

index 51c3aa7851fc8893875ad087fde4daec910a4b68..80d3420a95c8092572806580841ebf57ebda5e88 100644 (file)
@@ -66,6 +66,16 @@ BACKSLASH = {
     'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t',
 }
 
+def _decode_uXXXX(s, pos):
+    esc = s[pos + 1:pos + 5]
+    if len(esc) == 4 and esc[1] not in 'xX':
+        try:
+            return int(esc, 16)
+        except ValueError:
+            pass
+    msg = "Invalid \\uXXXX escape"
+    raise ValueError(errmsg(msg, s, pos))
+
 def py_scanstring(s, end, strict=True,
         _b=BACKSLASH, _m=STRINGCHUNK.match):
     """Scan the string s for a JSON string. End is the index of the
@@ -115,25 +125,14 @@ def py_scanstring(s, end, strict=True,
                 raise ValueError(errmsg(msg, s, end))
             end += 1
         else:
-            esc = s[end + 1:end + 5]
-            next_end = end + 5
-            if len(esc) != 4:
-                msg = "Invalid \\uXXXX escape"
-                raise ValueError(errmsg(msg, s, end))
-            uni = int(esc, 16)
-            if 0xd800 <= uni <= 0xdbff:
-                msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
-                if not s[end + 5:end + 7] == '\\u':
-                    raise ValueError(errmsg(msg, s, end))
-                esc2 = s[end + 7:end + 11]
-                if len(esc2) != 4:
-                    raise ValueError(errmsg(msg, s, end))
-                uni2 = int(esc2, 16)
-                uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
-                next_end += 6
+            uni = _decode_uXXXX(s, end)
+            end += 5
+            if 0xd800 <= uni <= 0xdbff and s[end:end + 2] == '\\u':
+                uni2 = _decode_uXXXX(s, end + 1)
+                if 0xdc00 <= uni2 <= 0xdfff:
+                    uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
+                    end += 6
             char = chr(uni)
-
-            end = next_end
         _append(char)
     return ''.join(chunks), end
 
index 2e3a291358c14fc93e6ec89f7362e8d610b8e951..07f4358100da6a3a6f975c6ea2a1a2fd903ec6ab 100644 (file)
@@ -5,10 +5,6 @@ from test.test_json import PyTest, CTest
 class TestScanstring:
     def test_scanstring(self):
         scanstring = self.json.decoder.scanstring
-        self.assertEqual(
-            scanstring('"z\\ud834\\udd20x"', 1, True),
-            ('z\U0001d120x', 16))
-
         self.assertEqual(
             scanstring('"z\U0001d120x"', 1, True),
             ('z\U0001d120x', 5))
@@ -89,6 +85,53 @@ class TestScanstring:
             scanstring('["Bad value", truth]', 2, True),
             ('Bad value', 12))
 
+    def test_surrogates(self):
+        scanstring = self.json.decoder.scanstring
+        def assertScan(given, expect):
+            self.assertEqual(scanstring(given, 1, True),
+                             (expect, len(given)))
+
+        assertScan('"z\\ud834\\u0079x"', 'z\ud834yx')
+        assertScan('"z\\ud834\\udd20x"', 'z\U0001d120x')
+        assertScan('"z\\ud834\\ud834\\udd20x"', 'z\ud834\U0001d120x')
+        assertScan('"z\\ud834x"', 'z\ud834x')
+        assertScan('"z\\ud834\udd20x12345"', 'z\ud834\udd20x12345')
+        assertScan('"z\\udd20x"', 'z\udd20x')
+        assertScan('"z\ud834\udd20x"', 'z\ud834\udd20x')
+        assertScan('"z\ud834\\udd20x"', 'z\ud834\udd20x')
+        assertScan('"z\ud834x"', 'z\ud834x')
+
+    def test_bad_escapes(self):
+        scanstring = self.json.decoder.scanstring
+        bad_escapes = [
+            '"\\"',
+            '"\\x"',
+            '"\\u"',
+            '"\\u0"',
+            '"\\u01"',
+            '"\\u012"',
+            '"\\uz012"',
+            '"\\u0z12"',
+            '"\\u01z2"',
+            '"\\u012z"',
+            '"\\u0x12"',
+            '"\\u0X12"',
+            '"\\ud834\\"',
+            '"\\ud834\\u"',
+            '"\\ud834\\ud"',
+            '"\\ud834\\udd"',
+            '"\\ud834\\udd2"',
+            '"\\ud834\\uzdd2"',
+            '"\\ud834\\udzd2"',
+            '"\\ud834\\uddz2"',
+            '"\\ud834\\udd2z"',
+            '"\\ud834\\u0x20"',
+            '"\\ud834\\u0X20"',
+        ]
+        for s in bad_escapes:
+            with self.assertRaises(ValueError, msg=s):
+                scanstring(s, 1, True)
+
     def test_overflow(self):
         with self.assertRaises(OverflowError):
             self.json.decoder.scanstring(b"xxx", sys.maxsize+1)
index 2e8d21c59b6385c2228a8e479b0f14e35316b152..84217e7b2188c5446f901012f82fc93bb6c31333 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -16,6 +16,8 @@ Core and Builtins
 Library
 -------
 
+- Issue #11489: JSON decoder now accepts lone surrogates.
+
 - Issue #19545: Avoid chained exceptions while passing stray % to
   time.strptime().  Initial patch by Claudiu Popa.
 
index db45c28fe4033fd875b53675bd3531b17439faa9..916668028911cfbe0d30fca5e925b4a5506ace08 100644 (file)
@@ -433,17 +433,10 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
                 }
             }
             /* Surrogate pair */
-            if ((c & 0xfc00) == 0xd800) {
+            if (Py_UNICODE_IS_HIGH_SURROGATE(c) && end + 6 < len &&
+                PyUnicode_READ(kind, buf, next++) == '\\' &&
+                PyUnicode_READ(kind, buf, next++) == 'u') {
                 Py_UCS4 c2 = 0;
-                if (end + 6 >= len) {
-                    raise_errmsg("Unpaired high surrogate", pystr, end - 5);
-                    goto bail;
-                }
-                if (PyUnicode_READ(kind, buf, next++) != '\\' ||
-                    PyUnicode_READ(kind, buf, next++) != 'u') {
-                    raise_errmsg("Unpaired high surrogate", pystr, end - 5);
-                    goto bail;
-                }
                 end += 6;
                 /* Decode 4 hex digits */
                 for (; next < end; next++) {
@@ -464,15 +457,10 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
                             goto bail;
                     }
                 }
-                if ((c2 & 0xfc00) != 0xdc00) {
-                    raise_errmsg("Unpaired high surrogate", pystr, end - 5);
-                    goto bail;
-                }
-                c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
-            }
-            else if ((c & 0xfc00) == 0xdc00) {
-                raise_errmsg("Unpaired low surrogate", pystr, end - 5);
-                goto bail;
+                if (Py_UNICODE_IS_LOW_SURROGATE(c2))
+                    c = Py_UNICODE_JOIN_SURROGATES(c, c2);
+                else
+                    end -= 6;
             }
         }
         APPEND_OLD_CHUNK