From: Alexandre Vassalotti Date: Sat, 27 Dec 2008 07:32:41 +0000 (+0000) Subject: Update copy of PyUnicode_EncodeRawUnicodeEscape in _pickle. X-Git-Tag: v3.1a1~666 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=554d878b1c653c98b6bd043cea67a4191ba114d0;p=python Update copy of PyUnicode_EncodeRawUnicodeEscape in _pickle. Add astral character test case. --- diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py index c7c89d1509..f65093ae41 100644 --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -484,13 +484,21 @@ class AbstractPickleTests(unittest.TestCase): self.assertRaises(ValueError, self.loads, buf) def test_unicode(self): - endcases = ['', '<\\u>', '<\\\u1234>', '<\n>', '<\\>'] + endcases = ['', '<\\u>', '<\\\u1234>', '<\n>', '<\\>', + '<\\\U00012345>'] for proto in protocols: for u in endcases: p = self.dumps(u, proto) u2 = self.loads(p) self.assertEqual(u2, u) + def test_unicode_high_plane(self): + t = '\U00012345' + for proto in protocols: + p = self.dumps(t, proto) + t2 = self.loads(p) + self.assertEqual(t2, t) + def test_bytes(self): for proto in protocols: for u in b'', b'xyz', b'xyz'*100: diff --git a/Modules/_pickle.c b/Modules/_pickle.c index a0810b99a9..6cc90b3f1e 100644 --- a/Modules/_pickle.c +++ b/Modules/_pickle.c @@ -1109,16 +1109,21 @@ raw_unicode_escape(const Py_UNICODE *s, Py_ssize_t size) static const char *hexdigits = "0123456789abcdef"; #ifdef Py_UNICODE_WIDE - repr = PyBytes_FromStringAndSize(NULL, 10 * size); + const Py_ssize_t expandsize = 10; #else - repr = PyBytes_FromStringAndSize(NULL, 6 * size); + const Py_ssize_t expandsize = 6; #endif + + if (size > PY_SSIZE_T_MAX / expandsize) + return PyErr_NoMemory(); + + repr = PyByteArray_FromStringAndSize(NULL, expandsize * size); if (repr == NULL) return NULL; if (size == 0) goto done; - p = q = PyBytes_AS_STRING(repr); + p = q = PyByteArray_AS_STRING(repr); while (size-- > 0) { Py_UNICODE ch = *s++; #ifdef Py_UNICODE_WIDE @@ -1136,6 +1141,32 @@ raw_unicode_escape(const Py_UNICODE *s, Py_ssize_t size) *p++ = hexdigits[ch & 15]; } else +#else + /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ + if (ch >= 0xD800 && ch < 0xDC00) { + Py_UNICODE ch2; + Py_UCS4 ucs; + + ch2 = *s++; + size--; + if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { + ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; + *p++ = '\\'; + *p++ = 'U'; + *p++ = hexdigits[(ucs >> 28) & 0xf]; + *p++ = hexdigits[(ucs >> 24) & 0xf]; + *p++ = hexdigits[(ucs >> 20) & 0xf]; + *p++ = hexdigits[(ucs >> 16) & 0xf]; + *p++ = hexdigits[(ucs >> 12) & 0xf]; + *p++ = hexdigits[(ucs >> 8) & 0xf]; + *p++ = hexdigits[(ucs >> 4) & 0xf]; + *p++ = hexdigits[ucs & 0xf]; + continue; + } + /* Fall through: isolated surrogates are copied as-is */ + s--; + size++; + } #endif /* Map 16-bit characters to '\uxxxx' */ if (ch >= 256 || ch == '\\' || ch == '\n') { @@ -1146,14 +1177,14 @@ raw_unicode_escape(const Py_UNICODE *s, Py_ssize_t size) *p++ = hexdigits[(ch >> 4) & 0xf]; *p++ = hexdigits[ch & 15]; } - /* Copy everything else as-is */ + /* Copy everything else as-is */ else *p++ = (char) ch; } size = p - q; done: - result = PyBytes_FromStringAndSize(PyBytes_AS_STRING(repr), size); + result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), size); Py_DECREF(repr); return result; }