]> granicus.if.org Git - python/commitdiff
Merged revisions 67934-67935 via svnmerge from
authorAlexandre Vassalotti <alexandre@peadrop.com>
Sat, 27 Dec 2008 07:16:40 +0000 (07:16 +0000)
committerAlexandre Vassalotti <alexandre@peadrop.com>
Sat, 27 Dec 2008 07:16:40 +0000 (07:16 +0000)
svn+ssh://pythondev@svn.python.org/python/trunk

........
  r67934 | alexandre.vassalotti | 2008-12-27 02:08:47 -0500 (Sat, 27 Dec 2008) | 4 lines

  Fix issue #4730: cPickle corrupts high-unicode strings.
  Update outdated copy of PyUnicode_EncodeRawUnicodeEscape.
  Add a test case.
........
  r67935 | alexandre.vassalotti | 2008-12-27 02:13:01 -0500 (Sat, 27 Dec 2008) | 2 lines

  Add Misc/NEWS entry for r67934.
........

Lib/test/pickletester.py
Misc/NEWS
Modules/cPickle.c

index bf9bca78f98734d3a94ef31a02e0adffb55a825c..bf25245bd1a0b41c85c4a258b2262148faabcc32 100644 (file)
@@ -480,14 +480,21 @@ class AbstractPickleTests(unittest.TestCase):
 
     if have_unicode:
         def test_unicode(self):
-            endcases = [unicode(''), unicode('<\\u>'), unicode('<\\\u1234>'),
-                        unicode('<\n>'),  unicode('<\\>')]
+            endcases = [u'', u'<\\u>', u'<\\\\u1234>', u'<\n>',
+                        u'<\\>', u'<\\\\U00012345>']
             for proto in protocols:
                 for u in endcases:
                     p = self.dumps(u, proto)
                     u2 = self.loads(p)
                     self.assertEqual(u2, u)
 
+        def test_unicode_high_plane(self):
+            t = u'\U00012345'
+            for proto in protocols:
+                p = self.dumps(t, proto)
+                t2 = self.loads(p)
+                self.assertEqual(t2, t)
+
     def test_ints(self):
         import sys
         for proto in protocols:
index a305843c44879d0783cbf57bcc8e0b406763152a..af1c22e2a0c6d0f64a244136ed58e3c15d163bee 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -173,6 +173,9 @@ Library
 - Issue #4014: Don't claim that Python has an Alpha release status, in addition
   to claiming it is Mature.
 
+- Issue #4730: Fixed the cPickle module to handle correctly astral characters
+  when protocol 0 is used.
+
 Build
 -----
 
index f777286a6a7bca234cc869a13ec14601b0ba6f70..18baee1c395f3e129519dabbe6b8000d87cf47b3 100644 (file)
@@ -1255,41 +1255,90 @@ save_string(Picklerobject *self, PyObject *args, int doput)
 /* A copy of PyUnicode_EncodeRawUnicodeEscape() that also translates
    backslash and newline characters to \uXXXX escapes. */
 static PyObject *
-modified_EncodeRawUnicodeEscape(const Py_UNICODE *s, int size)
+modified_EncodeRawUnicodeEscape(const Py_UNICODE *s, Py_ssize_t size)
 {
-       PyObject *repr;
-       char *p;
-       char *q;
+    PyObject *repr;
+    char *p;
+    char *q;
 
-       static const char *hexdigit = "0123456789ABCDEF";
+    static const char *hexdigit = "0123456789abcdef";
+#ifdef Py_UNICODE_WIDE
+    const Py_ssize_t expandsize = 10;
+#else
+    const Py_ssize_t expandsize = 6;
+#endif
 
-       repr = PyString_FromStringAndSize(NULL, 6 * size);
-       if (repr == NULL)
-               return NULL;
-       if (size == 0)
-               return repr;
-
-       p = q = PyString_AS_STRING(repr);
-       while (size-- > 0) {
-               Py_UNICODE ch = *s++;
-               /* Map 16-bit characters to '\uxxxx' */
-               if (ch >= 256 || ch == '\\' || ch == '\n') {
-                       *p++ = '\\';
-                       *p++ = 'u';
-                       *p++ = hexdigit[(ch >> 12) & 0xf];
-                       *p++ = hexdigit[(ch >> 8) & 0xf];
-                       *p++ = hexdigit[(ch >> 4) & 0xf];
-                       *p++ = hexdigit[ch & 15];
-               }
-               /* Copy everything else as-is */
-               else
-                       *p++ = (char) ch;
-       }
-       *p = '\0';
-       _PyString_Resize(&repr, p - q);
+    if (size > PY_SSIZE_T_MAX / expandsize)
+        return PyErr_NoMemory();
+
+    repr = PyString_FromStringAndSize(NULL, expandsize * size);
+    if (repr == NULL)
+        return NULL;
+    if (size == 0)
        return repr;
-}
 
+    p = q = PyString_AS_STRING(repr);
+    while (size-- > 0) {
+        Py_UNICODE ch = *s++;
+#ifdef Py_UNICODE_WIDE
+       /* Map 32-bit characters to '\Uxxxxxxxx' */
+       if (ch >= 0x10000) {
+            *p++ = '\\';
+            *p++ = 'U';
+            *p++ = hexdigit[(ch >> 28) & 0xf];
+            *p++ = hexdigit[(ch >> 24) & 0xf];
+            *p++ = hexdigit[(ch >> 20) & 0xf];
+            *p++ = hexdigit[(ch >> 16) & 0xf];
+            *p++ = hexdigit[(ch >> 12) & 0xf];
+            *p++ = hexdigit[(ch >> 8) & 0xf];
+            *p++ = hexdigit[(ch >> 4) & 0xf];
+            *p++ = hexdigit[ch & 15];
+        }
+        else
+#else
+       /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
+       if (ch >= 0xD800 && ch < 0xDC00) {
+           Py_UNICODE ch2;
+           Py_UCS4 ucs;
+
+           ch2 = *s++;
+           size--;
+           if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
+               ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
+               *p++ = '\\';
+               *p++ = 'U';
+               *p++ = hexdigit[(ucs >> 28) & 0xf];
+               *p++ = hexdigit[(ucs >> 24) & 0xf];
+               *p++ = hexdigit[(ucs >> 20) & 0xf];
+               *p++ = hexdigit[(ucs >> 16) & 0xf];
+               *p++ = hexdigit[(ucs >> 12) & 0xf];
+               *p++ = hexdigit[(ucs >> 8) & 0xf];
+               *p++ = hexdigit[(ucs >> 4) & 0xf];
+               *p++ = hexdigit[ucs & 0xf];
+               continue;
+           }
+           /* Fall through: isolated surrogates are copied as-is */
+           s--;
+           size++;
+       }
+#endif
+       /* Map 16-bit characters to '\uxxxx' */
+       if (ch >= 256 || ch == '\\' || ch == '\n') {
+            *p++ = '\\';
+            *p++ = 'u';
+            *p++ = hexdigit[(ch >> 12) & 0xf];
+            *p++ = hexdigit[(ch >> 8) & 0xf];
+            *p++ = hexdigit[(ch >> 4) & 0xf];
+            *p++ = hexdigit[ch & 15];
+        }
+       /* Copy everything else as-is */
+       else
+            *p++ = (char) ch;
+    }
+    *p = '\0';
+    _PyString_Resize(&repr, p - q);
+    return repr;
+}
 
 static int
 save_unicode(Picklerobject *self, PyObject *args, int doput)