in wide builds, avoid storing high unicode characters from source code with surrogates

author Benjamin Peterson <benjamin@python.org>

Wed, 28 Oct 2009 21:59:39 +0000 (21:59 +0000)

committer Benjamin Peterson <benjamin@python.org>

Wed, 28 Oct 2009 21:59:39 +0000 (21:59 +0000)
author Benjamin Peterson <benjamin@python.org>
Wed, 28 Oct 2009 21:59:39 +0000 (21:59 +0000)
committer Benjamin Peterson <benjamin@python.org>
Wed, 28 Oct 2009 21:59:39 +0000 (21:59 +0000)
diff --git a/Lib/test/test_pep263.py b/Lib/test/test_pep263.py

index 05ca47ff43fe189778ff587327a97b6c8f31681c..587b2fcc11adaaead47ac67dc645e20809f45562 100644 (file)
--- a/Lib/test/test_pep263.py
+++ b/Lib/test/test_pep263.py
@@ -36,6 +36,14 @@ class PEP263Test(unittest.TestCase):
          exec(c, d)
          self.assertEquals(d['\xc6'], '\xc6')
  
+    def test_issue3297(self):
+        c = compile("a, b = '\U0001010F', '\\U0001010F'", "dummy", "exec")
+        d = {}
+        exec(c, d)
+        self.assertEqual(d['a'], d['b'])
+        self.assertEqual(len(d['a']), len(d['b']))
+        self.assertEqual(ascii(d['a']), ascii(d['b']))
+
  def test_main():
      support.run_unittest(PEP263Test)
  
diff --git a/Misc/NEWS b/Misc/NEWS

index 96320f7afa422e4622884281ead062ce10c1588f..030430a9fd89b444ffd1e5dee7b0347cea116d39 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -12,6 +12,9 @@ What's New in Python 3.2 Alpha 1?
  Core and Builtins
  -----------------
  
+- Issue #3297: On wide unicode builds, do not split unicode characters into
+  surrogates.
+
  - Remove length limitation when constructing a complex number from a string.
  
  - Issue #1087418: Boost performance of bitwise operations for longs.
diff --git a/Python/ast.c b/Python/ast.c

index c3edea35346fa567c095d37e8123ae06cf9f3aab..c6a6417efe85008b5e30545372bd251bb517a9d4 100644 (file)
--- a/Python/ast.c
+++ b/Python/ast.c
@@ -3246,10 +3246,11 @@ decode_unicode(struct compiling *c, const char *s, size_t len, int rawmode, cons
          u = NULL;
      } else {
          /* check for integer overflow */
-        if (len > PY_SIZE_MAX / 4)
+        if (len > PY_SIZE_MAX / 6)
              return NULL;
-        /* "\XX" may become "\u005c\uHHLL" (12 bytes) */
-        u = PyBytes_FromStringAndSize((char *)NULL, len * 4);
+        /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
+           "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
+        u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
          if (u == NULL)
              return NULL;
          p = buf = PyBytes_AsString(u);
@@ -3266,20 +3267,24 @@ decode_unicode(struct compiling *c, const char *s, size_t len, int rawmode, cons
                  PyObject *w;
                  char *r;
                  Py_ssize_t rn, i;
-                w = decode_utf8(c, &s, end, "utf-16-be");
+                w = decode_utf8(c, &s, end, "utf-32-be");
                  if (w == NULL) {
                      Py_DECREF(u);
                      return NULL;
                  }
                  r = PyBytes_AS_STRING(w);
                  rn = Py_SIZE(w);
-                assert(rn % 2 == 0);
-                for (i = 0; i < rn; i += 2) {
-                    sprintf(p, "\\u%02x%02x",
+                assert(rn % 4 == 0);
+                for (i = 0; i < rn; i += 4) {
+                    sprintf(p, "\\U%02x%02x%02x%02x",
                              r[i + 0] & 0xFF,
-                            r[i + 1] & 0xFF);
-                    p += 6;
+                            r[i + 1] & 0xFF,
+                            r[i + 2] & 0xFF,
+                            r[i + 3] & 0xFF);
+                    p += 10;
                  }
+                /* Should be impossible to overflow */
+                assert(p - buf <= Py_SIZE(u));
                  Py_DECREF(w);
              } else {
                  *p++ = *s++;
author	Benjamin Peterson <benjamin@python.org>
	Wed, 28 Oct 2009 21:59:39 +0000 (21:59 +0000)
committer	Benjamin Peterson <benjamin@python.org>
	Wed, 28 Oct 2009 21:59:39 +0000 (21:59 +0000)
Lib/test/test_pep263.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history
Python/ast.c		patch \| blob \| history