Merged revisions 84655 via svnmerge from

author Antoine Pitrou <solipsis@pitrou.net>

Thu, 9 Sep 2010 20:33:43 +0000 (20:33 +0000)

committer Antoine Pitrou <solipsis@pitrou.net>

Thu, 9 Sep 2010 20:33:43 +0000 (20:33 +0000)
author Antoine Pitrou <solipsis@pitrou.net>
Thu, 9 Sep 2010 20:33:43 +0000 (20:33 +0000)
committer Antoine Pitrou <solipsis@pitrou.net>
Thu, 9 Sep 2010 20:33:43 +0000 (20:33 +0000)
diff --git a/Lib/test/test_builtin.py b/Lib/test/test_builtin.py

index 2a08337c7942e0a8eefbaa6aa4fad2980b1b6444..645ef9d5cf67979b943bc4849a8f189ecdee8c7a 100644 (file)
--- a/Lib/test/test_builtin.py
+++ b/Lib/test/test_builtin.py
@@ -174,6 +174,28 @@ class BuiltinTest(unittest.TestCase):
          a = {}
          a[0] = a
          self.assertEqual(ascii(a), '{0: {...}}')
+        # Advanced checks for unicode strings
+        def _check_uni(s):
+            self.assertEqual(ascii(s), repr(s))
+        _check_uni("'")
+        _check_uni('"')
+        _check_uni('"\'')
+        _check_uni('\0')
+        _check_uni('\r\n\t .')
+        # Unprintable non-ASCII characters
+        _check_uni('\x85')
+        _check_uni('\u1fff')
+        _check_uni('\U00012fff')
+        # Lone surrogates
+        _check_uni('\ud800')
+        _check_uni('\udfff')
+        # Issue #9804: surrogates should be joined even for printable
+        # wide characters (UCS-2 builds).
+        self.assertEqual(ascii('\U0001d121'), "'\\U0001d121'")
+        # All together
+        s = "'\0\"\n\r\t abcd\x85é\U00012fff\uD800\U0001D121xxx."
+        self.assertEqual(ascii(s),
+            r"""'\'\x00"\n\r\t abcd\x85\xe9\U00012fff\ud800\U0001d121xxx.'""")
  
      def test_neg(self):
          x = -sys.maxsize-1
diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py

index 82782b5db3614788f76292c0f01d4bc7c30ef703..6105fc02fa62317c16824aa129167abc9a8e8332 100644 (file)
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@@ -577,17 +577,31 @@ class CodecCallbackTest(unittest.TestCase):
                  UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")),
              ("\\uffff", 1)
          )
-        if sys.maxunicode>0xffff:
-            self.assertEquals(
-                codecs.backslashreplace_errors(
-                    UnicodeEncodeError("ascii", "\U00010000", 0, 1, "ouch")),
-                ("\\U00010000", 1)
-            )
-            self.assertEquals(
-                codecs.backslashreplace_errors(
-                    UnicodeEncodeError("ascii", "\U0010ffff", 0, 1, "ouch")),
-                ("\\U0010ffff", 1)
-            )
+        # 1 on UCS-4 builds, 2 on UCS-2
+        len_wide = len("\U00010000")
+        self.assertEquals(
+            codecs.backslashreplace_errors(
+                UnicodeEncodeError("ascii", "\U00010000",
+                                   0, len_wide, "ouch")),
+            ("\\U00010000", len_wide)
+        )
+        self.assertEquals(
+            codecs.backslashreplace_errors(
+                UnicodeEncodeError("ascii", "\U0010ffff",
+                                   0, len_wide, "ouch")),
+            ("\\U0010ffff", len_wide)
+        )
+        # Lone surrogates (regardless of unicode width)
+        self.assertEquals(
+            codecs.backslashreplace_errors(
+                UnicodeEncodeError("ascii", "\ud800", 0, 1, "ouch")),
+            ("\\ud800", 1)
+        )
+        self.assertEquals(
+            codecs.backslashreplace_errors(
+                UnicodeEncodeError("ascii", "\udfff", 0, 1, "ouch")),
+            ("\\udfff", 1)
+        )
  
      def test_badhandlerresults(self):
          results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
diff --git a/Misc/NEWS b/Misc/NEWS

index cc18cf3e3b866d5ed423c46909a4cfc638b29623..d0a31c1eef6a5ade456bf60dba755d30fffc8424 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -12,6 +12,11 @@ What's New in Python 3.1.3?
  Core and Builtins
  -----------------
  
+- Issue #9804: ascii() now always represents unicode surrogate pairs as
+  a single ``\UXXXXXXXX``, regardless of whether the character is printable
+  or not.  Also, the "backslashreplace" error handler now joins surrogate
+  pairs into a single character on UCS-2 builds.
+
  - Issue #9797: pystate.c wrongly assumed that zero couldn't be a valid
    thread-local storage key.
  
diff --git a/Python/codecs.c b/Python/codecs.c

index 04487a216c2da03727dc513890dcc33c1dc4a27e..45d99291f11e7f1b09174aca23bc131e7450aa24 100644 (file)
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -678,6 +678,13 @@ static Py_UNICODE hexdigits[] = {
  
  PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
  {
+#ifndef Py_UNICODE_WIDE
+#define IS_SURROGATE_PAIR(p, end) \
+    (*p >= 0xD800 && *p <= 0xDBFF && (p + 1) < end && \
+     *(p + 1) >= 0xDC00 && *(p + 1) <= 0xDFFF)
+#else
+#define IS_SURROGATE_PAIR(p, end) 0
+#endif
      if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
          PyObject *restuple;
          PyObject *object;
@@ -702,7 +709,12 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
              else
  #endif
              if (*p >= 0x100) {
-                ressize += 1+1+4;
+                if (IS_SURROGATE_PAIR(p, startp+end)) {
+                    ressize += 1+1+8;
+                    ++p;
+                }
+                else
+                    ressize += 1+1+4;
              }
              else
                  ressize += 1+1+2;
@@ -712,9 +724,12 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
              return NULL;
          for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
              p < startp+end; ++p) {
-            Py_UNICODE c = *p;
+            Py_UCS4 c = (Py_UCS4) *p;
              *outp++ = '\\';
-#ifdef Py_UNICODE_WIDE
+            if (IS_SURROGATE_PAIR(p, startp+end)) {
+                c = ((*p & 0x3FF) << 10) + (*(p + 1) & 0x3FF) + 0x10000;
+                ++p;
+            }
              if (c >= 0x00010000) {
                  *outp++ = 'U';
                  *outp++ = hexdigits[(c>>28)&0xf];
@@ -724,9 +739,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
                  *outp++ = hexdigits[(c>>12)&0xf];
                  *outp++ = hexdigits[(c>>8)&0xf];
              }
-            else
-#endif
-            if (c >= 0x100) {
+            else if (c >= 0x100) {
                  *outp++ = 'u';
                  *outp++ = hexdigits[(c>>12)&0xf];
                  *outp++ = hexdigits[(c>>8)&0xf];
@@ -746,6 +759,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
          wrong_exception_type(exc);
          return NULL;
      }
+#undef IS_SURROGATE_PAIR
  }
  
  /* This handler is declared static until someone demonstrates
author	Antoine Pitrou <solipsis@pitrou.net>
	Thu, 9 Sep 2010 20:33:43 +0000 (20:33 +0000)
committer	Antoine Pitrou <solipsis@pitrou.net>
	Thu, 9 Sep 2010 20:33:43 +0000 (20:33 +0000)
Lib/test/test_builtin.py		patch \| blob \| history
Lib/test/test_codeccallbacks.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history
Python/codecs.c		patch \| blob \| history