#1477: ur'\U0010FFFF' raised in narrow unicode builds.

author Amaury Forgeot d'Arc <amauryfa@gmail.com>

Sun, 23 Mar 2008 09:55:29 +0000 (09:55 +0000)

committer Amaury Forgeot d'Arc <amauryfa@gmail.com>

Sun, 23 Mar 2008 09:55:29 +0000 (09:55 +0000)
author Amaury Forgeot d'Arc <amauryfa@gmail.com>
Sun, 23 Mar 2008 09:55:29 +0000 (09:55 +0000)
committer Amaury Forgeot d'Arc <amauryfa@gmail.com>
Sun, 23 Mar 2008 09:55:29 +0000 (09:55 +0000)
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py

index bdc7192b62c5dba616c724a53030de5d630c9a9e..24e8e772491666e2a83fcce5629859930dea86e3 100644 (file)
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -736,12 +736,25 @@ class UnicodeTest(
          print >>out, u'def\n'
  
      def test_ucs4(self):
-        if sys.maxunicode == 0xFFFF:
-            return
          x = u'\U00100000'
          y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
          self.assertEqual(x, y)
  
+        y = r'\U00100000'
+        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
+        self.assertEqual(x, y)
+        y = r'\U00010000'
+        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
+        self.assertEqual(x, y)
+
+        try:
+            '\U11111111'.decode("raw-unicode-escape")
+        except UnicodeDecodeError as e:
+            self.assertEqual(e.start, 0)
+            self.assertEqual(e.end, 10)
+        else:
+            self.fail("Should have raised UnicodeDecodeError")
+
      def test_conversion(self):
          # Make sure __unicode__() works properly
          class Foo0:
diff --git a/Misc/NEWS b/Misc/NEWS

index 62803ffd8e9637ad046054e1d494033373b98abc..f7b16b4d34b270260f1fe413afb3d1015d08f1b7 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -12,6 +12,12 @@ What's New in Python 2.6 alpha 2?
  Core and builtins
  -----------------
   
+- Issue #1477: With narrow Unicode builds, the unicode escape sequence
+  \Uxxxxxxxx did not accept values outside the Basic Multilingual Plane.  This
+  affected raw unicode literals and the 'raw-unicode-escape' codec.  Now
+  UTF-16 surrogates are generated in this case, like normal unicode literals
+  and the 'unicode-escape' codec.
+
  - Issue #2348: add Py3k warning for file.softspace.
  
  - Issue #2346/#2347: add Py3k warnings for __methods__ and __members__.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 5878f96166c7f2197d861d72c32f305f744b9dfc..4df9fd8f06ae1f7c5e9b84cc7e6b2f6a4f8f1577 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -3088,8 +3088,22 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
             else
                 x += 10 + c - 'A';
         }
-#ifndef Py_UNICODE_WIDE
-        if (x > 0x10000) {
+        if (x <= 0xffff)
+                /* UCS-2 character */
+                *p++ = (Py_UNICODE) x;
+        else if (x <= 0x10ffff) {
+                /* UCS-4 character. Either store directly, or as
+                   surrogate pair. */
+#ifdef Py_UNICODE_WIDE
+                *p++ = (Py_UNIC0DE) x;
+#else
+                x -= 0x10000L;
+                *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
+                *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
+#endif
+        } else {
+            endinpos = s-starts;
+            outpos = p-PyUnicode_AS_UNICODE(v);
              if (unicode_decode_call_errorhandler(
                      errors, &errorHandler,
                      "rawunicodeescape", "\\Uxxxxxxxx out of range",
@@ -3097,8 +3111,6 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
                     (PyObject **)&v, &outpos, &p))
                     goto onError;
          }
-#endif
-       *p++ = x;
         nextByte:
         ;
      }
@@ -3152,6 +3164,32 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
              *p++ = hexdigit[ch & 15];
          }
          else
+#else
+       /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
+       if (ch >= 0xD800 && ch < 0xDC00) {
+           Py_UNICODE ch2;
+           Py_UCS4 ucs;
+
+           ch2 = *s++;
+           size--;
+           if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
+               ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
+               *p++ = '\\';
+               *p++ = 'U';
+               *p++ = hexdigit[(ucs >> 28) & 0xf];
+               *p++ = hexdigit[(ucs >> 24) & 0xf];
+               *p++ = hexdigit[(ucs >> 20) & 0xf];
+               *p++ = hexdigit[(ucs >> 16) & 0xf];
+               *p++ = hexdigit[(ucs >> 12) & 0xf];
+               *p++ = hexdigit[(ucs >> 8) & 0xf];
+               *p++ = hexdigit[(ucs >> 4) & 0xf];
+               *p++ = hexdigit[ucs & 0xf];
+               continue;
+           }
+           /* Fall through: isolated surrogates are copied as-is */
+           s--;
+           size++;
+       }
  #endif
         /* Map 16-bit characters to '\uxxxx' */
         if (ch >= 256) {
author	Amaury Forgeot d'Arc <amauryfa@gmail.com>
	Sun, 23 Mar 2008 09:55:29 +0000 (09:55 +0000)
committer	Amaury Forgeot d'Arc <amauryfa@gmail.com>
	Sun, 23 Mar 2008 09:55:29 +0000 (09:55 +0000)
Lib/test/test_unicode.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history
Objects/unicodeobject.c		patch \| blob \| history