From a79e05097be27579d0ab0e38fda404b0edb670c0 Mon Sep 17 00:00:00 2001 From: Amaury Forgeot d'Arc Date: Mon, 24 Mar 2008 21:16:28 +0000 Subject: [PATCH] #1477: ur'\U0010FFFF' used to raise in narrow unicode builds. Corrected the raw-unicode-escape codec to use UTF-16 surrogates in this case, like the unicode-escape codec does. Backport of r61793 and r61853 --- Lib/test/test_unicode.py | 17 +++++++++++++-- Misc/NEWS | 7 ++++++ Objects/unicodeobject.c | 46 ++++++++++++++++++++++++++++++++++++---- 3 files changed, 64 insertions(+), 6 deletions(-) diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 4f75771458..55fb8e17f0 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -736,12 +736,25 @@ class UnicodeTest( print >>out, u'def\n' def test_ucs4(self): - if sys.maxunicode == 0xFFFF: - return x = u'\U00100000' y = x.encode("raw-unicode-escape").decode("raw-unicode-escape") self.assertEqual(x, y) + y = r'\U00100000' + x = y.decode("raw-unicode-escape").encode("raw-unicode-escape") + self.assertEqual(x, y) + y = r'\U00010000' + x = y.decode("raw-unicode-escape").encode("raw-unicode-escape") + self.assertEqual(x, y) + + try: + '\U11111111'.decode("raw-unicode-escape") + except UnicodeDecodeError, e: + self.assertEqual(e.start, 0) + self.assertEqual(e.end, 10) + else: + self.fail("Should have raised UnicodeDecodeError") + def test_conversion(self): # Make sure __unicode__() works properly class Foo0: diff --git a/Misc/NEWS b/Misc/NEWS index a97fa52b26..3e95a44da9 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -11,6 +11,13 @@ What's New in Python 2.5.3? Core and builtins ----------------- + +- Issue #1477: With narrow Unicode builds, the unicode escape sequence + \Uxxxxxxxx did not accept values outside the Basic Multilingual Plane. This + affected raw unicode literals and the 'raw-unicode-escape' codec. Now + UTF-16 surrogates are generated in this case, like normal unicode literals + and the 'unicode-escape' codec. + - Issue #2321: use pymalloc for unicode object string data to reduce memory usage in some circumstances. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index c749ac51a7..e2f1ed323d 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2273,8 +2273,22 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, else x += 10 + c - 'A'; } -#ifndef Py_UNICODE_WIDE - if (x > 0x10000) { + if (x <= 0xffff) + /* UCS-2 character */ + *p++ = (Py_UNICODE) x; + else if (x <= 0x10ffff) { + /* UCS-4 character. Either store directly, or as + surrogate pair. */ +#ifdef Py_UNICODE_WIDE + *p++ = (Py_UNICODE) x; +#else + x -= 0x10000L; + *p++ = 0xD800 + (Py_UNICODE) (x >> 10); + *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); +#endif + } else { + endinpos = s-starts; + outpos = p-PyUnicode_AS_UNICODE(v); if (unicode_decode_call_errorhandler( errors, &errorHandler, "rawunicodeescape", "\\Uxxxxxxxx out of range", @@ -2282,8 +2296,6 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, (PyObject **)&v, &outpos, &p)) goto onError; } -#endif - *p++ = x; nextByte: ; } @@ -2337,6 +2349,32 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, *p++ = hexdigit[ch & 15]; } else +#else + /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ + if (ch >= 0xD800 && ch < 0xDC00) { + Py_UNICODE ch2; + Py_UCS4 ucs; + + ch2 = *s++; + size--; + if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { + ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; + *p++ = '\\'; + *p++ = 'U'; + *p++ = hexdigit[(ucs >> 28) & 0xf]; + *p++ = hexdigit[(ucs >> 24) & 0xf]; + *p++ = hexdigit[(ucs >> 20) & 0xf]; + *p++ = hexdigit[(ucs >> 16) & 0xf]; + *p++ = hexdigit[(ucs >> 12) & 0xf]; + *p++ = hexdigit[(ucs >> 8) & 0xf]; + *p++ = hexdigit[(ucs >> 4) & 0xf]; + *p++ = hexdigit[ucs & 0xf]; + continue; + } + /* Fall through: isolated surrogates are copied as-is */ + s--; + size++; + } #endif /* Map 16-bit characters to '\uxxxx' */ if (ch >= 256) { -- 2.40.0