From: Antoine Pitrou Date: Tue, 15 Nov 2011 00:49:40 +0000 (+0100) Subject: Issue #13333: The UTF-7 decoder now accepts lone surrogates X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=30402549de68a5303a5e2995dca7375d3d17966f;p=python Issue #13333: The UTF-7 decoder now accepts lone surrogates (the encoder already accepts them). --- diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 399eed7e31..5c1858c9b4 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -771,10 +771,18 @@ class UnicodeTest( for (x, y) in utfTests: self.assertEqual(x.encode('utf-7'), y) - # Unpaired surrogates not supported - self.assertRaises(UnicodeError, unicode, '+3ADYAA-', 'utf-7') - - self.assertEqual(unicode('+3ADYAA-', 'utf-7', 'replace'), u'\ufffd\ufffd') + # Unpaired surrogates are passed through + self.assertEqual(u'\uD801'.encode('utf-7'), '+2AE-') + self.assertEqual(u'\uD801x'.encode('utf-7'), '+2AE-x') + self.assertEqual(u'\uDC01'.encode('utf-7'), '+3AE-') + self.assertEqual(u'\uDC01x'.encode('utf-7'), '+3AE-x') + self.assertEqual('+2AE-'.decode('utf-7'), u'\uD801') + self.assertEqual('+2AE-x'.decode('utf-7'), u'\uD801x') + self.assertEqual('+3AE-'.decode('utf-7'), u'\uDC01') + self.assertEqual('+3AE-x'.decode('utf-7'), u'\uDC01x') + + self.assertEqual(u'\uD801\U000abcde'.encode('utf-7'), '+2AHab9ze-') + self.assertEqual('+2AHab9ze-'.decode('utf-7'), u'\uD801\U000abcde') # Direct encoded characters set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?" diff --git a/Misc/NEWS b/Misc/NEWS index c12d53c604..8d37ca52ab 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -9,6 +9,9 @@ What's New in Python 2.7.3? Core and Builtins ----------------- +- Issue #13333: The UTF-7 decoder now accepts lone surrogates (the encoder + already accepts them). + - Remove Py3k warning for callable. - Issue #10519: Avoid unnecessary recursive function calls in diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 4d6864d7fd..5ce879d440 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1628,21 +1628,17 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, *p++ = outCh; #endif surrogate = 0; + continue; } else { + *p++ = surrogate; surrogate = 0; - errmsg = "second surrogate missing"; - goto utf7Error; } } - else if (outCh >= 0xD800 && outCh <= 0xDBFF) { + if (outCh >= 0xD800 && outCh <= 0xDBFF) { /* first surrogate */ surrogate = outCh; } - else if (outCh >= 0xDC00 && outCh <= 0xDFFF) { - errmsg = "unexpected second surrogate"; - goto utf7Error; - } else { *p++ = outCh; } @@ -1652,8 +1648,8 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, inShift = 0; s++; if (surrogate) { - errmsg = "second surrogate missing at end of shift sequence"; - goto utf7Error; + *p++ = surrogate; + surrogate = 0; } if (base64bits > 0) { /* left-over bits */ if (base64bits >= 6) {