]> granicus.if.org Git - python/commitdiff
Issue #13333: The UTF-7 decoder now accepts lone surrogates
authorAntoine Pitrou <solipsis@pitrou.net>
Tue, 15 Nov 2011 00:42:21 +0000 (01:42 +0100)
committerAntoine Pitrou <solipsis@pitrou.net>
Tue, 15 Nov 2011 00:42:21 +0000 (01:42 +0100)
(the encoder already accepts them).

Lib/test/test_unicode.py
Misc/NEWS
Objects/unicodeobject.c

index 86185e9db6e4c5c6a24eb6ee2ecbd27500762956..591a297756be114163527d9fbda29c24d570294d 100644 (file)
@@ -1091,10 +1091,18 @@ class UnicodeTest(string_tests.CommonTest,
         for (x, y) in utfTests:
             self.assertEqual(x.encode('utf-7'), y)
 
-        # Unpaired surrogates not supported
-        self.assertRaises(UnicodeError, str, b'+3ADYAA-', 'utf-7')
-
-        self.assertEqual(str(b'+3ADYAA-', 'utf-7', 'replace'), '\ufffd\ufffd')
+        # Unpaired surrogates are passed through
+        self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-')
+        self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x')
+        self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-')
+        self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x')
+        self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801')
+        self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x')
+        self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01')
+        self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x')
+
+        self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-')
+        self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
 
         # Issue #2242: crash on some Windows/MSVC versions
         self.assertEqual(b'+\xc1'.decode('utf-7'), '\xc1')
index ca8d4cb7bb5c2c1ef3197ef8c66c0827c3a61022..4fb9ff6305a135d9074696e768545442d14ae73e 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,9 @@ What's New in Python 3.2.3?
 Core and Builtins
 -----------------
 
+- Issue #13333: The UTF-7 decoder now accepts lone surrogates (the encoder
+  already accepts them).
+
 - Issue #13342: input() used to ignore sys.stdin's and sys.stdout's unicode
   error handler in interactive mode (when calling into PyOS_Readline()).
 
index 7316abfc9c350c60ae9e34a4a288315592720708..8680726275e7cb2334ff12ddef63f1f499ac5e4d 100644 (file)
@@ -2282,21 +2282,17 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
                             *p++ = outCh;
 #endif
                             surrogate = 0;
+                            continue;
                         }
                         else {
+                            *p++ = surrogate;
                             surrogate = 0;
-                            errmsg = "second surrogate missing";
-                            goto utf7Error;
                         }
                     }
-                    else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
+                    if (outCh >= 0xD800 && outCh <= 0xDBFF) {
                         /* first surrogate */
                         surrogate = outCh;
                     }
-                    else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
-                        errmsg = "unexpected second surrogate";
-                        goto utf7Error;
-                    }
                     else {
                         *p++ = outCh;
                     }
@@ -2306,8 +2302,8 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
                 inShift = 0;
                 s++;
                 if (surrogate) {
-                    errmsg = "second surrogate missing at end of shift sequence";
-                    goto utf7Error;
+                    *p++ = surrogate;
+                    surrogate = 0;
                 }
                 if (base64bits > 0) { /* left-over bits */
                     if (base64bits >= 6) {