Issue #24848: Fixed bugs in UTF-7 decoding of misformed data:

author Serhiy Storchaka <storchaka@gmail.com>

Fri, 2 Oct 2015 10:07:28 +0000 (13:07 +0300)

committer Serhiy Storchaka <storchaka@gmail.com>

Fri, 2 Oct 2015 10:07:28 +0000 (13:07 +0300)
author Serhiy Storchaka <storchaka@gmail.com>
Fri, 2 Oct 2015 10:07:28 +0000 (13:07 +0300)
committer Serhiy Storchaka <storchaka@gmail.com>
Fri, 2 Oct 2015 10:07:28 +0000 (13:07 +0300)
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py

index 8b78c240415599330088074bdc108e23042521c9..a1079a1f1a01311d23056cac3b70ab7cdc3c098e 100644 (file)
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -898,6 +898,32 @@ class CP65001Test(ReadTest, unittest.TestCase):
  class UTF7Test(ReadTest, unittest.TestCase):
      encoding = "utf-7"
  
+    def test_ascii(self):
+        # Set D (directly encoded characters)
+        set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+                 'abcdefghijklmnopqrstuvwxyz'
+                 '0123456789'
+                 '\'(),-./:?')
+        self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
+        self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
+        # Set O (optional direct characters)
+        set_o = ' !"#$%&*;<=>@[]^_`{|}'
+        self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
+        self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
+        # +
+        self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
+        self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
+        # White spaces
+        ws = ' \t\n\r'
+        self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
+        self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
+        # Other ASCII characters
+        other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
+                                     set(set_d + set_o + '+' + ws)))
+        self.assertEqual(other_ascii.encode(self.encoding),
+                         b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
+                         b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
+
      def test_partial(self):
          self.check_partial(
              'a+-b\x00c\x80d\u0100e\U00010000f',
@@ -939,7 +965,9 @@ class UTF7Test(ReadTest, unittest.TestCase):
  
      def test_errors(self):
          tests = [
+            (b'\xffb', '\ufffdb'),
              (b'a\xffb', 'a\ufffdb'),
+            (b'a\xff\xffb', 'a\ufffd\ufffdb'),
              (b'a+IK', 'a\ufffd'),
              (b'a+IK-b', 'a\ufffdb'),
              (b'a+IK,b', 'a\ufffdb'),
@@ -955,6 +983,8 @@ class UTF7Test(ReadTest, unittest.TestCase):
              (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
              (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
              (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
+            (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
+            (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
          ]
          for raw, expected in tests:
              with self.subTest(raw=raw):
@@ -966,8 +996,36 @@ class UTF7Test(ReadTest, unittest.TestCase):
          self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
          self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
          self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
+        self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
+        self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
+        self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
+        self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
+        self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
+                         b'+IKwgrNgB3KA-')
+        self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
+                         '\u20ac\u20ac\U000104A0')
+        self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
+                         '\u20ac\u20ac\U000104A0')
  
-    test_lone_surrogates = None
+    def test_lone_surrogates(self):
+        tests = [
+            (b'a+2AE-b', 'a\ud801b'),
+            (b'a+2AE\xffb', 'a\ufffdb'),
+            (b'a+2AE', 'a\ufffd'),
+            (b'a+2AEA-b', 'a\ufffdb'),
+            (b'a+2AH-b', 'a\ufffdb'),
+            (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
+            (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
+            (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
+            (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
+            (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
+            (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
+            (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
+            (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
+        ]
+        for raw, expected in tests:
+            with self.subTest(raw=raw):
+                self.assertEqual(raw.decode('utf-7', 'replace'), expected)
  
  
  class UTF16ExTest(unittest.TestCase):
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py

index 5efbe3e42ca8e01ab54ac99f66025b3905672509..2cc1d7c3ff92a488dfff4f7f7e349dae29c5689d 100644 (file)
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -1524,7 +1524,7 @@ class UnicodeTest(string_tests.CommonTest,
          self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
  
          # Issue #2242: crash on some Windows/MSVC versions
-        self.assertEqual(b'+\xc1'.decode('utf-7'), '\xc1')
+        self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '')
  
          # Direct encoded characters
          set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
@@ -1966,6 +1966,7 @@ class UnicodeTest(string_tests.CommonTest,
          self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
          self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
          self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
+        self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x')
  
          # Error handling (unknown character names)
          self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
diff --git a/Misc/NEWS b/Misc/NEWS

index 14fa1c2838bae38dd4e565e12d472b13116cb36c..99185d181f8a698c5e131b7179d6bb1fd4c8b0d4 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,8 @@ Release date: tba
  Core and Builtins
  -----------------
  
+- Issue #24848: Fixed a number of bugs in UTF-7 decoding of misformed data.
+
  - Issue #25280: Import trace messages emitted in verbose (-v) mode are no
    longer formatted twice.
  
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index e28bae4d2b6e6d7b058d3b43319cc49338de74e6..e9281ad4427963067cd740aeb3bcbac9e9dbab57 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -4381,31 +4381,31 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
              }
              else { /* now leaving a base-64 section */
                  inShift = 0;
-                s++;
-                if (surrogate) {
-                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
-                        goto onError;
-                    surrogate = 0;
-                }
                  if (base64bits > 0) { /* left-over bits */
                      if (base64bits >= 6) {
                          /* We've seen at least one base-64 character */
+                        s++;
                          errmsg = "partial character in shift sequence";
                          goto utf7Error;
                      }
                      else {
                          /* Some bits remain; they should be zero */
                          if (base64buffer != 0) {
+                            s++;
                              errmsg = "non-zero padding bits in shift sequence";
                              goto utf7Error;
                          }
                      }
                  }
-                if (ch != '-') {
+                if (surrogate && DECODE_DIRECT(ch)) {
+                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
+                        goto onError;
+                }
+                surrogate = 0;
+                if (ch == '-') {
                      /* '-' is absorbed; other terminating
                         characters are preserved */
-                    if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
-                        goto onError;
+                    s++;
                  }
              }
          }
@@ -4419,6 +4419,7 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
              }
              else { /* begin base64-encoded section */
                  inShift = 1;
+                surrogate = 0;
                  shiftOutStart = writer.pos;
                  base64bits = 0;
                  base64buffer = 0;
@@ -4450,6 +4451,7 @@ utf7Error:
  
      if (inShift && !consumed) { /* in shift sequence, no more to follow */
          /* if we're in an inconsistent state, that's an error */
+        inShift = 0;
          if (surrogate ||
                  (base64bits >= 6) ||
                  (base64bits > 0 && base64buffer != 0)) {
@@ -13337,6 +13339,7 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
  
          if (maxchar > writer->maxchar || writer->readonly) {
              /* resize + widen */
+            maxchar = Py_MAX(maxchar, writer->maxchar);
              newbuffer = PyUnicode_New(newlen, maxchar);
              if (newbuffer == NULL)
                  return -1;
author	Serhiy Storchaka <storchaka@gmail.com>
	Fri, 2 Oct 2015 10:07:28 +0000 (13:07 +0300)
committer	Serhiy Storchaka <storchaka@gmail.com>
	Fri, 2 Oct 2015 10:07:28 +0000 (13:07 +0300)
Lib/test/test_codecs.py		patch \| blob \| history
Lib/test/test_unicode.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history
Objects/unicodeobject.c		patch \| blob \| history