From e12f63218603f3e15592df7fba5a484f9ff5c004 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 2 Oct 2015 13:14:53 +0300 Subject: [PATCH] Issue #24848: Fixed bugs in UTF-7 decoding of misformed data: 1. Non-ASCII bytes were accepted after shift sequence. 2. A low surrogate could be emitted in case of error in high surrogate. --- Lib/test/test_codecs.py | 59 ++++++++++++++++++++++++++++++++++++++++ Lib/test/test_unicode.py | 1 + Misc/NEWS | 2 ++ Objects/unicodeobject.c | 16 ++++++----- 4 files changed, 71 insertions(+), 7 deletions(-) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index cf486373cc..9ae0ed0848 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -642,6 +642,32 @@ class UTF8Test(ReadTest): class UTF7Test(ReadTest): encoding = "utf-7" + def test_ascii(self): + # Set D (directly encoded characters) + set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' + 'abcdefghijklmnopqrstuvwxyz' + '0123456789' + '\'(),-./:?') + self.assertEqual(set_d.encode(self.encoding), set_d) + self.assertEqual(set_d.decode(self.encoding), set_d) + # Set O (optional direct characters) + set_o = ' !"#$%&*;<=>@[]^_`{|}' + self.assertEqual(set_o.encode(self.encoding), set_o) + self.assertEqual(set_o.decode(self.encoding), set_o) + # + + self.assertEqual(u'a+b'.encode(self.encoding), 'a+-b') + self.assertEqual('a+-b'.decode(self.encoding), u'a+b') + # White spaces + ws = ' \t\n\r' + self.assertEqual(ws.encode(self.encoding), ws) + self.assertEqual(ws.decode(self.encoding), ws) + # Other ASCII characters + other_ascii = ''.join(sorted(set(chr(i) for i in range(0x80)) - + set(set_d + set_o + '+' + ws))) + self.assertEqual(other_ascii.encode(self.encoding), + '+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU' + 'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-') + def test_partial(self): self.check_partial( u"a+-b", @@ -656,7 +682,9 @@ class UTF7Test(ReadTest): def test_errors(self): tests = [ + ('\xffb', u'\ufffdb'), ('a\xffb', u'a\ufffdb'), + ('a\xff\xffb', u'a\ufffd\ufffdb'), ('a+IK', u'a\ufffd'), ('a+IK-b', u'a\ufffdb'), ('a+IK,b', u'a\ufffdb'), @@ -672,6 +700,8 @@ class UTF7Test(ReadTest): ('a+//,+IKw-b', u'a\ufffd\u20acb'), ('a+///,+IKw-b', u'a\uffff\ufffd\u20acb'), ('a+////,+IKw-b', u'a\uffff\ufffd\u20acb'), + ('a+IKw-b\xff', u'a\u20acb\ufffd'), + ('a+IKw\xffb', u'a\u20ac\ufffdb'), ] for raw, expected in tests: self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode, @@ -682,6 +712,35 @@ class UTF7Test(ReadTest): self.assertEqual(u'\U000104A0'.encode(self.encoding), '+2AHcoA-') self.assertEqual(u'\ud801\udca0'.encode(self.encoding), '+2AHcoA-') self.assertEqual('+2AHcoA-'.decode(self.encoding), u'\U000104A0') + self.assertEqual('+2AHcoA'.decode(self.encoding), u'\U000104A0') + self.assertEqual(u'\u20ac\U000104A0'.encode(self.encoding), '+IKzYAdyg-') + self.assertEqual('+IKzYAdyg-'.decode(self.encoding), u'\u20ac\U000104A0') + self.assertEqual('+IKzYAdyg'.decode(self.encoding), u'\u20ac\U000104A0') + self.assertEqual(u'\u20ac\u20ac\U000104A0'.encode(self.encoding), + '+IKwgrNgB3KA-') + self.assertEqual('+IKwgrNgB3KA-'.decode(self.encoding), + u'\u20ac\u20ac\U000104A0') + self.assertEqual('+IKwgrNgB3KA'.decode(self.encoding), + u'\u20ac\u20ac\U000104A0') + + def test_lone_surrogates(self): + tests = [ + ('a+2AE-b', u'a\ud801b'), + ('a+2AE\xffb', u'a\ufffdb'), + ('a+2AE', u'a\ufffd'), + ('a+2AEA-b', u'a\ufffdb'), + ('a+2AH-b', u'a\ufffdb'), + ('a+IKzYAQ-b', u'a\u20ac\ud801b'), + ('a+IKzYAQ\xffb', u'a\u20ac\ufffdb'), + ('a+IKzYAQA-b', u'a\u20ac\ufffdb'), + ('a+IKzYAd-b', u'a\u20ac\ufffdb'), + ('a+IKwgrNgB-b', u'a\u20ac\u20ac\ud801b'), + ('a+IKwgrNgB\xffb', u'a\u20ac\u20ac\ufffdb'), + ('a+IKwgrNgB', u'a\u20ac\u20ac\ufffd'), + ('a+IKwgrNgBA-b', u'a\u20ac\u20ac\ufffdb'), + ] + for raw, expected in tests: + self.assertEqual(raw.decode('utf-7', 'replace'), expected) class UTF16ExTest(unittest.TestCase): diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 625d08c897..be8f89be07 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -1036,6 +1036,7 @@ class UnicodeTest( self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict') self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x") self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x') + self.assertEqual(unicode('\202 x', 'ascii', 'replace'), u'\uFFFD x') self.assertEqual(u'abcde'.decode('ascii', 'ignore'), u'abcde'.decode('ascii', errors='ignore')) self.assertEqual(u'abcde'.decode('ascii', 'replace'), diff --git a/Misc/NEWS b/Misc/NEWS index 74648703f8..0494fd17d8 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,8 @@ What's New in Python 2.7.11? Core and Builtins ----------------- +- Issue #24848: Fixed a number of bugs in UTF-7 decoding of misformed data. + - Issue #25003: os.urandom() doesn't use getentropy() on Solaris because getentropy() is blocking, whereas os.urandom() should not block. getentropy() is supported since Solaris 11.3. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 08723ac9b8..6c46263222 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1716,29 +1716,29 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, } else { /* now leaving a base-64 section */ inShift = 0; - s++; - if (surrogate) { - *p++ = surrogate; - surrogate = 0; - } if (base64bits > 0) { /* left-over bits */ if (base64bits >= 6) { /* We've seen at least one base-64 character */ + s++; errmsg = "partial character in shift sequence"; goto utf7Error; } else { /* Some bits remain; they should be zero */ if (base64buffer != 0) { + s++; errmsg = "non-zero padding bits in shift sequence"; goto utf7Error; } } } - if (ch != '-') { + if (surrogate && DECODE_DIRECT(ch)) + *p++ = surrogate; + surrogate = 0; + if (ch == '-') { /* '-' is absorbed; other terminating characters are preserved */ - *p++ = ch; + s++; } } } @@ -1751,6 +1751,7 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, } else { /* begin base64-encoded section */ inShift = 1; + surrogate = 0; shiftOutStart = p; base64bits = 0; base64buffer = 0; @@ -1782,6 +1783,7 @@ utf7Error: if (inShift && !consumed) { /* in shift sequence, no more to follow */ /* if we're in an inconsistent state, that's an error */ + inShift = 0; if (surrogate || (base64bits >= 6) || (base64bits > 0 && base64buffer != 0)) { -- 2.50.1