From c4b82c037e95fe2fe77352e8a4a54be7b209df2a Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 8 Jan 2013 23:12:00 +0200 Subject: [PATCH] Issue #11461: Fix the incremental UTF-16 decoder. Original patch by Amaury Forgeot d'Arc. Added tests for partial decoding of non-BMP characters. --- Lib/test/test_codecs.py | 48 ++++++++++++++++++++++++++++++++++------- Misc/NEWS | 3 +++ Objects/unicodeobject.c | 5 ++++- 3 files changed, 47 insertions(+), 9 deletions(-) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index f620b4cde3..5baf225259 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -281,7 +281,7 @@ class UTF32Test(ReadTest): def test_partial(self): self.check_partial( - u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff\U00010000", [ u"", # first byte of BOM read u"", # second byte of BOM read @@ -303,6 +303,10 @@ class UTF32Test(ReadTest): u"\x00\xff\u0100", u"\x00\xff\u0100", u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff\U00010000", ] ) @@ -331,7 +335,7 @@ class UTF32LETest(ReadTest): def test_partial(self): self.check_partial( - u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff\U00010000", [ u"", u"", @@ -349,6 +353,10 @@ class UTF32LETest(ReadTest): u"\x00\xff\u0100", u"\x00\xff\u0100", u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff\U00010000", ] ) @@ -371,7 +379,7 @@ class UTF32BETest(ReadTest): def test_partial(self): self.check_partial( - u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff\U00010000", [ u"", u"", @@ -389,6 +397,10 @@ class UTF32BETest(ReadTest): u"\x00\xff\u0100", u"\x00\xff\u0100", u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff\U00010000", ] ) @@ -439,7 +451,7 @@ class UTF16Test(ReadTest): def test_partial(self): self.check_partial( - u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff\U00010000", [ u"", # first byte of BOM read u"", # second byte of BOM read => byteorder known @@ -451,6 +463,10 @@ class UTF16Test(ReadTest): u"\x00\xff\u0100", u"\x00\xff\u0100", u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff\U00010000", ] ) @@ -481,7 +497,7 @@ class UTF16LETest(ReadTest): def test_partial(self): self.check_partial( - u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff\U00010000", [ u"", u"\x00", @@ -491,6 +507,10 @@ class UTF16LETest(ReadTest): u"\x00\xff\u0100", u"\x00\xff\u0100", u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff\U00010000", ] ) @@ -514,7 +534,7 @@ class UTF16BETest(ReadTest): def test_partial(self): self.check_partial( - u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff\U00010000", [ u"", u"\x00", @@ -524,6 +544,10 @@ class UTF16BETest(ReadTest): u"\x00\xff\u0100", u"\x00\xff\u0100", u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff", + u"\x00\xff\u0100\uffff\U00010000", ] ) @@ -547,7 +571,7 @@ class UTF8Test(ReadTest): def test_partial(self): self.check_partial( - u"\x00\xff\u07ff\u0800\uffff", + u"\x00\xff\u07ff\u0800\uffff\U00010000", [ u"\x00", u"\x00", @@ -560,6 +584,10 @@ class UTF8Test(ReadTest): u"\x00\xff\u07ff\u0800", u"\x00\xff\u07ff\u0800", u"\x00\xff\u07ff\u0800\uffff", + u"\x00\xff\u07ff\u0800\uffff", + u"\x00\xff\u07ff\u0800\uffff", + u"\x00\xff\u07ff\u0800\uffff", + u"\x00\xff\u07ff\u0800\uffff\U00010000", ] ) @@ -619,7 +647,7 @@ class UTF8SigTest(ReadTest): def test_partial(self): self.check_partial( - u"\ufeff\x00\xff\u07ff\u0800\uffff", + u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000", [ u"", u"", @@ -638,6 +666,10 @@ class UTF8SigTest(ReadTest): u"\ufeff\x00\xff\u07ff\u0800", u"\ufeff\x00\xff\u07ff\u0800", u"\ufeff\x00\xff\u07ff\u0800\uffff", + u"\ufeff\x00\xff\u07ff\u0800\uffff", + u"\ufeff\x00\xff\u07ff\u0800\uffff", + u"\ufeff\x00\xff\u07ff\u0800\uffff", + u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000", ] ) diff --git a/Misc/NEWS b/Misc/NEWS index dd26444bd9..8f5a1e34df 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -9,6 +9,9 @@ What's New in Python 2.7.4 Core and Builtins ----------------- +- Issue #11461: Fix the incremental UTF-16 decoder. Original patch by + Amaury Forgeot d'Arc. + - Issue #16367: Fix FileIO.readall() on Windows for files larger than 2 GB. - Issue #15516: Fix a bug in PyString_FromFormat where it failed to properly diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 7713b5497e..1c6e55d1bf 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2565,8 +2565,11 @@ PyUnicode_DecodeUTF16Stateful(const char *s, /* UTF-16 code pair: */ if (e - q < 2) { + q -= 2; + if (consumed) + break; errmsg = "unexpected end of data"; - startinpos = (((const char *)q)-2)-starts; + startinpos = ((const char *)q)-starts; endinpos = ((const char *)e)-starts; goto utf16Error; } -- 2.50.1