From ab2eb0ee84ceb4b8f28653559248adb43a9502de Mon Sep 17 00:00:00 2001 From: Ezio Melotti Date: Sat, 5 Jun 2010 19:21:32 +0000 Subject: [PATCH] Add a NEWS entry for r81758 and clarify a comment. --- Lib/test/test_unicode.py | 6 +++--- Misc/NEWS | 8 ++++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 3171379089..4550be850a 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -670,9 +670,9 @@ class UnicodeTest( ('\xF4'+cb+'\xBF\xBF').decode, 'utf-8') def test_issue8271(self): - # Issue #8271: when a byte sequence is invalid, only the start byte - # and all the valid continuation bytes should be replaced by U+FFFD, - # not the number of bytes specified by the start byte. + # Issue #8271: during the decoding of an invalid UTF-8 byte sequence, + # only the start byte and the continuation byte(s) are now considered + # invalid, instead of the number of bytes specified by the start byte. # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95, # table 3-8, Row 2) for more information about the algorithm used. FFFD = u'\ufffd' diff --git a/Misc/NEWS b/Misc/NEWS index 1d5cc023c1..f22512b470 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -12,6 +12,14 @@ What's New in Python 2.7 Release Candidate 1? Core and Builtins ----------------- +- Issue #8271: during the decoding of an invalid UTF-8 byte sequence, only the + start byte and the continuation byte(s) are now considered invalid, instead + of the number of bytes specified by the start byte. + E.g.: '\xf1\x80AB'.decode('utf-8', 'replace') now returns u'\ufffdAB' and + replaces with U+FFFD only the start byte ('\xf1') and the continuation byte + ('\x80') even if '\xf1' is the start byte of a 4-bytes sequence. + Previous versions returned a single u'\ufffd'. + - Issue #8627: Remove bogus "Overriding __cmp__ blocks inheritance of __hash__ in 3.x" warning. Also fix "XXX undetected error" that arises from the "Overriding __eq__ blocks inheritance ..." warning -- 2.50.0