From: Ezio Melotti Date: Sat, 1 Feb 2014 19:20:22 +0000 (+0200) Subject: #20288: fix handling of invalid numeric charrefs in HTMLParser. X-Git-Tag: v2.7.8~65 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=5a88853bdc1074e62441c7558502bd989c39f056;p=python #20288: fix handling of invalid numeric charrefs in HTMLParser. --- diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py index 5a55e264af..3f97830a9a 100644 --- a/Lib/HTMLParser.py +++ b/Lib/HTMLParser.py @@ -195,9 +195,9 @@ class HTMLParser(markupbase.ParserBase): i = self.updatepos(i, k) continue else: - if ";" in rawdata[i:]: #bail by consuming &# - self.handle_data(rawdata[0:2]) - i = self.updatepos(i, 2) + if ";" in rawdata[i:]: # bail by consuming '&#' + self.handle_data(rawdata[i:i+2]) + i = self.updatepos(i, i+2) break elif startswith('&', i): match = entityref.match(rawdata, i) diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 6a0e461829..cde2bd23b7 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -394,6 +394,12 @@ text ("data", "&#bad;"), ("endtag", "p"), ]) + # add the [] as a workaround to avoid buffering (see #20288) + self._run_check(["
&#bad;
"], [ + ("starttag", "div", []), + ("data", "&#bad;"), + ("endtag", "div"), + ]) def test_unescape_function(self): parser = HTMLParser.HTMLParser() diff --git a/Misc/NEWS b/Misc/NEWS index 67423709d9..ec12fbef9a 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -38,6 +38,8 @@ Core and Builtins Library ------- +- Issue #20288: fix handling of invalid numeric charrefs in HTMLParser. + - Issue #19456: ntpath.join() now joins relative paths correctly when a drive is present.