From: Victor Stinner Date: Mon, 24 May 2010 21:33:24 +0000 (+0000) Subject: Issue #6662: Fix parsing of malformatted charref (&#bad;) X-Git-Tag: v2.7rc1~95 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=554a3b82e40573846f893ffdfff230e1d908af57;p=python Issue #6662: Fix parsing of malformatted charref (&#bad;) --- diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py index 2cbc2ecbc7..7cee47a7c5 100644 --- a/Lib/HTMLParser.py +++ b/Lib/HTMLParser.py @@ -175,6 +175,9 @@ class HTMLParser(markupbase.ParserBase): i = self.updatepos(i, k) continue else: + if ";" in rawdata[i:]: #bail by consuming &# + self.handle_data(rawdata[0:2]) + i = self.updatepos(i, 2) break elif startswith('&', i): match = entityref.match(rawdata, i) diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 810af6c8cb..c45cf00ece 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -313,6 +313,13 @@ DOCTYPE html [ ("starttag", "html", [("foo", u"\u20AC&aa&unsupported;")]) ]) + def test_malformatted_charref(self): + self._run_check("

&#bad;

", [ + ("starttag", "p", []), + ("data", "&#bad;"), + ("endtag", "p"), + ]) + def test_main(): test_support.run_unittest(HTMLParserTestCase) diff --git a/Misc/NEWS b/Misc/NEWS index b094126889..e2f0f0ccf8 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -29,6 +29,8 @@ C-API Library ------- +- Issue #6662: Fix parsing of malformatted charref (&#bad;) + - Issue #8016: Add the CP858 codec. - Issue #3924: Ignore cookies with invalid "version" field in cookielib.