#20288: fix handling of invalid numeric charrefs in HTMLParser.

author Ezio Melotti <ezio.melotti@gmail.com>

Sat, 1 Feb 2014 19:21:01 +0000 (21:21 +0200)

committer Ezio Melotti <ezio.melotti@gmail.com>

Sat, 1 Feb 2014 19:21:01 +0000 (21:21 +0200)
author Ezio Melotti <ezio.melotti@gmail.com>
Sat, 1 Feb 2014 19:21:01 +0000 (21:21 +0200)
committer Ezio Melotti <ezio.melotti@gmail.com>
Sat, 1 Feb 2014 19:21:01 +0000 (21:21 +0200)
diff --git a/Lib/html/parser.py b/Lib/html/parser.py

index 2d3bef351b0ad0fe2285166908dec2ba4f9070ca..63fe77425bdc45457243c373984c6666fe8ce57b 100644 (file)
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -228,9 +228,9 @@ class HTMLParser(_markupbase.ParserBase):
                      i = self.updatepos(i, k)
                      continue
                  else:
-                    if ";" in rawdata[i:]: #bail by consuming &#
-                        self.handle_data(rawdata[0:2])
-                        i = self.updatepos(i, 2)
+                    if ";" in rawdata[i:]:  # bail by consuming &#
+                        self.handle_data(rawdata[i:i+2])
+                        i = self.updatepos(i, i+2)
                      break
              elif startswith('&', i):
                  match = entityref.match(rawdata, i)
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py

index c977a9dd4d7111a3e1794bbadf007f5b73aeb390..11d9c9ce8b8c1a4c60250f57226d1d19f38eca33 100644 (file)
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -151,6 +151,12 @@ text
              ("data", "&#bad;"),
              ("endtag", "p"),
          ])
+        # add the [] as a workaround to avoid buffering (see #20288)
+        self._run_check(["<div>&#bad;</div>"], [
+            ("starttag", "div", []),
+            ("data", "&#bad;"),
+            ("endtag", "div"),
+        ])
  
      def test_unclosed_entityref(self):
          self._run_check("&entityref foo", [
diff --git a/Misc/NEWS b/Misc/NEWS

index 5a84af86cd779ef2ecd8fb8cd7d44182818f1a7c..d2efc232a2d50027b2735247fcc9ff164c80c4d0 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -45,6 +45,8 @@ Core and Builtins
  Library
  -------
  
+- Issue #20288: fix handling of invalid numeric charrefs in HTMLParser.
+
  - Issue #20424: Python implementation of io.StringIO now supports lone surrogates.
  
  - Issue #19456: ntpath.join() now joins relative paths correctly when a drive
author	Ezio Melotti <ezio.melotti@gmail.com>
	Sat, 1 Feb 2014 19:21:01 +0000 (21:21 +0200)
committer	Ezio Melotti <ezio.melotti@gmail.com>
	Sat, 1 Feb 2014 19:21:01 +0000 (21:21 +0200)
Lib/html/parser.py		patch \| blob \| history
Lib/test/test_htmlparser.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history