#13987: HTMLParser is now able to handle EOFs in the middle of a construct.

author Ezio Melotti <ezio.melotti@gmail.com>

Wed, 15 Feb 2012 10:44:23 +0000 (12:44 +0200)

committer Ezio Melotti <ezio.melotti@gmail.com>

Wed, 15 Feb 2012 10:44:23 +0000 (12:44 +0200)
author Ezio Melotti <ezio.melotti@gmail.com>
Wed, 15 Feb 2012 10:44:23 +0000 (12:44 +0200)
committer Ezio Melotti <ezio.melotti@gmail.com>
Wed, 15 Feb 2012 10:44:23 +0000 (12:44 +0200)
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py

index f230c5f163f8f380f0c4c034c5f9bf796865f9c7..d2268d02cd0d447d117ba3a24d14a005be5e7c5b 100644 (file)
--- a/Lib/HTMLParser.py
+++ b/Lib/HTMLParser.py
@@ -170,9 +170,16 @@ class HTMLParser(markupbase.ParserBase):
                  else:
                      break
                  if k < 0:
-                    if end:
-                        self.error("EOF in middle of construct")
-                    break
+                    if not end:
+                        break
+                    k = rawdata.find('>', i + 1)
+                    if k < 0:
+                        k = rawdata.find('<', i + 1)
+                        if k < 0:
+                            k = i + 1
+                    else:
+                        k += 1
+                    self.handle_data(rawdata[i:k])
                  i = self.updatepos(i, k)
              elif startswith("&#", i):
                  match = charref.match(rawdata, i)
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py

index 66675127850d1858dc0b788ba90192b34670eb20..ba775abdac1bf6818a8548fd66eab1039a0352dd 100644 (file)
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -204,16 +204,16 @@ text
      def test_starttag_junk_chars(self):
          self._run_check("</>", [])
          self._run_check("</$>", [('comment', '$')])
-        self._parse_error("</")
-        self._parse_error("</a")
+        self._run_check("</", [('data', '</')])
+        self._run_check("</a", [('data', '</a')])
          self._parse_error("<a<a>")
          self._run_check("</a<a>", [('endtag', 'a<a')])
-        self._parse_error("<!")
-        self._parse_error("<a")
-        self._parse_error("<a foo='bar'")
-        self._parse_error("<a foo='bar")
-        self._parse_error("<a foo='>'")
-        self._parse_error("<a foo='>")
+        self._run_check("<!", [('data', '<!')])
+        self._run_check("<a", [('data', '<a')])
+        self._run_check("<a foo='bar'", [('data', "<a foo='bar'")])
+        self._run_check("<a foo='bar", [('data', "<a foo='bar")])
+        self._run_check("<a foo='>'", [('data', "<a foo='>'")])
+        self._run_check("<a foo='>", [('data', "<a foo='>")])
  
      def test_valid_doctypes(self):
          # from http://www.w3.org/QA/2002/04/valid-dtd-list.html
diff --git a/Misc/NEWS b/Misc/NEWS

index ba09480640a59ad9c5bb4879c1906d8442d791bf..825752134aad5827a3bd676d8b86eddd32b36a12 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -93,6 +93,9 @@ Core and Builtins
  Library
  -------
  
+- Issue #13987: HTMLParser is now able to handle EOFs in the middle of a
+  construct.
+
  - Issue #13015: Fix a possible reference leak in defaultdict.__repr__.
    Patch by Suman Saha.
author	Ezio Melotti <ezio.melotti@gmail.com>
	Wed, 15 Feb 2012 10:44:23 +0000 (12:44 +0200)
committer	Ezio Melotti <ezio.melotti@gmail.com>
	Wed, 15 Feb 2012 10:44:23 +0000 (12:44 +0200)
Lib/HTMLParser.py		patch \| blob \| history
Lib/test/test_htmlparser.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history