#13987: HTMLParser is now able to handle malformed start tags.

author Ezio Melotti <ezio.melotti@gmail.com>

Wed, 15 Feb 2012 11:19:10 +0000 (13:19 +0200)

committer Ezio Melotti <ezio.melotti@gmail.com>

Wed, 15 Feb 2012 11:19:10 +0000 (13:19 +0200)
author Ezio Melotti <ezio.melotti@gmail.com>
Wed, 15 Feb 2012 11:19:10 +0000 (13:19 +0200)
committer Ezio Melotti <ezio.melotti@gmail.com>
Wed, 15 Feb 2012 11:19:10 +0000 (13:19 +0200)
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py

index d2268d02cd0d447d117ba3a24d14a005be5e7c5b..5081a62562c0758a964d8f0a42b41f6f657a99c5 100644 (file)
--- a/Lib/HTMLParser.py
+++ b/Lib/HTMLParser.py
@@ -315,8 +315,8 @@ class HTMLParser(markupbase.ParserBase):
                           - self.__starttag_text.rfind("\n")
              else:
                  offset = offset + len(self.__starttag_text)
-            self.error("junk characters in start tag: %r"
-                       % (rawdata[k:endpos][:20],))
+            self.handle_data(rawdata[i:endpos])
+            return endpos
          if end.endswith('/>'):
              # XHTML-style empty tag: <span attr="value" />
              self.handle_startendtag(tag, attrs)
@@ -353,8 +353,10 @@ class HTMLParser(markupbase.ParserBase):
                  # end of input in or before attribute value, or we have the
                  # '/' from a '/>' ending
                  return -1
-            self.updatepos(i, j)
-            self.error("malformed start tag")
+            if j > i:
+                return j
+            else:
+                return i + 1
          raise AssertionError("we should not get here!")
  
      # Internal -- parse endtag, return end or -1 if incomplete
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py

index ba775abdac1bf6818a8548fd66eab1039a0352dd..8136bca3e28ee49665fee244e60505a20bb7bc21 100644 (file)
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -206,7 +206,8 @@ text
          self._run_check("</$>", [('comment', '$')])
          self._run_check("</", [('data', '</')])
          self._run_check("</a", [('data', '</a')])
-        self._parse_error("<a<a>")
+        # XXX this might be wrong
+        self._run_check("<a<a>", [('data', '<a'), ('starttag', 'a', [])])
          self._run_check("</a<a>", [('endtag', 'a<a')])
          self._run_check("<!", [('data', '<!')])
          self._run_check("<a", [('data', '<a')])
diff --git a/Misc/NEWS b/Misc/NEWS

index 825752134aad5827a3bd676d8b86eddd32b36a12..7fc224ab6c413d095051a0ce7c16df38971c12a0 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -94,7 +94,7 @@ Library
  -------
  
  - Issue #13987: HTMLParser is now able to handle EOFs in the middle of a
-  construct.
+  construct and malformed start tags.
  
  - Issue #13015: Fix a possible reference leak in defaultdict.__repr__.
    Patch by Suman Saha.
author	Ezio Melotti <ezio.melotti@gmail.com>
	Wed, 15 Feb 2012 11:19:10 +0000 (13:19 +0200)
committer	Ezio Melotti <ezio.melotti@gmail.com>
	Wed, 15 Feb 2012 11:19:10 +0000 (13:19 +0200)
Lib/HTMLParser.py		patch \| blob \| history
Lib/test/test_htmlparser.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history