From 65d36dab4d915eb9fada52b867301b546e840fae Mon Sep 17 00:00:00 2001
From: Ezio Melotti <ezio.melotti@gmail.com>
Date: Wed, 15 Feb 2012 13:19:10 +0200
Subject: [PATCH] #13987: HTMLParser is now able to handle malformed start
 tags.

---
 Lib/HTMLParser.py           | 10 ++++++----
 Lib/test/test_htmlparser.py |  3 ++-
 Misc/NEWS                   |  2 +-
 3 files changed, 9 insertions(+), 6 deletions(-)
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py
index d2268d02cd..5081a62562 100644
--- a/Lib/HTMLParser.py
+++ b/Lib/HTMLParser.py
@@ -315,8 +315,8 @@ class HTMLParser(markupbase.ParserBase):
                          - self.__starttag_text.rfind("\n")
             else:
                 offset = offset + len(self.__starttag_text)
-            self.error("junk characters in start tag: %r"
-                       % (rawdata[k:endpos][:20],))
+            self.handle_data(rawdata[i:endpos])
+            return endpos
         if end.endswith('/>'):
             # XHTML-style empty tag: <span attr="value" />
             self.handle_startendtag(tag, attrs)
@@ -353,8 +353,10 @@ class HTMLParser(markupbase.ParserBase):
                 # end of input in or before attribute value, or we have the
                 # '/' from a '/>' ending
                 return -1
-            self.updatepos(i, j)
-            self.error("malformed start tag")
+            if j > i:
+                return j
+            else:
+                return i + 1
         raise AssertionError("we should not get here!")
 
     # Internal -- parse endtag, return end or -1 if incomplete
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index ba775abdac..8136bca3e2 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -206,7 +206,8 @@ text
         self._run_check("</$>", [('comment', '$')])
         self._run_check("</", [('data', '</')])
         self._run_check("</a", [('data', '</a')])
-        self._parse_error("<a<a>")
+        # XXX this might be wrong
+        self._run_check("<a<a>", [('data', '<a'), ('starttag', 'a', [])])
         self._run_check("</a<a>", [('endtag', 'a<a')])
         self._run_check("<!", [('data', '<!')])
         self._run_check("<a", [('data', '<a')])
diff --git a/Misc/NEWS b/Misc/NEWS
index 825752134a..7fc224ab6c 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -94,7 +94,7 @@ Library
 -------
 
 - Issue #13987: HTMLParser is now able to handle EOFs in the middle of a
-  construct.
+  construct and malformed start tags.
 
 - Issue #13015: Fix a possible reference leak in defaultdict.__repr__.
   Patch by Suman Saha.
-- 
2.50.1