#1486713: Add a tolerant mode to HTMLParser.

author R. David Murray <rdmurray@bitdance.com>

Fri, 3 Dec 2010 04:06:39 +0000 (04:06 +0000)

committer R. David Murray <rdmurray@bitdance.com>

Fri, 3 Dec 2010 04:06:39 +0000 (04:06 +0000)
author R. David Murray <rdmurray@bitdance.com>
Fri, 3 Dec 2010 04:06:39 +0000 (04:06 +0000)
committer R. David Murray <rdmurray@bitdance.com>
Fri, 3 Dec 2010 04:06:39 +0000 (04:06 +0000)
diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst

index 2bc655510b9c37386f08dcbfb33c3ae72c020284..743d183fb8791c138221533e772ca449daa7412e 100644 (file)
--- a/Doc/library/html.parser.rst
+++ b/Doc/library/html.parser.rst
@@ -12,9 +12,13 @@
  This module defines a class :class:`HTMLParser` which serves as the basis for
  parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
  
-.. class:: HTMLParser()
+.. class:: HTMLParser(strict=True)
  
-   The :class:`HTMLParser` class is instantiated without arguments.
+   Create a parser instance.  If *strict* is ``True`` (the default), invalid
+   html results in :exc:`~html.parser.HTMLParseError` exceptions [#]_.  If
+   *strict* is ``False``, the parser uses heuristics to make a best guess at
+   the intention of any invalid html it encounters, similar to the way most
+   browsers do.
  
     An :class:`HTMLParser` instance is fed HTML data and calls handler functions when tags
     begin and end.  The :class:`HTMLParser` class is meant to be overridden by the
@@ -191,3 +195,8 @@ As a basic example, below is a very basic HTML parser that uses the
     Encountered a html end tag
  
  
+.. rubric:: Footnotes
+
+.. [#] For backward compatibility reasons *strict* mode does not throw
+       errors for all non-compliant HTML.  That is, some invalid HTML
+       is tolerated even in *strict* mode.
diff --git a/Lib/html/parser.py b/Lib/html/parser.py

index c2c7f6bf5da5fcab99caf17e19a7a7daafed0c83..8d275ab315858f4c49b8ef593e9bb66dd41c0637 100644 (file)
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -24,10 +24,14 @@ starttagopen = re.compile('<[a-zA-Z]')
  piclose = re.compile('>')
  commentclose = re.compile(r'--\s*>')
  tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
+# Note, the strict one of this pair isn't really strict, but we can't
+# make it correctly strict without breaking backward compatibility.
  attrfind = re.compile(
      r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
      r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')
-
+attrfind_tolerant = re.compile(
+    r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
+    r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?')
  locatestarttagend = re.compile(r"""
    <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
    (?:\s+                             # whitespace before attribute name
@@ -42,6 +46,21 @@ locatestarttagend = re.compile(r"""
     )*
    \s*                                # trailing whitespace
  """, re.VERBOSE)
+locatestarttagend_tolerant = re.compile(r"""
+  <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
+  (?:\s*                             # optional whitespace before attribute name
+    (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
+      (?:\s*=\s*                     # value indicator
+        (?:'[^']*'                   # LITA-enclosed value
+          |\"[^\"]*\"                # LIT-enclosed value
+          |[^'\">\s]+                # bare value
+         )
+         (?:\s*,)*                   # possibly followed by a comma
+       )?
+     )
+   )*
+  \s*                                # trailing whitespace
+""", re.VERBOSE)
  endendtag = re.compile('>')
  endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
  
@@ -86,9 +105,15 @@ class HTMLParser(_markupbase.ParserBase):
  
      CDATA_CONTENT_ELEMENTS = ("script", "style")
  
+    def __init__(self, strict=True):
+        """Initialize and reset this instance.
  
-    def __init__(self):
-        """Initialize and reset this instance."""
+        If strict is set to True (the default), errors are raised when invalid
+        HTML is encountered.  If set to False, an attempt is instead made to
+        continue parsing, making "best guesses" about the intended meaning, in
+        a fashion similar to what browsers typically do.
+        """
+        self.strict = strict
          self.reset()
  
      def reset(self):
@@ -160,9 +185,18 @@ class HTMLParser(_markupbase.ParserBase):
                  else:
                      break
                  if k < 0:
-                    if end:
+                    if not end:
+                        break
+                    if self.strict:
                          self.error("EOF in middle of construct")
-                    break
+                    k = rawdata.find('>', i + 1)
+                    if k < 0:
+                        k = rawdata.find('<', i + 1)
+                        if k < 0:
+                            k = i + 1
+                    else:
+                        k += 1
+                    self.handle_data(rawdata[i:k])
                  i = self.updatepos(i, k)
              elif startswith("&#", i):
                  match = charref.match(rawdata, i)
@@ -193,7 +227,12 @@ class HTMLParser(_markupbase.ParserBase):
                  if match:
                      # match.group() will contain at least 2 chars
                      if end and match.group() == rawdata[i:]:
-                        self.error("EOF in middle of entity or char ref")
+                        if self.strict:
+                            self.error("EOF in middle of entity or char ref")
+                        else:
+                            if k <= i:
+                                k = n
+                            i = self.updatepos(i, i + 1)
                      # incomplete
                      break
                  elif (i + 1) < n:
@@ -240,7 +279,10 @@ class HTMLParser(_markupbase.ParserBase):
          self.lasttag = tag = rawdata[i+1:k].lower()
  
          while k < endpos:
-            m = attrfind.match(rawdata, k)
+            if self.strict:
+                m = attrfind.match(rawdata, k)
+            else:
+                m = attrfind_tolerant.search(rawdata, k)
              if not m:
                  break
              attrname, rest, attrvalue = m.group(1, 2, 3)
@@ -262,8 +304,11 @@ class HTMLParser(_markupbase.ParserBase):
                           - self.__starttag_text.rfind("\n")
              else:
                  offset = offset + len(self.__starttag_text)
-            self.error("junk characters in start tag: %r"
-                       % (rawdata[k:endpos][:20],))
+            if self.strict:
+                self.error("junk characters in start tag: %r"
+                           % (rawdata[k:endpos][:20],))
+            self.handle_data(rawdata[i:endpos])
+            return endpos
          if end.endswith('/>'):
              # XHTML-style empty tag: <span attr="value" />
              self.handle_startendtag(tag, attrs)
@@ -277,7 +322,10 @@ class HTMLParser(_markupbase.ParserBase):
      # or -1 if incomplete.
      def check_for_whole_start_tag(self, i):
          rawdata = self.rawdata
-        m = locatestarttagend.match(rawdata, i)
+        if self.strict:
+            m = locatestarttagend.match(rawdata, i)
+        else:
+            m = locatestarttagend_tolerant.match(rawdata, i)
          if m:
              j = m.end()
              next = rawdata[j:j+1]
@@ -290,8 +338,13 @@ class HTMLParser(_markupbase.ParserBase):
                      # buffer boundary
                      return -1
                  # else bogus input
-                self.updatepos(i, j + 1)
-                self.error("malformed empty start tag")
+                if self.strict:
+                    self.updatepos(i, j + 1)
+                    self.error("malformed empty start tag")
+                if j > i:
+                    return j
+                else:
+                    return i + 1
              if next == "":
                  # end of input
                  return -1
@@ -300,8 +353,13 @@ class HTMLParser(_markupbase.ParserBase):
                  # end of input in or before attribute value, or we have the
                  # '/' from a '/>' ending
                  return -1
-            self.updatepos(i, j)
-            self.error("malformed start tag")
+            if self.strict:
+                self.updatepos(i, j)
+                self.error("malformed start tag")
+            if j > i:
+                return j
+            else:
+                return i + 1
          raise AssertionError("we should not get here!")
  
      # Internal -- parse endtag, return end or -1 if incomplete
@@ -314,7 +372,15 @@ class HTMLParser(_markupbase.ParserBase):
          j = match.end()
          match = endtagfind.match(rawdata, i) # </ + tag + >
          if not match:
-            self.error("bad end tag: %r" % (rawdata[i:j],))
+            if self.strict:
+                self.error("bad end tag: %r" % (rawdata[i:j],))
+            k = rawdata.find('<', i + 1, j)
+            if k > i:
+                j = k
+            if j <= i:
+                j = i + 1
+            self.handle_data(rawdata[i:j])
+            return j
          tag = match.group(1)
          self.handle_endtag(tag.lower())
          self.clear_cdata_mode()
@@ -358,7 +424,8 @@ class HTMLParser(_markupbase.ParserBase):
          pass
  
      def unknown_decl(self, data):
-        self.error("unknown declaration: %r" % (data,))
+        if self.strict:
+            self.error("unknown declaration: %r" % (data,))
  
      # Internal -- helper to remove special character quoting
      entitydefs = None
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py

index e982218dba11fe3e1f2ac12faf7840c4bbed2e04..beaf6b63a239ca5892bd47b200ecab34ee8ef3a6 100644 (file)
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -8,10 +8,10 @@ from test import support
  
  class EventCollector(html.parser.HTMLParser):
  
-    def __init__(self):
+    def __init__(self, *args, **kw):
          self.events = []
          self.append = self.events.append
-        html.parser.HTMLParser.__init__(self)
+        html.parser.HTMLParser.__init__(self, *args, **kw)
  
      def get_events(self):
          # Normalize the list of events so that buffer artefacts don't
@@ -72,8 +72,10 @@ class EventCollectorExtra(EventCollector):
  
  class TestCaseBase(unittest.TestCase):
  
-    def _run_check(self, source, expected_events, collector=EventCollector):
-        parser = collector()
+    def _run_check(self, source, expected_events, collector=None):
+        if collector is None:
+            collector = EventCollector()
+        parser = collector
          for s in source:
              parser.feed(s)
          parser.close()
@@ -84,7 +86,7 @@ class TestCaseBase(unittest.TestCase):
                        "\nReceived:\n" + pprint.pformat(events))
  
      def _run_check_extra(self, source, events):
-        self._run_check(source, events, EventCollectorExtra)
+        self._run_check(source, events, EventCollectorExtra())
  
      def _parse_error(self, source):
          def parse(source=source):
@@ -321,8 +323,42 @@ DOCTYPE html [
                  ])
  
  
+class HTMLParserTolerantTestCase(TestCaseBase):
+
+    def setUp(self):
+        self.collector = EventCollector(strict=False)
+
+    def test_tolerant_parsing(self):
+        self._run_check('<html <html>te>>xt&a<<bc</a></html>\n'
+                        '<img src="URL><//img></html</html>', [
+                             ('data', '<html '),
+                             ('starttag', 'html', []),
+                             ('data', 'te>>xt'),
+                             ('entityref', 'a'),
+                             ('data', '<<bc'),
+                             ('endtag', 'a'),
+                             ('endtag', 'html'),
+                             ('data', '\n<img src="URL><//img></html'),
+                             ('endtag', 'html')],
+                        collector = self.collector)
+
+    def test_comma_between_attributes(self):
+        self._run_check('<form action="/xxx.php?a=1&amp;b=2&amp", '
+                        'method="post">', [
+                            ('starttag', 'form',
+                                [('action', '/xxx.php?a=1&b=2&amp'),
+                                 ('method', 'post')])],
+                        collector = self.collector)
+
+    def test_weird_chars_in_unquoted_attribute_values(self):
+        self._run_check('<form action=bogus|&#()value>', [
+                            ('starttag', 'form',
+                                [('action', 'bogus|&#()value')])],
+                        collector = self.collector)
+
+
  def test_main():
-    support.run_unittest(HTMLParserTestCase)
+    support.run_unittest(HTMLParserTestCase, HTMLParserTolerantTestCase)
  
  
  if __name__ == "__main__":
diff --git a/Misc/NEWS b/Misc/NEWS

index 434ed234a6153e4ce648a73aeb3bf2834eb77368..7c5d8d1caf456a0aa8fb84e7b956b4a0060e2434 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -58,6 +58,9 @@ Core and Builtins
  Library
  -------
  
+- Issue #1486713: HTMLParser now has an optional tolerant mode where it
+  tries to guess at the correct parsing of invalid html.
+
  - Issue #10554: Add context manager support to subprocess.Popen objects.
  
  - Issue #8989: email.utils.make_msgid now has a domain parameter that can
author	R. David Murray <rdmurray@bitdance.com>
	Fri, 3 Dec 2010 04:06:39 +0000 (04:06 +0000)
committer	R. David Murray <rdmurray@bitdance.com>
	Fri, 3 Dec 2010 04:06:39 +0000 (04:06 +0000)
Doc/library/html.parser.rst		patch \| blob \| history
Lib/html/parser.py		patch \| blob \| history
Lib/test/test_htmlparser.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history