#15114: the strict mode of HTMLParser and the HTMLParseError exception are deprecated...

author Ezio Melotti <ezio.melotti@gmail.com>

Sat, 23 Jun 2012 13:27:51 +0000 (15:27 +0200)

committer Ezio Melotti <ezio.melotti@gmail.com>

Sat, 23 Jun 2012 13:27:51 +0000 (15:27 +0200)
author Ezio Melotti <ezio.melotti@gmail.com>
Sat, 23 Jun 2012 13:27:51 +0000 (15:27 +0200)
committer Ezio Melotti <ezio.melotti@gmail.com>
Sat, 23 Jun 2012 13:27:51 +0000 (15:27 +0200)
diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst

index f3c36ec886719b8e2352f13df349e385e4148d47..4715185fcc7666c59399fb321ad774a8e4584626 100644 (file)
--- a/Doc/library/html.parser.rst
+++ b/Doc/library/html.parser.rst
@@ -16,13 +16,14 @@
  This module defines a class :class:`HTMLParser` which serves as the basis for
  parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
  
-.. class:: HTMLParser(strict=True)
+.. class:: HTMLParser(strict=False)
  
-   Create a parser instance.  If *strict* is ``True`` (the default), invalid
-   HTML results in :exc:`~html.parser.HTMLParseError` exceptions [#]_.  If
-   *strict* is ``False``, the parser uses heuristics to make a best guess at
-   the intention of any invalid HTML it encounters, similar to the way most
-   browsers do.  Using ``strict=False`` is advised.
+   Create a parser instance.  If *strict* is ``False`` (the default), the parser
+   will accept and parse invalid markup.  If *strict* is ``True`` the parser
+   will raise an :exc:`~html.parser.HTMLParseError` exception instead [#]_ when
+   it's not able to parse the markup.
+   The use of ``strict=True`` is discouraged and the *strict* argument is
+   deprecated.
  
     An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
     when start tags, end tags, text, comments, and other markup elements are
@@ -34,6 +35,10 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
  
     .. versionchanged:: 3.2 *strict* keyword added
  
+   .. deprecated-removed:: 3.3 3.5
+      The *strict* argument and the strict mode have been deprecated.
+      The parser is now able to accept and parse invalid markup too.
+
  An exception is defined as well:
  
  
@@ -46,6 +51,10 @@ An exception is defined as well:
     detected, and :attr:`offset` is the number of characters into the line at
     which the construct starts.
  
+   .. deprecated-removed:: 3.3 3.5
+      This exception has been deprecated because it's never raised by the parser
+      (when the default non-strict mode is used).
+
  
  Example HTML Parser Application
  -------------------------------
diff --git a/Lib/html/parser.py b/Lib/html/parser.py

index de504ab54409314352461fa39517e7ab411a0be6..494cf24fd879eea47c77cff0a01dee53b631285c 100644 (file)
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -10,6 +10,7 @@
  
  import _markupbase
  import re
+import warnings
  
  # Regular expressions used for parsing
  
@@ -113,14 +114,16 @@ class HTMLParser(_markupbase.ParserBase):
  
      CDATA_CONTENT_ELEMENTS = ("script", "style")
  
-    def __init__(self, strict=True):
+    def __init__(self, strict=False):
          """Initialize and reset this instance.
  
-        If strict is set to True (the default), errors are raised when invalid
-        HTML is encountered.  If set to False, an attempt is instead made to
-        continue parsing, making "best guesses" about the intended meaning, in
-        a fashion similar to what browsers typically do.
+        If strict is set to False (the default) the parser will parse invalid
+        markup, otherwise it will raise an error.  Note that the strict mode
+        is deprecated.
          """
+        if strict:
+            warnings.warn("The strict mode is deprecated.",
+                          DeprecationWarning, stacklevel=2)
          self.strict = strict
          self.reset()
  
@@ -271,8 +274,8 @@ class HTMLParser(_markupbase.ParserBase):
      # See also parse_declaration in _markupbase
      def parse_html_declaration(self, i):
          rawdata = self.rawdata
-        if rawdata[i:i+2] != '<!':
-            self.error('unexpected call to parse_html_declaration()')
+        assert rawdata[i:i+2] == '<!', ('unexpected call to '
+                                        'parse_html_declaration()')
          if rawdata[i:i+4] == '<!--':
              # this case is actually already handled in goahead()
              return self.parse_comment(i)
@@ -292,8 +295,8 @@ class HTMLParser(_markupbase.ParserBase):
      # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
      def parse_bogus_comment(self, i, report=1):
          rawdata = self.rawdata
-        if rawdata[i:i+2] not in ('<!', '</'):
-            self.error('unexpected call to parse_comment()')
+        assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
+                                                'parse_comment()')
          pos = rawdata.find('>', i+2)
          if pos == -1:
              return -1
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py

index c4f80cca30e25d363448386226dccd838f83ef01..64a4f5dfb4547887224b8bec271d3df5b5206206 100644 (file)
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -102,7 +102,8 @@ class TestCaseBase(unittest.TestCase):
  class HTMLParserStrictTestCase(TestCaseBase):
  
      def get_collector(self):
-        return EventCollector(strict=True)
+        with support.check_warnings(("", DeprecationWarning), quite=False):
+            return EventCollector(strict=True)
  
      def test_processing_instruction_only(self):
          self._run_check("<?processing instruction>", [
@@ -594,7 +595,8 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
  class AttributesStrictTestCase(TestCaseBase):
  
      def get_collector(self):
-        return EventCollector(strict=True)
+        with support.check_warnings(("", DeprecationWarning), quite=False):
+            return EventCollector(strict=True)
  
      def test_attr_syntax(self):
          output = [
diff --git a/Misc/NEWS b/Misc/NEWS

index 0325058e39c9891f3b201921811473e6ce727e4b..d76aeebb2066d3632e6accb2aa5141f1a4abc1c0 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -43,6 +43,9 @@ Core and Builtins
  Library
  -------
  
+- Issue #15114: the strict mode of HTMLParser and the HTMLParseError exception
+  are deprecated now that the parser is able to parse invalid markup.
+
  - Issue #3665: \u and \U escapes are now supported in unicode regular
    expressions.  Patch by Serhiy Storchaka.
  
@@ -78,7 +81,7 @@ Library
  - Issue #9527: datetime.astimezone() method will now supply a class
    timezone instance corresponding to the system local timezone when
    called with no arguments.
-  
+
  - Issue #14653: email.utils.mktime_tz() no longer relies on system
    mktime() when timezone offest is supplied.
author	Ezio Melotti <ezio.melotti@gmail.com>
	Sat, 23 Jun 2012 13:27:51 +0000 (15:27 +0200)
committer	Ezio Melotti <ezio.melotti@gmail.com>
	Sat, 23 Jun 2012 13:27:51 +0000 (15:27 +0200)
Doc/library/html.parser.rst		patch \| blob \| history
Lib/html/parser.py		patch \| blob \| history
Lib/test/test_htmlparser.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history