#13633: Added a new convert_charrefs keyword arg to HTMLParser that, when True, autom...

author Ezio Melotti <ezio.melotti@gmail.com>

Sat, 23 Nov 2013 17:52:05 +0000 (19:52 +0200)

committer Ezio Melotti <ezio.melotti@gmail.com>

Sat, 23 Nov 2013 17:52:05 +0000 (19:52 +0200)
author Ezio Melotti <ezio.melotti@gmail.com>
Sat, 23 Nov 2013 17:52:05 +0000 (19:52 +0200)
committer Ezio Melotti <ezio.melotti@gmail.com>
Sat, 23 Nov 2013 17:52:05 +0000 (19:52 +0200)
diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst

index 0ea964457c2bc1494499ec935c8373b2f59c2749..44b7d6ea6d282bb9ce0c608c19782f79826a2803 100644 (file)
--- a/Doc/library/html.parser.rst
+++ b/Doc/library/html.parser.rst
@@ -16,14 +16,21 @@
  This module defines a class :class:`HTMLParser` which serves as the basis for
  parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
  
-.. class:: HTMLParser(strict=False)
+.. class:: HTMLParser(strict=False, *, convert_charrefs=False)
  
-   Create a parser instance.  If *strict* is ``False`` (the default), the parser
-   will accept and parse invalid markup.  If *strict* is ``True`` the parser
-   will raise an :exc:`~html.parser.HTMLParseError` exception instead [#]_ when
-   it's not able to parse the markup.
-   The use of ``strict=True`` is discouraged and the *strict* argument is
-   deprecated.
+   Create a parser instance.
+
+   If *convert_charrefs* is ``True`` (default: ``False``), all character
+   references (except the ones in ``script``/``style`` elements) are
+   automatically converted to the corresponding Unicode characters.
+   The use of ``convert_charrefs=True`` is encouraged and will become
+   the default in Python 3.5.
+
+   If *strict* is ``False`` (the default), the parser will accept and parse
+   invalid markup.  If *strict* is ``True`` the parser will raise an
+   :exc:`~html.parser.HTMLParseError` exception instead [#]_ when it's not
+   able to parse the markup.  The use of ``strict=True`` is discouraged and
+   the *strict* argument is deprecated.
  
     An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
     when start tags, end tags, text, comments, and other markup elements are
@@ -34,12 +41,15 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
     handler for elements which are closed implicitly by closing an outer element.
  
     .. versionchanged:: 3.2
-      *strict* keyword added.
+      *strict* argument added.
  
     .. deprecated-removed:: 3.3 3.5
        The *strict* argument and the strict mode have been deprecated.
        The parser is now able to accept and parse invalid markup too.
  
+   .. versionchanged:: 3.4
+      *convert_charrefs* keyword argument added.
+
  An exception is defined as well:
  
  
@@ -181,7 +191,8 @@ implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`):
  
     This method is called to process a named character reference of the form
     ``&name;`` (e.g. ``&gt;``), where *name* is a general entity reference
-   (e.g. ``'gt'``).
+   (e.g. ``'gt'``).  This method is never called if *convert_charrefs* is
+   ``True``.
  
  
  .. method:: HTMLParser.handle_charref(name)
@@ -189,7 +200,8 @@ implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`):
     This method is called to process decimal and hexadecimal numeric character
     references of the form ``&#NNN;`` and ``&#xNNN;``.  For example, the decimal
     equivalent for ``&gt;`` is ``&#62;``, whereas the hexadecimal is ``&#x3E;``;
-   in this case the method will receive ``'62'`` or ``'x3E'``.
+   in this case the method will receive ``'62'`` or ``'x3E'``.  This method
+   is never called if *convert_charrefs* is ``True``.
  
  
  .. method:: HTMLParser.handle_comment(data)
@@ -324,7 +336,8 @@ correct char (note: these 3 references are all equivalent to ``'>'``)::
     Num ent  : >
  
  Feeding incomplete chunks to :meth:`~HTMLParser.feed` works, but
-:meth:`~HTMLParser.handle_data` might be called more than once::
+:meth:`~HTMLParser.handle_data` might be called more than once
+(unless *convert_charrefs* is set to ``True``)::
  
     >>> for chunk in ['<sp', 'an>buff', 'ered ', 'text</s', 'pan>']:
     ...     parser.feed(chunk)
diff --git a/Lib/html/parser.py b/Lib/html/parser.py

index a228e8ed370f218904932dcd26297d42780e4a14..12c28b8339ffa25711dd59a6bc89c627a4fca4b9 100644 (file)
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -97,7 +97,7 @@ class HTMLParseError(Exception):
          return result
  
  
-_strict_sentinel = object()
+_default_sentinel = object()
  
  class HTMLParser(_markupbase.ParserBase):
      """Find tags and other markup and call handler functions.
@@ -112,28 +112,39 @@ class HTMLParser(_markupbase.ParserBase):
      self.handle_startendtag(); end tags by self.handle_endtag().  The
      data between tags is passed from the parser to the derived class
      by calling self.handle_data() with the data as argument (the data
-    may be split up in arbitrary chunks).  Entity references are
-    passed by calling self.handle_entityref() with the entity
-    reference as the argument.  Numeric character references are
-    passed to self.handle_charref() with the string containing the
-    reference as the argument.
+    may be split up in arbitrary chunks).  If convert_charrefs is
+    True the character references are converted automatically to the
+    corresponding Unicode character (and self.handle_data() is no
+    longer split in chunks), otherwise they are passed by calling
+    self.handle_entityref() or self.handle_charref() with the string
+    containing respectively the named or numeric reference as the
+    argument.
      """
  
      CDATA_CONTENT_ELEMENTS = ("script", "style")
  
-    def __init__(self, strict=_strict_sentinel):
+    def __init__(self, strict=_default_sentinel, *,
+                 convert_charrefs=_default_sentinel):
          """Initialize and reset this instance.
  
+        If convert_charrefs is True (default: False), all character references
+        are automatically converted to the corresponding Unicode characters.
          If strict is set to False (the default) the parser will parse invalid
          markup, otherwise it will raise an error.  Note that the strict mode
          and argument are deprecated.
          """
-        if strict is not _strict_sentinel:
+        if strict is not _default_sentinel:
              warnings.warn("The strict argument and mode are deprecated.",
                            DeprecationWarning, stacklevel=2)
          else:
              strict = False  # default
          self.strict = strict
+        if convert_charrefs is _default_sentinel:
+            convert_charrefs = False  # default
+            warnings.warn("The value of convert_charrefs will become True in "
+                          "3.5. You are encouraged to set the value explicitly.",
+                          DeprecationWarning, stacklevel=2)
+        self.convert_charrefs = convert_charrefs
          self.reset()
  
      def reset(self):
@@ -184,14 +195,25 @@ class HTMLParser(_markupbase.ParserBase):
          i = 0
          n = len(rawdata)
          while i < n:
-            match = self.interesting.search(rawdata, i) # < or &
-            if match:
-                j = match.start()
+            if self.convert_charrefs and not self.cdata_elem:
+                j = rawdata.find('<', i)
+                if j < 0:
+                    if not end:
+                        break  # wait till we get all the text
+                    j = n
              else:
-                if self.cdata_elem:
-                    break
-                j = n
-            if i < j: self.handle_data(rawdata[i:j])
+                match = self.interesting.search(rawdata, i)  # < or &
+                if match:
+                    j = match.start()
+                else:
+                    if self.cdata_elem:
+                        break
+                    j = n
+            if i < j:
+                if self.convert_charrefs and not self.cdata_elem:
+                    self.handle_data(unescape(rawdata[i:j]))
+                else:
+                    self.handle_data(rawdata[i:j])
              i = self.updatepos(i, j)
              if i == n: break
              startswith = rawdata.startswith
@@ -226,7 +248,10 @@ class HTMLParser(_markupbase.ParserBase):
                              k = i + 1
                      else:
                          k += 1
-                    self.handle_data(rawdata[i:k])
+                    if self.convert_charrefs and not self.cdata_elem:
+                        self.handle_data(unescape(rawdata[i:k]))
+                    else:
+                        self.handle_data(rawdata[i:k])
                  i = self.updatepos(i, k)
              elif startswith("&#", i):
                  match = charref.match(rawdata, i)
@@ -277,7 +302,10 @@ class HTMLParser(_markupbase.ParserBase):
                  assert 0, "interesting.search() lied"
          # end while
          if end and i < n and not self.cdata_elem:
-            self.handle_data(rawdata[i:n])
+            if self.convert_charrefs and not self.cdata_elem:
+                self.handle_data(unescape(rawdata[i:n]))
+            else:
+                self.handle_data(rawdata[i:n])
              i = self.updatepos(i, n)
          self.rawdata = rawdata[i:]
  
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py

index 509b3cdcbe1d5d50a31fdf057c5e94dea45e0b80..1a480c818721278b57d783ee39df965381078f79 100644 (file)
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -70,6 +70,18 @@ class EventCollectorExtra(EventCollector):
          self.append(("starttag_text", self.get_starttag_text()))
  
  
+class EventCollectorCharrefs(EventCollector):
+
+    def get_events(self):
+        return self.events
+
+    def handle_charref(self, data):
+        self.fail('This should never be called with convert_charrefs=True')
+
+    def handle_entityref(self, data):
+        self.fail('This should never be called with convert_charrefs=True')
+
+
  class TestCaseBase(unittest.TestCase):
  
      def get_collector(self):
@@ -84,12 +96,14 @@ class TestCaseBase(unittest.TestCase):
          parser.close()
          events = parser.get_events()
          if events != expected_events:
-            self.fail("received events did not match expected events\n"
-                      "Expected:\n" + pprint.pformat(expected_events) +
+            self.fail("received events did not match expected events" +
+                      "\nSource:\n" + repr(source) +
+                      "\nExpected:\n" + pprint.pformat(expected_events) +
                        "\nReceived:\n" + pprint.pformat(events))
  
      def _run_check_extra(self, source, events):
-        self._run_check(source, events, EventCollectorExtra())
+        self._run_check(source, events,
+                        EventCollectorExtra(convert_charrefs=False))
  
      def _parse_error(self, source):
          def parse(source=source):
@@ -105,7 +119,7 @@ class HTMLParserStrictTestCase(TestCaseBase):
  
      def get_collector(self):
          with support.check_warnings(("", DeprecationWarning), quite=False):
-            return EventCollector(strict=True)
+            return EventCollector(strict=True, convert_charrefs=False)
  
      def test_processing_instruction_only(self):
          self._run_check("<?processing instruction>", [
@@ -335,7 +349,7 @@ text
              self._run_check(s, [("starttag", element_lower, []),
                                  ("data", content),
                                  ("endtag", element_lower)],
-                            collector=Collector())
+                            collector=Collector(convert_charrefs=False))
  
      def test_comments(self):
          html = ("<!-- I'm a valid comment -->"
@@ -363,13 +377,53 @@ text
                      ('comment', '[if lte IE 7]>pretty?<![endif]')]
          self._run_check(html, expected)
  
+    def test_convert_charrefs(self):
+        collector = lambda: EventCollectorCharrefs(convert_charrefs=True)
+        self.assertTrue(collector().convert_charrefs)
+        charrefs = ['&quot;', '&#34;', '&#x22;', '&quot', '&#34', '&#x22']
+        # check charrefs in the middle of the text/attributes
+        expected = [('starttag', 'a', [('href', 'foo"zar')]),
+                    ('data', 'a"z'), ('endtag', 'a')]
+        for charref in charrefs:
+            self._run_check('<a href="foo{0}zar">a{0}z</a>'.format(charref),
+                            expected, collector=collector())
+        # check charrefs at the beginning/end of the text/attributes
+        expected = [('data', '"'),
+                    ('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]),
+                    ('data', '"'), ('endtag', 'a'), ('data', '"')]
+        for charref in charrefs:
+            self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">'
+                            '{0}</a>{0}'.format(charref),
+                            expected, collector=collector())
+        # check charrefs in <script>/<style> elements
+        for charref in charrefs:
+            text = 'X'.join([charref]*3)
+            expected = [('data', '"'),
+                        ('starttag', 'script', []), ('data', text),
+                        ('endtag', 'script'), ('data', '"'),
+                        ('starttag', 'style', []), ('data', text),
+                        ('endtag', 'style'), ('data', '"')]
+            self._run_check('{1}<script>{0}</script>{1}'
+                            '<style>{0}</style>{1}'.format(text, charref),
+                            expected, collector=collector())
+        # check truncated charrefs at the end of the file
+        html = '&quo &# &#x'
+        for x in range(1, len(html)):
+            self._run_check(html[:x], [('data', html[:x])],
+                            collector=collector())
+        # check a string with no charrefs
+        self._run_check('no charrefs here', [('data', 'no charrefs here')],
+                        collector=collector())
+
  
  class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
  
      def get_collector(self):
-        return EventCollector()
+        return EventCollector(convert_charrefs=False)
  
      def test_deprecation_warnings(self):
+        with self.assertWarns(DeprecationWarning):
+            EventCollector()  # convert_charrefs not passed explicitly
          with self.assertWarns(DeprecationWarning):
              EventCollector(strict=True)
          with self.assertWarns(DeprecationWarning):
@@ -630,7 +684,7 @@ class AttributesStrictTestCase(TestCaseBase):
  
      def get_collector(self):
          with support.check_warnings(("", DeprecationWarning), quite=False):
-            return EventCollector(strict=True)
+            return EventCollector(strict=True, convert_charrefs=False)
  
      def test_attr_syntax(self):
          output = [
@@ -691,7 +745,7 @@ class AttributesStrictTestCase(TestCaseBase):
  class AttributesTolerantTestCase(AttributesStrictTestCase):
  
      def get_collector(self):
-        return EventCollector()
+        return EventCollector(convert_charrefs=False)
  
      def test_attr_funky_names2(self):
          self._run_check(
diff --git a/Misc/NEWS b/Misc/NEWS

index 3d5cd7e9a632000447d28170be5f26aaee944f34..336c3f4332a8271f0a28744521b4a1cae24293d1 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -132,6 +132,9 @@ Library
  - Issue #19449: in csv's writerow, handle non-string keys when generating the
    error message that certain keys are not in the 'fieldnames' list.
  
+- Issue #13633: Added a new convert_charrefs keyword arg to HTMLParser that,
+  when True, automatically converts all character references.
+
  - Issue #2927: Added the unescape() function to the html module.
  
  - Issue #8402: Added the escape() function to the glob module.
author	Ezio Melotti <ezio.melotti@gmail.com>
	Sat, 23 Nov 2013 17:52:05 +0000 (19:52 +0200)
committer	Ezio Melotti <ezio.melotti@gmail.com>
	Sat, 23 Nov 2013 17:52:05 +0000 (19:52 +0200)
Doc/library/html.parser.rst		patch \| blob \| history
Lib/html/parser.py		patch \| blob \| history
Lib/test/test_htmlparser.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history