#23144: Make sure that HTMLParser.feed() returns all the data, even when convert_char...

author Ezio Melotti <ezio.melotti@gmail.com>

Sun, 6 Sep 2015 18:38:06 +0000 (21:38 +0300)

committer Ezio Melotti <ezio.melotti@gmail.com>

Sun, 6 Sep 2015 18:38:06 +0000 (21:38 +0300)
author Ezio Melotti <ezio.melotti@gmail.com>
Sun, 6 Sep 2015 18:38:06 +0000 (21:38 +0300)
committer Ezio Melotti <ezio.melotti@gmail.com>
Sun, 6 Sep 2015 18:38:06 +0000 (21:38 +0300)
diff --git a/Lib/html/parser.py b/Lib/html/parser.py

index a650d5eeded76200ff413aec76b1022cfffde5f6..9ae31b9128d96f587621512aea5d0bcbecd94675 100644 (file)
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -198,7 +198,15 @@ class HTMLParser(_markupbase.ParserBase):
              if self.convert_charrefs and not self.cdata_elem:
                  j = rawdata.find('<', i)
                  if j < 0:
-                    if not end:
+                    # if we can't find the next <, either we are at the end
+                    # or there's more text incoming.  If the latter is True,
+                    # we can't pass the text to handle_data in case we have
+                    # a charref cut in half at end.  Try to determine if
+                    # this is the case before proceding by looking for an
+                    # & near the end and see if it's followed by a space or ;.
+                    amppos = rawdata.rfind('&', max(i, n-34))
+                    if (amppos >= 0 and
+                        not re.compile(r'[\s;]').search(rawdata, amppos)):
                          break  # wait till we get all the text
                      j = n
              else:
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py

index 2d771a2a974c572cf05c1dbeeb42c9ec68c54a18..144f820af29c2262d1b883a25f9deec2a9b40615 100644 (file)
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -72,9 +72,6 @@ class EventCollectorExtra(EventCollector):
  
  class EventCollectorCharrefs(EventCollector):
  
-    def get_events(self):
-        return self.events
-
      def handle_charref(self, data):
          self.fail('This should never be called with convert_charrefs=True')
  
@@ -685,6 +682,18 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
          ]
          self._run_check(html, expected)
  
+    def test_convert_charrefs_dropped_text(self):
+        # #23144: make sure that all the events are triggered when
+        # convert_charrefs is True, even if we don't call .close()
+        parser = EventCollector(convert_charrefs=True)
+        # before the fix, bar & baz was missing
+        parser.feed("foo <a>link</a> bar &amp; baz")
+        self.assertEqual(
+            parser.get_events(),
+            [('data', 'foo '), ('starttag', 'a', []), ('data', 'link'),
+             ('endtag', 'a'), ('data', ' bar & baz')]
+        )
+
  
  class AttributesStrictTestCase(TestCaseBase):
  
diff --git a/Misc/NEWS b/Misc/NEWS

index dd175fead46e030d58893938c2bb7a1795a7d149..7a3c22c4832f01b86fddff47b0d47b6370c0df29 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -1,4 +1,4 @@
-+++++++++++
++++++++++++
  Python News
  +++++++++++
  
@@ -81,6 +81,9 @@ Core and Builtins
  Library
  -------
  
+- Issue #23144: Make sure that HTMLParser.feed() returns all the data, even
+  when convert_charrefs is True.
+
  - Issue #16180: Exit pdb if file has syntax error, instead of trapping user
    in an infinite loop.  Patch by Xavier de Gaye.
author	Ezio Melotti <ezio.melotti@gmail.com>
	Sun, 6 Sep 2015 18:38:06 +0000 (21:38 +0300)
committer	Ezio Melotti <ezio.melotti@gmail.com>
	Sun, 6 Sep 2015 18:38:06 +0000 (21:38 +0300)
Lib/html/parser.py		patch \| blob \| history
Lib/test/test_htmlparser.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history