#13358: HTMLParser now calls handle_data only once for each CDATA.

author Ezio Melotti <ezio.melotti@gmail.com>

Fri, 18 Nov 2011 16:01:49 +0000 (18:01 +0200)

committer Ezio Melotti <ezio.melotti@gmail.com>

Fri, 18 Nov 2011 16:01:49 +0000 (18:01 +0200)
author Ezio Melotti <ezio.melotti@gmail.com>
Fri, 18 Nov 2011 16:01:49 +0000 (18:01 +0200)
committer Ezio Melotti <ezio.melotti@gmail.com>
Fri, 18 Nov 2011 16:01:49 +0000 (18:01 +0200)
diff --git a/Lib/html/parser.py b/Lib/html/parser.py

index 662e85575a4219df97e876e094adc963f9127f20..dd9c2e14862eae4a3df3ec4df9c22586ba9b49dc 100644 (file)
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -14,7 +14,6 @@ import re
  # Regular expressions used for parsing
  
  interesting_normal = re.compile('[&<]')
-interesting_cdata = re.compile(r'<(/|\Z)')
  incomplete = re.compile('&[a-zA-Z#]')
  
  entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
@@ -149,8 +148,8 @@ class HTMLParser(_markupbase.ParserBase):
          return self.__starttag_text
  
      def set_cdata_mode(self, elem):
-        self.interesting = interesting_cdata
          self.cdata_elem = elem.lower()
+        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
  
      def clear_cdata_mode(self):
          self.interesting = interesting_normal
@@ -168,6 +167,8 @@ class HTMLParser(_markupbase.ParserBase):
              if match:
                  j = match.start()
              else:
+                if self.cdata_elem:
+                    break
                  j = n
              if i < j: self.handle_data(rawdata[i:j])
              i = self.updatepos(i, j)
@@ -250,7 +251,7 @@ class HTMLParser(_markupbase.ParserBase):
              else:
                  assert 0, "interesting.search() lied"
          # end while
-        if end and i < n:
+        if end and i < n and not self.cdata_elem:
              self.handle_data(rawdata[i:n])
              i = self.updatepos(i, n)
          self.rawdata = rawdata[i:]
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py

index 1ce4594a44a65145ee12b2fb6561fab1969eed9e..87b5060611d3bfe9213c109bba7262357efcb6ea 100644 (file)
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -301,7 +301,27 @@ DOCTYPE html [
                                      ("data", content),
                                      ("endtag", element_lower)])
  
-
+    def test_cdata_with_closing_tags(self):
+        # see issue #13358
+        # make sure that HTMLParser calls handle_data only once for each CDATA.
+        # The normal event collector normalizes  the events in get_events,
+        # so we override it to return the original list of events.
+        class Collector(EventCollector):
+            def get_events(self):
+                return self.events
+
+        content = """<!-- not a comment --> &not-an-entity-ref;
+                  <a href="" /> </p><p> <span></span></style>
+                  '</script' + '>'"""
+        for element in [' script', 'script ', ' script ',
+                        '\nscript', 'script\n', '\nscript\n']:
+            element_lower = element.lower().strip()
+            s = '<script>{content}</{element}>'.format(element=element,
+                                                       content=content)
+            self._run_check(s, [("starttag", element_lower, []),
+                                ("data", content),
+                                ("endtag", element_lower)],
+                            collector=Collector())
  
  class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
  
diff --git a/Misc/NEWS b/Misc/NEWS

index 8a82b099ff85d09e1b0344a0da5128e071c8ed85..513984d6dee11f394ab26d752d3ae55ee74c0174 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -76,6 +76,8 @@ Core and Builtins
  Library
  -------
  
+- Issue #13358: HTMLParser now calls handle_data only once for each CDATA.
+
  - Issue #4147: minidom's toprettyxml no longer adds whitespace around a text
    node when it is the only child of an element.  Initial patch by Dan
    Kenigsberg.
author	Ezio Melotti <ezio.melotti@gmail.com>
	Fri, 18 Nov 2011 16:01:49 +0000 (18:01 +0200)
committer	Ezio Melotti <ezio.melotti@gmail.com>
	Fri, 18 Nov 2011 16:01:49 +0000 (18:01 +0200)
Lib/html/parser.py		patch \| blob \| history
Lib/test/test_htmlparser.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history