Deal more appropriately with bare ampersands and pointy brackets; this

author Fred Drake <fdrake@acm.org>

Mon, 20 Aug 2001 21:24:19 +0000 (21:24 +0000)

committer Fred Drake <fdrake@acm.org>

Mon, 20 Aug 2001 21:24:19 +0000 (21:24 +0000)
author Fred Drake <fdrake@acm.org>
Mon, 20 Aug 2001 21:24:19 +0000 (21:24 +0000)
committer Fred Drake <fdrake@acm.org>
Mon, 20 Aug 2001 21:24:19 +0000 (21:24 +0000)
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py

index 39a5d8262ff1bb5cbe4e5f6fd6499e655daef2e7..954ce2647f570a9c9a3cd13e29cc817cf59bf536 100644 (file)
--- a/Lib/HTMLParser.py
+++ b/Lib/HTMLParser.py
@@ -15,7 +15,8 @@ import string
  
  interesting_normal = re.compile('[&<]')
  interesting_cdata = re.compile(r'<(/|\Z)')
-incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*|#[0-9]*)?')
+incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*'
+                        '|#([0-9]*|[xX][0-9a-fA-F]*))?')
  
  entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
  charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
@@ -185,11 +186,8 @@ class HTMLParser:
                  elif declopen.match(rawdata, i): # <!
                      k = self.parse_declaration(i)
                  else:
-                    if i < n-1:
-                        raise HTMLParseError(
-                            "invalid '<' construct: %s" % `rawdata[i:i+2]`,
-                            self.getpos())
-                    k = -1
+                    self.handle_data("<")
+                    k = i + 1
                  if k < 0:
                      if end:
                          raise HTMLParseError("EOF in middle of construct",
@@ -203,7 +201,7 @@ class HTMLParser:
                      self.handle_charref(name)
                      k = match.end()
                      if rawdata[k-1] != ';':
-                        k = k-1
+                        k = k - 1
                      i = self.updatepos(i, k)
                      continue
                  match = entityref.match(rawdata, i)
@@ -212,17 +210,19 @@ class HTMLParser:
                      self.handle_entityref(name)
                      k = match.end()
                      if rawdata[k-1] != ';':
-                        k = k-1
+                        k = k - 1
                      i = self.updatepos(i, k)
                      continue
-                if incomplete.match(rawdata, i):
-                    if end:
+                match = incomplete.match(rawdata, i)
+                if match:
+                    rest = rawdata[i:]
+                    if end and rest != "&" and match.group() == rest:
                          raise HTMLParseError(
                              "EOF in middle of entity or char ref",
                              self.getpos())
                      return -1 # incomplete
-                raise HTMLParseError("'&' not part of entity or char ref",
-                                     self.getpos())
+                self.handle_data("&")
+                i = self.updatepos(i, i + 1)
              else:
                  assert 0, "interesting.search() lied"
          # end while
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py

index e0e212c27a5453d9913ce10225c05227ee78ebaa..bb6e0b0fe528dee83653f88b8344cb68633e4eb9 100755 (executable)
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -1,6 +1,7 @@
  """Tests for HTMLParser.py."""
  
  import HTMLParser
+import pprint
  import sys
  import test_support
  import unittest
@@ -83,9 +84,10 @@ class TestCaseBase(unittest.TestCase):
          for c in self.epilogue:
              parser.feed(c)
          parser.close()
-        self.assert_(parser.get_events() ==
-                     self.initial_events + events + self.final_events,
-                     parser.get_events())
+        events = parser.get_events()
+        self.assertEqual(events,
+                         self.initial_events + events + self.final_events,
+                         "got events:\n" + pprint.pformat(events))
  
      def _run_check_extra(self, source, events):
          self._run_check(source, events, EventCollectorExtra)
@@ -137,6 +139,18 @@ text
      ("data", "\n"),
      ])
  
+    def test_doctype_decl(self):
+        inside = """\
+DOCTYPE html [
+  <!ELEMENT html - O EMPTY>
+  <!ATTLIST html
+      version CDATA #IMPLIED '4.0'>
+  <!-- comment -->
+]"""
+        self._run_check("<!%s>" % inside, [
+            ("decl", inside),
+            ])
+
      def test_bad_nesting(self):
          # Strangely, this *is* supposed to test that overlapping
          # elements are allowed.  HTMLParser is more geared toward
@@ -148,6 +162,16 @@ text
              ("endtag", "b"),
              ])
  
+    def test_bare_ampersands(self):
+        self._run_check("this text & contains & ampersands &", [
+            ("data", "this text & contains & ampersands &"),
+            ])
+
+    def test_bare_pointy_brackets(self):
+        self._run_check("this < text > contains < bare>pointy< brackets", [
+            ("data", "this < text > contains < bare>pointy< brackets"),
+            ])
+
      def test_attr_syntax(self):
          output = [
            ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)])
@@ -199,16 +223,12 @@ text
          self._run_check(["<a b='>'", ">"], output)
  
      def test_starttag_junk_chars(self):
-        self._parse_error("<")
-        self._parse_error("<>")
          self._parse_error("</>")
          self._parse_error("</$>")
          self._parse_error("</")
          self._parse_error("</a")
          self._parse_error("<a<a>")
          self._parse_error("</a<a>")
-        self._parse_error("<$")
-        self._parse_error("<$>")
          self._parse_error("<!")
          self._parse_error("<a $>")
          self._parse_error("<a")
author	Fred Drake <fdrake@acm.org>
	Mon, 20 Aug 2001 21:24:19 +0000 (21:24 +0000)
committer	Fred Drake <fdrake@acm.org>
	Mon, 20 Aug 2001 21:24:19 +0000 (21:24 +0000)
Lib/HTMLParser.py		patch \| blob \| history
Lib/test/test_htmlparser.py		patch \| blob \| history