]> granicus.if.org Git - python/commitdiff
- SF bug #853506: IP6 address parsing in sgmllib
authorFred Drake <fdrake@acm.org>
Fri, 23 Jun 2006 06:03:45 +0000 (06:03 +0000)
committerFred Drake <fdrake@acm.org>
Fri, 23 Jun 2006 06:03:45 +0000 (06:03 +0000)
  ('[' and ']' were not accepted in unquoted attribute values)

- cleaned up tests of character and entity reference decoding so the
  tests cover the documented relationships among handle_charref,
  handle_entityref, convert_charref, convert_codepoint, and
  convert_entityref, without bringing up Unicode issues that sgmllib
  cannot be involved in

Lib/sgmllib.py
Lib/test/test_sgmllib.py

index 194396bb5d45483b2eb3467f54920f2057dde751..3ab57c23071707b0a23c29e58f35afaa09306f37 100644 (file)
@@ -33,7 +33,7 @@ endbracket = re.compile('[<>]')
 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
 attrfind = re.compile(
     r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
-    r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
+    r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
 
 
 class SGMLParseError(RuntimeError):
@@ -400,11 +400,11 @@ class SGMLParser(markupbase.ParserBase):
 
     def handle_charref(self, name):
         """Handle character reference, no need to override."""
-        replacement = convert_charref(name)
+        replacement = self.convert_charref(name)
         if replacement is None:
             self.unknown_charref(name)
         else:
-            self.handle_data(convert_charref(name))
+            self.handle_data(replacement)
 
     # Definition of entities -- derived classes may override
     entitydefs = \
index 31b54de384525f5f076f3632657b35cfbe9861f2..076df37efb2d3487367e99f0e7af234da80c1222 100644 (file)
@@ -1,4 +1,6 @@
+import htmlentitydefs
 import pprint
+import re
 import sgmllib
 import unittest
 from test import test_support
@@ -65,20 +67,34 @@ class CDATAEventCollector(EventCollector):
 
 
 class HTMLEntityCollector(EventCollector):
-    import re, htmlentitydefs
+
     entity_or_charref = re.compile('(?:&([a-zA-Z][-.a-zA-Z0-9]*)'
         '|&#(x[0-9a-zA-Z]+|[0-9]+))(;?)')
 
     def convert_charref(self, name):
         self.append(("charref", "convert", name))
-        if name.startswith('x'):
-            return unichr(int(name[1:],16))
-        else:
-            return unichr(int(name))
+        if name[0] != "x":
+            return EventCollector.convert_charref(self, name)
+
+    def convert_codepoint(self, codepoint):
+        self.append(("codepoint", "convert", codepoint))
+        EventCollector.convert_codepoint(self, codepoint)
 
     def convert_entityref(self, name):
         self.append(("entityref", "convert", name))
-        return unichr(self.htmlentitydefs.name2codepoint[name])
+        return EventCollector.convert_entityref(self, name)
+
+    # These to record that they were called, then pass the call along
+    # to the default implementation so that it's actions can be
+    # recorded.
+
+    def handle_charref(self, data):
+        self.append(("charref", data))
+        sgmllib.SGMLParser.handle_charref(self, data)
+
+    def handle_entityref(self, data):
+        self.append(("entityref", data))
+        sgmllib.SGMLParser.handle_entityref(self, data)
 
 
 class SGMLParserTestCase(unittest.TestCase):
@@ -251,13 +267,23 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
                                 ])])
 
     def test_convert_overrides(self):
+        # This checks that the character and entity reference
+        # conversion helpers are called at the documented times.  No
+        # attempt is made to really change what the parser accepts.
+        #
         self.collector = HTMLEntityCollector
-        self.check_events('<a title="&ldquo;test&#x201d;">foo</a>', [
+        self.check_events(('<a title="&ldquo;test&#x201d;">foo</a>'
+                           '&foobar;&#42;'), [
             ('entityref', 'convert', 'ldquo'),
             ('charref', 'convert', 'x201d'),
-            ('starttag', 'a', [('title', u'\u201ctest\u201d')]),
+            ('starttag', 'a', [('title', '&ldquo;test&#x201d;')]),
             ('data', 'foo'),
             ('endtag', 'a'),
+            ('entityref', 'foobar'),
+            ('entityref', 'convert', 'foobar'),
+            ('charref', '42'),
+            ('charref', 'convert', '42'),
+            ('codepoint', 'convert', 42),
             ])
 
     def test_attr_funky_names(self):
@@ -265,6 +291,14 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
             ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
             ])
 
+    def test_attr_value_ip6_url(self):
+        # http://www.python.org/sf/853506
+        self.check_events(("<a href='http://[1080::8:800:200C:417A]/'>"
+                           "<a href=http://[1080::8:800:200C:417A]/>"), [
+            ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
+            ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
+            ])
+
     def test_illegal_declarations(self):
         s = 'abc<!spacer type="block" height="25">def'
         self.check_events(s, [