]> granicus.if.org Git - python/commitdiff
#7311: fix HTMLParser to accept non-ASCII attribute values.
authorEzio Melotti <none@none>
Tue, 5 Apr 2011 17:40:52 +0000 (20:40 +0300)
committerEzio Melotti <none@none>
Tue, 5 Apr 2011 17:40:52 +0000 (20:40 +0300)
Lib/HTMLParser.py
Lib/test/test_htmlparser.py
Misc/NEWS

index 4fdc09aa7631ec2f153f2e969fbbc00fdf1f5a0e..e0189011d115e59cb50f26be0bccc849ca74be34 100644 (file)
@@ -26,7 +26,7 @@ commentclose = re.compile(r'--\s*>')
 tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
 attrfind = re.compile(
     r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
-    r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')
+    r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
 
 locatestarttagend = re.compile(r"""
   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
index 717585ca5b9be5c58740cd4c2d31a9d4774c7967..0620d0bdc2a95cbd88f9044ae80c9aeff117935e 100644 (file)
@@ -208,6 +208,23 @@ DOCTYPE html [
             ("starttag", "a", [("href", "mailto:xyz@example.com")]),
             ])
 
+    def test_attr_nonascii(self):
+        # see issue 7311
+        self._run_check(u"<img src=/foo/bar.png alt=\u4e2d\u6587>", [
+            ("starttag", "img", [("src", "/foo/bar.png"),
+                                 ("alt", u"\u4e2d\u6587")]),
+            ])
+        self._run_check(u"<a title='\u30c6\u30b9\u30c8' "
+                        u"href='\u30c6\u30b9\u30c8.html'>", [
+            ("starttag", "a", [("title", u"\u30c6\u30b9\u30c8"),
+                               ("href", u"\u30c6\u30b9\u30c8.html")]),
+            ])
+        self._run_check(u'<a title="\u30c6\u30b9\u30c8" '
+                        u'href="\u30c6\u30b9\u30c8.html">', [
+            ("starttag", "a", [("title", u"\u30c6\u30b9\u30c8"),
+                               ("href", u"\u30c6\u30b9\u30c8.html")]),
+            ])
+
     def test_attr_entity_replacement(self):
         self._run_check("""<a b='&amp;&gt;&lt;&quot;&apos;'>""", [
             ("starttag", "a", [("b", "&><\"'")]),
index 2c444a98b7fceb4d63ee33c0ef5953b694965186..fe4605fafac3cd9667240f2e9844cdc5724a7ccc 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -47,6 +47,8 @@ Core and Builtins
 Library
 -------
 
+- Issue #7311: fix HTMLParser to accept non-ASCII attribute values.
+
 - Issue #10963: Ensure that subprocess.communicate() never raises EPIPE.
 
 - Issue #11662: Make urllib and urllib2 ignore redirections if the