#7311: fix html.parser to accept non-ASCII attribute values.

author Ezio Melotti <none@none>

Thu, 7 Apr 2011 19:03:31 +0000 (22:03 +0300)

committer Ezio Melotti <none@none>

Thu, 7 Apr 2011 19:03:31 +0000 (22:03 +0300)
author Ezio Melotti <none@none>
Thu, 7 Apr 2011 19:03:31 +0000 (22:03 +0300)
committer Ezio Melotti <none@none>
Thu, 7 Apr 2011 19:03:31 +0000 (22:03 +0300)
diff --git a/Lib/html/parser.py b/Lib/html/parser.py

index 21ebbc3eaf078989ab9a1bfb6dd20746a333fe9a..a3586ebf516d1aaa047acfd6acf44f25abd3f308 100644 (file)
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -28,7 +28,7 @@ tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
  # make it correctly strict without breaking backward compatibility.
  attrfind = re.compile(
      r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
-    r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')
+    r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
  attrfind_tolerant = re.compile(
      r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
      r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?')
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py

index 5ecd016084dbf3a05f74615118258f9e2205043d..637ab01f126223db1274547ba79b1969ccfa2478 100644 (file)
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -217,6 +217,23 @@ DOCTYPE html [
              ("starttag", "a", [("href", "mailto:xyz@example.com")]),
              ])
  
+    def test_attr_nonascii(self):
+        # see issue 7311
+        self._run_check("<img src=/foo/bar.png alt=\u4e2d\u6587>", [
+            ("starttag", "img", [("src", "/foo/bar.png"),
+                                 ("alt", "\u4e2d\u6587")]),
+            ])
+        self._run_check("<a title='\u30c6\u30b9\u30c8' "
+                        "href='\u30c6\u30b9\u30c8.html'>", [
+            ("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
+                               ("href", "\u30c6\u30b9\u30c8.html")]),
+            ])
+        self._run_check('<a title="\u30c6\u30b9\u30c8" '
+                        'href="\u30c6\u30b9\u30c8.html">', [
+            ("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
+                               ("href", "\u30c6\u30b9\u30c8.html")]),
+            ])
+
      def test_attr_entity_replacement(self):
          self._run_check("""<a b='&amp;&gt;&lt;&quot;&apos;'>""", [
              ("starttag", "a", [("b", "&><\"'")]),
diff --git a/Misc/NEWS b/Misc/NEWS

index 2ae7eaf657ab9265ce6c169985a03b2e8e6e1d71..80d46ed117af8c6e91a6f1ea3ef1855688027ec7 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -49,6 +49,8 @@ Core and Builtins
  Library
  -------
  
+- Issue #7311: fix html.parser to accept non-ASCII attribute values.
+
  - Issue #11605: email.parser.BytesFeedParser was incorrectly converting multipart
    subpararts with an 8bit CTE into unicode instead of preserving the bytes.
author	Ezio Melotti <none@none>
	Thu, 7 Apr 2011 19:03:31 +0000 (22:03 +0300)
committer	Ezio Melotti <none@none>
	Thu, 7 Apr 2011 19:03:31 +0000 (22:03 +0300)
Lib/html/parser.py		patch \| blob \| history
Lib/test/test_htmlparser.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history