# make it correctly strict without breaking backward compatibility.
attrfind = re.compile(
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
- r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')
+ r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
attrfind_tolerant = re.compile(
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?')
("starttag", "a", [("href", "mailto:xyz@example.com")]),
])
+ def test_attr_nonascii(self):
+ # see issue 7311
+ self._run_check("<img src=/foo/bar.png alt=\u4e2d\u6587>", [
+ ("starttag", "img", [("src", "/foo/bar.png"),
+ ("alt", "\u4e2d\u6587")]),
+ ])
+ self._run_check("<a title='\u30c6\u30b9\u30c8' "
+ "href='\u30c6\u30b9\u30c8.html'>", [
+ ("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
+ ("href", "\u30c6\u30b9\u30c8.html")]),
+ ])
+ self._run_check('<a title="\u30c6\u30b9\u30c8" '
+ 'href="\u30c6\u30b9\u30c8.html">', [
+ ("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
+ ("href", "\u30c6\u30b9\u30c8.html")]),
+ ])
+
def test_attr_entity_replacement(self):
self._run_check("""<a b='&><"''>""", [
("starttag", "a", [("b", "&><\"'")]),