HTMLParser is now able to handle slashes in the start tag.

author Ezio Melotti <ezio.melotti@gmail.com>

Tue, 21 Feb 2012 07:22:16 +0000 (09:22 +0200)

committer Ezio Melotti <ezio.melotti@gmail.com>

Tue, 21 Feb 2012 07:22:16 +0000 (09:22 +0200)
author Ezio Melotti <ezio.melotti@gmail.com>
Tue, 21 Feb 2012 07:22:16 +0000 (09:22 +0200)
committer Ezio Melotti <ezio.melotti@gmail.com>
Tue, 21 Feb 2012 07:22:16 +0000 (09:22 +0200)
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py

index 5081a62562c0758a964d8f0a42b41f6f657a99c5..d4e14d438769bd71469c63838c0098865234f509 100644 (file)
--- a/Lib/HTMLParser.py
+++ b/Lib/HTMLParser.py
@@ -28,19 +28,19 @@ tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
  tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
  
  attrfind = re.compile(
-    r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
-    r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
+    r'[\s/]*((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
+    r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
  
  locatestarttagend = re.compile(r"""
    <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
-  (?:\s+                             # whitespace before attribute name
-    (?:(?<=['"\s])[^\s/>][^\s/=>]*   # attribute name
+  (?:[\s/]*                          # optional whitespace before attribute name
+    (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
        (?:\s*=+\s*                    # value indicator
          (?:'[^']*'                   # LITA-enclosed value
            |"[^"]*"                   # LIT-enclosed value
            |(?!['"])[^>\s]*           # bare value
           )
-       )?\s*
+       )?(?:\s|/(?!>))*
       )*
     )?
    \s*                                # trailing whitespace
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py

index 8136bca3e28ee49665fee244e60505a20bb7bc21..41f43408d83525f3bfd6f0a35f9f30980e7b0bdb 100644 (file)
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -240,6 +240,27 @@ text
              self._run_check("<!DOCTYPE %s>" % dtd,
                              [('decl', 'DOCTYPE ' + dtd)])
  
+    def test_slashes_in_starttag(self):
+        self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
+        html = ('<img width=902 height=250px '
+                'src="/sites/default/files/images/homepage/foo.jpg" '
+                '/*what am I doing here*/ />')
+        expected = [(
+            'startendtag', 'img',
+            [('width', '902'), ('height', '250px'),
+             ('src', '/sites/default/files/images/homepage/foo.jpg'),
+             ('*what', None), ('am', None), ('i', None),
+             ('doing', None), ('here*', None)]
+        )]
+        self._run_check(html, expected)
+        html = ('<a / /foo/ / /=/ / /bar/ / />'
+                '<a / /foo/ / /=/ / /bar/ / >')
+        expected = [
+            ('startendtag', 'a', [('foo', None), ('=', None), ('bar', None)]),
+            ('starttag', 'a', [('foo', None), ('=', None), ('bar', None)])
+        ]
+        self._run_check(html, expected)
+
      def test_declaration_junk_chars(self):
          self._run_check("<!DOCTYPE foo $ >", [('decl', 'DOCTYPE foo $ ')])
  
diff --git a/Misc/NEWS b/Misc/NEWS

index 1c049876abb91e7b68ccfe7873bd30ad609ce195..45f22b1447d4ba449d7935e1ed0d754499ce4d63 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -98,6 +98,8 @@ Core and Builtins
  Library
  -------
  
+- HTMLParser is now able to handle slashes in the start tag.
+
  - Issue #14001: CVE-2012-0845: xmlrpc: Fix an endless loop in
    SimpleXMLRPCServer upon malformed POST request.
author	Ezio Melotti <ezio.melotti@gmail.com>
	Tue, 21 Feb 2012 07:22:16 +0000 (09:22 +0200)
committer	Ezio Melotti <ezio.melotti@gmail.com>
	Tue, 21 Feb 2012 07:22:16 +0000 (09:22 +0200)
Lib/HTMLParser.py		patch \| blob \| history
Lib/test/test_htmlparser.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history