]> granicus.if.org Git - python/commitdiff
patch #1462498: handle entityrefs in attribute values.
authorGeorg Brandl <georg@python.org>
Sat, 1 Apr 2006 08:35:18 +0000 (08:35 +0000)
committerGeorg Brandl <georg@python.org>
Sat, 1 Apr 2006 08:35:18 +0000 (08:35 +0000)
Doc/lib/libsgmllib.tex
Lib/sgmllib.py
Lib/test/test_sgmllib.py
Misc/NEWS

index 27bf0b0ff3d4869888187e721e7f92bbe397565c..592c1916318ce528ea4fc505a6dcc49f552a91aa 100644 (file)
@@ -95,12 +95,15 @@ lower case, and the \var{method} argument is the bound method which
 should be used to support semantic interpretation of the start tag.
 The \var{attributes} argument is a list of \code{(\var{name},
 \var{value})} pairs containing the attributes found inside the tag's
-\code{<>} brackets.  The \var{name} has been translated to lower case
-and double quotes and backslashes in the \var{value} have been interpreted.
+\code{<>} brackets.  The \var{name} has been translated to lower case.
+Double quotes and backslashes in the \var{value} have been interpreted,
+as well as known entity and character references.
 For instance, for the tag \code{<A HREF="http://www.cwi.nl/">}, this
 method would be called as \samp{unknown_starttag('a', [('href',
 'http://www.cwi.nl/')])}.  The base implementation simply calls
 \var{method} with \var{attributes} as the only argument.
+\versionadded[Handling of entity and character references within
+              attribute values]{2.5}
 \end{methoddesc}
 
 \begin{methoddesc}{handle_endtag}{tag, method}
index 08e365bdef99e20e60c480095823ed8e1c72832e..784dbe1b8a402e4b20927079ce62d074b19b1d4e 100644 (file)
@@ -269,9 +269,37 @@ class SGMLParser(markupbase.ParserBase):
             attrname, rest, attrvalue = match.group(1, 2, 3)
             if not rest:
                 attrvalue = attrname
-            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
-                 attrvalue[:1] == '"' == attrvalue[-1:]:
-                attrvalue = attrvalue[1:-1]
+            else:
+                if (attrvalue[:1] == "'" == attrvalue[-1:] or 
+                    attrvalue[:1] == '"' == attrvalue[-1:]):
+                    # strip quotes
+                    attrvalue = attrvalue[1:-1]
+                l = 0
+                new_attrvalue = ''
+                while l < len(attrvalue):
+                    av_match = entityref.match(attrvalue, l)
+                    if (av_match and av_match.group(1) in self.entitydefs and
+                        attrvalue[av_match.end(1)] == ';'):
+                        # only substitute entityrefs ending in ';' since
+                        # otherwise we may break <a href='?p=x&q=y'>
+                        # which is very common
+                        new_attrvalue += self.entitydefs[av_match.group(1)]
+                        l = av_match.end(0)
+                        continue
+                    ch_match = charref.match(attrvalue, l)
+                    if ch_match:
+                        try:
+                            char = chr(int(ch_match.group(1)))
+                            new_attrvalue += char
+                            l = ch_match.end(0)
+                            continue
+                        except ValueError:
+                            # invalid character reference, don't substitute
+                            pass
+                    # all other cases
+                    new_attrvalue += attrvalue[l]
+                    l += 1
+                attrvalue = new_attrvalue
             attrs.append((attrname.lower(), attrvalue))
             k = match.end(0)
         if rawdata[j] == '>':
index bc25bd0195cf9bc2e1166377cecbc0f901a38006..8e8b02f8770a761ebf813f0a2aff44773dc8fcab 100644 (file)
@@ -214,6 +214,20 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
             ("starttag", "e", [("a", "rgb(1,2,3)")]),
             ])
 
+    def test_attr_values_entities(self):
+        """Substitution of entities and charrefs in attribute values"""
+        # SF bug #1452246
+        self.check_events("""<a b=&lt; c=&lt;&gt; d=&lt-&gt; e='&lt; '
+                                f="&xxx;" g='&#32;&#33;' h='&#500;' i='x?a=b&c=d;'>""",
+            [("starttag", "a", [("b", "<"),
+                                ("c", "<>"),
+                                ("d", "&lt->"),
+                                ("e", "< "),
+                                ("f", "&xxx;"),
+                                ("g", " !"),
+                                ("h", "&#500;"),
+                                ("i", "x?a=b&c=d;"), ])])
+
     def test_attr_funky_names(self):
         self.check_events("""<a a.b='v' c:d=v e-f=v>""", [
             ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
index 0ecea2e4d28f65477de48a1c4cf8a2b033e098d7..260d38f18b22c6b074323bbdce3b9f46ab192ff3 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -489,6 +489,9 @@ Extension Modules
 Library
 -------
 
+- Patch #1462498: sgmllib now handles entity and character references
+  in attribute values.
+
 - Added the sqlite3 package. This is based on pysqlite2.1.3, and provides
   a DB-API interface in the standard library. You'll need sqlite 3.2.2 or
   later to build this - if you have an earlier version, the C extension