Fixed a problem where xmllib didn't handle the tag

author Guido van Rossum <guido@python.org>

Mon, 19 Oct 1998 13:28:26 +0000 (13:28 +0000)

committer Guido van Rossum <guido@python.org>

Mon, 19 Oct 1998 13:28:26 +0000 (13:28 +0000)
author Guido van Rossum <guido@python.org>
Mon, 19 Oct 1998 13:28:26 +0000 (13:28 +0000)
committer Guido van Rossum <guido@python.org>
Mon, 19 Oct 1998 13:28:26 +0000 (13:28 +0000)
diff --git a/Lib/xmllib.py b/Lib/xmllib.py

index 355714fbbb8f2e7ab91b9b1b30a5382a1d7a063d..06dc373c8e65f9f4919f4f2eb49919de4668e076 100644 (file)
--- a/Lib/xmllib.py
+++ b/Lib/xmllib.py
@@ -9,9 +9,10 @@ version = '0.1'
  
  # Regular expressions used for parsing
  
-_S = '[ \t\r\n]+'
-_opS = '[ \t\r\n]*'
-_Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*'
+_S = '[ \t\r\n]+'                       # white space
+_opS = '[ \t\r\n]*'                     # optional white space
+_Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*'    # valid XML name
+_QStr = "(?:'[^']*'|\"[^\"]*\")"        # quoted XML string
  illegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content
  interesting = re.compile('[]&<]')
  
@@ -22,17 +23,25 @@ charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
  space = re.compile(_S + '$')
  newline = re.compile('\n')
  
+attrfind = re.compile(
+    _S + '(?P<name>' + _Name + ')'
+    '(' + _opS + '=' + _opS +
+    '(?P<value>'+_QStr+'|[-a-zA-Z0-9.:+*%?!()_#=~]+))?')
  starttagopen = re.compile('<' + _Name)
-endtagopen = re.compile('</')
  starttagend = re.compile(_opS + '(?P<slash>/?)>')
+starttagmatch = re.compile('<(?P<tagname>'+_Name+')'
+                      '(?P<attrs>(?:'+attrfind.pattern+')*)'+
+                      starttagend.pattern)
+endtagopen = re.compile('</')
  endbracket = re.compile(_opS + '>')
+endbracketfind = re.compile('(?:[^>\'"]|'+_QStr+')*>')
  tagfind = re.compile(_Name)
  cdataopen = re.compile(r'<!\[CDATA\[')
  cdataclose = re.compile(r'\]\]>')
  # this matches one of the following:
  # SYSTEM SystemLiteral
  # PUBLIC PubidLiteral SystemLiteral
-_SystemLiteral = '(?P<%s>\'[^\']*\'|"[^"]*")'
+_SystemLiteral = '(?P<%s>'+_QStr+')'
  _PublicLiteral = '(?P<%s>"[-\'()+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \
                          "'[-()+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')"
  _ExternalId = '(?:SYSTEM|' \
@@ -41,7 +50,7 @@ _ExternalId = '(?:SYSTEM|' \
  doctype = re.compile('<!DOCTYPE'+_S+'(?P<name>'+_Name+')'
                       '(?:'+_S+_ExternalId+')?'+_opS)
  xmldecl = re.compile('<\?xml'+_S+
-                     'version'+_opS+'='+_opS+'(?P<version>\'[^\']*\'|"[^"]*")'+
+                     'version'+_opS+'='+_opS+'(?P<version>'+_QStr+')'+
                       '(?:'+_S+'encoding'+_opS+'='+_opS+
                          "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
                          '"[A-Za-z][-A-Za-z0-9._]*"))?'
@@ -53,10 +62,6 @@ procclose = re.compile(_opS + r'\?>')
  commentopen = re.compile('<!--')
  commentclose = re.compile('-->')
  doubledash = re.compile('--')
-attrfind = re.compile(
-    _S + '(?P<name>' + _Name + ')'
-    '(' + _opS + '=' + _opS +
-    '(?P<value>\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9.:+*%?!()_#=~]+))')
  attrtrans = string.maketrans(' \r\n\t', '    ')
  
  
@@ -173,7 +178,7 @@ class XMLParser:
                      self.syntax_error('illegal data at start of file')
                  self.__at_start = 0
                  data = rawdata[i:j]
-                if not self.stack and not space.match(data):
+                if not self.stack and space.match(data) is None:
                      self.syntax_error('data not in content')
                  if illegal.search(data):
                      self.syntax_error('illegal character in content')
@@ -327,7 +332,7 @@ class XMLParser:
          if rawdata[i:i+4] <> '<!--':
              raise RuntimeError, 'unexpected call to handle_comment'
          res = commentclose.search(rawdata, i+4)
-        if not res:
+        if res is None:
              return -1
          if doubledash.search(rawdata, i+4, res.start(0)):
              self.syntax_error("`--' inside comment")
@@ -365,7 +370,7 @@ class XMLParser:
                      pass
                  elif level <= 0 and c == ']':
                      res = endbracket.match(rawdata, k+1)
-                    if not res:
+                    if res is None:
                          return -1
                      self.handle_doctype(name, pubid, syslit, rawdata[j+1:k])
                      return res.end(0)
@@ -376,10 +381,10 @@ class XMLParser:
                      if level < 0:
                          self.syntax_error("bogus `>' in DOCTYPE")
                  k = k+1
-        res = endbracket.search(rawdata, k)
-        if not res:
+        res = endbracketfind.match(rawdata, k)
+        if res is None:
              return -1
-        if res.start(0) != k:
+        if endbracket.match(rawdata, k) is None:
              self.syntax_error('garbage in DOCTYPE')
          self.handle_doctype(name, pubid, syslit, None)
          return res.end(0)
@@ -390,7 +395,7 @@ class XMLParser:
          if rawdata[i:i+9] <> '<![CDATA[':
              raise RuntimeError, 'unexpected call to parse_cdata'
          res = cdataclose.search(rawdata, i+9)
-        if not res:
+        if res is None:
              return -1
          if illegal.search(rawdata, i+9, res.start(0)):
              self.syntax_error('illegal character in CDATA')
@@ -404,13 +409,13 @@ class XMLParser:
      def parse_proc(self, i):
          rawdata = self.rawdata
          end = procclose.search(rawdata, i)
-        if not end:
+        if end is None:
              return -1
          j = end.start(0)
          if illegal.search(rawdata, i+2, j):
              self.syntax_error('illegal character in processing instruction')
          res = tagfind.match(rawdata, i+2)
-        if not res:
+        if res is None:
              raise RuntimeError, 'unexpected call to parse_proc'
          k = res.end(0)
          name = res.group(0)
@@ -420,9 +425,9 @@ class XMLParser:
          return end.end(0)
  
      # Internal -- parse attributes between i and j
-    def parse_attributes(self, tag, k, j, attributes = None):
+    def parse_attributes(self, tag, i, j, attributes = None):
          rawdata = self.rawdata
-        # Now parse the data between k and j into a tag and attrs
+        # Now parse the data between i and j into a tag and attrs
          attrdict = {}
          try:
              # convert attributes list to dictionary
@@ -432,79 +437,79 @@ class XMLParser:
              attributes = d
          except TypeError:
              pass
-        while k < j:
-            res = attrfind.match(rawdata, k)
-            if not res: break
+        while i < j:
+            res = attrfind.match(rawdata, i)
+            if res is None:
+                break
              attrname, attrvalue = res.group('name', 'value')
              if attrvalue is None:
-                self.syntax_error('no attribute value specified')
+                self.syntax_error("no value specified for attribute `%s'" % attrname)
                  attrvalue = attrname
              elif attrvalue[:1] == "'" == attrvalue[-1:] or \
                   attrvalue[:1] == '"' == attrvalue[-1:]:
                  attrvalue = attrvalue[1:-1]
              else:
-                self.syntax_error('attribute value not quoted')
+                self.syntax_error("attribute `%s' value not quoted" % attrname)
+            if '<' in attrvalue:
+                self.syntax_error("`<' illegal in attribute value")
              if attributes is not None and not attributes.has_key(attrname):
-                self.syntax_error('unknown attribute %s of element %s' %
+                self.syntax_error("unknown attribute `%s' of element `%s'" %
                                    (attrname, tag))
              if attrdict.has_key(attrname):
-                self.syntax_error('attribute specified twice')
+                self.syntax_error("attribute `%s' specified twice" % attrname)
              attrvalue = string.translate(attrvalue, attrtrans)
              attrdict[attrname] = self.translate_references(attrvalue)
-            k = res.end(0)
+            i = res.end(0)
          if attributes is not None:
              # fill in with default attributes
              for key, val in attributes.items():
                  if val is not None and not attrdict.has_key(key):
                      attrdict[key] = val
-        return attrdict, k
+        return attrdict, i
  
      # Internal -- handle starttag, return length or -1 if not terminated
      def parse_starttag(self, i):
          rawdata = self.rawdata
          # i points to start of tag
-        end = endbracket.search(rawdata, i+1)
-        if not end:
+        end = endbracketfind.match(rawdata, i+1)
+        if end is None:
              return -1
-        j = end.start(0)
-        res = tagfind.match(rawdata, i+1)
-        if not res:
-            raise RuntimeError, 'unexpected call to parse_starttag'
-        k = res.end(0)
-        tag = res.group(0)
-        if not self.__seen_starttag and self.__seen_doctype:
-            if tag != self.__seen_doctype:
-                self.syntax_error('starttag does not match DOCTYPE')
+        tag = starttagmatch.match(rawdata, i)
+        if tag is None or tag.end(0) != end.end(0):
+            self.syntax_error('garbage in starttag')
+            return end.end(0)
+        tagname = tag.group('tagname')
+        if not self.__seen_starttag and self.__seen_doctype and \
+           tagname != self.__seen_doctype:
+            self.syntax_error('starttag does not match DOCTYPE')
          if self.__seen_starttag and not self.stack:
              self.syntax_error('multiple elements on top level')
-        if hasattr(self, tag + '_attributes'):
-            attributes = getattr(self, tag + '_attributes')
+        if hasattr(self, tagname + '_attributes'):
+            attributes = getattr(self, tagname + '_attributes')
          else:
              attributes = None
-        attrdict, k = self.parse_attributes(tag, k, j, attributes)
-        res = starttagend.match(rawdata, k)
-        if not res:
-            self.syntax_error('garbage in start tag')
-        self.finish_starttag(tag, attrdict)
-        if res and res.group('slash') == '/':
-            self.finish_endtag(tag)
-        return end.end(0)
+        k, j = tag.span('attrs')
+        attrdict, k = self.parse_attributes(tagname, k, j, attributes)
+        self.finish_starttag(tagname, attrdict)
+        if tag.group('slash') == '/':
+            self.finish_endtag(tagname)
+        return tag.end(0)
  
      # Internal -- parse endtag
      def parse_endtag(self, i):
          rawdata = self.rawdata
-        end = endbracket.search(rawdata, i+1)
-        if not end:
+        end = endbracketfind.match(rawdata, i+1)
+        if end is None:
              return -1
          res = tagfind.match(rawdata, i+2)
-        if not res:
+        if res is None:
              self.syntax_error('no name specified in end tag')
              tag = ''
              k = i+2
          else:
              tag = res.group(0)
              k = res.end(0)
-        if k != end.start(0):
+        if endbracket.match(rawdata, k) is None:
              self.syntax_error('garbage in end tag')
          self.finish_endtag(tag)
          return end.end(0)
author	Guido van Rossum <guido@python.org>
	Mon, 19 Oct 1998 13:28:26 +0000 (13:28 +0000)
committer	Guido van Rossum <guido@python.org>
	Mon, 19 Oct 1998 13:28:26 +0000 (13:28 +0000)