"""
+from xmlreader import InputSource
from handler import ContentHandler, ErrorHandler
from _exceptions import SAXException, SAXNotRecognizedException, \
SAXParseException, SAXNotSupportedException
-def parse(filename_or_stream, handler, errorHandler=ErrorHandler()):
+def parse(source, handler, errorHandler=ErrorHandler()):
parser = ExpatParser()
parser.setContentHandler(handler)
parser.setErrorHandler(errorHandler)
- parser.parse(filename_or_stream)
+ parser.parse(source)
def parseString(string, handler, errorHandler=ErrorHandler()):
try:
parser = ExpatParser()
parser.setContentHandler(handler)
parser.setErrorHandler(errorHandler)
- parser.parse(StringIO(string))
+
+ inpsrc = InputSource()
+ inpsrc.setByteStream(StringIO(string))
+ parser.parse(inpsrc)
# this is the parser list used by the make_parser function if no
# alternatives are given as parameters to the function
from xml.sax._exceptions import *
from xml.parsers import expat
-from xml.sax import xmlreader
+from xml.sax import xmlreader, saxutils
AttributesImpl = xmlreader.AttributesImpl
AttributesNSImpl = xmlreader.AttributesNSImpl
# XMLReader methods
- def parse(self, stream_or_string):
+ def parse(self, source):
"Parse an XML document from a URL."
- if type(stream_or_string) is type(""):
- stream = open(stream_or_string)
- else:
- stream = stream_or_string
-
+ source = saxutils.prepare_input_source(source)
+
+ self._source = source
self.reset()
self._cont_handler.setDocumentLocator(self)
try:
- xmlreader.IncrementalParser.parse(self, stream)
+ xmlreader.IncrementalParser.parse(self, source)
except expat.error:
error_code = self._parser.ErrorCode
raise SAXParseException(expat.ErrorString(error_code), None, self)
self._cont_handler.endDocument()
- def prepareParser(self, filename=None):
- self._source = filename
-
- if self._source != None:
- self._parser.SetBase(self._source)
+ def prepareParser(self, source):
+ if source.getSystemId() != None:
+ self._parser.SetBase(source.getSystemId())
def getFeature(self, name):
if name == feature_namespaces:
convenience of application and driver writers.
"""
+import os, urlparse, urllib
import handler
import xmlreader
def setProperty(self, name, value):
self._parent.setProperty(name, value)
+
+# --- Utility functions
+
+def prepare_input_source(source, base = ""):
+ """This function takes an InputSource and an optional base URL and
+ returns a fully resolved InputSource object ready for reading."""
+
+ if type(source) == type(""):
+ source = xmlreader.InputSource(source)
+
+ if source.getByteStream() == None:
+ sysid = source.getSystemId()
+ if urlparse.urlparse(sysid)[0] == '':
+ basehead = os.path.split(os.path.normpath(base))[0]
+ source.setSystemId(os.path.join(basehead, sysid))
+ else:
+ source.setSystemId(urlparse.urljoin(base, sysid))
+
+ source.setByteStream(urllib.urlopen(source.getSystemId()))
+
+ return source
# ===== XMLREADER =====
class XMLReader:
+
def __init__(self):
self._cont_handler = handler.ContentHandler()
#self._dtd_handler = handler.DTDHandler()
"Sets the value of a SAX2 property."
raise SAXNotRecognizedException("Property '%s' not recognized" % name)
-
+import saxutils
+
class IncrementalParser(XMLReader):
"""This interface adds three extra methods to the XMLReader
interface that allow XML parsers to support incremental
self._bufsize = bufsize
XMLReader.__init__(self)
- def _parseOpenFile(self, source):
- buffer = source.read(self._bufsize)
+ def parse(self, source):
+ source = saxutils.prepare_input_source(source)
+
+ self.prepareParser(source)
+ file = source.getByteStream()
+ buffer = file.read(self._bufsize)
while buffer != "":
self.feed(buffer)
- buffer = source.read(self._bufsize)
- self.close()
+ buffer = file.read(self._bufsize)
+
self.reset()
- def parse(self, source):
- if hasattr(source, "read"):
- self._parseOpenFile(source)
- else:
- #FIXME: how to recognize if it is a URL instead of filename?
- self.prepareParser(source)
- file = open(source)
- self._parseOpenFile(file)
- file.close()
-
def feed(self, data):
"""This method gives the raw XML data in the data parameter to
the parser and makes it parse the data, emitting the
"Return the system identifier for the current event."
return None
+# ===== INPUTSOURCE =====
+
+class InputSource:
+ """Encapsulation of the information needed by the XMLReader to
+ read entities.
+
+ This class may include information about the public identifier,
+ system identifier, byte stream (possibly with character encoding
+ information) and/or the character stream of an entity.
+
+ Applications will create objects of this class for use in the
+ XMLReader.parse method and for returning from
+ EntityResolver.resolveEntity.
+
+ An InputSource belongs to the application, the XMLReader is not
+ allowed to modify InputSource objects passed to it from the
+ application, although it may make copies and modify those."""
+
+ def __init__(self, system_id = None):
+ self.__system_id = system_id
+ self.__public_id = None
+ self.__encoding = None
+ self.__bytefile = None
+ self.__charfile = None
+
+ def setPublicId(self, public_id):
+ "Sets the public identifier of this InputSource."
+ self.__public_id = public_id
+
+ def getPublicId(self):
+ "Returns the public identifier of this InputSource."
+ return self.__public_id
+
+ def setSystemId(self, system_id):
+ "Sets the system identifier of this InputSource."
+ self.__system_id = system_id
+
+ def getSystemId(self):
+ "Returns the system identifier of this InputSource."
+ return self.__system_id
+
+ def setEncoding(self, encoding):
+ """Sets the character encoding of this InputSource.
+
+ The encoding must be a string acceptable for an XML encoding
+ declaration (see section 4.3.3 of the XML recommendation).
+
+ The encoding attribute of the InputSource is ignored if the
+ InputSource also contains a character stream."""
+ self.__encoding = encoding
+
+ def getEncoding(self):
+ "Get the character encoding of this InputSource."
+ return self.__encoding
+
+ def setByteStream(self, bytefile):
+ """Set the byte stream (a Python file-like object which does
+ not perform byte-to-character conversion) for this input
+ source.
+
+ The SAX parser will ignore this if there is also a character
+ stream specified, but it will use a byte stream in preference
+ to opening a URI connection itself.
+
+ If the application knows the character encoding of the byte
+ stream, it should set it with the setEncoding method."""
+ self.__bytefile = bytefile
+
+ def getByteStream(self):
+ """Get the byte stream for this input source.
+
+ The getEncoding method will return the character encoding for
+ this byte stream, or None if unknown."""
+ return self.__bytefile
+
+ def setCharacterStream(self, charfile):
+ """Set the character stream for this input source. (The stream
+ must be a Python 1.6 Unicode-wrapped file-like that performs
+ conversion to Unicode strings.)
+
+ If there is a character stream specified, the SAX parser will
+ ignore any byte stream and will not attempt to open a URI
+ connection to the system identifier."""
+ self.__charfile = charfile
+
+ def getCharacterStream(self):
+ "Get the character stream for this input source."
+ return self.__charfile
+
# ===== ATTRIBUTESIMPL =====
class AttributesImpl: