123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378 |
- """An XML Reader is the SAX 2 name for an XML parser. XML Parsers
- should be based on this code. """
- from . import handler
- from ._exceptions import SAXNotSupportedException, SAXNotRecognizedException
- # ===== XMLREADER =====
- class XMLReader:
- """Interface for reading an XML document using callbacks.
- XMLReader is the interface that an XML parser's SAX2 driver must
- implement. This interface allows an application to set and query
- features and properties in the parser, to register event handlers
- for document processing, and to initiate a document parse.
- All SAX interfaces are assumed to be synchronous: the parse
- methods must not return until parsing is complete, and readers
- must wait for an event-handler callback to return before reporting
- the next event."""
- def __init__(self):
- self._cont_handler = handler.ContentHandler()
- self._dtd_handler = handler.DTDHandler()
- self._ent_handler = handler.EntityResolver()
- self._err_handler = handler.ErrorHandler()
- def parse(self, source):
- "Parse an XML document from a system identifier or an InputSource."
- raise NotImplementedError("This method must be implemented!")
- def getContentHandler(self):
- "Returns the current ContentHandler."
- return self._cont_handler
- def setContentHandler(self, handler):
- "Registers a new object to receive document content events."
- self._cont_handler = handler
- def getDTDHandler(self):
- "Returns the current DTD handler."
- return self._dtd_handler
- def setDTDHandler(self, handler):
- "Register an object to receive basic DTD-related events."
- self._dtd_handler = handler
- def getEntityResolver(self):
- "Returns the current EntityResolver."
- return self._ent_handler
- def setEntityResolver(self, resolver):
- "Register an object to resolve external entities."
- self._ent_handler = resolver
- def getErrorHandler(self):
- "Returns the current ErrorHandler."
- return self._err_handler
- def setErrorHandler(self, handler):
- "Register an object to receive error-message events."
- self._err_handler = handler
- def setLocale(self, locale):
- """Allow an application to set the locale for errors and warnings.
- SAX parsers are not required to provide localization for errors
- and warnings; if they cannot support the requested locale,
- however, they must raise a SAX exception. Applications may
- request a locale change in the middle of a parse."""
- raise SAXNotSupportedException("Locale support not implemented")
- def getFeature(self, name):
- "Looks up and returns the state of a SAX2 feature."
- raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
- def setFeature(self, name, state):
- "Sets the state of a SAX2 feature."
- raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
- def getProperty(self, name):
- "Looks up and returns the value of a SAX2 property."
- raise SAXNotRecognizedException("Property '%s' not recognized" % name)
- def setProperty(self, name, value):
- "Sets the value of a SAX2 property."
- raise SAXNotRecognizedException("Property '%s' not recognized" % name)
- class IncrementalParser(XMLReader):
- """This interface adds three extra methods to the XMLReader
- interface that allow XML parsers to support incremental
- parsing. Support for this interface is optional, since not all
- underlying XML parsers support this functionality.
- When the parser is instantiated it is ready to begin accepting
- data from the feed method immediately. After parsing has been
- finished with a call to close the reset method must be called to
- make the parser ready to accept new data, either from feed or
- using the parse method.
- Note that these methods must _not_ be called during parsing, that
- is, after parse has been called and before it returns.
- By default, the class also implements the parse method of the XMLReader
- interface using the feed, close and reset methods of the
- IncrementalParser interface as a convenience to SAX 2.0 driver
- writers."""
- def __init__(self, bufsize=2**16):
- self._bufsize = bufsize
- XMLReader.__init__(self)
- def parse(self, source):
- from . import saxutils
- source = saxutils.prepare_input_source(source)
- self.prepareParser(source)
- file = source.getCharacterStream()
- if file is None:
- file = source.getByteStream()
- while buffer := file.read(self._bufsize):
- self.feed(buffer)
- self.close()
- def feed(self, data):
- """This method gives the raw XML data in the data parameter to
- the parser and makes it parse the data, emitting the
- corresponding events. It is allowed for XML constructs to be
- split across several calls to feed.
- feed may raise SAXException."""
- raise NotImplementedError("This method must be implemented!")
- def prepareParser(self, source):
- """This method is called by the parse implementation to allow
- the SAX 2.0 driver to prepare itself for parsing."""
- raise NotImplementedError("prepareParser must be overridden!")
- def close(self):
- """This method is called when the entire XML document has been
- passed to the parser through the feed method, to notify the
- parser that there are no more data. This allows the parser to
- do the final checks on the document and empty the internal
- data buffer.
- The parser will not be ready to parse another document until
- the reset method has been called.
- close may raise SAXException."""
- raise NotImplementedError("This method must be implemented!")
- def reset(self):
- """This method is called after close has been called to reset
- the parser so that it is ready to parse new documents. The
- results of calling parse or feed after close without calling
- reset are undefined."""
- raise NotImplementedError("This method must be implemented!")
- # ===== LOCATOR =====
- class Locator:
- """Interface for associating a SAX event with a document
- location. A locator object will return valid results only during
- calls to DocumentHandler methods; at any other time, the
- results are unpredictable."""
- def getColumnNumber(self):
- "Return the column number where the current event ends."
- return -1
- def getLineNumber(self):
- "Return the line number where the current event ends."
- return -1
- def getPublicId(self):
- "Return the public identifier for the current event."
- return None
- def getSystemId(self):
- "Return the system identifier for the current event."
- return None
- # ===== INPUTSOURCE =====
- class InputSource:
- """Encapsulation of the information needed by the XMLReader to
- read entities.
- This class may include information about the public identifier,
- system identifier, byte stream (possibly with character encoding
- information) and/or the character stream of an entity.
- Applications will create objects of this class for use in the
- XMLReader.parse method and for returning from
- EntityResolver.resolveEntity.
- An InputSource belongs to the application, the XMLReader is not
- allowed to modify InputSource objects passed to it from the
- application, although it may make copies and modify those."""
- def __init__(self, system_id = None):
- self.__system_id = system_id
- self.__public_id = None
- self.__encoding = None
- self.__bytefile = None
- self.__charfile = None
- def setPublicId(self, public_id):
- "Sets the public identifier of this InputSource."
- self.__public_id = public_id
- def getPublicId(self):
- "Returns the public identifier of this InputSource."
- return self.__public_id
- def setSystemId(self, system_id):
- "Sets the system identifier of this InputSource."
- self.__system_id = system_id
- def getSystemId(self):
- "Returns the system identifier of this InputSource."
- return self.__system_id
- def setEncoding(self, encoding):
- """Sets the character encoding of this InputSource.
- The encoding must be a string acceptable for an XML encoding
- declaration (see section 4.3.3 of the XML recommendation).
- The encoding attribute of the InputSource is ignored if the
- InputSource also contains a character stream."""
- self.__encoding = encoding
- def getEncoding(self):
- "Get the character encoding of this InputSource."
- return self.__encoding
- def setByteStream(self, bytefile):
- """Set the byte stream (a Python file-like object which does
- not perform byte-to-character conversion) for this input
- source.
- The SAX parser will ignore this if there is also a character
- stream specified, but it will use a byte stream in preference
- to opening a URI connection itself.
- If the application knows the character encoding of the byte
- stream, it should set it with the setEncoding method."""
- self.__bytefile = bytefile
- def getByteStream(self):
- """Get the byte stream for this input source.
- The getEncoding method will return the character encoding for
- this byte stream, or None if unknown."""
- return self.__bytefile
- def setCharacterStream(self, charfile):
- """Set the character stream for this input source. (The stream
- must be a Python 2.0 Unicode-wrapped file-like that performs
- conversion to Unicode strings.)
- If there is a character stream specified, the SAX parser will
- ignore any byte stream and will not attempt to open a URI
- connection to the system identifier."""
- self.__charfile = charfile
- def getCharacterStream(self):
- "Get the character stream for this input source."
- return self.__charfile
- # ===== ATTRIBUTESIMPL =====
- class AttributesImpl:
- def __init__(self, attrs):
- """Non-NS-aware implementation.
- attrs should be of the form {name : value}."""
- self._attrs = attrs
- def getLength(self):
- return len(self._attrs)
- def getType(self, name):
- return "CDATA"
- def getValue(self, name):
- return self._attrs[name]
- def getValueByQName(self, name):
- return self._attrs[name]
- def getNameByQName(self, name):
- if name not in self._attrs:
- raise KeyError(name)
- return name
- def getQNameByName(self, name):
- if name not in self._attrs:
- raise KeyError(name)
- return name
- def getNames(self):
- return list(self._attrs.keys())
- def getQNames(self):
- return list(self._attrs.keys())
- def __len__(self):
- return len(self._attrs)
- def __getitem__(self, name):
- return self._attrs[name]
- def keys(self):
- return list(self._attrs.keys())
- def __contains__(self, name):
- return name in self._attrs
- def get(self, name, alternative=None):
- return self._attrs.get(name, alternative)
- def copy(self):
- return self.__class__(self._attrs)
- def items(self):
- return list(self._attrs.items())
- def values(self):
- return list(self._attrs.values())
- # ===== ATTRIBUTESNSIMPL =====
- class AttributesNSImpl(AttributesImpl):
- def __init__(self, attrs, qnames):
- """NS-aware implementation.
- attrs should be of the form {(ns_uri, lname): value, ...}.
- qnames of the form {(ns_uri, lname): qname, ...}."""
- self._attrs = attrs
- self._qnames = qnames
- def getValueByQName(self, name):
- for (nsname, qname) in self._qnames.items():
- if qname == name:
- return self._attrs[nsname]
- raise KeyError(name)
- def getNameByQName(self, name):
- for (nsname, qname) in self._qnames.items():
- if qname == name:
- return nsname
- raise KeyError(name)
- def getQNameByName(self, name):
- return self._qnames[name]
- def getQNames(self):
- return list(self._qnames.values())
- def copy(self):
- return self.__class__(self._attrs, self._qnames)
- def _test():
- XMLReader()
- IncrementalParser()
- Locator()
- if __name__ == "__main__":
- _test()
|