123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478 |
- """Shim module exporting the same ElementTree API for lxml and
- xml.etree backends.
- When lxml is installed, it is automatically preferred over the built-in
- xml.etree module.
- On Python 2.7, the cElementTree module is preferred over the pure-python
- ElementTree module.
- Besides exporting a unified interface, this also defines extra functions
- or subclasses built-in ElementTree classes to add features that are
- only availble in lxml, like OrderedDict for attributes, pretty_print and
- iterwalk.
- """
- from fontTools.misc.textTools import tostr
- XML_DECLARATION = """<?xml version='1.0' encoding='%s'?>"""
- __all__ = [
- # public symbols
- "Comment",
- "dump",
- "Element",
- "ElementTree",
- "fromstring",
- "fromstringlist",
- "iselement",
- "iterparse",
- "parse",
- "ParseError",
- "PI",
- "ProcessingInstruction",
- "QName",
- "SubElement",
- "tostring",
- "tostringlist",
- "TreeBuilder",
- "XML",
- "XMLParser",
- "register_namespace",
- ]
- try:
- from lxml.etree import *
- _have_lxml = True
- except ImportError:
- try:
- from xml.etree.cElementTree import *
- # the cElementTree version of XML function doesn't support
- # the optional 'parser' keyword argument
- from xml.etree.ElementTree import XML
- except ImportError: # pragma: no cover
- from xml.etree.ElementTree import *
- _have_lxml = False
- import sys
- # dict is always ordered in python >= 3.6 and on pypy
- PY36 = sys.version_info >= (3, 6)
- try:
- import __pypy__
- except ImportError:
- __pypy__ = None
- _dict_is_ordered = bool(PY36 or __pypy__)
- del PY36, __pypy__
- if _dict_is_ordered:
- _Attrib = dict
- else:
- from collections import OrderedDict as _Attrib
- if isinstance(Element, type):
- _Element = Element
- else:
- # in py27, cElementTree.Element cannot be subclassed, so
- # we need to import the pure-python class
- from xml.etree.ElementTree import Element as _Element
- class Element(_Element):
- """Element subclass that keeps the order of attributes."""
- def __init__(self, tag, attrib=_Attrib(), **extra):
- super(Element, self).__init__(tag)
- self.attrib = _Attrib()
- if attrib:
- self.attrib.update(attrib)
- if extra:
- self.attrib.update(extra)
- def SubElement(parent, tag, attrib=_Attrib(), **extra):
- """Must override SubElement as well otherwise _elementtree.SubElement
- fails if 'parent' is a subclass of Element object.
- """
- element = parent.__class__(tag, attrib, **extra)
- parent.append(element)
- return element
- def _iterwalk(element, events, tag):
- include = tag is None or element.tag == tag
- if include and "start" in events:
- yield ("start", element)
- for e in element:
- for item in _iterwalk(e, events, tag):
- yield item
- if include:
- yield ("end", element)
- def iterwalk(element_or_tree, events=("end",), tag=None):
- """A tree walker that generates events from an existing tree as
- if it was parsing XML data with iterparse().
- Drop-in replacement for lxml.etree.iterwalk.
- """
- if iselement(element_or_tree):
- element = element_or_tree
- else:
- element = element_or_tree.getroot()
- if tag == "*":
- tag = None
- for item in _iterwalk(element, events, tag):
- yield item
- _ElementTree = ElementTree
- class ElementTree(_ElementTree):
- """ElementTree subclass that adds 'pretty_print' and 'doctype'
- arguments to the 'write' method.
- Currently these are only supported for the default XML serialization
- 'method', and not also for "html" or "text", for these are delegated
- to the base class.
- """
- def write(
- self,
- file_or_filename,
- encoding=None,
- xml_declaration=False,
- method=None,
- doctype=None,
- pretty_print=False,
- ):
- if method and method != "xml":
- # delegate to super-class
- super(ElementTree, self).write(
- file_or_filename,
- encoding=encoding,
- xml_declaration=xml_declaration,
- method=method,
- )
- return
- if encoding is not None and encoding.lower() == "unicode":
- if xml_declaration:
- raise ValueError(
- "Serialisation to unicode must not request an XML declaration"
- )
- write_declaration = False
- encoding = "unicode"
- elif xml_declaration is None:
- # by default, write an XML declaration only for non-standard encodings
- write_declaration = encoding is not None and encoding.upper() not in (
- "ASCII",
- "UTF-8",
- "UTF8",
- "US-ASCII",
- )
- else:
- write_declaration = xml_declaration
- if encoding is None:
- encoding = "ASCII"
- if pretty_print:
- # NOTE this will modify the tree in-place
- _indent(self._root)
- with _get_writer(file_or_filename, encoding) as write:
- if write_declaration:
- write(XML_DECLARATION % encoding.upper())
- if pretty_print:
- write("\n")
- if doctype:
- write(_tounicode(doctype))
- if pretty_print:
- write("\n")
- qnames, namespaces = _namespaces(self._root)
- _serialize_xml(write, self._root, qnames, namespaces)
- import io
- def tostring(
- element,
- encoding=None,
- xml_declaration=None,
- method=None,
- doctype=None,
- pretty_print=False,
- ):
- """Custom 'tostring' function that uses our ElementTree subclass, with
- pretty_print support.
- """
- stream = io.StringIO() if encoding == "unicode" else io.BytesIO()
- ElementTree(element).write(
- stream,
- encoding=encoding,
- xml_declaration=xml_declaration,
- method=method,
- doctype=doctype,
- pretty_print=pretty_print,
- )
- return stream.getvalue()
- # serialization support
- import re
- # Valid XML strings can include any Unicode character, excluding control
- # characters, the surrogate blocks, FFFE, and FFFF:
- # Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
- # Here we reversed the pattern to match only the invalid characters.
- # For the 'narrow' python builds supporting only UCS-2, which represent
- # characters beyond BMP as UTF-16 surrogate pairs, we need to pass through
- # the surrogate block. I haven't found a more elegant solution...
- UCS2 = sys.maxunicode < 0x10FFFF
- if UCS2:
- _invalid_xml_string = re.compile(
- "[\u0000-\u0008\u000B-\u000C\u000E-\u001F\uFFFE-\uFFFF]"
- )
- else:
- _invalid_xml_string = re.compile(
- "[\u0000-\u0008\u000B-\u000C\u000E-\u001F\uD800-\uDFFF\uFFFE-\uFFFF]"
- )
- def _tounicode(s):
- """Test if a string is valid user input and decode it to unicode string
- using ASCII encoding if it's a bytes string.
- Reject all bytes/unicode input that contains non-XML characters.
- Reject all bytes input that contains non-ASCII characters.
- """
- try:
- s = tostr(s, encoding="ascii", errors="strict")
- except UnicodeDecodeError:
- raise ValueError(
- "Bytes strings can only contain ASCII characters. "
- "Use unicode strings for non-ASCII characters."
- )
- except AttributeError:
- _raise_serialization_error(s)
- if s and _invalid_xml_string.search(s):
- raise ValueError(
- "All strings must be XML compatible: Unicode or ASCII, "
- "no NULL bytes or control characters"
- )
- return s
- import contextlib
- @contextlib.contextmanager
- def _get_writer(file_or_filename, encoding):
- # returns text write method and release all resources after using
- try:
- write = file_or_filename.write
- except AttributeError:
- # file_or_filename is a file name
- f = open(
- file_or_filename,
- "w",
- encoding="utf-8" if encoding == "unicode" else encoding,
- errors="xmlcharrefreplace",
- )
- with f:
- yield f.write
- else:
- # file_or_filename is a file-like object
- # encoding determines if it is a text or binary writer
- if encoding == "unicode":
- # use a text writer as is
- yield write
- else:
- # wrap a binary writer with TextIOWrapper
- detach_buffer = False
- if isinstance(file_or_filename, io.BufferedIOBase):
- buf = file_or_filename
- elif isinstance(file_or_filename, io.RawIOBase):
- buf = io.BufferedWriter(file_or_filename)
- detach_buffer = True
- else:
- # This is to handle passed objects that aren't in the
- # IOBase hierarchy, but just have a write method
- buf = io.BufferedIOBase()
- buf.writable = lambda: True
- buf.write = write
- try:
- # TextIOWrapper uses this methods to determine
- # if BOM (for UTF-16, etc) should be added
- buf.seekable = file_or_filename.seekable
- buf.tell = file_or_filename.tell
- except AttributeError:
- pass
- wrapper = io.TextIOWrapper(
- buf,
- encoding=encoding,
- errors="xmlcharrefreplace",
- newline="\n",
- )
- try:
- yield wrapper.write
- finally:
- # Keep the original file open when the TextIOWrapper and
- # the BufferedWriter are destroyed
- wrapper.detach()
- if detach_buffer:
- buf.detach()
- from xml.etree.ElementTree import _namespace_map
- def _namespaces(elem):
- # identify namespaces used in this tree
- # maps qnames to *encoded* prefix:local names
- qnames = {None: None}
- # maps uri:s to prefixes
- namespaces = {}
- def add_qname(qname):
- # calculate serialized qname representation
- try:
- qname = _tounicode(qname)
- if qname[:1] == "{":
- uri, tag = qname[1:].rsplit("}", 1)
- prefix = namespaces.get(uri)
- if prefix is None:
- prefix = _namespace_map.get(uri)
- if prefix is None:
- prefix = "ns%d" % len(namespaces)
- else:
- prefix = _tounicode(prefix)
- if prefix != "xml":
- namespaces[uri] = prefix
- if prefix:
- qnames[qname] = "%s:%s" % (prefix, tag)
- else:
- qnames[qname] = tag # default element
- else:
- qnames[qname] = qname
- except TypeError:
- _raise_serialization_error(qname)
- # populate qname and namespaces table
- for elem in elem.iter():
- tag = elem.tag
- if isinstance(tag, QName):
- if tag.text not in qnames:
- add_qname(tag.text)
- elif isinstance(tag, str):
- if tag not in qnames:
- add_qname(tag)
- elif tag is not None and tag is not Comment and tag is not PI:
- _raise_serialization_error(tag)
- for key, value in elem.items():
- if isinstance(key, QName):
- key = key.text
- if key not in qnames:
- add_qname(key)
- if isinstance(value, QName) and value.text not in qnames:
- add_qname(value.text)
- text = elem.text
- if isinstance(text, QName) and text.text not in qnames:
- add_qname(text.text)
- return qnames, namespaces
- def _serialize_xml(write, elem, qnames, namespaces, **kwargs):
- tag = elem.tag
- text = elem.text
- if tag is Comment:
- write("<!--%s-->" % _tounicode(text))
- elif tag is ProcessingInstruction:
- write("<?%s?>" % _tounicode(text))
- else:
- tag = qnames[_tounicode(tag) if tag is not None else None]
- if tag is None:
- if text:
- write(_escape_cdata(text))
- for e in elem:
- _serialize_xml(write, e, qnames, None)
- else:
- write("<" + tag)
- if namespaces:
- for uri, prefix in sorted(
- namespaces.items(), key=lambda x: x[1]
- ): # sort on prefix
- if prefix:
- prefix = ":" + prefix
- write(' xmlns%s="%s"' % (prefix, _escape_attrib(uri)))
- attrs = elem.attrib
- if attrs:
- # try to keep existing attrib order
- if len(attrs) <= 1 or type(attrs) is _Attrib:
- items = attrs.items()
- else:
- # if plain dict, use lexical order
- items = sorted(attrs.items())
- for k, v in items:
- if isinstance(k, QName):
- k = _tounicode(k.text)
- else:
- k = _tounicode(k)
- if isinstance(v, QName):
- v = qnames[_tounicode(v.text)]
- else:
- v = _escape_attrib(v)
- write(' %s="%s"' % (qnames[k], v))
- if text is not None or len(elem):
- write(">")
- if text:
- write(_escape_cdata(text))
- for e in elem:
- _serialize_xml(write, e, qnames, None)
- write("</" + tag + ">")
- else:
- write("/>")
- if elem.tail:
- write(_escape_cdata(elem.tail))
- def _raise_serialization_error(text):
- raise TypeError("cannot serialize %r (type %s)" % (text, type(text).__name__))
- def _escape_cdata(text):
- # escape character data
- try:
- text = _tounicode(text)
- # it's worth avoiding do-nothing calls for short strings
- if "&" in text:
- text = text.replace("&", "&")
- if "<" in text:
- text = text.replace("<", "<")
- if ">" in text:
- text = text.replace(">", ">")
- return text
- except (TypeError, AttributeError):
- _raise_serialization_error(text)
- def _escape_attrib(text):
- # escape attribute value
- try:
- text = _tounicode(text)
- if "&" in text:
- text = text.replace("&", "&")
- if "<" in text:
- text = text.replace("<", "<")
- if ">" in text:
- text = text.replace(">", ">")
- if '"' in text:
- text = text.replace('"', """)
- if "\n" in text:
- text = text.replace("\n", " ")
- return text
- except (TypeError, AttributeError):
- _raise_serialization_error(text)
- def _indent(elem, level=0):
- # From http://effbot.org/zone/element-lib.htm#prettyprint
- i = "\n" + level * " "
- if len(elem):
- if not elem.text or not elem.text.strip():
- elem.text = i + " "
- if not elem.tail or not elem.tail.strip():
- elem.tail = i
- for elem in elem:
- _indent(elem, level + 1)
- if not elem.tail or not elem.tail.strip():
- elem.tail = i
- else:
- if level and (not elem.tail or not elem.tail.strip()):
- elem.tail = i
|