pulldom.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349
  1. import xml.sax
  2. import xml.sax.handler
  3. START_ELEMENT = "START_ELEMENT"
  4. END_ELEMENT = "END_ELEMENT"
  5. COMMENT = "COMMENT"
  6. START_DOCUMENT = "START_DOCUMENT"
  7. END_DOCUMENT = "END_DOCUMENT"
  8. PROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION"
  9. IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE"
  10. CHARACTERS = "CHARACTERS"
  11. class PullDOM(xml.sax.ContentHandler):
  12. _locator = None
  13. document = None
  14. def __init__(self, documentFactory=None):
  15. from xml.dom import XML_NAMESPACE
  16. self.documentFactory = documentFactory
  17. self.firstEvent = [None, None]
  18. self.lastEvent = self.firstEvent
  19. self.elementStack = []
  20. self.push = self.elementStack.append
  21. try:
  22. self.pop = self.elementStack.pop
  23. except AttributeError:
  24. # use class' pop instead
  25. pass
  26. self._ns_contexts = [{XML_NAMESPACE:'xml'}] # contains uri -> prefix dicts
  27. self._current_context = self._ns_contexts[-1]
  28. self.pending_events = []
  29. def pop(self):
  30. result = self.elementStack[-1]
  31. del self.elementStack[-1]
  32. return result
  33. def setDocumentLocator(self, locator):
  34. self._locator = locator
  35. def startPrefixMapping(self, prefix, uri):
  36. if not hasattr(self, '_xmlns_attrs'):
  37. self._xmlns_attrs = []
  38. self._xmlns_attrs.append((prefix or 'xmlns', uri))
  39. self._ns_contexts.append(self._current_context.copy())
  40. self._current_context[uri] = prefix or None
  41. def endPrefixMapping(self, prefix):
  42. self._current_context = self._ns_contexts.pop()
  43. def startElementNS(self, name, tagName , attrs):
  44. # Retrieve xml namespace declaration attributes.
  45. xmlns_uri = 'http://www.w3.org/2000/xmlns/'
  46. xmlns_attrs = getattr(self, '_xmlns_attrs', None)
  47. if xmlns_attrs is not None:
  48. for aname, value in xmlns_attrs:
  49. attrs._attrs[(xmlns_uri, aname)] = value
  50. self._xmlns_attrs = []
  51. uri, localname = name
  52. if uri:
  53. # When using namespaces, the reader may or may not
  54. # provide us with the original name. If not, create
  55. # *a* valid tagName from the current context.
  56. if tagName is None:
  57. prefix = self._current_context[uri]
  58. if prefix:
  59. tagName = prefix + ":" + localname
  60. else:
  61. tagName = localname
  62. if self.document:
  63. node = self.document.createElementNS(uri, tagName)
  64. else:
  65. node = self.buildDocument(uri, tagName)
  66. else:
  67. # When the tagname is not prefixed, it just appears as
  68. # localname
  69. if self.document:
  70. node = self.document.createElement(localname)
  71. else:
  72. node = self.buildDocument(None, localname)
  73. for aname,value in attrs.items():
  74. a_uri, a_localname = aname
  75. if a_uri == xmlns_uri:
  76. if a_localname == 'xmlns':
  77. qname = a_localname
  78. else:
  79. qname = 'xmlns:' + a_localname
  80. attr = self.document.createAttributeNS(a_uri, qname)
  81. node.setAttributeNodeNS(attr)
  82. elif a_uri:
  83. prefix = self._current_context[a_uri]
  84. if prefix:
  85. qname = prefix + ":" + a_localname
  86. else:
  87. qname = a_localname
  88. attr = self.document.createAttributeNS(a_uri, qname)
  89. node.setAttributeNodeNS(attr)
  90. else:
  91. attr = self.document.createAttribute(a_localname)
  92. node.setAttributeNode(attr)
  93. attr.value = value
  94. self.lastEvent[1] = [(START_ELEMENT, node), None]
  95. self.lastEvent = self.lastEvent[1]
  96. self.push(node)
  97. def endElementNS(self, name, tagName):
  98. self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
  99. self.lastEvent = self.lastEvent[1]
  100. def startElement(self, name, attrs):
  101. if self.document:
  102. node = self.document.createElement(name)
  103. else:
  104. node = self.buildDocument(None, name)
  105. for aname,value in attrs.items():
  106. attr = self.document.createAttribute(aname)
  107. attr.value = value
  108. node.setAttributeNode(attr)
  109. self.lastEvent[1] = [(START_ELEMENT, node), None]
  110. self.lastEvent = self.lastEvent[1]
  111. self.push(node)
  112. def endElement(self, name):
  113. self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
  114. self.lastEvent = self.lastEvent[1]
  115. def comment(self, s):
  116. if self.document:
  117. node = self.document.createComment(s)
  118. self.lastEvent[1] = [(COMMENT, node), None]
  119. self.lastEvent = self.lastEvent[1]
  120. else:
  121. event = [(COMMENT, s), None]
  122. self.pending_events.append(event)
  123. def processingInstruction(self, target, data):
  124. if self.document:
  125. node = self.document.createProcessingInstruction(target, data)
  126. self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None]
  127. self.lastEvent = self.lastEvent[1]
  128. else:
  129. event = [(PROCESSING_INSTRUCTION, target, data), None]
  130. self.pending_events.append(event)
  131. def ignorableWhitespace(self, chars):
  132. node = self.document.createTextNode(chars)
  133. self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None]
  134. self.lastEvent = self.lastEvent[1]
  135. def characters(self, chars):
  136. node = self.document.createTextNode(chars)
  137. self.lastEvent[1] = [(CHARACTERS, node), None]
  138. self.lastEvent = self.lastEvent[1]
  139. def startDocument(self):
  140. if self.documentFactory is None:
  141. import xml.dom.minidom
  142. self.documentFactory = xml.dom.minidom.Document.implementation
  143. def buildDocument(self, uri, tagname):
  144. # Can't do that in startDocument, since we need the tagname
  145. # XXX: obtain DocumentType
  146. node = self.documentFactory.createDocument(uri, tagname, None)
  147. self.document = node
  148. self.lastEvent[1] = [(START_DOCUMENT, node), None]
  149. self.lastEvent = self.lastEvent[1]
  150. self.push(node)
  151. # Put everything we have seen so far into the document
  152. for e in self.pending_events:
  153. if e[0][0] == PROCESSING_INSTRUCTION:
  154. _,target,data = e[0]
  155. n = self.document.createProcessingInstruction(target, data)
  156. e[0] = (PROCESSING_INSTRUCTION, n)
  157. elif e[0][0] == COMMENT:
  158. n = self.document.createComment(e[0][1])
  159. e[0] = (COMMENT, n)
  160. else:
  161. raise AssertionError("Unknown pending event ",e[0][0])
  162. self.lastEvent[1] = e
  163. self.lastEvent = e
  164. self.pending_events = None
  165. return node.firstChild
  166. def endDocument(self):
  167. self.lastEvent[1] = [(END_DOCUMENT, self.document), None]
  168. self.pop()
  169. def clear(self):
  170. "clear(): Explicitly release parsing structures"
  171. self.document = None
  172. class ErrorHandler:
  173. def warning(self, exception):
  174. print(exception)
  175. def error(self, exception):
  176. raise exception
  177. def fatalError(self, exception):
  178. raise exception
  179. class DOMEventStream:
  180. def __init__(self, stream, parser, bufsize):
  181. self.stream = stream
  182. self.parser = parser
  183. self.bufsize = bufsize
  184. if not hasattr(self.parser, 'feed'):
  185. self.getEvent = self._slurp
  186. self.reset()
  187. def reset(self):
  188. self.pulldom = PullDOM()
  189. # This content handler relies on namespace support
  190. self.parser.setFeature(xml.sax.handler.feature_namespaces, 1)
  191. self.parser.setContentHandler(self.pulldom)
  192. def __getitem__(self, pos):
  193. import warnings
  194. warnings.warn(
  195. "DOMEventStream's __getitem__ method ignores 'pos' parameter. "
  196. "Use iterator protocol instead.",
  197. DeprecationWarning,
  198. stacklevel=2
  199. )
  200. rc = self.getEvent()
  201. if rc:
  202. return rc
  203. raise IndexError
  204. def __next__(self):
  205. rc = self.getEvent()
  206. if rc:
  207. return rc
  208. raise StopIteration
  209. def __iter__(self):
  210. return self
  211. def expandNode(self, node):
  212. event = self.getEvent()
  213. parents = [node]
  214. while event:
  215. token, cur_node = event
  216. if cur_node is node:
  217. return
  218. if token != END_ELEMENT:
  219. parents[-1].appendChild(cur_node)
  220. if token == START_ELEMENT:
  221. parents.append(cur_node)
  222. elif token == END_ELEMENT:
  223. del parents[-1]
  224. event = self.getEvent()
  225. def getEvent(self):
  226. # use IncrementalParser interface, so we get the desired
  227. # pull effect
  228. if not self.pulldom.firstEvent[1]:
  229. self.pulldom.lastEvent = self.pulldom.firstEvent
  230. while not self.pulldom.firstEvent[1]:
  231. buf = self.stream.read(self.bufsize)
  232. if not buf:
  233. self.parser.close()
  234. return None
  235. self.parser.feed(buf)
  236. rc = self.pulldom.firstEvent[1][0]
  237. self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
  238. return rc
  239. def _slurp(self):
  240. """ Fallback replacement for getEvent() using the
  241. standard SAX2 interface, which means we slurp the
  242. SAX events into memory (no performance gain, but
  243. we are compatible to all SAX parsers).
  244. """
  245. self.parser.parse(self.stream)
  246. self.getEvent = self._emit
  247. return self._emit()
  248. def _emit(self):
  249. """ Fallback replacement for getEvent() that emits
  250. the events that _slurp() read previously.
  251. """
  252. rc = self.pulldom.firstEvent[1][0]
  253. self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
  254. return rc
  255. def clear(self):
  256. """clear(): Explicitly release parsing objects"""
  257. self.pulldom.clear()
  258. del self.pulldom
  259. self.parser = None
  260. self.stream = None
  261. class SAX2DOM(PullDOM):
  262. def startElementNS(self, name, tagName , attrs):
  263. PullDOM.startElementNS(self, name, tagName, attrs)
  264. curNode = self.elementStack[-1]
  265. parentNode = self.elementStack[-2]
  266. parentNode.appendChild(curNode)
  267. def startElement(self, name, attrs):
  268. PullDOM.startElement(self, name, attrs)
  269. curNode = self.elementStack[-1]
  270. parentNode = self.elementStack[-2]
  271. parentNode.appendChild(curNode)
  272. def processingInstruction(self, target, data):
  273. PullDOM.processingInstruction(self, target, data)
  274. node = self.lastEvent[0][1]
  275. parentNode = self.elementStack[-1]
  276. parentNode.appendChild(node)
  277. def ignorableWhitespace(self, chars):
  278. PullDOM.ignorableWhitespace(self, chars)
  279. node = self.lastEvent[0][1]
  280. parentNode = self.elementStack[-1]
  281. parentNode.appendChild(node)
  282. def characters(self, chars):
  283. PullDOM.characters(self, chars)
  284. node = self.lastEvent[0][1]
  285. parentNode = self.elementStack[-1]
  286. parentNode.appendChild(node)
  287. default_bufsize = (2 ** 14) - 20
  288. def parse(stream_or_string, parser=None, bufsize=None):
  289. if bufsize is None:
  290. bufsize = default_bufsize
  291. if isinstance(stream_or_string, str):
  292. stream = open(stream_or_string, 'rb')
  293. else:
  294. stream = stream_or_string
  295. if not parser:
  296. parser = xml.sax.make_parser()
  297. return DOMEventStream(stream, parser, bufsize)
  298. def parseString(string, parser=None):
  299. from io import StringIO
  300. bufsize = len(string)
  301. buf = StringIO(string)
  302. if not parser:
  303. parser = xml.sax.make_parser()
  304. return DOMEventStream(buf, parser, bufsize)