pulldom.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. import xml.sax
  2. import xml.sax.handler
  3. START_ELEMENT = "START_ELEMENT"
  4. END_ELEMENT = "END_ELEMENT"
  5. COMMENT = "COMMENT"
  6. START_DOCUMENT = "START_DOCUMENT"
  7. END_DOCUMENT = "END_DOCUMENT"
  8. PROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION"
  9. IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE"
  10. CHARACTERS = "CHARACTERS"
  11. class PullDOM(xml.sax.ContentHandler):
  12. _locator = None
  13. document = None
  14. def __init__(self, documentFactory=None):
  15. from xml.dom import XML_NAMESPACE
  16. self.documentFactory = documentFactory
  17. self.firstEvent = [None, None]
  18. self.lastEvent = self.firstEvent
  19. self.elementStack = []
  20. self.push = self.elementStack.append
  21. try:
  22. self.pop = self.elementStack.pop
  23. except AttributeError:
  24. # use class' pop instead
  25. pass
  26. self._ns_contexts = [{XML_NAMESPACE:'xml'}] # contains uri -> prefix dicts
  27. self._current_context = self._ns_contexts[-1]
  28. self.pending_events = []
  29. def pop(self):
  30. result = self.elementStack[-1]
  31. del self.elementStack[-1]
  32. return result
  33. def setDocumentLocator(self, locator):
  34. self._locator = locator
  35. def startPrefixMapping(self, prefix, uri):
  36. if not hasattr(self, '_xmlns_attrs'):
  37. self._xmlns_attrs = []
  38. self._xmlns_attrs.append((prefix or 'xmlns', uri))
  39. self._ns_contexts.append(self._current_context.copy())
  40. self._current_context[uri] = prefix or None
  41. def endPrefixMapping(self, prefix):
  42. self._current_context = self._ns_contexts.pop()
  43. def startElementNS(self, name, tagName , attrs):
  44. # Retrieve xml namespace declaration attributes.
  45. xmlns_uri = 'http://www.w3.org/2000/xmlns/'
  46. xmlns_attrs = getattr(self, '_xmlns_attrs', None)
  47. if xmlns_attrs is not None:
  48. for aname, value in xmlns_attrs:
  49. attrs._attrs[(xmlns_uri, aname)] = value
  50. self._xmlns_attrs = []
  51. uri, localname = name
  52. if uri:
  53. # When using namespaces, the reader may or may not
  54. # provide us with the original name. If not, create
  55. # *a* valid tagName from the current context.
  56. if tagName is None:
  57. prefix = self._current_context[uri]
  58. if prefix:
  59. tagName = prefix + ":" + localname
  60. else:
  61. tagName = localname
  62. if self.document:
  63. node = self.document.createElementNS(uri, tagName)
  64. else:
  65. node = self.buildDocument(uri, tagName)
  66. else:
  67. # When the tagname is not prefixed, it just appears as
  68. # localname
  69. if self.document:
  70. node = self.document.createElement(localname)
  71. else:
  72. node = self.buildDocument(None, localname)
  73. for aname,value in attrs.items():
  74. a_uri, a_localname = aname
  75. if a_uri == xmlns_uri:
  76. if a_localname == 'xmlns':
  77. qname = a_localname
  78. else:
  79. qname = 'xmlns:' + a_localname
  80. attr = self.document.createAttributeNS(a_uri, qname)
  81. node.setAttributeNodeNS(attr)
  82. elif a_uri:
  83. prefix = self._current_context[a_uri]
  84. if prefix:
  85. qname = prefix + ":" + a_localname
  86. else:
  87. qname = a_localname
  88. attr = self.document.createAttributeNS(a_uri, qname)
  89. node.setAttributeNodeNS(attr)
  90. else:
  91. attr = self.document.createAttribute(a_localname)
  92. node.setAttributeNode(attr)
  93. attr.value = value
  94. self.lastEvent[1] = [(START_ELEMENT, node), None]
  95. self.lastEvent = self.lastEvent[1]
  96. self.push(node)
  97. def endElementNS(self, name, tagName):
  98. self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
  99. self.lastEvent = self.lastEvent[1]
  100. def startElement(self, name, attrs):
  101. if self.document:
  102. node = self.document.createElement(name)
  103. else:
  104. node = self.buildDocument(None, name)
  105. for aname,value in attrs.items():
  106. attr = self.document.createAttribute(aname)
  107. attr.value = value
  108. node.setAttributeNode(attr)
  109. self.lastEvent[1] = [(START_ELEMENT, node), None]
  110. self.lastEvent = self.lastEvent[1]
  111. self.push(node)
  112. def endElement(self, name):
  113. self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
  114. self.lastEvent = self.lastEvent[1]
  115. def comment(self, s):
  116. if self.document:
  117. node = self.document.createComment(s)
  118. self.lastEvent[1] = [(COMMENT, node), None]
  119. self.lastEvent = self.lastEvent[1]
  120. else:
  121. event = [(COMMENT, s), None]
  122. self.pending_events.append(event)
  123. def processingInstruction(self, target, data):
  124. if self.document:
  125. node = self.document.createProcessingInstruction(target, data)
  126. self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None]
  127. self.lastEvent = self.lastEvent[1]
  128. else:
  129. event = [(PROCESSING_INSTRUCTION, target, data), None]
  130. self.pending_events.append(event)
  131. def ignorableWhitespace(self, chars):
  132. node = self.document.createTextNode(chars)
  133. self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None]
  134. self.lastEvent = self.lastEvent[1]
  135. def characters(self, chars):
  136. node = self.document.createTextNode(chars)
  137. self.lastEvent[1] = [(CHARACTERS, node), None]
  138. self.lastEvent = self.lastEvent[1]
  139. def startDocument(self):
  140. if self.documentFactory is None:
  141. import xml.dom.minidom
  142. self.documentFactory = xml.dom.minidom.Document.implementation
  143. def buildDocument(self, uri, tagname):
  144. # Can't do that in startDocument, since we need the tagname
  145. # XXX: obtain DocumentType
  146. node = self.documentFactory.createDocument(uri, tagname, None)
  147. self.document = node
  148. self.lastEvent[1] = [(START_DOCUMENT, node), None]
  149. self.lastEvent = self.lastEvent[1]
  150. self.push(node)
  151. # Put everything we have seen so far into the document
  152. for e in self.pending_events:
  153. if e[0][0] == PROCESSING_INSTRUCTION:
  154. _,target,data = e[0]
  155. n = self.document.createProcessingInstruction(target, data)
  156. e[0] = (PROCESSING_INSTRUCTION, n)
  157. elif e[0][0] == COMMENT:
  158. n = self.document.createComment(e[0][1])
  159. e[0] = (COMMENT, n)
  160. else:
  161. raise AssertionError("Unknown pending event ",e[0][0])
  162. self.lastEvent[1] = e
  163. self.lastEvent = e
  164. self.pending_events = None
  165. return node.firstChild
  166. def endDocument(self):
  167. self.lastEvent[1] = [(END_DOCUMENT, self.document), None]
  168. self.pop()
  169. def clear(self):
  170. "clear(): Explicitly release parsing structures"
  171. self.document = None
  172. class ErrorHandler:
  173. def warning(self, exception):
  174. print(exception)
  175. def error(self, exception):
  176. raise exception
  177. def fatalError(self, exception):
  178. raise exception
  179. class DOMEventStream:
  180. def __init__(self, stream, parser, bufsize):
  181. self.stream = stream
  182. self.parser = parser
  183. self.bufsize = bufsize
  184. if not hasattr(self.parser, 'feed'):
  185. self.getEvent = self._slurp
  186. self.reset()
  187. def reset(self):
  188. self.pulldom = PullDOM()
  189. # This content handler relies on namespace support
  190. self.parser.setFeature(xml.sax.handler.feature_namespaces, 1)
  191. self.parser.setContentHandler(self.pulldom)
  192. def __next__(self):
  193. rc = self.getEvent()
  194. if rc:
  195. return rc
  196. raise StopIteration
  197. def __iter__(self):
  198. return self
  199. def expandNode(self, node):
  200. event = self.getEvent()
  201. parents = [node]
  202. while event:
  203. token, cur_node = event
  204. if cur_node is node:
  205. return
  206. if token != END_ELEMENT:
  207. parents[-1].appendChild(cur_node)
  208. if token == START_ELEMENT:
  209. parents.append(cur_node)
  210. elif token == END_ELEMENT:
  211. del parents[-1]
  212. event = self.getEvent()
  213. def getEvent(self):
  214. # use IncrementalParser interface, so we get the desired
  215. # pull effect
  216. if not self.pulldom.firstEvent[1]:
  217. self.pulldom.lastEvent = self.pulldom.firstEvent
  218. while not self.pulldom.firstEvent[1]:
  219. buf = self.stream.read(self.bufsize)
  220. if not buf:
  221. self.parser.close()
  222. return None
  223. self.parser.feed(buf)
  224. rc = self.pulldom.firstEvent[1][0]
  225. self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
  226. return rc
  227. def _slurp(self):
  228. """ Fallback replacement for getEvent() using the
  229. standard SAX2 interface, which means we slurp the
  230. SAX events into memory (no performance gain, but
  231. we are compatible to all SAX parsers).
  232. """
  233. self.parser.parse(self.stream)
  234. self.getEvent = self._emit
  235. return self._emit()
  236. def _emit(self):
  237. """ Fallback replacement for getEvent() that emits
  238. the events that _slurp() read previously.
  239. """
  240. rc = self.pulldom.firstEvent[1][0]
  241. self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
  242. return rc
  243. def clear(self):
  244. """clear(): Explicitly release parsing objects"""
  245. self.pulldom.clear()
  246. del self.pulldom
  247. self.parser = None
  248. self.stream = None
  249. class SAX2DOM(PullDOM):
  250. def startElementNS(self, name, tagName , attrs):
  251. PullDOM.startElementNS(self, name, tagName, attrs)
  252. curNode = self.elementStack[-1]
  253. parentNode = self.elementStack[-2]
  254. parentNode.appendChild(curNode)
  255. def startElement(self, name, attrs):
  256. PullDOM.startElement(self, name, attrs)
  257. curNode = self.elementStack[-1]
  258. parentNode = self.elementStack[-2]
  259. parentNode.appendChild(curNode)
  260. def processingInstruction(self, target, data):
  261. PullDOM.processingInstruction(self, target, data)
  262. node = self.lastEvent[0][1]
  263. parentNode = self.elementStack[-1]
  264. parentNode.appendChild(node)
  265. def ignorableWhitespace(self, chars):
  266. PullDOM.ignorableWhitespace(self, chars)
  267. node = self.lastEvent[0][1]
  268. parentNode = self.elementStack[-1]
  269. parentNode.appendChild(node)
  270. def characters(self, chars):
  271. PullDOM.characters(self, chars)
  272. node = self.lastEvent[0][1]
  273. parentNode = self.elementStack[-1]
  274. parentNode.appendChild(node)
  275. default_bufsize = (2 ** 14) - 20
  276. def parse(stream_or_string, parser=None, bufsize=None):
  277. if bufsize is None:
  278. bufsize = default_bufsize
  279. if isinstance(stream_or_string, str):
  280. stream = open(stream_or_string, 'rb')
  281. else:
  282. stream = stream_or_string
  283. if not parser:
  284. parser = xml.sax.make_parser()
  285. return DOMEventStream(stream, parser, bufsize)
  286. def parseString(string, parser=None):
  287. from io import StringIO
  288. bufsize = len(string)
  289. buf = StringIO(string)
  290. if not parser:
  291. parser = xml.sax.make_parser()
  292. return DOMEventStream(buf, parser, bufsize)