expatbuilder.py 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962
  1. """Facility to use the Expat parser to load a minidom instance
  2. from a string or file.
  3. This avoids all the overhead of SAX and pulldom to gain performance.
  4. """
  5. # Warning!
  6. #
  7. # This module is tightly bound to the implementation details of the
  8. # minidom DOM and can't be used with other DOM implementations. This
  9. # is due, in part, to a lack of appropriate methods in the DOM (there is
  10. # no way to create Entity and Notation nodes via the DOM Level 2
  11. # interface), and for performance. The latter is the cause of some fairly
  12. # cryptic code.
  13. #
  14. # Performance hacks:
  15. #
  16. # - .character_data_handler() has an extra case in which continuing
  17. # data is appended to an existing Text node; this can be a
  18. # speedup since pyexpat can break up character data into multiple
  19. # callbacks even though we set the buffer_text attribute on the
  20. # parser. This also gives us the advantage that we don't need a
  21. # separate normalization pass.
  22. #
  23. # - Determining that a node exists is done using an identity comparison
  24. # with None rather than a truth test; this avoids searching for and
  25. # calling any methods on the node object if it exists. (A rather
  26. # nice speedup is achieved this way as well!)
  27. from xml.dom import xmlbuilder, minidom, Node
  28. from xml.dom import EMPTY_NAMESPACE, EMPTY_PREFIX, XMLNS_NAMESPACE
  29. from xml.parsers import expat
  30. from xml.dom.minidom import _append_child, _set_attribute_node
  31. from xml.dom.NodeFilter import NodeFilter
  32. TEXT_NODE = Node.TEXT_NODE
  33. CDATA_SECTION_NODE = Node.CDATA_SECTION_NODE
  34. DOCUMENT_NODE = Node.DOCUMENT_NODE
  35. FILTER_ACCEPT = xmlbuilder.DOMBuilderFilter.FILTER_ACCEPT
  36. FILTER_REJECT = xmlbuilder.DOMBuilderFilter.FILTER_REJECT
  37. FILTER_SKIP = xmlbuilder.DOMBuilderFilter.FILTER_SKIP
  38. FILTER_INTERRUPT = xmlbuilder.DOMBuilderFilter.FILTER_INTERRUPT
  39. theDOMImplementation = minidom.getDOMImplementation()
  40. # Expat typename -> TypeInfo
  41. _typeinfo_map = {
  42. "CDATA": minidom.TypeInfo(None, "cdata"),
  43. "ENUM": minidom.TypeInfo(None, "enumeration"),
  44. "ENTITY": minidom.TypeInfo(None, "entity"),
  45. "ENTITIES": minidom.TypeInfo(None, "entities"),
  46. "ID": minidom.TypeInfo(None, "id"),
  47. "IDREF": minidom.TypeInfo(None, "idref"),
  48. "IDREFS": minidom.TypeInfo(None, "idrefs"),
  49. "NMTOKEN": minidom.TypeInfo(None, "nmtoken"),
  50. "NMTOKENS": minidom.TypeInfo(None, "nmtokens"),
  51. }
  52. class ElementInfo(object):
  53. __slots__ = '_attr_info', '_model', 'tagName'
  54. def __init__(self, tagName, model=None):
  55. self.tagName = tagName
  56. self._attr_info = []
  57. self._model = model
  58. def __getstate__(self):
  59. return self._attr_info, self._model, self.tagName
  60. def __setstate__(self, state):
  61. self._attr_info, self._model, self.tagName = state
  62. def getAttributeType(self, aname):
  63. for info in self._attr_info:
  64. if info[1] == aname:
  65. t = info[-2]
  66. if t[0] == "(":
  67. return _typeinfo_map["ENUM"]
  68. else:
  69. return _typeinfo_map[info[-2]]
  70. return minidom._no_type
  71. def getAttributeTypeNS(self, namespaceURI, localName):
  72. return minidom._no_type
  73. def isElementContent(self):
  74. if self._model:
  75. type = self._model[0]
  76. return type not in (expat.model.XML_CTYPE_ANY,
  77. expat.model.XML_CTYPE_MIXED)
  78. else:
  79. return False
  80. def isEmpty(self):
  81. if self._model:
  82. return self._model[0] == expat.model.XML_CTYPE_EMPTY
  83. else:
  84. return False
  85. def isId(self, aname):
  86. for info in self._attr_info:
  87. if info[1] == aname:
  88. return info[-2] == "ID"
  89. return False
  90. def isIdNS(self, euri, ename, auri, aname):
  91. # not sure this is meaningful
  92. return self.isId((auri, aname))
  93. def _intern(builder, s):
  94. return builder._intern_setdefault(s, s)
  95. def _parse_ns_name(builder, name):
  96. assert ' ' in name
  97. parts = name.split(' ')
  98. intern = builder._intern_setdefault
  99. if len(parts) == 3:
  100. uri, localname, prefix = parts
  101. prefix = intern(prefix, prefix)
  102. qname = "%s:%s" % (prefix, localname)
  103. qname = intern(qname, qname)
  104. localname = intern(localname, localname)
  105. elif len(parts) == 2:
  106. uri, localname = parts
  107. prefix = EMPTY_PREFIX
  108. qname = localname = intern(localname, localname)
  109. else:
  110. raise ValueError("Unsupported syntax: spaces in URIs not supported: %r" % name)
  111. return intern(uri, uri), localname, prefix, qname
  112. class ExpatBuilder:
  113. """Document builder that uses Expat to build a ParsedXML.DOM document
  114. instance."""
  115. def __init__(self, options=None):
  116. if options is None:
  117. options = xmlbuilder.Options()
  118. self._options = options
  119. if self._options.filter is not None:
  120. self._filter = FilterVisibilityController(self._options.filter)
  121. else:
  122. self._filter = None
  123. # This *really* doesn't do anything in this case, so
  124. # override it with something fast & minimal.
  125. self._finish_start_element = id
  126. self._parser = None
  127. self.reset()
  128. def createParser(self):
  129. """Create a new parser object."""
  130. return expat.ParserCreate()
  131. def getParser(self):
  132. """Return the parser object, creating a new one if needed."""
  133. if not self._parser:
  134. self._parser = self.createParser()
  135. self._intern_setdefault = self._parser.intern.setdefault
  136. self._parser.buffer_text = True
  137. self._parser.ordered_attributes = True
  138. self._parser.specified_attributes = True
  139. self.install(self._parser)
  140. return self._parser
  141. def reset(self):
  142. """Free all data structures used during DOM construction."""
  143. self.document = theDOMImplementation.createDocument(
  144. EMPTY_NAMESPACE, None, None)
  145. self.curNode = self.document
  146. self._elem_info = self.document._elem_info
  147. self._cdata = False
  148. def install(self, parser):
  149. """Install the callbacks needed to build the DOM into the parser."""
  150. # This creates circular references!
  151. parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
  152. parser.StartElementHandler = self.first_element_handler
  153. parser.EndElementHandler = self.end_element_handler
  154. parser.ProcessingInstructionHandler = self.pi_handler
  155. if self._options.entities:
  156. parser.EntityDeclHandler = self.entity_decl_handler
  157. parser.NotationDeclHandler = self.notation_decl_handler
  158. if self._options.comments:
  159. parser.CommentHandler = self.comment_handler
  160. if self._options.cdata_sections:
  161. parser.StartCdataSectionHandler = self.start_cdata_section_handler
  162. parser.EndCdataSectionHandler = self.end_cdata_section_handler
  163. parser.CharacterDataHandler = self.character_data_handler_cdata
  164. else:
  165. parser.CharacterDataHandler = self.character_data_handler
  166. parser.ExternalEntityRefHandler = self.external_entity_ref_handler
  167. parser.XmlDeclHandler = self.xml_decl_handler
  168. parser.ElementDeclHandler = self.element_decl_handler
  169. parser.AttlistDeclHandler = self.attlist_decl_handler
  170. def parseFile(self, file):
  171. """Parse a document from a file object, returning the document
  172. node."""
  173. parser = self.getParser()
  174. first_buffer = True
  175. try:
  176. while buffer := file.read(16*1024):
  177. parser.Parse(buffer, False)
  178. if first_buffer and self.document.documentElement:
  179. self._setup_subset(buffer)
  180. first_buffer = False
  181. parser.Parse(b"", True)
  182. except ParseEscape:
  183. pass
  184. doc = self.document
  185. self.reset()
  186. self._parser = None
  187. return doc
  188. def parseString(self, string):
  189. """Parse a document from a string, returning the document node."""
  190. parser = self.getParser()
  191. try:
  192. parser.Parse(string, True)
  193. self._setup_subset(string)
  194. except ParseEscape:
  195. pass
  196. doc = self.document
  197. self.reset()
  198. self._parser = None
  199. return doc
  200. def _setup_subset(self, buffer):
  201. """Load the internal subset if there might be one."""
  202. if self.document.doctype:
  203. extractor = InternalSubsetExtractor()
  204. extractor.parseString(buffer)
  205. subset = extractor.getSubset()
  206. self.document.doctype.internalSubset = subset
  207. def start_doctype_decl_handler(self, doctypeName, systemId, publicId,
  208. has_internal_subset):
  209. doctype = self.document.implementation.createDocumentType(
  210. doctypeName, publicId, systemId)
  211. doctype.ownerDocument = self.document
  212. _append_child(self.document, doctype)
  213. self.document.doctype = doctype
  214. if self._filter and self._filter.acceptNode(doctype) == FILTER_REJECT:
  215. self.document.doctype = None
  216. del self.document.childNodes[-1]
  217. doctype = None
  218. self._parser.EntityDeclHandler = None
  219. self._parser.NotationDeclHandler = None
  220. if has_internal_subset:
  221. if doctype is not None:
  222. doctype.entities._seq = []
  223. doctype.notations._seq = []
  224. self._parser.CommentHandler = None
  225. self._parser.ProcessingInstructionHandler = None
  226. self._parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
  227. def end_doctype_decl_handler(self):
  228. if self._options.comments:
  229. self._parser.CommentHandler = self.comment_handler
  230. self._parser.ProcessingInstructionHandler = self.pi_handler
  231. if not (self._elem_info or self._filter):
  232. self._finish_end_element = id
  233. def pi_handler(self, target, data):
  234. node = self.document.createProcessingInstruction(target, data)
  235. _append_child(self.curNode, node)
  236. if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
  237. self.curNode.removeChild(node)
  238. def character_data_handler_cdata(self, data):
  239. childNodes = self.curNode.childNodes
  240. if self._cdata:
  241. if ( self._cdata_continue
  242. and childNodes[-1].nodeType == CDATA_SECTION_NODE):
  243. childNodes[-1].appendData(data)
  244. return
  245. node = self.document.createCDATASection(data)
  246. self._cdata_continue = True
  247. elif childNodes and childNodes[-1].nodeType == TEXT_NODE:
  248. node = childNodes[-1]
  249. value = node.data + data
  250. node.data = value
  251. return
  252. else:
  253. node = minidom.Text()
  254. node.data = data
  255. node.ownerDocument = self.document
  256. _append_child(self.curNode, node)
  257. def character_data_handler(self, data):
  258. childNodes = self.curNode.childNodes
  259. if childNodes and childNodes[-1].nodeType == TEXT_NODE:
  260. node = childNodes[-1]
  261. node.data = node.data + data
  262. return
  263. node = minidom.Text()
  264. node.data = node.data + data
  265. node.ownerDocument = self.document
  266. _append_child(self.curNode, node)
  267. def entity_decl_handler(self, entityName, is_parameter_entity, value,
  268. base, systemId, publicId, notationName):
  269. if is_parameter_entity:
  270. # we don't care about parameter entities for the DOM
  271. return
  272. if not self._options.entities:
  273. return
  274. node = self.document._create_entity(entityName, publicId,
  275. systemId, notationName)
  276. if value is not None:
  277. # internal entity
  278. # node *should* be readonly, but we'll cheat
  279. child = self.document.createTextNode(value)
  280. node.childNodes.append(child)
  281. self.document.doctype.entities._seq.append(node)
  282. if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
  283. del self.document.doctype.entities._seq[-1]
  284. def notation_decl_handler(self, notationName, base, systemId, publicId):
  285. node = self.document._create_notation(notationName, publicId, systemId)
  286. self.document.doctype.notations._seq.append(node)
  287. if self._filter and self._filter.acceptNode(node) == FILTER_ACCEPT:
  288. del self.document.doctype.notations._seq[-1]
  289. def comment_handler(self, data):
  290. node = self.document.createComment(data)
  291. _append_child(self.curNode, node)
  292. if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
  293. self.curNode.removeChild(node)
  294. def start_cdata_section_handler(self):
  295. self._cdata = True
  296. self._cdata_continue = False
  297. def end_cdata_section_handler(self):
  298. self._cdata = False
  299. self._cdata_continue = False
  300. def external_entity_ref_handler(self, context, base, systemId, publicId):
  301. return 1
  302. def first_element_handler(self, name, attributes):
  303. if self._filter is None and not self._elem_info:
  304. self._finish_end_element = id
  305. self.getParser().StartElementHandler = self.start_element_handler
  306. self.start_element_handler(name, attributes)
  307. def start_element_handler(self, name, attributes):
  308. node = self.document.createElement(name)
  309. _append_child(self.curNode, node)
  310. self.curNode = node
  311. if attributes:
  312. for i in range(0, len(attributes), 2):
  313. a = minidom.Attr(attributes[i], EMPTY_NAMESPACE,
  314. None, EMPTY_PREFIX)
  315. value = attributes[i+1]
  316. a.value = value
  317. a.ownerDocument = self.document
  318. _set_attribute_node(node, a)
  319. if node is not self.document.documentElement:
  320. self._finish_start_element(node)
  321. def _finish_start_element(self, node):
  322. if self._filter:
  323. # To be general, we'd have to call isSameNode(), but this
  324. # is sufficient for minidom:
  325. if node is self.document.documentElement:
  326. return
  327. filt = self._filter.startContainer(node)
  328. if filt == FILTER_REJECT:
  329. # ignore this node & all descendents
  330. Rejecter(self)
  331. elif filt == FILTER_SKIP:
  332. # ignore this node, but make it's children become
  333. # children of the parent node
  334. Skipper(self)
  335. else:
  336. return
  337. self.curNode = node.parentNode
  338. node.parentNode.removeChild(node)
  339. node.unlink()
  340. # If this ever changes, Namespaces.end_element_handler() needs to
  341. # be changed to match.
  342. #
  343. def end_element_handler(self, name):
  344. curNode = self.curNode
  345. self.curNode = curNode.parentNode
  346. self._finish_end_element(curNode)
  347. def _finish_end_element(self, curNode):
  348. info = self._elem_info.get(curNode.tagName)
  349. if info:
  350. self._handle_white_text_nodes(curNode, info)
  351. if self._filter:
  352. if curNode is self.document.documentElement:
  353. return
  354. if self._filter.acceptNode(curNode) == FILTER_REJECT:
  355. self.curNode.removeChild(curNode)
  356. curNode.unlink()
  357. def _handle_white_text_nodes(self, node, info):
  358. if (self._options.whitespace_in_element_content
  359. or not info.isElementContent()):
  360. return
  361. # We have element type information and should remove ignorable
  362. # whitespace; identify for text nodes which contain only
  363. # whitespace.
  364. L = []
  365. for child in node.childNodes:
  366. if child.nodeType == TEXT_NODE and not child.data.strip():
  367. L.append(child)
  368. # Remove ignorable whitespace from the tree.
  369. for child in L:
  370. node.removeChild(child)
  371. def element_decl_handler(self, name, model):
  372. info = self._elem_info.get(name)
  373. if info is None:
  374. self._elem_info[name] = ElementInfo(name, model)
  375. else:
  376. assert info._model is None
  377. info._model = model
  378. def attlist_decl_handler(self, elem, name, type, default, required):
  379. info = self._elem_info.get(elem)
  380. if info is None:
  381. info = ElementInfo(elem)
  382. self._elem_info[elem] = info
  383. info._attr_info.append(
  384. [None, name, None, None, default, 0, type, required])
  385. def xml_decl_handler(self, version, encoding, standalone):
  386. self.document.version = version
  387. self.document.encoding = encoding
  388. # This is still a little ugly, thanks to the pyexpat API. ;-(
  389. if standalone >= 0:
  390. if standalone:
  391. self.document.standalone = True
  392. else:
  393. self.document.standalone = False
  394. # Don't include FILTER_INTERRUPT, since that's checked separately
  395. # where allowed.
  396. _ALLOWED_FILTER_RETURNS = (FILTER_ACCEPT, FILTER_REJECT, FILTER_SKIP)
  397. class FilterVisibilityController(object):
  398. """Wrapper around a DOMBuilderFilter which implements the checks
  399. to make the whatToShow filter attribute work."""
  400. __slots__ = 'filter',
  401. def __init__(self, filter):
  402. self.filter = filter
  403. def startContainer(self, node):
  404. mask = self._nodetype_mask[node.nodeType]
  405. if self.filter.whatToShow & mask:
  406. val = self.filter.startContainer(node)
  407. if val == FILTER_INTERRUPT:
  408. raise ParseEscape
  409. if val not in _ALLOWED_FILTER_RETURNS:
  410. raise ValueError(
  411. "startContainer() returned illegal value: " + repr(val))
  412. return val
  413. else:
  414. return FILTER_ACCEPT
  415. def acceptNode(self, node):
  416. mask = self._nodetype_mask[node.nodeType]
  417. if self.filter.whatToShow & mask:
  418. val = self.filter.acceptNode(node)
  419. if val == FILTER_INTERRUPT:
  420. raise ParseEscape
  421. if val == FILTER_SKIP:
  422. # move all child nodes to the parent, and remove this node
  423. parent = node.parentNode
  424. for child in node.childNodes[:]:
  425. parent.appendChild(child)
  426. # node is handled by the caller
  427. return FILTER_REJECT
  428. if val not in _ALLOWED_FILTER_RETURNS:
  429. raise ValueError(
  430. "acceptNode() returned illegal value: " + repr(val))
  431. return val
  432. else:
  433. return FILTER_ACCEPT
  434. _nodetype_mask = {
  435. Node.ELEMENT_NODE: NodeFilter.SHOW_ELEMENT,
  436. Node.ATTRIBUTE_NODE: NodeFilter.SHOW_ATTRIBUTE,
  437. Node.TEXT_NODE: NodeFilter.SHOW_TEXT,
  438. Node.CDATA_SECTION_NODE: NodeFilter.SHOW_CDATA_SECTION,
  439. Node.ENTITY_REFERENCE_NODE: NodeFilter.SHOW_ENTITY_REFERENCE,
  440. Node.ENTITY_NODE: NodeFilter.SHOW_ENTITY,
  441. Node.PROCESSING_INSTRUCTION_NODE: NodeFilter.SHOW_PROCESSING_INSTRUCTION,
  442. Node.COMMENT_NODE: NodeFilter.SHOW_COMMENT,
  443. Node.DOCUMENT_NODE: NodeFilter.SHOW_DOCUMENT,
  444. Node.DOCUMENT_TYPE_NODE: NodeFilter.SHOW_DOCUMENT_TYPE,
  445. Node.DOCUMENT_FRAGMENT_NODE: NodeFilter.SHOW_DOCUMENT_FRAGMENT,
  446. Node.NOTATION_NODE: NodeFilter.SHOW_NOTATION,
  447. }
  448. class FilterCrutch(object):
  449. __slots__ = '_builder', '_level', '_old_start', '_old_end'
  450. def __init__(self, builder):
  451. self._level = 0
  452. self._builder = builder
  453. parser = builder._parser
  454. self._old_start = parser.StartElementHandler
  455. self._old_end = parser.EndElementHandler
  456. parser.StartElementHandler = self.start_element_handler
  457. parser.EndElementHandler = self.end_element_handler
  458. class Rejecter(FilterCrutch):
  459. __slots__ = ()
  460. def __init__(self, builder):
  461. FilterCrutch.__init__(self, builder)
  462. parser = builder._parser
  463. for name in ("ProcessingInstructionHandler",
  464. "CommentHandler",
  465. "CharacterDataHandler",
  466. "StartCdataSectionHandler",
  467. "EndCdataSectionHandler",
  468. "ExternalEntityRefHandler",
  469. ):
  470. setattr(parser, name, None)
  471. def start_element_handler(self, *args):
  472. self._level = self._level + 1
  473. def end_element_handler(self, *args):
  474. if self._level == 0:
  475. # restore the old handlers
  476. parser = self._builder._parser
  477. self._builder.install(parser)
  478. parser.StartElementHandler = self._old_start
  479. parser.EndElementHandler = self._old_end
  480. else:
  481. self._level = self._level - 1
  482. class Skipper(FilterCrutch):
  483. __slots__ = ()
  484. def start_element_handler(self, *args):
  485. node = self._builder.curNode
  486. self._old_start(*args)
  487. if self._builder.curNode is not node:
  488. self._level = self._level + 1
  489. def end_element_handler(self, *args):
  490. if self._level == 0:
  491. # We're popping back out of the node we're skipping, so we
  492. # shouldn't need to do anything but reset the handlers.
  493. self._builder._parser.StartElementHandler = self._old_start
  494. self._builder._parser.EndElementHandler = self._old_end
  495. self._builder = None
  496. else:
  497. self._level = self._level - 1
  498. self._old_end(*args)
  499. # framework document used by the fragment builder.
  500. # Takes a string for the doctype, subset string, and namespace attrs string.
  501. _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID = \
  502. "http://xml.python.org/entities/fragment-builder/internal"
  503. _FRAGMENT_BUILDER_TEMPLATE = (
  504. '''\
  505. <!DOCTYPE wrapper
  506. %%s [
  507. <!ENTITY fragment-builder-internal
  508. SYSTEM "%s">
  509. %%s
  510. ]>
  511. <wrapper %%s
  512. >&fragment-builder-internal;</wrapper>'''
  513. % _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID)
  514. class FragmentBuilder(ExpatBuilder):
  515. """Builder which constructs document fragments given XML source
  516. text and a context node.
  517. The context node is expected to provide information about the
  518. namespace declarations which are in scope at the start of the
  519. fragment.
  520. """
  521. def __init__(self, context, options=None):
  522. if context.nodeType == DOCUMENT_NODE:
  523. self.originalDocument = context
  524. self.context = context
  525. else:
  526. self.originalDocument = context.ownerDocument
  527. self.context = context
  528. ExpatBuilder.__init__(self, options)
  529. def reset(self):
  530. ExpatBuilder.reset(self)
  531. self.fragment = None
  532. def parseFile(self, file):
  533. """Parse a document fragment from a file object, returning the
  534. fragment node."""
  535. return self.parseString(file.read())
  536. def parseString(self, string):
  537. """Parse a document fragment from a string, returning the
  538. fragment node."""
  539. self._source = string
  540. parser = self.getParser()
  541. doctype = self.originalDocument.doctype
  542. ident = ""
  543. if doctype:
  544. subset = doctype.internalSubset or self._getDeclarations()
  545. if doctype.publicId:
  546. ident = ('PUBLIC "%s" "%s"'
  547. % (doctype.publicId, doctype.systemId))
  548. elif doctype.systemId:
  549. ident = 'SYSTEM "%s"' % doctype.systemId
  550. else:
  551. subset = ""
  552. nsattrs = self._getNSattrs() # get ns decls from node's ancestors
  553. document = _FRAGMENT_BUILDER_TEMPLATE % (ident, subset, nsattrs)
  554. try:
  555. parser.Parse(document, True)
  556. except:
  557. self.reset()
  558. raise
  559. fragment = self.fragment
  560. self.reset()
  561. ## self._parser = None
  562. return fragment
  563. def _getDeclarations(self):
  564. """Re-create the internal subset from the DocumentType node.
  565. This is only needed if we don't already have the
  566. internalSubset as a string.
  567. """
  568. doctype = self.context.ownerDocument.doctype
  569. s = ""
  570. if doctype:
  571. for i in range(doctype.notations.length):
  572. notation = doctype.notations.item(i)
  573. if s:
  574. s = s + "\n "
  575. s = "%s<!NOTATION %s" % (s, notation.nodeName)
  576. if notation.publicId:
  577. s = '%s PUBLIC "%s"\n "%s">' \
  578. % (s, notation.publicId, notation.systemId)
  579. else:
  580. s = '%s SYSTEM "%s">' % (s, notation.systemId)
  581. for i in range(doctype.entities.length):
  582. entity = doctype.entities.item(i)
  583. if s:
  584. s = s + "\n "
  585. s = "%s<!ENTITY %s" % (s, entity.nodeName)
  586. if entity.publicId:
  587. s = '%s PUBLIC "%s"\n "%s"' \
  588. % (s, entity.publicId, entity.systemId)
  589. elif entity.systemId:
  590. s = '%s SYSTEM "%s"' % (s, entity.systemId)
  591. else:
  592. s = '%s "%s"' % (s, entity.firstChild.data)
  593. if entity.notationName:
  594. s = "%s NOTATION %s" % (s, entity.notationName)
  595. s = s + ">"
  596. return s
  597. def _getNSattrs(self):
  598. return ""
  599. def external_entity_ref_handler(self, context, base, systemId, publicId):
  600. if systemId == _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID:
  601. # this entref is the one that we made to put the subtree
  602. # in; all of our given input is parsed in here.
  603. old_document = self.document
  604. old_cur_node = self.curNode
  605. parser = self._parser.ExternalEntityParserCreate(context)
  606. # put the real document back, parse into the fragment to return
  607. self.document = self.originalDocument
  608. self.fragment = self.document.createDocumentFragment()
  609. self.curNode = self.fragment
  610. try:
  611. parser.Parse(self._source, True)
  612. finally:
  613. self.curNode = old_cur_node
  614. self.document = old_document
  615. self._source = None
  616. return -1
  617. else:
  618. return ExpatBuilder.external_entity_ref_handler(
  619. self, context, base, systemId, publicId)
  620. class Namespaces:
  621. """Mix-in class for builders; adds support for namespaces."""
  622. def _initNamespaces(self):
  623. # list of (prefix, uri) ns declarations. Namespace attrs are
  624. # constructed from this and added to the element's attrs.
  625. self._ns_ordered_prefixes = []
  626. def createParser(self):
  627. """Create a new namespace-handling parser."""
  628. parser = expat.ParserCreate(namespace_separator=" ")
  629. parser.namespace_prefixes = True
  630. return parser
  631. def install(self, parser):
  632. """Insert the namespace-handlers onto the parser."""
  633. ExpatBuilder.install(self, parser)
  634. if self._options.namespace_declarations:
  635. parser.StartNamespaceDeclHandler = (
  636. self.start_namespace_decl_handler)
  637. def start_namespace_decl_handler(self, prefix, uri):
  638. """Push this namespace declaration on our storage."""
  639. self._ns_ordered_prefixes.append((prefix, uri))
  640. def start_element_handler(self, name, attributes):
  641. if ' ' in name:
  642. uri, localname, prefix, qname = _parse_ns_name(self, name)
  643. else:
  644. uri = EMPTY_NAMESPACE
  645. qname = name
  646. localname = None
  647. prefix = EMPTY_PREFIX
  648. node = minidom.Element(qname, uri, prefix, localname)
  649. node.ownerDocument = self.document
  650. _append_child(self.curNode, node)
  651. self.curNode = node
  652. if self._ns_ordered_prefixes:
  653. for prefix, uri in self._ns_ordered_prefixes:
  654. if prefix:
  655. a = minidom.Attr(_intern(self, 'xmlns:' + prefix),
  656. XMLNS_NAMESPACE, prefix, "xmlns")
  657. else:
  658. a = minidom.Attr("xmlns", XMLNS_NAMESPACE,
  659. "xmlns", EMPTY_PREFIX)
  660. a.value = uri
  661. a.ownerDocument = self.document
  662. _set_attribute_node(node, a)
  663. del self._ns_ordered_prefixes[:]
  664. if attributes:
  665. node._ensure_attributes()
  666. _attrs = node._attrs
  667. _attrsNS = node._attrsNS
  668. for i in range(0, len(attributes), 2):
  669. aname = attributes[i]
  670. value = attributes[i+1]
  671. if ' ' in aname:
  672. uri, localname, prefix, qname = _parse_ns_name(self, aname)
  673. a = minidom.Attr(qname, uri, localname, prefix)
  674. _attrs[qname] = a
  675. _attrsNS[(uri, localname)] = a
  676. else:
  677. a = minidom.Attr(aname, EMPTY_NAMESPACE,
  678. aname, EMPTY_PREFIX)
  679. _attrs[aname] = a
  680. _attrsNS[(EMPTY_NAMESPACE, aname)] = a
  681. a.ownerDocument = self.document
  682. a.value = value
  683. a.ownerElement = node
  684. if __debug__:
  685. # This only adds some asserts to the original
  686. # end_element_handler(), so we only define this when -O is not
  687. # used. If changing one, be sure to check the other to see if
  688. # it needs to be changed as well.
  689. #
  690. def end_element_handler(self, name):
  691. curNode = self.curNode
  692. if ' ' in name:
  693. uri, localname, prefix, qname = _parse_ns_name(self, name)
  694. assert (curNode.namespaceURI == uri
  695. and curNode.localName == localname
  696. and curNode.prefix == prefix), \
  697. "element stack messed up! (namespace)"
  698. else:
  699. assert curNode.nodeName == name, \
  700. "element stack messed up - bad nodeName"
  701. assert curNode.namespaceURI == EMPTY_NAMESPACE, \
  702. "element stack messed up - bad namespaceURI"
  703. self.curNode = curNode.parentNode
  704. self._finish_end_element(curNode)
  705. class ExpatBuilderNS(Namespaces, ExpatBuilder):
  706. """Document builder that supports namespaces."""
  707. def reset(self):
  708. ExpatBuilder.reset(self)
  709. self._initNamespaces()
  710. class FragmentBuilderNS(Namespaces, FragmentBuilder):
  711. """Fragment builder that supports namespaces."""
  712. def reset(self):
  713. FragmentBuilder.reset(self)
  714. self._initNamespaces()
  715. def _getNSattrs(self):
  716. """Return string of namespace attributes from this element and
  717. ancestors."""
  718. # XXX This needs to be re-written to walk the ancestors of the
  719. # context to build up the namespace information from
  720. # declarations, elements, and attributes found in context.
  721. # Otherwise we have to store a bunch more data on the DOM
  722. # (though that *might* be more reliable -- not clear).
  723. attrs = ""
  724. context = self.context
  725. L = []
  726. while context:
  727. if hasattr(context, '_ns_prefix_uri'):
  728. for prefix, uri in context._ns_prefix_uri.items():
  729. # add every new NS decl from context to L and attrs string
  730. if prefix in L:
  731. continue
  732. L.append(prefix)
  733. if prefix:
  734. declname = "xmlns:" + prefix
  735. else:
  736. declname = "xmlns"
  737. if attrs:
  738. attrs = "%s\n %s='%s'" % (attrs, declname, uri)
  739. else:
  740. attrs = " %s='%s'" % (declname, uri)
  741. context = context.parentNode
  742. return attrs
  743. class ParseEscape(Exception):
  744. """Exception raised to short-circuit parsing in InternalSubsetExtractor."""
  745. pass
  746. class InternalSubsetExtractor(ExpatBuilder):
  747. """XML processor which can rip out the internal document type subset."""
  748. subset = None
  749. def getSubset(self):
  750. """Return the internal subset as a string."""
  751. return self.subset
  752. def parseFile(self, file):
  753. try:
  754. ExpatBuilder.parseFile(self, file)
  755. except ParseEscape:
  756. pass
  757. def parseString(self, string):
  758. try:
  759. ExpatBuilder.parseString(self, string)
  760. except ParseEscape:
  761. pass
  762. def install(self, parser):
  763. parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
  764. parser.StartElementHandler = self.start_element_handler
  765. def start_doctype_decl_handler(self, name, publicId, systemId,
  766. has_internal_subset):
  767. if has_internal_subset:
  768. parser = self.getParser()
  769. self.subset = []
  770. parser.DefaultHandler = self.subset.append
  771. parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
  772. else:
  773. raise ParseEscape()
  774. def end_doctype_decl_handler(self):
  775. s = ''.join(self.subset).replace('\r\n', '\n').replace('\r', '\n')
  776. self.subset = s
  777. raise ParseEscape()
  778. def start_element_handler(self, name, attrs):
  779. raise ParseEscape()
  780. def parse(file, namespaces=True):
  781. """Parse a document, returning the resulting Document node.
  782. 'file' may be either a file name or an open file object.
  783. """
  784. if namespaces:
  785. builder = ExpatBuilderNS()
  786. else:
  787. builder = ExpatBuilder()
  788. if isinstance(file, str):
  789. with open(file, 'rb') as fp:
  790. result = builder.parseFile(fp)
  791. else:
  792. result = builder.parseFile(file)
  793. return result
  794. def parseString(string, namespaces=True):
  795. """Parse a document from a string, returning the resulting
  796. Document node.
  797. """
  798. if namespaces:
  799. builder = ExpatBuilderNS()
  800. else:
  801. builder = ExpatBuilder()
  802. return builder.parseString(string)
  803. def parseFragment(file, context, namespaces=True):
  804. """Parse a fragment of a document, given the context from which it
  805. was originally extracted. context should be the parent of the
  806. node(s) which are in the fragment.
  807. 'file' may be either a file name or an open file object.
  808. """
  809. if namespaces:
  810. builder = FragmentBuilderNS(context)
  811. else:
  812. builder = FragmentBuilder(context)
  813. if isinstance(file, str):
  814. with open(file, 'rb') as fp:
  815. result = builder.parseFile(fp)
  816. else:
  817. result = builder.parseFile(file)
  818. return result
  819. def parseFragmentString(string, context, namespaces=True):
  820. """Parse a fragment of a document from a string, given the context
  821. from which it was originally extracted. context should be the
  822. parent of the node(s) which are in the fragment.
  823. """
  824. if namespaces:
  825. builder = FragmentBuilderNS(context)
  826. else:
  827. builder = FragmentBuilder(context)
  828. return builder.parseString(string)
  829. def makeBuilder(options):
  830. """Create a builder based on an Options object."""
  831. if options.namespaces:
  832. return ExpatBuilderNS(options)
  833. else:
  834. return ExpatBuilder(options)