expatbuilder.py 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965
  1. """Facility to use the Expat parser to load a minidom instance
  2. from a string or file.
  3. This avoids all the overhead of SAX and pulldom to gain performance.
  4. """
  5. # Warning!
  6. #
  7. # This module is tightly bound to the implementation details of the
  8. # minidom DOM and can't be used with other DOM implementations. This
  9. # is due, in part, to a lack of appropriate methods in the DOM (there is
  10. # no way to create Entity and Notation nodes via the DOM Level 2
  11. # interface), and for performance. The latter is the cause of some fairly
  12. # cryptic code.
  13. #
  14. # Performance hacks:
  15. #
  16. # - .character_data_handler() has an extra case in which continuing
  17. # data is appended to an existing Text node; this can be a
  18. # speedup since pyexpat can break up character data into multiple
  19. # callbacks even though we set the buffer_text attribute on the
  20. # parser. This also gives us the advantage that we don't need a
  21. # separate normalization pass.
  22. #
  23. # - Determining that a node exists is done using an identity comparison
  24. # with None rather than a truth test; this avoids searching for and
  25. # calling any methods on the node object if it exists. (A rather
  26. # nice speedup is achieved this way as well!)
  27. from xml.dom import xmlbuilder, minidom, Node
  28. from xml.dom import EMPTY_NAMESPACE, EMPTY_PREFIX, XMLNS_NAMESPACE
  29. from xml.parsers import expat
  30. from xml.dom.minidom import _append_child, _set_attribute_node
  31. from xml.dom.NodeFilter import NodeFilter
  32. TEXT_NODE = Node.TEXT_NODE
  33. CDATA_SECTION_NODE = Node.CDATA_SECTION_NODE
  34. DOCUMENT_NODE = Node.DOCUMENT_NODE
  35. FILTER_ACCEPT = xmlbuilder.DOMBuilderFilter.FILTER_ACCEPT
  36. FILTER_REJECT = xmlbuilder.DOMBuilderFilter.FILTER_REJECT
  37. FILTER_SKIP = xmlbuilder.DOMBuilderFilter.FILTER_SKIP
  38. FILTER_INTERRUPT = xmlbuilder.DOMBuilderFilter.FILTER_INTERRUPT
  39. theDOMImplementation = minidom.getDOMImplementation()
  40. # Expat typename -> TypeInfo
  41. _typeinfo_map = {
  42. "CDATA": minidom.TypeInfo(None, "cdata"),
  43. "ENUM": minidom.TypeInfo(None, "enumeration"),
  44. "ENTITY": minidom.TypeInfo(None, "entity"),
  45. "ENTITIES": minidom.TypeInfo(None, "entities"),
  46. "ID": minidom.TypeInfo(None, "id"),
  47. "IDREF": minidom.TypeInfo(None, "idref"),
  48. "IDREFS": minidom.TypeInfo(None, "idrefs"),
  49. "NMTOKEN": minidom.TypeInfo(None, "nmtoken"),
  50. "NMTOKENS": minidom.TypeInfo(None, "nmtokens"),
  51. }
  52. class ElementInfo(object):
  53. __slots__ = '_attr_info', '_model', 'tagName'
  54. def __init__(self, tagName, model=None):
  55. self.tagName = tagName
  56. self._attr_info = []
  57. self._model = model
  58. def __getstate__(self):
  59. return self._attr_info, self._model, self.tagName
  60. def __setstate__(self, state):
  61. self._attr_info, self._model, self.tagName = state
  62. def getAttributeType(self, aname):
  63. for info in self._attr_info:
  64. if info[1] == aname:
  65. t = info[-2]
  66. if t[0] == "(":
  67. return _typeinfo_map["ENUM"]
  68. else:
  69. return _typeinfo_map[info[-2]]
  70. return minidom._no_type
  71. def getAttributeTypeNS(self, namespaceURI, localName):
  72. return minidom._no_type
  73. def isElementContent(self):
  74. if self._model:
  75. type = self._model[0]
  76. return type not in (expat.model.XML_CTYPE_ANY,
  77. expat.model.XML_CTYPE_MIXED)
  78. else:
  79. return False
  80. def isEmpty(self):
  81. if self._model:
  82. return self._model[0] == expat.model.XML_CTYPE_EMPTY
  83. else:
  84. return False
  85. def isId(self, aname):
  86. for info in self._attr_info:
  87. if info[1] == aname:
  88. return info[-2] == "ID"
  89. return False
  90. def isIdNS(self, euri, ename, auri, aname):
  91. # not sure this is meaningful
  92. return self.isId((auri, aname))
  93. def _intern(builder, s):
  94. return builder._intern_setdefault(s, s)
  95. def _parse_ns_name(builder, name):
  96. assert ' ' in name
  97. parts = name.split(' ')
  98. intern = builder._intern_setdefault
  99. if len(parts) == 3:
  100. uri, localname, prefix = parts
  101. prefix = intern(prefix, prefix)
  102. qname = "%s:%s" % (prefix, localname)
  103. qname = intern(qname, qname)
  104. localname = intern(localname, localname)
  105. elif len(parts) == 2:
  106. uri, localname = parts
  107. prefix = EMPTY_PREFIX
  108. qname = localname = intern(localname, localname)
  109. else:
  110. raise ValueError("Unsupported syntax: spaces in URIs not supported: %r" % name)
  111. return intern(uri, uri), localname, prefix, qname
  112. class ExpatBuilder:
  113. """Document builder that uses Expat to build a ParsedXML.DOM document
  114. instance."""
  115. def __init__(self, options=None):
  116. if options is None:
  117. options = xmlbuilder.Options()
  118. self._options = options
  119. if self._options.filter is not None:
  120. self._filter = FilterVisibilityController(self._options.filter)
  121. else:
  122. self._filter = None
  123. # This *really* doesn't do anything in this case, so
  124. # override it with something fast & minimal.
  125. self._finish_start_element = id
  126. self._parser = None
  127. self.reset()
  128. def createParser(self):
  129. """Create a new parser object."""
  130. return expat.ParserCreate()
  131. def getParser(self):
  132. """Return the parser object, creating a new one if needed."""
  133. if not self._parser:
  134. self._parser = self.createParser()
  135. self._intern_setdefault = self._parser.intern.setdefault
  136. self._parser.buffer_text = True
  137. self._parser.ordered_attributes = True
  138. self._parser.specified_attributes = True
  139. self.install(self._parser)
  140. return self._parser
  141. def reset(self):
  142. """Free all data structures used during DOM construction."""
  143. self.document = theDOMImplementation.createDocument(
  144. EMPTY_NAMESPACE, None, None)
  145. self.curNode = self.document
  146. self._elem_info = self.document._elem_info
  147. self._cdata = False
  148. def install(self, parser):
  149. """Install the callbacks needed to build the DOM into the parser."""
  150. # This creates circular references!
  151. parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
  152. parser.StartElementHandler = self.first_element_handler
  153. parser.EndElementHandler = self.end_element_handler
  154. parser.ProcessingInstructionHandler = self.pi_handler
  155. if self._options.entities:
  156. parser.EntityDeclHandler = self.entity_decl_handler
  157. parser.NotationDeclHandler = self.notation_decl_handler
  158. if self._options.comments:
  159. parser.CommentHandler = self.comment_handler
  160. if self._options.cdata_sections:
  161. parser.StartCdataSectionHandler = self.start_cdata_section_handler
  162. parser.EndCdataSectionHandler = self.end_cdata_section_handler
  163. parser.CharacterDataHandler = self.character_data_handler_cdata
  164. else:
  165. parser.CharacterDataHandler = self.character_data_handler
  166. parser.ExternalEntityRefHandler = self.external_entity_ref_handler
  167. parser.XmlDeclHandler = self.xml_decl_handler
  168. parser.ElementDeclHandler = self.element_decl_handler
  169. parser.AttlistDeclHandler = self.attlist_decl_handler
  170. def parseFile(self, file):
  171. """Parse a document from a file object, returning the document
  172. node."""
  173. parser = self.getParser()
  174. first_buffer = True
  175. try:
  176. while 1:
  177. buffer = file.read(16*1024)
  178. if not buffer:
  179. break
  180. parser.Parse(buffer, False)
  181. if first_buffer and self.document.documentElement:
  182. self._setup_subset(buffer)
  183. first_buffer = False
  184. parser.Parse(b"", True)
  185. except ParseEscape:
  186. pass
  187. doc = self.document
  188. self.reset()
  189. self._parser = None
  190. return doc
  191. def parseString(self, string):
  192. """Parse a document from a string, returning the document node."""
  193. parser = self.getParser()
  194. try:
  195. parser.Parse(string, True)
  196. self._setup_subset(string)
  197. except ParseEscape:
  198. pass
  199. doc = self.document
  200. self.reset()
  201. self._parser = None
  202. return doc
  203. def _setup_subset(self, buffer):
  204. """Load the internal subset if there might be one."""
  205. if self.document.doctype:
  206. extractor = InternalSubsetExtractor()
  207. extractor.parseString(buffer)
  208. subset = extractor.getSubset()
  209. self.document.doctype.internalSubset = subset
  210. def start_doctype_decl_handler(self, doctypeName, systemId, publicId,
  211. has_internal_subset):
  212. doctype = self.document.implementation.createDocumentType(
  213. doctypeName, publicId, systemId)
  214. doctype.ownerDocument = self.document
  215. _append_child(self.document, doctype)
  216. self.document.doctype = doctype
  217. if self._filter and self._filter.acceptNode(doctype) == FILTER_REJECT:
  218. self.document.doctype = None
  219. del self.document.childNodes[-1]
  220. doctype = None
  221. self._parser.EntityDeclHandler = None
  222. self._parser.NotationDeclHandler = None
  223. if has_internal_subset:
  224. if doctype is not None:
  225. doctype.entities._seq = []
  226. doctype.notations._seq = []
  227. self._parser.CommentHandler = None
  228. self._parser.ProcessingInstructionHandler = None
  229. self._parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
  230. def end_doctype_decl_handler(self):
  231. if self._options.comments:
  232. self._parser.CommentHandler = self.comment_handler
  233. self._parser.ProcessingInstructionHandler = self.pi_handler
  234. if not (self._elem_info or self._filter):
  235. self._finish_end_element = id
  236. def pi_handler(self, target, data):
  237. node = self.document.createProcessingInstruction(target, data)
  238. _append_child(self.curNode, node)
  239. if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
  240. self.curNode.removeChild(node)
  241. def character_data_handler_cdata(self, data):
  242. childNodes = self.curNode.childNodes
  243. if self._cdata:
  244. if ( self._cdata_continue
  245. and childNodes[-1].nodeType == CDATA_SECTION_NODE):
  246. childNodes[-1].appendData(data)
  247. return
  248. node = self.document.createCDATASection(data)
  249. self._cdata_continue = True
  250. elif childNodes and childNodes[-1].nodeType == TEXT_NODE:
  251. node = childNodes[-1]
  252. value = node.data + data
  253. node.data = value
  254. return
  255. else:
  256. node = minidom.Text()
  257. node.data = data
  258. node.ownerDocument = self.document
  259. _append_child(self.curNode, node)
  260. def character_data_handler(self, data):
  261. childNodes = self.curNode.childNodes
  262. if childNodes and childNodes[-1].nodeType == TEXT_NODE:
  263. node = childNodes[-1]
  264. node.data = node.data + data
  265. return
  266. node = minidom.Text()
  267. node.data = node.data + data
  268. node.ownerDocument = self.document
  269. _append_child(self.curNode, node)
  270. def entity_decl_handler(self, entityName, is_parameter_entity, value,
  271. base, systemId, publicId, notationName):
  272. if is_parameter_entity:
  273. # we don't care about parameter entities for the DOM
  274. return
  275. if not self._options.entities:
  276. return
  277. node = self.document._create_entity(entityName, publicId,
  278. systemId, notationName)
  279. if value is not None:
  280. # internal entity
  281. # node *should* be readonly, but we'll cheat
  282. child = self.document.createTextNode(value)
  283. node.childNodes.append(child)
  284. self.document.doctype.entities._seq.append(node)
  285. if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
  286. del self.document.doctype.entities._seq[-1]
  287. def notation_decl_handler(self, notationName, base, systemId, publicId):
  288. node = self.document._create_notation(notationName, publicId, systemId)
  289. self.document.doctype.notations._seq.append(node)
  290. if self._filter and self._filter.acceptNode(node) == FILTER_ACCEPT:
  291. del self.document.doctype.notations._seq[-1]
  292. def comment_handler(self, data):
  293. node = self.document.createComment(data)
  294. _append_child(self.curNode, node)
  295. if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
  296. self.curNode.removeChild(node)
  297. def start_cdata_section_handler(self):
  298. self._cdata = True
  299. self._cdata_continue = False
  300. def end_cdata_section_handler(self):
  301. self._cdata = False
  302. self._cdata_continue = False
  303. def external_entity_ref_handler(self, context, base, systemId, publicId):
  304. return 1
  305. def first_element_handler(self, name, attributes):
  306. if self._filter is None and not self._elem_info:
  307. self._finish_end_element = id
  308. self.getParser().StartElementHandler = self.start_element_handler
  309. self.start_element_handler(name, attributes)
  310. def start_element_handler(self, name, attributes):
  311. node = self.document.createElement(name)
  312. _append_child(self.curNode, node)
  313. self.curNode = node
  314. if attributes:
  315. for i in range(0, len(attributes), 2):
  316. a = minidom.Attr(attributes[i], EMPTY_NAMESPACE,
  317. None, EMPTY_PREFIX)
  318. value = attributes[i+1]
  319. a.value = value
  320. a.ownerDocument = self.document
  321. _set_attribute_node(node, a)
  322. if node is not self.document.documentElement:
  323. self._finish_start_element(node)
  324. def _finish_start_element(self, node):
  325. if self._filter:
  326. # To be general, we'd have to call isSameNode(), but this
  327. # is sufficient for minidom:
  328. if node is self.document.documentElement:
  329. return
  330. filt = self._filter.startContainer(node)
  331. if filt == FILTER_REJECT:
  332. # ignore this node & all descendents
  333. Rejecter(self)
  334. elif filt == FILTER_SKIP:
  335. # ignore this node, but make it's children become
  336. # children of the parent node
  337. Skipper(self)
  338. else:
  339. return
  340. self.curNode = node.parentNode
  341. node.parentNode.removeChild(node)
  342. node.unlink()
  343. # If this ever changes, Namespaces.end_element_handler() needs to
  344. # be changed to match.
  345. #
  346. def end_element_handler(self, name):
  347. curNode = self.curNode
  348. self.curNode = curNode.parentNode
  349. self._finish_end_element(curNode)
  350. def _finish_end_element(self, curNode):
  351. info = self._elem_info.get(curNode.tagName)
  352. if info:
  353. self._handle_white_text_nodes(curNode, info)
  354. if self._filter:
  355. if curNode is self.document.documentElement:
  356. return
  357. if self._filter.acceptNode(curNode) == FILTER_REJECT:
  358. self.curNode.removeChild(curNode)
  359. curNode.unlink()
  360. def _handle_white_text_nodes(self, node, info):
  361. if (self._options.whitespace_in_element_content
  362. or not info.isElementContent()):
  363. return
  364. # We have element type information and should remove ignorable
  365. # whitespace; identify for text nodes which contain only
  366. # whitespace.
  367. L = []
  368. for child in node.childNodes:
  369. if child.nodeType == TEXT_NODE and not child.data.strip():
  370. L.append(child)
  371. # Remove ignorable whitespace from the tree.
  372. for child in L:
  373. node.removeChild(child)
  374. def element_decl_handler(self, name, model):
  375. info = self._elem_info.get(name)
  376. if info is None:
  377. self._elem_info[name] = ElementInfo(name, model)
  378. else:
  379. assert info._model is None
  380. info._model = model
  381. def attlist_decl_handler(self, elem, name, type, default, required):
  382. info = self._elem_info.get(elem)
  383. if info is None:
  384. info = ElementInfo(elem)
  385. self._elem_info[elem] = info
  386. info._attr_info.append(
  387. [None, name, None, None, default, 0, type, required])
  388. def xml_decl_handler(self, version, encoding, standalone):
  389. self.document.version = version
  390. self.document.encoding = encoding
  391. # This is still a little ugly, thanks to the pyexpat API. ;-(
  392. if standalone >= 0:
  393. if standalone:
  394. self.document.standalone = True
  395. else:
  396. self.document.standalone = False
  397. # Don't include FILTER_INTERRUPT, since that's checked separately
  398. # where allowed.
  399. _ALLOWED_FILTER_RETURNS = (FILTER_ACCEPT, FILTER_REJECT, FILTER_SKIP)
  400. class FilterVisibilityController(object):
  401. """Wrapper around a DOMBuilderFilter which implements the checks
  402. to make the whatToShow filter attribute work."""
  403. __slots__ = 'filter',
  404. def __init__(self, filter):
  405. self.filter = filter
  406. def startContainer(self, node):
  407. mask = self._nodetype_mask[node.nodeType]
  408. if self.filter.whatToShow & mask:
  409. val = self.filter.startContainer(node)
  410. if val == FILTER_INTERRUPT:
  411. raise ParseEscape
  412. if val not in _ALLOWED_FILTER_RETURNS:
  413. raise ValueError(
  414. "startContainer() returned illegal value: " + repr(val))
  415. return val
  416. else:
  417. return FILTER_ACCEPT
  418. def acceptNode(self, node):
  419. mask = self._nodetype_mask[node.nodeType]
  420. if self.filter.whatToShow & mask:
  421. val = self.filter.acceptNode(node)
  422. if val == FILTER_INTERRUPT:
  423. raise ParseEscape
  424. if val == FILTER_SKIP:
  425. # move all child nodes to the parent, and remove this node
  426. parent = node.parentNode
  427. for child in node.childNodes[:]:
  428. parent.appendChild(child)
  429. # node is handled by the caller
  430. return FILTER_REJECT
  431. if val not in _ALLOWED_FILTER_RETURNS:
  432. raise ValueError(
  433. "acceptNode() returned illegal value: " + repr(val))
  434. return val
  435. else:
  436. return FILTER_ACCEPT
  437. _nodetype_mask = {
  438. Node.ELEMENT_NODE: NodeFilter.SHOW_ELEMENT,
  439. Node.ATTRIBUTE_NODE: NodeFilter.SHOW_ATTRIBUTE,
  440. Node.TEXT_NODE: NodeFilter.SHOW_TEXT,
  441. Node.CDATA_SECTION_NODE: NodeFilter.SHOW_CDATA_SECTION,
  442. Node.ENTITY_REFERENCE_NODE: NodeFilter.SHOW_ENTITY_REFERENCE,
  443. Node.ENTITY_NODE: NodeFilter.SHOW_ENTITY,
  444. Node.PROCESSING_INSTRUCTION_NODE: NodeFilter.SHOW_PROCESSING_INSTRUCTION,
  445. Node.COMMENT_NODE: NodeFilter.SHOW_COMMENT,
  446. Node.DOCUMENT_NODE: NodeFilter.SHOW_DOCUMENT,
  447. Node.DOCUMENT_TYPE_NODE: NodeFilter.SHOW_DOCUMENT_TYPE,
  448. Node.DOCUMENT_FRAGMENT_NODE: NodeFilter.SHOW_DOCUMENT_FRAGMENT,
  449. Node.NOTATION_NODE: NodeFilter.SHOW_NOTATION,
  450. }
  451. class FilterCrutch(object):
  452. __slots__ = '_builder', '_level', '_old_start', '_old_end'
  453. def __init__(self, builder):
  454. self._level = 0
  455. self._builder = builder
  456. parser = builder._parser
  457. self._old_start = parser.StartElementHandler
  458. self._old_end = parser.EndElementHandler
  459. parser.StartElementHandler = self.start_element_handler
  460. parser.EndElementHandler = self.end_element_handler
  461. class Rejecter(FilterCrutch):
  462. __slots__ = ()
  463. def __init__(self, builder):
  464. FilterCrutch.__init__(self, builder)
  465. parser = builder._parser
  466. for name in ("ProcessingInstructionHandler",
  467. "CommentHandler",
  468. "CharacterDataHandler",
  469. "StartCdataSectionHandler",
  470. "EndCdataSectionHandler",
  471. "ExternalEntityRefHandler",
  472. ):
  473. setattr(parser, name, None)
  474. def start_element_handler(self, *args):
  475. self._level = self._level + 1
  476. def end_element_handler(self, *args):
  477. if self._level == 0:
  478. # restore the old handlers
  479. parser = self._builder._parser
  480. self._builder.install(parser)
  481. parser.StartElementHandler = self._old_start
  482. parser.EndElementHandler = self._old_end
  483. else:
  484. self._level = self._level - 1
  485. class Skipper(FilterCrutch):
  486. __slots__ = ()
  487. def start_element_handler(self, *args):
  488. node = self._builder.curNode
  489. self._old_start(*args)
  490. if self._builder.curNode is not node:
  491. self._level = self._level + 1
  492. def end_element_handler(self, *args):
  493. if self._level == 0:
  494. # We're popping back out of the node we're skipping, so we
  495. # shouldn't need to do anything but reset the handlers.
  496. self._builder._parser.StartElementHandler = self._old_start
  497. self._builder._parser.EndElementHandler = self._old_end
  498. self._builder = None
  499. else:
  500. self._level = self._level - 1
  501. self._old_end(*args)
  502. # framework document used by the fragment builder.
  503. # Takes a string for the doctype, subset string, and namespace attrs string.
  504. _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID = \
  505. "http://xml.python.org/entities/fragment-builder/internal"
  506. _FRAGMENT_BUILDER_TEMPLATE = (
  507. '''\
  508. <!DOCTYPE wrapper
  509. %%s [
  510. <!ENTITY fragment-builder-internal
  511. SYSTEM "%s">
  512. %%s
  513. ]>
  514. <wrapper %%s
  515. >&fragment-builder-internal;</wrapper>'''
  516. % _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID)
  517. class FragmentBuilder(ExpatBuilder):
  518. """Builder which constructs document fragments given XML source
  519. text and a context node.
  520. The context node is expected to provide information about the
  521. namespace declarations which are in scope at the start of the
  522. fragment.
  523. """
  524. def __init__(self, context, options=None):
  525. if context.nodeType == DOCUMENT_NODE:
  526. self.originalDocument = context
  527. self.context = context
  528. else:
  529. self.originalDocument = context.ownerDocument
  530. self.context = context
  531. ExpatBuilder.__init__(self, options)
  532. def reset(self):
  533. ExpatBuilder.reset(self)
  534. self.fragment = None
  535. def parseFile(self, file):
  536. """Parse a document fragment from a file object, returning the
  537. fragment node."""
  538. return self.parseString(file.read())
  539. def parseString(self, string):
  540. """Parse a document fragment from a string, returning the
  541. fragment node."""
  542. self._source = string
  543. parser = self.getParser()
  544. doctype = self.originalDocument.doctype
  545. ident = ""
  546. if doctype:
  547. subset = doctype.internalSubset or self._getDeclarations()
  548. if doctype.publicId:
  549. ident = ('PUBLIC "%s" "%s"'
  550. % (doctype.publicId, doctype.systemId))
  551. elif doctype.systemId:
  552. ident = 'SYSTEM "%s"' % doctype.systemId
  553. else:
  554. subset = ""
  555. nsattrs = self._getNSattrs() # get ns decls from node's ancestors
  556. document = _FRAGMENT_BUILDER_TEMPLATE % (ident, subset, nsattrs)
  557. try:
  558. parser.Parse(document, True)
  559. except:
  560. self.reset()
  561. raise
  562. fragment = self.fragment
  563. self.reset()
  564. ## self._parser = None
  565. return fragment
  566. def _getDeclarations(self):
  567. """Re-create the internal subset from the DocumentType node.
  568. This is only needed if we don't already have the
  569. internalSubset as a string.
  570. """
  571. doctype = self.context.ownerDocument.doctype
  572. s = ""
  573. if doctype:
  574. for i in range(doctype.notations.length):
  575. notation = doctype.notations.item(i)
  576. if s:
  577. s = s + "\n "
  578. s = "%s<!NOTATION %s" % (s, notation.nodeName)
  579. if notation.publicId:
  580. s = '%s PUBLIC "%s"\n "%s">' \
  581. % (s, notation.publicId, notation.systemId)
  582. else:
  583. s = '%s SYSTEM "%s">' % (s, notation.systemId)
  584. for i in range(doctype.entities.length):
  585. entity = doctype.entities.item(i)
  586. if s:
  587. s = s + "\n "
  588. s = "%s<!ENTITY %s" % (s, entity.nodeName)
  589. if entity.publicId:
  590. s = '%s PUBLIC "%s"\n "%s"' \
  591. % (s, entity.publicId, entity.systemId)
  592. elif entity.systemId:
  593. s = '%s SYSTEM "%s"' % (s, entity.systemId)
  594. else:
  595. s = '%s "%s"' % (s, entity.firstChild.data)
  596. if entity.notationName:
  597. s = "%s NOTATION %s" % (s, entity.notationName)
  598. s = s + ">"
  599. return s
  600. def _getNSattrs(self):
  601. return ""
  602. def external_entity_ref_handler(self, context, base, systemId, publicId):
  603. if systemId == _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID:
  604. # this entref is the one that we made to put the subtree
  605. # in; all of our given input is parsed in here.
  606. old_document = self.document
  607. old_cur_node = self.curNode
  608. parser = self._parser.ExternalEntityParserCreate(context)
  609. # put the real document back, parse into the fragment to return
  610. self.document = self.originalDocument
  611. self.fragment = self.document.createDocumentFragment()
  612. self.curNode = self.fragment
  613. try:
  614. parser.Parse(self._source, True)
  615. finally:
  616. self.curNode = old_cur_node
  617. self.document = old_document
  618. self._source = None
  619. return -1
  620. else:
  621. return ExpatBuilder.external_entity_ref_handler(
  622. self, context, base, systemId, publicId)
  623. class Namespaces:
  624. """Mix-in class for builders; adds support for namespaces."""
  625. def _initNamespaces(self):
  626. # list of (prefix, uri) ns declarations. Namespace attrs are
  627. # constructed from this and added to the element's attrs.
  628. self._ns_ordered_prefixes = []
  629. def createParser(self):
  630. """Create a new namespace-handling parser."""
  631. parser = expat.ParserCreate(namespace_separator=" ")
  632. parser.namespace_prefixes = True
  633. return parser
  634. def install(self, parser):
  635. """Insert the namespace-handlers onto the parser."""
  636. ExpatBuilder.install(self, parser)
  637. if self._options.namespace_declarations:
  638. parser.StartNamespaceDeclHandler = (
  639. self.start_namespace_decl_handler)
  640. def start_namespace_decl_handler(self, prefix, uri):
  641. """Push this namespace declaration on our storage."""
  642. self._ns_ordered_prefixes.append((prefix, uri))
  643. def start_element_handler(self, name, attributes):
  644. if ' ' in name:
  645. uri, localname, prefix, qname = _parse_ns_name(self, name)
  646. else:
  647. uri = EMPTY_NAMESPACE
  648. qname = name
  649. localname = None
  650. prefix = EMPTY_PREFIX
  651. node = minidom.Element(qname, uri, prefix, localname)
  652. node.ownerDocument = self.document
  653. _append_child(self.curNode, node)
  654. self.curNode = node
  655. if self._ns_ordered_prefixes:
  656. for prefix, uri in self._ns_ordered_prefixes:
  657. if prefix:
  658. a = minidom.Attr(_intern(self, 'xmlns:' + prefix),
  659. XMLNS_NAMESPACE, prefix, "xmlns")
  660. else:
  661. a = minidom.Attr("xmlns", XMLNS_NAMESPACE,
  662. "xmlns", EMPTY_PREFIX)
  663. a.value = uri
  664. a.ownerDocument = self.document
  665. _set_attribute_node(node, a)
  666. del self._ns_ordered_prefixes[:]
  667. if attributes:
  668. node._ensure_attributes()
  669. _attrs = node._attrs
  670. _attrsNS = node._attrsNS
  671. for i in range(0, len(attributes), 2):
  672. aname = attributes[i]
  673. value = attributes[i+1]
  674. if ' ' in aname:
  675. uri, localname, prefix, qname = _parse_ns_name(self, aname)
  676. a = minidom.Attr(qname, uri, localname, prefix)
  677. _attrs[qname] = a
  678. _attrsNS[(uri, localname)] = a
  679. else:
  680. a = minidom.Attr(aname, EMPTY_NAMESPACE,
  681. aname, EMPTY_PREFIX)
  682. _attrs[aname] = a
  683. _attrsNS[(EMPTY_NAMESPACE, aname)] = a
  684. a.ownerDocument = self.document
  685. a.value = value
  686. a.ownerElement = node
  687. if __debug__:
  688. # This only adds some asserts to the original
  689. # end_element_handler(), so we only define this when -O is not
  690. # used. If changing one, be sure to check the other to see if
  691. # it needs to be changed as well.
  692. #
  693. def end_element_handler(self, name):
  694. curNode = self.curNode
  695. if ' ' in name:
  696. uri, localname, prefix, qname = _parse_ns_name(self, name)
  697. assert (curNode.namespaceURI == uri
  698. and curNode.localName == localname
  699. and curNode.prefix == prefix), \
  700. "element stack messed up! (namespace)"
  701. else:
  702. assert curNode.nodeName == name, \
  703. "element stack messed up - bad nodeName"
  704. assert curNode.namespaceURI == EMPTY_NAMESPACE, \
  705. "element stack messed up - bad namespaceURI"
  706. self.curNode = curNode.parentNode
  707. self._finish_end_element(curNode)
  708. class ExpatBuilderNS(Namespaces, ExpatBuilder):
  709. """Document builder that supports namespaces."""
  710. def reset(self):
  711. ExpatBuilder.reset(self)
  712. self._initNamespaces()
  713. class FragmentBuilderNS(Namespaces, FragmentBuilder):
  714. """Fragment builder that supports namespaces."""
  715. def reset(self):
  716. FragmentBuilder.reset(self)
  717. self._initNamespaces()
  718. def _getNSattrs(self):
  719. """Return string of namespace attributes from this element and
  720. ancestors."""
  721. # XXX This needs to be re-written to walk the ancestors of the
  722. # context to build up the namespace information from
  723. # declarations, elements, and attributes found in context.
  724. # Otherwise we have to store a bunch more data on the DOM
  725. # (though that *might* be more reliable -- not clear).
  726. attrs = ""
  727. context = self.context
  728. L = []
  729. while context:
  730. if hasattr(context, '_ns_prefix_uri'):
  731. for prefix, uri in context._ns_prefix_uri.items():
  732. # add every new NS decl from context to L and attrs string
  733. if prefix in L:
  734. continue
  735. L.append(prefix)
  736. if prefix:
  737. declname = "xmlns:" + prefix
  738. else:
  739. declname = "xmlns"
  740. if attrs:
  741. attrs = "%s\n %s='%s'" % (attrs, declname, uri)
  742. else:
  743. attrs = " %s='%s'" % (declname, uri)
  744. context = context.parentNode
  745. return attrs
  746. class ParseEscape(Exception):
  747. """Exception raised to short-circuit parsing in InternalSubsetExtractor."""
  748. pass
  749. class InternalSubsetExtractor(ExpatBuilder):
  750. """XML processor which can rip out the internal document type subset."""
  751. subset = None
  752. def getSubset(self):
  753. """Return the internal subset as a string."""
  754. return self.subset
  755. def parseFile(self, file):
  756. try:
  757. ExpatBuilder.parseFile(self, file)
  758. except ParseEscape:
  759. pass
  760. def parseString(self, string):
  761. try:
  762. ExpatBuilder.parseString(self, string)
  763. except ParseEscape:
  764. pass
  765. def install(self, parser):
  766. parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
  767. parser.StartElementHandler = self.start_element_handler
  768. def start_doctype_decl_handler(self, name, publicId, systemId,
  769. has_internal_subset):
  770. if has_internal_subset:
  771. parser = self.getParser()
  772. self.subset = []
  773. parser.DefaultHandler = self.subset.append
  774. parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
  775. else:
  776. raise ParseEscape()
  777. def end_doctype_decl_handler(self):
  778. s = ''.join(self.subset).replace('\r\n', '\n').replace('\r', '\n')
  779. self.subset = s
  780. raise ParseEscape()
  781. def start_element_handler(self, name, attrs):
  782. raise ParseEscape()
  783. def parse(file, namespaces=True):
  784. """Parse a document, returning the resulting Document node.
  785. 'file' may be either a file name or an open file object.
  786. """
  787. if namespaces:
  788. builder = ExpatBuilderNS()
  789. else:
  790. builder = ExpatBuilder()
  791. if isinstance(file, str):
  792. with open(file, 'rb') as fp:
  793. result = builder.parseFile(fp)
  794. else:
  795. result = builder.parseFile(file)
  796. return result
  797. def parseString(string, namespaces=True):
  798. """Parse a document from a string, returning the resulting
  799. Document node.
  800. """
  801. if namespaces:
  802. builder = ExpatBuilderNS()
  803. else:
  804. builder = ExpatBuilder()
  805. return builder.parseString(string)
  806. def parseFragment(file, context, namespaces=True):
  807. """Parse a fragment of a document, given the context from which it
  808. was originally extracted. context should be the parent of the
  809. node(s) which are in the fragment.
  810. 'file' may be either a file name or an open file object.
  811. """
  812. if namespaces:
  813. builder = FragmentBuilderNS(context)
  814. else:
  815. builder = FragmentBuilder(context)
  816. if isinstance(file, str):
  817. with open(file, 'rb') as fp:
  818. result = builder.parseFile(fp)
  819. else:
  820. result = builder.parseFile(file)
  821. return result
  822. def parseFragmentString(string, context, namespaces=True):
  823. """Parse a fragment of a document from a string, given the context
  824. from which it was originally extracted. context should be the
  825. parent of the node(s) which are in the fragment.
  826. """
  827. if namespaces:
  828. builder = FragmentBuilderNS(context)
  829. else:
  830. builder = FragmentBuilder(context)
  831. return builder.parseString(string)
  832. def makeBuilder(options):
  833. """Create a builder based on an Options object."""
  834. if options.namespaces:
  835. return ExpatBuilderNS(options)
  836. else:
  837. return ExpatBuilder(options)