xml.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939
  1. """
  2. :mod:`pandas.io.xml` is a module for reading XML.
  3. """
  4. from __future__ import annotations
  5. import io
  6. from pandas._typing import (
  7. Buffer,
  8. CompressionOptions,
  9. FilePathOrBuffer,
  10. StorageOptions,
  11. )
  12. from pandas.compat._optional import import_optional_dependency
  13. from pandas.errors import (
  14. AbstractMethodError,
  15. ParserError,
  16. )
  17. from pandas.util._decorators import doc
  18. from pandas.core.dtypes.common import is_list_like
  19. from pandas.core.frame import DataFrame
  20. from pandas.core.shared_docs import _shared_docs
  21. from pandas.io.common import (
  22. file_exists,
  23. get_handle,
  24. is_fsspec_url,
  25. is_url,
  26. stringify_path,
  27. )
  28. from pandas.io.parsers import TextParser
  29. class _XMLFrameParser:
  30. """
  31. Internal subclass to parse XML into DataFrames.
  32. Parameters
  33. ----------
  34. path_or_buffer : a valid JSON str, path object or file-like object
  35. Any valid string path is acceptable. The string could be a URL. Valid
  36. URL schemes include http, ftp, s3, and file.
  37. xpath : str or regex
  38. The XPath expression to parse required set of nodes for
  39. migration to `Data Frame`. `etree` supports limited XPath.
  40. namespacess : dict
  41. The namespaces defined in XML document (`xmlns:namespace='URI')
  42. as dicts with key being namespace and value the URI.
  43. elems_only : bool
  44. Parse only the child elements at the specified `xpath`.
  45. attrs_only : bool
  46. Parse only the attributes at the specified `xpath`.
  47. names : list
  48. Column names for Data Frame of parsed XML data.
  49. encoding : str
  50. Encoding of xml object or document.
  51. stylesheet : str or file-like
  52. URL, file, file-like object, or a raw string containing XSLT,
  53. `etree` does not support XSLT but retained for consistency.
  54. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
  55. Compression type for on-the-fly decompression of on-disk data.
  56. If 'infer', then use extension for gzip, bz2, zip or xz.
  57. storage_options : dict, optional
  58. Extra options that make sense for a particular storage connection,
  59. e.g. host, port, username, password, etc.,
  60. See also
  61. --------
  62. pandas.io.xml._EtreeFrameParser
  63. pandas.io.xml._LxmlFrameParser
  64. Notes
  65. -----
  66. To subclass this class effectively you must override the following methods:`
  67. * :func:`parse_data`
  68. * :func:`_parse_nodes`
  69. * :func:`_parse_doc`
  70. * :func:`_validate_names`
  71. * :func:`_validate_path`
  72. See each method's respective documentation for details on their
  73. functionality.
  74. """
  75. def __init__(
  76. self,
  77. path_or_buffer,
  78. xpath,
  79. namespaces,
  80. elems_only,
  81. attrs_only,
  82. names,
  83. encoding,
  84. stylesheet,
  85. compression,
  86. storage_options,
  87. ) -> None:
  88. self.path_or_buffer = path_or_buffer
  89. self.xpath = xpath
  90. self.namespaces = namespaces
  91. self.elems_only = elems_only
  92. self.attrs_only = attrs_only
  93. self.names = names
  94. self.encoding = encoding
  95. self.stylesheet = stylesheet
  96. self.is_style = None
  97. self.compression = compression
  98. self.storage_options = storage_options
  99. def parse_data(self) -> list[dict[str, str | None]]:
  100. """
  101. Parse xml data.
  102. This method will call the other internal methods to
  103. validate xpath, names, parse and return specific nodes.
  104. """
  105. raise AbstractMethodError(self)
  106. def _parse_nodes(self) -> list[dict[str, str | None]]:
  107. """
  108. Parse xml nodes.
  109. This method will parse the children and attributes of elements
  110. in xpath, conditionally for only elements, only attributes
  111. or both while optionally renaming node names.
  112. Raises
  113. ------
  114. ValueError
  115. * If only elements and only attributes are specified.
  116. Notes
  117. -----
  118. Namespace URIs will be removed from return node values.Also,
  119. elements with missing children or attributes compared to siblings
  120. will have optional keys filled withi None values.
  121. """
  122. raise AbstractMethodError(self)
  123. def _validate_path(self) -> None:
  124. """
  125. Validate xpath.
  126. This method checks for syntax, evaluation, or empty nodes return.
  127. Raises
  128. ------
  129. SyntaxError
  130. * If xpah is not supported or issues with namespaces.
  131. ValueError
  132. * If xpah does not return any nodes.
  133. """
  134. raise AbstractMethodError(self)
  135. def _validate_names(self) -> None:
  136. """
  137. Validate names.
  138. This method will check if names is a list-like and aligns
  139. with length of parse nodes.
  140. Raises
  141. ------
  142. ValueError
  143. * If value is not a list and less then length of nodes.
  144. """
  145. raise AbstractMethodError(self)
  146. def _parse_doc(self, raw_doc) -> bytes:
  147. """
  148. Build tree from path_or_buffer.
  149. This method will parse XML object into tree
  150. either from string/bytes or file location.
  151. """
  152. raise AbstractMethodError(self)
  153. class _EtreeFrameParser(_XMLFrameParser):
  154. """
  155. Internal class to parse XML into DataFrames with the Python
  156. standard library XML module: `xml.etree.ElementTree`.
  157. """
  158. def __init__(self, *args, **kwargs) -> None:
  159. super().__init__(*args, **kwargs)
  160. def parse_data(self) -> list[dict[str, str | None]]:
  161. from xml.etree.ElementTree import XML
  162. if self.stylesheet is not None:
  163. raise ValueError(
  164. "To use stylesheet, you need lxml installed and selected as parser."
  165. )
  166. self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
  167. self._validate_path()
  168. self._validate_names()
  169. return self._parse_nodes()
  170. def _parse_nodes(self) -> list[dict[str, str | None]]:
  171. elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
  172. dicts: list[dict[str, str | None]]
  173. if self.elems_only and self.attrs_only:
  174. raise ValueError("Either element or attributes can be parsed not both.")
  175. elif self.elems_only:
  176. if self.names:
  177. dicts = [
  178. {
  179. **(
  180. {el.tag: el.text.strip()}
  181. if el.text and not el.text.isspace()
  182. else {}
  183. ),
  184. **{
  185. nm: ch.text.strip() if ch.text else None
  186. for nm, ch in zip(self.names, el.findall("*"))
  187. },
  188. }
  189. for el in elems
  190. ]
  191. else:
  192. dicts = [
  193. {
  194. ch.tag: ch.text.strip() if ch.text else None
  195. for ch in el.findall("*")
  196. }
  197. for el in elems
  198. ]
  199. elif self.attrs_only:
  200. dicts = [
  201. {k: v.strip() if v else None for k, v in el.attrib.items()}
  202. for el in elems
  203. ]
  204. else:
  205. if self.names:
  206. dicts = [
  207. {
  208. **el.attrib,
  209. **(
  210. {el.tag: el.text.strip()}
  211. if el.text and not el.text.isspace()
  212. else {}
  213. ),
  214. **{
  215. nm: ch.text.strip() if ch.text else None
  216. for nm, ch in zip(self.names, el.findall("*"))
  217. },
  218. }
  219. for el in elems
  220. ]
  221. else:
  222. dicts = [
  223. {
  224. **el.attrib,
  225. **(
  226. {el.tag: el.text.strip()}
  227. if el.text and not el.text.isspace()
  228. else {}
  229. ),
  230. **{
  231. ch.tag: ch.text.strip() if ch.text else None
  232. for ch in el.findall("*")
  233. },
  234. }
  235. for el in elems
  236. ]
  237. dicts = [
  238. {k.split("}")[1] if "}" in k else k: v for k, v in d.items()} for d in dicts
  239. ]
  240. keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
  241. dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
  242. if self.names:
  243. dicts = [
  244. {nm: v for nm, (k, v) in zip(self.names, d.items())} for d in dicts
  245. ]
  246. return dicts
  247. def _validate_path(self) -> None:
  248. """
  249. Notes
  250. -----
  251. `etree` supports limited XPath. If user attempts a more complex
  252. expression syntax error will raise.
  253. """
  254. msg = (
  255. "xpath does not return any nodes. "
  256. "If document uses namespaces denoted with "
  257. "xmlns, be sure to define namespaces and "
  258. "use them in xpath."
  259. )
  260. try:
  261. elems = self.xml_doc.find(self.xpath, namespaces=self.namespaces)
  262. if elems is None:
  263. raise ValueError(msg)
  264. if elems is not None and elems.find("*") is None and elems.attrib is None:
  265. raise ValueError(msg)
  266. except (KeyError, SyntaxError):
  267. raise SyntaxError(
  268. "You have used an incorrect or unsupported XPath "
  269. "expression for etree library or you used an "
  270. "undeclared namespace prefix."
  271. )
  272. def _validate_names(self) -> None:
  273. if self.names:
  274. parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces)
  275. children = parent.findall("*") if parent else []
  276. if is_list_like(self.names):
  277. if len(self.names) < len(children):
  278. raise ValueError(
  279. "names does not match length of child elements in xpath."
  280. )
  281. else:
  282. raise TypeError(
  283. f"{type(self.names).__name__} is not a valid type for names"
  284. )
  285. def _parse_doc(self, raw_doc) -> bytes:
  286. from xml.etree.ElementTree import (
  287. XMLParser,
  288. parse,
  289. tostring,
  290. )
  291. handle_data = get_data_from_filepath(
  292. filepath_or_buffer=raw_doc,
  293. encoding=self.encoding,
  294. compression=self.compression,
  295. storage_options=self.storage_options,
  296. )
  297. with preprocess_data(handle_data) as xml_data:
  298. curr_parser = XMLParser(encoding=self.encoding)
  299. r = parse(xml_data, parser=curr_parser)
  300. return tostring(r.getroot())
  301. class _LxmlFrameParser(_XMLFrameParser):
  302. """
  303. Internal class to parse XML into DataFrames with third-party
  304. full-featured XML library, `lxml`, that supports
  305. XPath 1.0 and XSLT 1.0.
  306. """
  307. def __init__(self, *args, **kwargs) -> None:
  308. super().__init__(*args, **kwargs)
  309. def parse_data(self) -> list[dict[str, str | None]]:
  310. """
  311. Parse xml data.
  312. This method will call the other internal methods to
  313. validate xpath, names, optionally parse and run XSLT,
  314. and parse original or transformed XML and return specific nodes.
  315. """
  316. from lxml.etree import XML
  317. self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
  318. if self.stylesheet is not None:
  319. self.xsl_doc = XML(self._parse_doc(self.stylesheet))
  320. self.xml_doc = XML(self._transform_doc())
  321. self._validate_path()
  322. self._validate_names()
  323. return self._parse_nodes()
  324. def _parse_nodes(self) -> list[dict[str, str | None]]:
  325. elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
  326. dicts: list[dict[str, str | None]]
  327. if self.elems_only and self.attrs_only:
  328. raise ValueError("Either element or attributes can be parsed not both.")
  329. elif self.elems_only:
  330. if self.names:
  331. dicts = [
  332. {
  333. **(
  334. {el.tag: el.text.strip()}
  335. if el.text and not el.text.isspace()
  336. else {}
  337. ),
  338. **{
  339. nm: ch.text.strip() if ch.text else None
  340. for nm, ch in zip(self.names, el.xpath("*"))
  341. },
  342. }
  343. for el in elems
  344. ]
  345. else:
  346. dicts = [
  347. {
  348. ch.tag: ch.text.strip() if ch.text else None
  349. for ch in el.xpath("*")
  350. }
  351. for el in elems
  352. ]
  353. elif self.attrs_only:
  354. dicts = [el.attrib for el in elems]
  355. else:
  356. if self.names:
  357. dicts = [
  358. {
  359. **el.attrib,
  360. **(
  361. {el.tag: el.text.strip()}
  362. if el.text and not el.text.isspace()
  363. else {}
  364. ),
  365. **{
  366. nm: ch.text.strip() if ch.text else None
  367. for nm, ch in zip(self.names, el.xpath("*"))
  368. },
  369. }
  370. for el in elems
  371. ]
  372. else:
  373. dicts = [
  374. {
  375. **el.attrib,
  376. **(
  377. {el.tag: el.text.strip()}
  378. if el.text and not el.text.isspace()
  379. else {}
  380. ),
  381. **{
  382. ch.tag: ch.text.strip() if ch.text else None
  383. for ch in el.xpath("*")
  384. },
  385. }
  386. for el in elems
  387. ]
  388. if self.namespaces or "}" in list(dicts[0].keys())[0]:
  389. dicts = [
  390. {k.split("}")[1] if "}" in k else k: v for k, v in d.items()}
  391. for d in dicts
  392. ]
  393. keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
  394. dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
  395. if self.names:
  396. dicts = [
  397. {nm: v for nm, (k, v) in zip(self.names, d.items())} for d in dicts
  398. ]
  399. return dicts
  400. def _validate_path(self) -> None:
  401. msg = (
  402. "xpath does not return any nodes. "
  403. "Be sure row level nodes are in xpath. "
  404. "If document uses namespaces denoted with "
  405. "xmlns, be sure to define namespaces and "
  406. "use them in xpath."
  407. )
  408. elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
  409. children = self.xml_doc.xpath(self.xpath + "/*", namespaces=self.namespaces)
  410. attrs = self.xml_doc.xpath(self.xpath + "/@*", namespaces=self.namespaces)
  411. if elems == []:
  412. raise ValueError(msg)
  413. if elems != [] and attrs == [] and children == []:
  414. raise ValueError(msg)
  415. def _validate_names(self) -> None:
  416. """
  417. Validate names.
  418. This method will check if names is a list and aligns with
  419. length of parse nodes.
  420. Raises
  421. ------
  422. ValueError
  423. * If value is not a list and less then length of nodes.
  424. """
  425. if self.names:
  426. children = self.xml_doc.xpath(
  427. self.xpath + "[1]/*", namespaces=self.namespaces
  428. )
  429. if is_list_like(self.names):
  430. if len(self.names) < len(children):
  431. raise ValueError(
  432. "names does not match length of child elements in xpath."
  433. )
  434. else:
  435. raise TypeError(
  436. f"{type(self.names).__name__} is not a valid type for names"
  437. )
  438. def _parse_doc(self, raw_doc) -> bytes:
  439. from lxml.etree import (
  440. XMLParser,
  441. fromstring,
  442. parse,
  443. tostring,
  444. )
  445. handle_data = get_data_from_filepath(
  446. filepath_or_buffer=raw_doc,
  447. encoding=self.encoding,
  448. compression=self.compression,
  449. storage_options=self.storage_options,
  450. )
  451. with preprocess_data(handle_data) as xml_data:
  452. curr_parser = XMLParser(encoding=self.encoding)
  453. if isinstance(xml_data, io.StringIO):
  454. doc = fromstring(
  455. xml_data.getvalue().encode(self.encoding), parser=curr_parser
  456. )
  457. else:
  458. doc = parse(xml_data, parser=curr_parser)
  459. return tostring(doc)
  460. def _transform_doc(self) -> bytes:
  461. """
  462. Transform original tree using stylesheet.
  463. This method will transform original xml using XSLT script into
  464. am ideally flatter xml document for easier parsing and migration
  465. to Data Frame.
  466. """
  467. from lxml.etree import XSLT
  468. transformer = XSLT(self.xsl_doc)
  469. new_doc = transformer(self.xml_doc)
  470. return bytes(new_doc)
  471. def get_data_from_filepath(
  472. filepath_or_buffer,
  473. encoding,
  474. compression,
  475. storage_options,
  476. ) -> str | bytes | Buffer:
  477. """
  478. Extract raw XML data.
  479. The method accepts three input types:
  480. 1. filepath (string-like)
  481. 2. file-like object (e.g. open file object, StringIO)
  482. 3. XML string or bytes
  483. This method turns (1) into (2) to simplify the rest of the processing.
  484. It returns input types (2) and (3) unchanged.
  485. """
  486. filepath_or_buffer = stringify_path(filepath_or_buffer)
  487. if (
  488. isinstance(filepath_or_buffer, str)
  489. and not filepath_or_buffer.startswith(("<?xml", "<"))
  490. ) and (
  491. not isinstance(filepath_or_buffer, str)
  492. or is_url(filepath_or_buffer)
  493. or is_fsspec_url(filepath_or_buffer)
  494. or file_exists(filepath_or_buffer)
  495. ):
  496. with get_handle(
  497. filepath_or_buffer,
  498. "r",
  499. encoding=encoding,
  500. compression=compression,
  501. storage_options=storage_options,
  502. ) as handle_obj:
  503. filepath_or_buffer = (
  504. handle_obj.handle.read()
  505. if hasattr(handle_obj.handle, "read")
  506. else handle_obj.handle
  507. )
  508. return filepath_or_buffer
  509. def preprocess_data(data) -> io.StringIO | io.BytesIO:
  510. """
  511. Convert extracted raw data.
  512. This method will return underlying data of extracted XML content.
  513. The data either has a `read` attribute (e.g. a file object or a
  514. StringIO/BytesIO) or is a string or bytes that is an XML document.
  515. """
  516. if isinstance(data, str):
  517. data = io.StringIO(data)
  518. elif isinstance(data, bytes):
  519. data = io.BytesIO(data)
  520. return data
  521. def _data_to_frame(data, **kwargs) -> DataFrame:
  522. """
  523. Convert parsed data to Data Frame.
  524. This method will bind xml dictionary data of keys and values
  525. into named columns of Data Frame using the built-in TextParser
  526. class that build Data Frame and infers specific dtypes.
  527. """
  528. tags = next(iter(data))
  529. nodes = [list(d.values()) for d in data]
  530. try:
  531. with TextParser(nodes, names=tags, **kwargs) as tp:
  532. return tp.read()
  533. except ParserError:
  534. raise ParserError(
  535. "XML document may be too complex for import. "
  536. "Try to flatten document and use distinct "
  537. "element and attribute names."
  538. )
  539. def _parse(
  540. path_or_buffer,
  541. xpath,
  542. namespaces,
  543. elems_only,
  544. attrs_only,
  545. names,
  546. encoding,
  547. parser,
  548. stylesheet,
  549. compression,
  550. storage_options,
  551. **kwargs,
  552. ) -> DataFrame:
  553. """
  554. Call internal parsers.
  555. This method will conditionally call internal parsers:
  556. LxmlFrameParser and/or EtreeParser.
  557. Raises
  558. ------
  559. ImportError
  560. * If lxml is not installed if selected as parser.
  561. ValueError
  562. * If parser is not lxml or etree.
  563. """
  564. lxml = import_optional_dependency("lxml.etree", errors="ignore")
  565. p: _EtreeFrameParser | _LxmlFrameParser
  566. if parser == "lxml":
  567. if lxml is not None:
  568. p = _LxmlFrameParser(
  569. path_or_buffer,
  570. xpath,
  571. namespaces,
  572. elems_only,
  573. attrs_only,
  574. names,
  575. encoding,
  576. stylesheet,
  577. compression,
  578. storage_options,
  579. )
  580. else:
  581. raise ImportError("lxml not found, please install or use the etree parser.")
  582. elif parser == "etree":
  583. p = _EtreeFrameParser(
  584. path_or_buffer,
  585. xpath,
  586. namespaces,
  587. elems_only,
  588. attrs_only,
  589. names,
  590. encoding,
  591. stylesheet,
  592. compression,
  593. storage_options,
  594. )
  595. else:
  596. raise ValueError("Values for parser can only be lxml or etree.")
  597. data_dicts = p.parse_data()
  598. return _data_to_frame(data=data_dicts, **kwargs)
  599. @doc(storage_options=_shared_docs["storage_options"])
  600. def read_xml(
  601. path_or_buffer: FilePathOrBuffer,
  602. xpath: str | None = "./*",
  603. namespaces: dict | list[dict] | None = None,
  604. elems_only: bool | None = False,
  605. attrs_only: bool | None = False,
  606. names: list[str] | None = None,
  607. encoding: str | None = "utf-8",
  608. parser: str | None = "lxml",
  609. stylesheet: FilePathOrBuffer | None = None,
  610. compression: CompressionOptions = "infer",
  611. storage_options: StorageOptions = None,
  612. ) -> DataFrame:
  613. r"""
  614. Read XML document into a ``DataFrame`` object.
  615. .. versionadded:: 1.3.0
  616. Parameters
  617. ----------
  618. path_or_buffer : str, path object, or file-like object
  619. Any valid XML string or path is acceptable. The string could be a URL.
  620. Valid URL schemes include http, ftp, s3, and file.
  621. xpath : str, optional, default './\*'
  622. The XPath to parse required set of nodes for migration to DataFrame.
  623. XPath should return a collection of elements and not a single
  624. element. Note: The ``etree`` parser supports limited XPath
  625. expressions. For more complex XPath, use ``lxml`` which requires
  626. installation.
  627. namespaces : dict, optional
  628. The namespaces defined in XML document as dicts with key being
  629. namespace prefix and value the URI. There is no need to include all
  630. namespaces in XML, only the ones used in ``xpath`` expression.
  631. Note: if XML document uses default namespace denoted as
  632. `xmlns='<URI>'` without a prefix, you must assign any temporary
  633. namespace prefix such as 'doc' to the URI in order to parse
  634. underlying nodes and/or attributes. For example, ::
  635. namespaces = {{"doc": "https://example.com"}}
  636. elems_only : bool, optional, default False
  637. Parse only the child elements at the specified ``xpath``. By default,
  638. all child elements and non-empty text nodes are returned.
  639. attrs_only : bool, optional, default False
  640. Parse only the attributes at the specified ``xpath``.
  641. By default, all attributes are returned.
  642. names : list-like, optional
  643. Column names for DataFrame of parsed XML data. Use this parameter to
  644. rename original element names and distinguish same named elements.
  645. encoding : str, optional, default 'utf-8'
  646. Encoding of XML document.
  647. parser : {{'lxml','etree'}}, default 'lxml'
  648. Parser module to use for retrieval of data. Only 'lxml' and
  649. 'etree' are supported. With 'lxml' more complex XPath searches
  650. and ability to use XSLT stylesheet are supported.
  651. stylesheet : str, path object or file-like object
  652. A URL, file-like object, or a raw string containing an XSLT script.
  653. This stylesheet should flatten complex, deeply nested XML documents
  654. for easier parsing. To use this feature you must have ``lxml`` module
  655. installed and specify 'lxml' as ``parser``. The ``xpath`` must
  656. reference nodes of transformed XML document generated after XSLT
  657. transformation and not the original XML document. Only XSLT 1.0
  658. scripts and not later versions is currently supported.
  659. compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer'
  660. For on-the-fly decompression of on-disk data. If 'infer', then use
  661. gzip, bz2, zip or xz if path_or_buffer is a string ending in
  662. '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression
  663. otherwise. If using 'zip', the ZIP file must contain only one data
  664. file to be read in. Set to None for no decompression.
  665. {storage_options}
  666. Returns
  667. -------
  668. df
  669. A DataFrame.
  670. See Also
  671. --------
  672. read_json : Convert a JSON string to pandas object.
  673. read_html : Read HTML tables into a list of DataFrame objects.
  674. Notes
  675. -----
  676. This method is best designed to import shallow XML documents in
  677. following format which is the ideal fit for the two-dimensions of a
  678. ``DataFrame`` (row by column). ::
  679. <root>
  680. <row>
  681. <column1>data</column1>
  682. <column2>data</column2>
  683. <column3>data</column3>
  684. ...
  685. </row>
  686. <row>
  687. ...
  688. </row>
  689. ...
  690. </root>
  691. As a file format, XML documents can be designed any way including
  692. layout of elements and attributes as long as it conforms to W3C
  693. specifications. Therefore, this method is a convenience handler for
  694. a specific flatter design and not all possible XML structures.
  695. However, for more complex XML documents, ``stylesheet`` allows you to
  696. temporarily redesign original document with XSLT (a special purpose
  697. language) for a flatter version for migration to a DataFrame.
  698. This function will *always* return a single :class:`DataFrame` or raise
  699. exceptions due to issues with XML document, ``xpath``, or other
  700. parameters.
  701. Examples
  702. --------
  703. >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
  704. ... <data xmlns="http://example.com">
  705. ... <row>
  706. ... <shape>square</shape>
  707. ... <degrees>360</degrees>
  708. ... <sides>4.0</sides>
  709. ... </row>
  710. ... <row>
  711. ... <shape>circle</shape>
  712. ... <degrees>360</degrees>
  713. ... <sides/>
  714. ... </row>
  715. ... <row>
  716. ... <shape>triangle</shape>
  717. ... <degrees>180</degrees>
  718. ... <sides>3.0</sides>
  719. ... </row>
  720. ... </data>'''
  721. >>> df = pd.read_xml(xml)
  722. >>> df
  723. shape degrees sides
  724. 0 square 360 4.0
  725. 1 circle 360 NaN
  726. 2 triangle 180 3.0
  727. >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
  728. ... <data>
  729. ... <row shape="square" degrees="360" sides="4.0"/>
  730. ... <row shape="circle" degrees="360"/>
  731. ... <row shape="triangle" degrees="180" sides="3.0"/>
  732. ... </data>'''
  733. >>> df = pd.read_xml(xml, xpath=".//row")
  734. >>> df
  735. shape degrees sides
  736. 0 square 360 4.0
  737. 1 circle 360 NaN
  738. 2 triangle 180 3.0
  739. >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
  740. ... <doc:data xmlns:doc="https://example.com">
  741. ... <doc:row>
  742. ... <doc:shape>square</doc:shape>
  743. ... <doc:degrees>360</doc:degrees>
  744. ... <doc:sides>4.0</doc:sides>
  745. ... </doc:row>
  746. ... <doc:row>
  747. ... <doc:shape>circle</doc:shape>
  748. ... <doc:degrees>360</doc:degrees>
  749. ... <doc:sides/>
  750. ... </doc:row>
  751. ... <doc:row>
  752. ... <doc:shape>triangle</doc:shape>
  753. ... <doc:degrees>180</doc:degrees>
  754. ... <doc:sides>3.0</doc:sides>
  755. ... </doc:row>
  756. ... </doc:data>'''
  757. >>> df = pd.read_xml(xml,
  758. ... xpath="//doc:row",
  759. ... namespaces={{"doc": "https://example.com"}})
  760. >>> df
  761. shape degrees sides
  762. 0 square 360 4.0
  763. 1 circle 360 NaN
  764. 2 triangle 180 3.0
  765. """
  766. return _parse(
  767. path_or_buffer=path_or_buffer,
  768. xpath=xpath,
  769. namespaces=namespaces,
  770. elems_only=elems_only,
  771. attrs_only=attrs_only,
  772. names=names,
  773. encoding=encoding,
  774. parser=parser,
  775. stylesheet=stylesheet,
  776. compression=compression,
  777. storage_options=storage_options,
  778. )