common.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953
  1. """Common IO api utilities"""
  2. from __future__ import annotations
  3. import bz2
  4. import codecs
  5. from collections import abc
  6. import dataclasses
  7. import gzip
  8. from io import (
  9. BufferedIOBase,
  10. BytesIO,
  11. RawIOBase,
  12. StringIO,
  13. TextIOWrapper,
  14. )
  15. import mmap
  16. import os
  17. from typing import (
  18. IO,
  19. Any,
  20. AnyStr,
  21. Mapping,
  22. cast,
  23. )
  24. from urllib.parse import (
  25. urljoin,
  26. urlparse as parse_url,
  27. uses_netloc,
  28. uses_params,
  29. uses_relative,
  30. )
  31. import warnings
  32. import zipfile
  33. from pandas._typing import (
  34. Buffer,
  35. CompressionDict,
  36. CompressionOptions,
  37. FileOrBuffer,
  38. FilePathOrBuffer,
  39. StorageOptions,
  40. )
  41. from pandas.compat import (
  42. get_lzma_file,
  43. import_lzma,
  44. )
  45. from pandas.compat._optional import import_optional_dependency
  46. from pandas.core.dtypes.common import is_file_like
  47. lzma = import_lzma()
  48. _VALID_URLS = set(uses_relative + uses_netloc + uses_params)
  49. _VALID_URLS.discard("")
  50. @dataclasses.dataclass
  51. class IOArgs:
  52. """
  53. Return value of io/common.py:_get_filepath_or_buffer.
  54. Note (copy&past from io/parsers):
  55. filepath_or_buffer can be Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile]
  56. though mypy handling of conditional imports is difficult.
  57. See https://github.com/python/mypy/issues/1297
  58. """
  59. filepath_or_buffer: FileOrBuffer
  60. encoding: str
  61. mode: str
  62. compression: CompressionDict
  63. should_close: bool = False
  64. @dataclasses.dataclass
  65. class IOHandles:
  66. """
  67. Return value of io/common.py:get_handle
  68. Can be used as a context manager.
  69. This is used to easily close created buffers and to handle corner cases when
  70. TextIOWrapper is inserted.
  71. handle: The file handle to be used.
  72. created_handles: All file handles that are created by get_handle
  73. is_wrapped: Whether a TextIOWrapper needs to be detached.
  74. """
  75. handle: Buffer
  76. compression: CompressionDict
  77. created_handles: list[Buffer] = dataclasses.field(default_factory=list)
  78. is_wrapped: bool = False
  79. is_mmap: bool = False
  80. def close(self) -> None:
  81. """
  82. Close all created buffers.
  83. Note: If a TextIOWrapper was inserted, it is flushed and detached to
  84. avoid closing the potentially user-created buffer.
  85. """
  86. if self.is_wrapped:
  87. assert isinstance(self.handle, TextIOWrapper)
  88. self.handle.flush()
  89. self.handle.detach()
  90. self.created_handles.remove(self.handle)
  91. try:
  92. for handle in self.created_handles:
  93. handle.close()
  94. except (OSError, ValueError):
  95. pass
  96. self.created_handles = []
  97. self.is_wrapped = False
  98. def __enter__(self) -> IOHandles:
  99. return self
  100. def __exit__(self, *args: Any) -> None:
  101. self.close()
  102. def is_url(url) -> bool:
  103. """
  104. Check to see if a URL has a valid protocol.
  105. Parameters
  106. ----------
  107. url : str or unicode
  108. Returns
  109. -------
  110. isurl : bool
  111. If `url` has a valid protocol return True otherwise False.
  112. """
  113. if not isinstance(url, str):
  114. return False
  115. return parse_url(url).scheme in _VALID_URLS
  116. def _expand_user(filepath_or_buffer: FileOrBuffer[AnyStr]) -> FileOrBuffer[AnyStr]:
  117. """
  118. Return the argument with an initial component of ~ or ~user
  119. replaced by that user's home directory.
  120. Parameters
  121. ----------
  122. filepath_or_buffer : object to be converted if possible
  123. Returns
  124. -------
  125. expanded_filepath_or_buffer : an expanded filepath or the
  126. input if not expandable
  127. """
  128. if isinstance(filepath_or_buffer, str):
  129. return os.path.expanduser(filepath_or_buffer)
  130. return filepath_or_buffer
  131. def validate_header_arg(header) -> None:
  132. if isinstance(header, bool):
  133. raise TypeError(
  134. "Passing a bool to header is invalid. Use header=None for no header or "
  135. "header=int or list-like of ints to specify "
  136. "the row(s) making up the column names"
  137. )
  138. def stringify_path(
  139. filepath_or_buffer: FilePathOrBuffer[AnyStr],
  140. convert_file_like: bool = False,
  141. ) -> FileOrBuffer[AnyStr]:
  142. """
  143. Attempt to convert a path-like object to a string.
  144. Parameters
  145. ----------
  146. filepath_or_buffer : object to be converted
  147. Returns
  148. -------
  149. str_filepath_or_buffer : maybe a string version of the object
  150. Notes
  151. -----
  152. Objects supporting the fspath protocol (python 3.6+) are coerced
  153. according to its __fspath__ method.
  154. Any other object is passed through unchanged, which includes bytes,
  155. strings, buffers, or anything else that's not even path-like.
  156. """
  157. if not convert_file_like and is_file_like(filepath_or_buffer):
  158. # GH 38125: some fsspec objects implement os.PathLike but have already opened a
  159. # file. This prevents opening the file a second time. infer_compression calls
  160. # this function with convert_file_like=True to infer the compression.
  161. return cast(FileOrBuffer[AnyStr], filepath_or_buffer)
  162. if isinstance(filepath_or_buffer, os.PathLike):
  163. filepath_or_buffer = filepath_or_buffer.__fspath__()
  164. return _expand_user(filepath_or_buffer)
  165. def urlopen(*args, **kwargs):
  166. """
  167. Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of
  168. the stdlib.
  169. """
  170. import urllib.request
  171. return urllib.request.urlopen(*args, **kwargs)
  172. def is_fsspec_url(url: FilePathOrBuffer) -> bool:
  173. """
  174. Returns true if the given URL looks like
  175. something fsspec can handle
  176. """
  177. return (
  178. isinstance(url, str)
  179. and "://" in url
  180. and not url.startswith(("http://", "https://"))
  181. )
  182. def _get_filepath_or_buffer(
  183. filepath_or_buffer: FilePathOrBuffer,
  184. encoding: str = "utf-8",
  185. compression: CompressionOptions = None,
  186. mode: str = "r",
  187. storage_options: StorageOptions = None,
  188. ) -> IOArgs:
  189. """
  190. If the filepath_or_buffer is a url, translate and return the buffer.
  191. Otherwise passthrough.
  192. Parameters
  193. ----------
  194. filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
  195. or buffer
  196. compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional
  197. encoding : the encoding to use to decode bytes, default is 'utf-8'
  198. mode : str, optional
  199. storage_options : dict, optional
  200. Extra options that make sense for a particular storage connection, e.g.
  201. host, port, username, password, etc., if using a URL that will
  202. be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error
  203. will be raised if providing this argument with a local path or
  204. a file-like buffer. See the fsspec and backend storage implementation
  205. docs for the set of allowed keys and values
  206. .. versionadded:: 1.2.0
  207. ..versionchange:: 1.2.0
  208. Returns the dataclass IOArgs.
  209. """
  210. filepath_or_buffer = stringify_path(filepath_or_buffer)
  211. # handle compression dict
  212. compression_method, compression = get_compression_method(compression)
  213. compression_method = infer_compression(filepath_or_buffer, compression_method)
  214. # GH21227 internal compression is not used for non-binary handles.
  215. if compression_method and hasattr(filepath_or_buffer, "write") and "b" not in mode:
  216. warnings.warn(
  217. "compression has no effect when passing a non-binary object as input.",
  218. RuntimeWarning,
  219. stacklevel=2,
  220. )
  221. compression_method = None
  222. compression = dict(compression, method=compression_method)
  223. # uniform encoding names
  224. if encoding is not None:
  225. encoding = encoding.replace("_", "-").lower()
  226. # bz2 and xz do not write the byte order mark for utf-16 and utf-32
  227. # print a warning when writing such files
  228. if (
  229. "w" in mode
  230. and compression_method in ["bz2", "xz"]
  231. and encoding in ["utf-16", "utf-32"]
  232. ):
  233. warnings.warn(
  234. f"{compression} will not write the byte order mark for {encoding}",
  235. UnicodeWarning,
  236. )
  237. # Use binary mode when converting path-like objects to file-like objects (fsspec)
  238. # except when text mode is explicitly requested. The original mode is returned if
  239. # fsspec is not used.
  240. fsspec_mode = mode
  241. if "t" not in fsspec_mode and "b" not in fsspec_mode:
  242. fsspec_mode += "b"
  243. if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer):
  244. # TODO: fsspec can also handle HTTP via requests, but leaving this
  245. # unchanged. using fsspec appears to break the ability to infer if the
  246. # server responded with gzipped data
  247. storage_options = storage_options or {}
  248. # waiting until now for importing to match intended lazy logic of
  249. # urlopen function defined elsewhere in this module
  250. import urllib.request
  251. # assuming storage_options is to be interpreted as headers
  252. req_info = urllib.request.Request(filepath_or_buffer, headers=storage_options)
  253. with urlopen(req_info) as req:
  254. content_encoding = req.headers.get("Content-Encoding", None)
  255. if content_encoding == "gzip":
  256. # Override compression based on Content-Encoding header
  257. compression = {"method": "gzip"}
  258. reader = BytesIO(req.read())
  259. return IOArgs(
  260. filepath_or_buffer=reader,
  261. encoding=encoding,
  262. compression=compression,
  263. should_close=True,
  264. mode=fsspec_mode,
  265. )
  266. if is_fsspec_url(filepath_or_buffer):
  267. assert isinstance(
  268. filepath_or_buffer, str
  269. ) # just to appease mypy for this branch
  270. # two special-case s3-like protocols; these have special meaning in Hadoop,
  271. # but are equivalent to just "s3" from fsspec's point of view
  272. # cc #11071
  273. if filepath_or_buffer.startswith("s3a://"):
  274. filepath_or_buffer = filepath_or_buffer.replace("s3a://", "s3://")
  275. if filepath_or_buffer.startswith("s3n://"):
  276. filepath_or_buffer = filepath_or_buffer.replace("s3n://", "s3://")
  277. fsspec = import_optional_dependency("fsspec")
  278. # If botocore is installed we fallback to reading with anon=True
  279. # to allow reads from public buckets
  280. err_types_to_retry_with_anon: list[Any] = []
  281. try:
  282. import_optional_dependency("botocore")
  283. from botocore.exceptions import (
  284. ClientError,
  285. NoCredentialsError,
  286. )
  287. err_types_to_retry_with_anon = [
  288. ClientError,
  289. NoCredentialsError,
  290. PermissionError,
  291. ]
  292. except ImportError:
  293. pass
  294. try:
  295. file_obj = fsspec.open(
  296. filepath_or_buffer, mode=fsspec_mode, **(storage_options or {})
  297. ).open()
  298. # GH 34626 Reads from Public Buckets without Credentials needs anon=True
  299. except tuple(err_types_to_retry_with_anon):
  300. if storage_options is None:
  301. storage_options = {"anon": True}
  302. else:
  303. # don't mutate user input.
  304. storage_options = dict(storage_options)
  305. storage_options["anon"] = True
  306. file_obj = fsspec.open(
  307. filepath_or_buffer, mode=fsspec_mode, **(storage_options or {})
  308. ).open()
  309. return IOArgs(
  310. filepath_or_buffer=file_obj,
  311. encoding=encoding,
  312. compression=compression,
  313. should_close=True,
  314. mode=fsspec_mode,
  315. )
  316. elif storage_options:
  317. raise ValueError(
  318. "storage_options passed with file object or non-fsspec file path"
  319. )
  320. if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)):
  321. return IOArgs(
  322. filepath_or_buffer=_expand_user(filepath_or_buffer),
  323. encoding=encoding,
  324. compression=compression,
  325. should_close=False,
  326. mode=mode,
  327. )
  328. if not is_file_like(filepath_or_buffer):
  329. msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}"
  330. raise ValueError(msg)
  331. return IOArgs(
  332. filepath_or_buffer=filepath_or_buffer,
  333. encoding=encoding,
  334. compression=compression,
  335. should_close=False,
  336. mode=mode,
  337. )
  338. def file_path_to_url(path: str) -> str:
  339. """
  340. converts an absolute native path to a FILE URL.
  341. Parameters
  342. ----------
  343. path : a path in native format
  344. Returns
  345. -------
  346. a valid FILE URL
  347. """
  348. # lazify expensive import (~30ms)
  349. from urllib.request import pathname2url
  350. return urljoin("file:", pathname2url(path))
  351. _compression_to_extension = {"gzip": ".gz", "bz2": ".bz2", "zip": ".zip", "xz": ".xz"}
  352. def get_compression_method(
  353. compression: CompressionOptions,
  354. ) -> tuple[str | None, CompressionDict]:
  355. """
  356. Simplifies a compression argument to a compression method string and
  357. a mapping containing additional arguments.
  358. Parameters
  359. ----------
  360. compression : str or mapping
  361. If string, specifies the compression method. If mapping, value at key
  362. 'method' specifies compression method.
  363. Returns
  364. -------
  365. tuple of ({compression method}, Optional[str]
  366. {compression arguments}, Dict[str, Any])
  367. Raises
  368. ------
  369. ValueError on mapping missing 'method' key
  370. """
  371. compression_method: str | None
  372. if isinstance(compression, Mapping):
  373. compression_args = dict(compression)
  374. try:
  375. compression_method = compression_args.pop("method")
  376. except KeyError as err:
  377. raise ValueError("If mapping, compression must have key 'method'") from err
  378. else:
  379. compression_args = {}
  380. compression_method = compression
  381. return compression_method, compression_args
  382. def infer_compression(
  383. filepath_or_buffer: FilePathOrBuffer, compression: str | None
  384. ) -> str | None:
  385. """
  386. Get the compression method for filepath_or_buffer. If compression='infer',
  387. the inferred compression method is returned. Otherwise, the input
  388. compression method is returned unchanged, unless it's invalid, in which
  389. case an error is raised.
  390. Parameters
  391. ----------
  392. filepath_or_buffer : str or file handle
  393. File path or object.
  394. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}
  395. If 'infer' and `filepath_or_buffer` is path-like, then detect
  396. compression from the following extensions: '.gz', '.bz2', '.zip',
  397. or '.xz' (otherwise no compression).
  398. Returns
  399. -------
  400. string or None
  401. Raises
  402. ------
  403. ValueError on invalid compression specified.
  404. """
  405. if compression is None:
  406. return None
  407. # Infer compression
  408. if compression == "infer":
  409. # Convert all path types (e.g. pathlib.Path) to strings
  410. filepath_or_buffer = stringify_path(filepath_or_buffer, convert_file_like=True)
  411. if not isinstance(filepath_or_buffer, str):
  412. # Cannot infer compression of a buffer, assume no compression
  413. return None
  414. # Infer compression from the filename/URL extension
  415. for compression, extension in _compression_to_extension.items():
  416. if filepath_or_buffer.lower().endswith(extension):
  417. return compression
  418. return None
  419. # Compression has been specified. Check that it's valid
  420. if compression in _compression_to_extension:
  421. return compression
  422. # https://github.com/python/mypy/issues/5492
  423. # Unsupported operand types for + ("List[Optional[str]]" and "List[str]")
  424. valid = ["infer", None] + sorted(
  425. _compression_to_extension
  426. ) # type: ignore[operator]
  427. msg = (
  428. f"Unrecognized compression type: {compression}\n"
  429. f"Valid compression types are {valid}"
  430. )
  431. raise ValueError(msg)
  432. def get_handle(
  433. path_or_buf: FilePathOrBuffer,
  434. mode: str,
  435. encoding: str | None = None,
  436. compression: CompressionOptions = None,
  437. memory_map: bool = False,
  438. is_text: bool = True,
  439. errors: str | None = None,
  440. storage_options: StorageOptions = None,
  441. ) -> IOHandles:
  442. """
  443. Get file handle for given path/buffer and mode.
  444. Parameters
  445. ----------
  446. path_or_buf : str or file handle
  447. File path or object.
  448. mode : str
  449. Mode to open path_or_buf with.
  450. encoding : str or None
  451. Encoding to use.
  452. compression : str or dict, default None
  453. If string, specifies compression mode. If dict, value at key 'method'
  454. specifies compression mode. Compression mode must be one of {'infer',
  455. 'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer'
  456. and `filepath_or_buffer` is path-like, then detect compression from
  457. the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise
  458. no compression). If dict and compression mode is one of
  459. {'zip', 'gzip', 'bz2'}, or inferred as one of the above,
  460. other entries passed as additional compression options.
  461. .. versionchanged:: 1.0.0
  462. May now be a dict with key 'method' as compression mode
  463. and other keys as compression options if compression
  464. mode is 'zip'.
  465. .. versionchanged:: 1.1.0
  466. Passing compression options as keys in dict is now
  467. supported for compression modes 'gzip' and 'bz2' as well as 'zip'.
  468. memory_map : bool, default False
  469. See parsers._parser_params for more information.
  470. is_text : bool, default True
  471. Whether the type of the content passed to the file/buffer is string or
  472. bytes. This is not the same as `"b" not in mode`. If a string content is
  473. passed to a binary file/buffer, a wrapper is inserted.
  474. errors : str, default 'strict'
  475. Specifies how encoding and decoding errors are to be handled.
  476. See the errors argument for :func:`open` for a full list
  477. of options.
  478. storage_options: StorageOptions = None
  479. Passed to _get_filepath_or_buffer
  480. .. versionchanged:: 1.2.0
  481. Returns the dataclass IOHandles
  482. """
  483. # Windows does not default to utf-8. Set to utf-8 for a consistent behavior
  484. encoding = encoding or "utf-8"
  485. # read_csv does not know whether the buffer is opened in binary/text mode
  486. if _is_binary_mode(path_or_buf, mode) and "b" not in mode:
  487. mode += "b"
  488. # valdiate errors
  489. if isinstance(errors, str):
  490. errors = errors.lower()
  491. if errors not in (
  492. None,
  493. "strict",
  494. "ignore",
  495. "replace",
  496. "xmlcharrefreplace",
  497. "backslashreplace",
  498. "namereplace",
  499. "surrogateescape",
  500. "surrogatepass",
  501. ):
  502. raise ValueError(
  503. f"Invalid value for `encoding_errors` ({errors}). Please see "
  504. + "https://docs.python.org/3/library/codecs.html#error-handlers "
  505. + "for valid values."
  506. )
  507. # open URLs
  508. ioargs = _get_filepath_or_buffer(
  509. path_or_buf,
  510. encoding=encoding,
  511. compression=compression,
  512. mode=mode,
  513. storage_options=storage_options,
  514. )
  515. handle = ioargs.filepath_or_buffer
  516. handles: list[Buffer]
  517. # memory mapping needs to be the first step
  518. handle, memory_map, handles = _maybe_memory_map(
  519. handle,
  520. memory_map,
  521. ioargs.encoding,
  522. ioargs.mode,
  523. errors,
  524. ioargs.compression["method"] not in _compression_to_extension,
  525. )
  526. is_path = isinstance(handle, str)
  527. compression_args = dict(ioargs.compression)
  528. compression = compression_args.pop("method")
  529. if compression:
  530. # compression libraries do not like an explicit text-mode
  531. ioargs.mode = ioargs.mode.replace("t", "")
  532. # GZ Compression
  533. if compression == "gzip":
  534. if is_path:
  535. assert isinstance(handle, str)
  536. handle = gzip.GzipFile(
  537. filename=handle,
  538. mode=ioargs.mode,
  539. **compression_args,
  540. )
  541. else:
  542. handle = gzip.GzipFile(
  543. # error: Argument "fileobj" to "GzipFile" has incompatible type
  544. # "Union[str, Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase,
  545. # TextIOWrapper, mmap]]"; expected "Optional[IO[bytes]]"
  546. fileobj=handle, # type: ignore[arg-type]
  547. mode=ioargs.mode,
  548. **compression_args,
  549. )
  550. # BZ Compression
  551. elif compression == "bz2":
  552. handle = bz2.BZ2File(
  553. # Argument 1 to "BZ2File" has incompatible type "Union[str,
  554. # Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper,
  555. # mmap]]"; expected "Union[Union[str, bytes, _PathLike[str],
  556. # _PathLike[bytes]], IO[bytes]]"
  557. handle, # type: ignore[arg-type]
  558. mode=ioargs.mode,
  559. **compression_args,
  560. )
  561. # ZIP Compression
  562. elif compression == "zip":
  563. handle = _BytesZipFile(handle, ioargs.mode, **compression_args)
  564. if handle.mode == "r":
  565. handles.append(handle)
  566. zip_names = handle.namelist()
  567. if len(zip_names) == 1:
  568. handle = handle.open(zip_names.pop())
  569. elif len(zip_names) == 0:
  570. raise ValueError(f"Zero files found in ZIP file {path_or_buf}")
  571. else:
  572. raise ValueError(
  573. "Multiple files found in ZIP file. "
  574. f"Only one file per ZIP: {zip_names}"
  575. )
  576. # XZ Compression
  577. elif compression == "xz":
  578. handle = get_lzma_file(lzma)(handle, ioargs.mode)
  579. # Unrecognized Compression
  580. else:
  581. msg = f"Unrecognized compression type: {compression}"
  582. raise ValueError(msg)
  583. assert not isinstance(handle, str)
  584. handles.append(handle)
  585. elif isinstance(handle, str):
  586. # Check whether the filename is to be opened in binary mode.
  587. # Binary mode does not support 'encoding' and 'newline'.
  588. if ioargs.encoding and "b" not in ioargs.mode:
  589. # Encoding
  590. handle = open(
  591. handle,
  592. ioargs.mode,
  593. encoding=ioargs.encoding,
  594. errors=errors,
  595. newline="",
  596. )
  597. else:
  598. # Binary mode
  599. handle = open(handle, ioargs.mode)
  600. handles.append(handle)
  601. # Convert BytesIO or file objects passed with an encoding
  602. is_wrapped = False
  603. if is_text and (compression or _is_binary_mode(handle, ioargs.mode)):
  604. handle = TextIOWrapper(
  605. # error: Argument 1 to "TextIOWrapper" has incompatible type
  606. # "Union[IO[bytes], IO[Any], RawIOBase, BufferedIOBase, TextIOBase, mmap]";
  607. # expected "IO[bytes]"
  608. handle, # type: ignore[arg-type]
  609. encoding=ioargs.encoding,
  610. errors=errors,
  611. newline="",
  612. )
  613. handles.append(handle)
  614. # only marked as wrapped when the caller provided a handle
  615. is_wrapped = not (
  616. isinstance(ioargs.filepath_or_buffer, str) or ioargs.should_close
  617. )
  618. handles.reverse() # close the most recently added buffer first
  619. if ioargs.should_close:
  620. assert not isinstance(ioargs.filepath_or_buffer, str)
  621. handles.append(ioargs.filepath_or_buffer)
  622. assert not isinstance(handle, str)
  623. return IOHandles(
  624. handle=handle,
  625. created_handles=handles,
  626. is_wrapped=is_wrapped,
  627. is_mmap=memory_map,
  628. compression=ioargs.compression,
  629. )
  630. # error: Definition of "__exit__" in base class "ZipFile" is incompatible with
  631. # definition in base class "BytesIO" [misc]
  632. # error: Definition of "__enter__" in base class "ZipFile" is incompatible with
  633. # definition in base class "BytesIO" [misc]
  634. # error: Definition of "__enter__" in base class "ZipFile" is incompatible with
  635. # definition in base class "BinaryIO" [misc]
  636. # error: Definition of "__enter__" in base class "ZipFile" is incompatible with
  637. # definition in base class "IO" [misc]
  638. # error: Definition of "read" in base class "ZipFile" is incompatible with
  639. # definition in base class "BytesIO" [misc]
  640. # error: Definition of "read" in base class "ZipFile" is incompatible with
  641. # definition in base class "IO" [misc]
  642. class _BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore[misc]
  643. """
  644. Wrapper for standard library class ZipFile and allow the returned file-like
  645. handle to accept byte strings via `write` method.
  646. BytesIO provides attributes of file-like object and ZipFile.writestr writes
  647. bytes strings into a member of the archive.
  648. """
  649. # GH 17778
  650. def __init__(
  651. self,
  652. file: FilePathOrBuffer,
  653. mode: str,
  654. archive_name: str | None = None,
  655. **kwargs,
  656. ):
  657. mode = mode.replace("b", "")
  658. self.archive_name = archive_name
  659. self.multiple_write_buffer: StringIO | BytesIO | None = None
  660. kwargs_zip: dict[str, Any] = {"compression": zipfile.ZIP_DEFLATED}
  661. kwargs_zip.update(kwargs)
  662. # error: Argument 1 to "__init__" of "ZipFile" has incompatible type
  663. # "Union[_PathLike[str], Union[str, Union[IO[Any], RawIOBase, BufferedIOBase,
  664. # TextIOBase, TextIOWrapper, mmap]]]"; expected "Union[Union[str,
  665. # _PathLike[str]], IO[bytes]]"
  666. super().__init__(file, mode, **kwargs_zip) # type: ignore[arg-type]
  667. def write(self, data):
  668. # buffer multiple write calls, write on flush
  669. if self.multiple_write_buffer is None:
  670. self.multiple_write_buffer = (
  671. BytesIO() if isinstance(data, bytes) else StringIO()
  672. )
  673. self.multiple_write_buffer.write(data)
  674. def flush(self) -> None:
  675. # write to actual handle and close write buffer
  676. if self.multiple_write_buffer is None or self.multiple_write_buffer.closed:
  677. return
  678. # ZipFile needs a non-empty string
  679. archive_name = self.archive_name or self.filename or "zip"
  680. with self.multiple_write_buffer:
  681. super().writestr(archive_name, self.multiple_write_buffer.getvalue())
  682. def close(self):
  683. self.flush()
  684. super().close()
  685. @property
  686. def closed(self):
  687. return self.fp is None
  688. class _MMapWrapper(abc.Iterator):
  689. """
  690. Wrapper for the Python's mmap class so that it can be properly read in
  691. by Python's csv.reader class.
  692. Parameters
  693. ----------
  694. f : file object
  695. File object to be mapped onto memory. Must support the 'fileno'
  696. method or have an equivalent attribute
  697. """
  698. def __init__(
  699. self,
  700. f: IO,
  701. encoding: str = "utf-8",
  702. errors: str = "strict",
  703. decode: bool = True,
  704. ):
  705. self.encoding = encoding
  706. self.errors = errors
  707. self.decoder = codecs.getincrementaldecoder(encoding)(errors=errors)
  708. self.decode = decode
  709. self.attributes = {}
  710. for attribute in ("seekable", "readable", "writeable"):
  711. if not hasattr(f, attribute):
  712. continue
  713. self.attributes[attribute] = getattr(f, attribute)()
  714. self.mmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
  715. def __getattr__(self, name: str):
  716. if name in self.attributes:
  717. return lambda: self.attributes[name]
  718. return getattr(self.mmap, name)
  719. def __iter__(self) -> _MMapWrapper:
  720. return self
  721. def read(self, size: int = -1) -> str | bytes:
  722. # CSV c-engine uses read instead of iterating
  723. content: bytes = self.mmap.read(size)
  724. if self.decode:
  725. # memory mapping is applied before compression. Encoding should
  726. # be applied to the de-compressed data.
  727. return content.decode(self.encoding, errors=self.errors)
  728. return content
  729. def __next__(self) -> str:
  730. newbytes = self.mmap.readline()
  731. # readline returns bytes, not str, but Python's CSV reader
  732. # expects str, so convert the output to str before continuing
  733. newline = self.decoder.decode(newbytes)
  734. # mmap doesn't raise if reading past the allocated
  735. # data but instead returns an empty string, so raise
  736. # if that is returned
  737. if newline == "":
  738. raise StopIteration
  739. # IncrementalDecoder seems to push newline to the next line
  740. return newline.lstrip("\n")
  741. def _maybe_memory_map(
  742. handle: FileOrBuffer,
  743. memory_map: bool,
  744. encoding: str,
  745. mode: str,
  746. errors: str | None,
  747. decode: bool,
  748. ) -> tuple[FileOrBuffer, bool, list[Buffer]]:
  749. """Try to memory map file/buffer."""
  750. handles: list[Buffer] = []
  751. memory_map &= hasattr(handle, "fileno") or isinstance(handle, str)
  752. if not memory_map:
  753. return handle, memory_map, handles
  754. # need to open the file first
  755. if isinstance(handle, str):
  756. if encoding and "b" not in mode:
  757. # Encoding
  758. handle = open(handle, mode, encoding=encoding, errors=errors, newline="")
  759. else:
  760. # Binary mode
  761. handle = open(handle, mode)
  762. handles.append(handle)
  763. try:
  764. # error: Argument 1 to "_MMapWrapper" has incompatible type "Union[IO[Any],
  765. # RawIOBase, BufferedIOBase, TextIOBase, mmap]"; expected "IO[Any]"
  766. wrapped = cast(
  767. mmap.mmap,
  768. _MMapWrapper(handle, encoding, errors, decode), # type: ignore[arg-type]
  769. )
  770. handle.close()
  771. handles.remove(handle)
  772. handles.append(wrapped)
  773. handle = wrapped
  774. except Exception:
  775. # we catch any errors that may have occurred
  776. # because that is consistent with the lower-level
  777. # functionality of the C engine (pd.read_csv), so
  778. # leave the file handler as is then
  779. memory_map = False
  780. return handle, memory_map, handles
  781. def file_exists(filepath_or_buffer: FilePathOrBuffer) -> bool:
  782. """Test whether file exists."""
  783. exists = False
  784. filepath_or_buffer = stringify_path(filepath_or_buffer)
  785. if not isinstance(filepath_or_buffer, str):
  786. return exists
  787. try:
  788. exists = os.path.exists(filepath_or_buffer)
  789. # gh-5874: if the filepath is too long will raise here
  790. except (TypeError, ValueError):
  791. pass
  792. return exists
  793. def _is_binary_mode(handle: FilePathOrBuffer, mode: str) -> bool:
  794. """Whether the handle is opened in binary mode"""
  795. # specified by user
  796. if "t" in mode or "b" in mode:
  797. return "b" in mode
  798. # classes that expect string but have 'b' in mode
  799. text_classes = (codecs.StreamWriter, codecs.StreamReader, codecs.StreamReaderWriter)
  800. if issubclass(type(handle), text_classes):
  801. return False
  802. # classes that expect bytes
  803. binary_classes = (BufferedIOBase, RawIOBase)
  804. return isinstance(handle, binary_classes) or "b" in getattr(handle, "mode", mode)