readers.py 53 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492
  1. """
  2. Module contains tools for processing files into DataFrames or other objects
  3. """
  4. from __future__ import annotations
  5. from collections import abc
  6. import csv
  7. import sys
  8. from textwrap import fill
  9. from typing import Any
  10. import warnings
  11. import numpy as np
  12. import pandas._libs.lib as lib
  13. from pandas._libs.parsers import STR_NA_VALUES
  14. from pandas._typing import (
  15. ArrayLike,
  16. DtypeArg,
  17. FilePathOrBuffer,
  18. StorageOptions,
  19. )
  20. from pandas.errors import (
  21. AbstractMethodError,
  22. ParserWarning,
  23. )
  24. from pandas.util._decorators import (
  25. Appender,
  26. deprecate_nonkeyword_arguments,
  27. )
  28. from pandas.util._validators import validate_bool_kwarg
  29. from pandas.core.dtypes.common import (
  30. is_file_like,
  31. is_float,
  32. is_integer,
  33. is_list_like,
  34. )
  35. from pandas.core import generic
  36. from pandas.core.frame import DataFrame
  37. from pandas.core.indexes.api import RangeIndex
  38. from pandas.io.common import validate_header_arg
  39. from pandas.io.parsers.base_parser import (
  40. ParserBase,
  41. is_index_col,
  42. parser_defaults,
  43. )
  44. from pandas.io.parsers.c_parser_wrapper import CParserWrapper
  45. from pandas.io.parsers.python_parser import (
  46. FixedWidthFieldParser,
  47. PythonParser,
  48. )
  49. _doc_read_csv_and_table = (
  50. r"""
  51. {summary}
  52. Also supports optionally iterating or breaking of the file
  53. into chunks.
  54. Additional help can be found in the online docs for
  55. `IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
  56. Parameters
  57. ----------
  58. filepath_or_buffer : str, path object or file-like object
  59. Any valid string path is acceptable. The string could be a URL. Valid
  60. URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is
  61. expected. A local file could be: file://localhost/path/to/table.csv.
  62. If you want to pass in a path object, pandas accepts any ``os.PathLike``.
  63. By file-like object, we refer to objects with a ``read()`` method, such as
  64. a file handle (e.g. via builtin ``open`` function) or ``StringIO``.
  65. sep : str, default {_default_sep}
  66. Delimiter to use. If sep is None, the C engine cannot automatically detect
  67. the separator, but the Python parsing engine can, meaning the latter will
  68. be used and automatically detect the separator by Python's builtin sniffer
  69. tool, ``csv.Sniffer``. In addition, separators longer than 1 character and
  70. different from ``'\s+'`` will be interpreted as regular expressions and
  71. will also force the use of the Python parsing engine. Note that regex
  72. delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``.
  73. delimiter : str, default ``None``
  74. Alias for sep.
  75. header : int, list of int, default 'infer'
  76. Row number(s) to use as the column names, and the start of the
  77. data. Default behavior is to infer the column names: if no names
  78. are passed the behavior is identical to ``header=0`` and column
  79. names are inferred from the first line of the file, if column
  80. names are passed explicitly then the behavior is identical to
  81. ``header=None``. Explicitly pass ``header=0`` to be able to
  82. replace existing names. The header can be a list of integers that
  83. specify row locations for a multi-index on the columns
  84. e.g. [0,1,3]. Intervening rows that are not specified will be
  85. skipped (e.g. 2 in this example is skipped). Note that this
  86. parameter ignores commented lines and empty lines if
  87. ``skip_blank_lines=True``, so ``header=0`` denotes the first line of
  88. data rather than the first line of the file.
  89. names : array-like, optional
  90. List of column names to use. If the file contains a header row,
  91. then you should explicitly pass ``header=0`` to override the column names.
  92. Duplicates in this list are not allowed.
  93. index_col : int, str, sequence of int / str, or False, default ``None``
  94. Column(s) to use as the row labels of the ``DataFrame``, either given as
  95. string name or column index. If a sequence of int / str is given, a
  96. MultiIndex is used.
  97. Note: ``index_col=False`` can be used to force pandas to *not* use the first
  98. column as the index, e.g. when you have a malformed file with delimiters at
  99. the end of each line.
  100. usecols : list-like or callable, optional
  101. Return a subset of the columns. If list-like, all elements must either
  102. be positional (i.e. integer indices into the document columns) or strings
  103. that correspond to column names provided either by the user in `names` or
  104. inferred from the document header row(s). For example, a valid list-like
  105. `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
  106. Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
  107. To instantiate a DataFrame from ``data`` with element order preserved use
  108. ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns
  109. in ``['foo', 'bar']`` order or
  110. ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
  111. for ``['bar', 'foo']`` order.
  112. If callable, the callable function will be evaluated against the column
  113. names, returning names where the callable function evaluates to True. An
  114. example of a valid callable argument would be ``lambda x: x.upper() in
  115. ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
  116. parsing time and lower memory usage.
  117. squeeze : bool, default False
  118. If the parsed data only contains one column then return a Series.
  119. prefix : str, optional
  120. Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
  121. mangle_dupe_cols : bool, default True
  122. Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
  123. 'X'...'X'. Passing in False will cause data to be overwritten if there
  124. are duplicate names in the columns.
  125. dtype : Type name or dict of column -> type, optional
  126. Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
  127. 'c': 'Int64'}}
  128. Use `str` or `object` together with suitable `na_values` settings
  129. to preserve and not interpret dtype.
  130. If converters are specified, they will be applied INSTEAD
  131. of dtype conversion.
  132. engine : {{'c', 'python'}}, optional
  133. Parser engine to use. The C engine is faster while the python engine is
  134. currently more feature-complete.
  135. converters : dict, optional
  136. Dict of functions for converting values in certain columns. Keys can either
  137. be integers or column labels.
  138. true_values : list, optional
  139. Values to consider as True.
  140. false_values : list, optional
  141. Values to consider as False.
  142. skipinitialspace : bool, default False
  143. Skip spaces after delimiter.
  144. skiprows : list-like, int or callable, optional
  145. Line numbers to skip (0-indexed) or number of lines to skip (int)
  146. at the start of the file.
  147. If callable, the callable function will be evaluated against the row
  148. indices, returning True if the row should be skipped and False otherwise.
  149. An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
  150. skipfooter : int, default 0
  151. Number of lines at bottom of file to skip (Unsupported with engine='c').
  152. nrows : int, optional
  153. Number of rows of file to read. Useful for reading pieces of large files.
  154. na_values : scalar, str, list-like, or dict, optional
  155. Additional strings to recognize as NA/NaN. If dict passed, specific
  156. per-column NA values. By default the following values are interpreted as
  157. NaN: '"""
  158. + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ")
  159. + """'.
  160. keep_default_na : bool, default True
  161. Whether or not to include the default NaN values when parsing the data.
  162. Depending on whether `na_values` is passed in, the behavior is as follows:
  163. * If `keep_default_na` is True, and `na_values` are specified, `na_values`
  164. is appended to the default NaN values used for parsing.
  165. * If `keep_default_na` is True, and `na_values` are not specified, only
  166. the default NaN values are used for parsing.
  167. * If `keep_default_na` is False, and `na_values` are specified, only
  168. the NaN values specified `na_values` are used for parsing.
  169. * If `keep_default_na` is False, and `na_values` are not specified, no
  170. strings will be parsed as NaN.
  171. Note that if `na_filter` is passed in as False, the `keep_default_na` and
  172. `na_values` parameters will be ignored.
  173. na_filter : bool, default True
  174. Detect missing value markers (empty strings and the value of na_values). In
  175. data without any NAs, passing na_filter=False can improve the performance
  176. of reading a large file.
  177. verbose : bool, default False
  178. Indicate number of NA values placed in non-numeric columns.
  179. skip_blank_lines : bool, default True
  180. If True, skip over blank lines rather than interpreting as NaN values.
  181. parse_dates : bool or list of int or names or list of lists or dict, \
  182. default False
  183. The behavior is as follows:
  184. * boolean. If True -> try parsing the index.
  185. * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
  186. each as a separate date column.
  187. * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
  188. a single date column.
  189. * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
  190. result 'foo'
  191. If a column or index cannot be represented as an array of datetimes,
  192. say because of an unparsable value or a mixture of timezones, the column
  193. or index will be returned unaltered as an object data type. For
  194. non-standard datetime parsing, use ``pd.to_datetime`` after
  195. ``pd.read_csv``. To parse an index or column with a mixture of timezones,
  196. specify ``date_parser`` to be a partially-applied
  197. :func:`pandas.to_datetime` with ``utc=True``. See
  198. :ref:`io.csv.mixed_timezones` for more.
  199. Note: A fast-path exists for iso8601-formatted dates.
  200. infer_datetime_format : bool, default False
  201. If True and `parse_dates` is enabled, pandas will attempt to infer the
  202. format of the datetime strings in the columns, and if it can be inferred,
  203. switch to a faster method of parsing them. In some cases this can increase
  204. the parsing speed by 5-10x.
  205. keep_date_col : bool, default False
  206. If True and `parse_dates` specifies combining multiple columns then
  207. keep the original columns.
  208. date_parser : function, optional
  209. Function to use for converting a sequence of string columns to an array of
  210. datetime instances. The default uses ``dateutil.parser.parser`` to do the
  211. conversion. Pandas will try to call `date_parser` in three different ways,
  212. advancing to the next if an exception occurs: 1) Pass one or more arrays
  213. (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
  214. string values from the columns defined by `parse_dates` into a single array
  215. and pass that; and 3) call `date_parser` once for each row using one or
  216. more strings (corresponding to the columns defined by `parse_dates`) as
  217. arguments.
  218. dayfirst : bool, default False
  219. DD/MM format dates, international and European format.
  220. cache_dates : bool, default True
  221. If True, use a cache of unique, converted dates to apply the datetime
  222. conversion. May produce significant speed-up when parsing duplicate
  223. date strings, especially ones with timezone offsets.
  224. .. versionadded:: 0.25.0
  225. iterator : bool, default False
  226. Return TextFileReader object for iteration or getting chunks with
  227. ``get_chunk()``.
  228. .. versionchanged:: 1.2
  229. ``TextFileReader`` is a context manager.
  230. chunksize : int, optional
  231. Return TextFileReader object for iteration.
  232. See the `IO Tools docs
  233. <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
  234. for more information on ``iterator`` and ``chunksize``.
  235. .. versionchanged:: 1.2
  236. ``TextFileReader`` is a context manager.
  237. compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer'
  238. For on-the-fly decompression of on-disk data. If 'infer' and
  239. `filepath_or_buffer` is path-like, then detect compression from the
  240. following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
  241. decompression). If using 'zip', the ZIP file must contain only one data
  242. file to be read in. Set to None for no decompression.
  243. thousands : str, optional
  244. Thousands separator.
  245. decimal : str, default '.'
  246. Character to recognize as decimal point (e.g. use ',' for European data).
  247. lineterminator : str (length 1), optional
  248. Character to break file into lines. Only valid with C parser.
  249. quotechar : str (length 1), optional
  250. The character used to denote the start and end of a quoted item. Quoted
  251. items can include the delimiter and it will be ignored.
  252. quoting : int or csv.QUOTE_* instance, default 0
  253. Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
  254. QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
  255. doublequote : bool, default ``True``
  256. When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate
  257. whether or not to interpret two consecutive quotechar elements INSIDE a
  258. field as a single ``quotechar`` element.
  259. escapechar : str (length 1), optional
  260. One-character string used to escape other characters.
  261. comment : str, optional
  262. Indicates remainder of line should not be parsed. If found at the beginning
  263. of a line, the line will be ignored altogether. This parameter must be a
  264. single character. Like empty lines (as long as ``skip_blank_lines=True``),
  265. fully commented lines are ignored by the parameter `header` but not by
  266. `skiprows`. For example, if ``comment='#'``, parsing
  267. ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in 'a,b,c' being
  268. treated as the header.
  269. encoding : str, optional
  270. Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
  271. standard encodings
  272. <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
  273. .. versionchanged:: 1.2
  274. When ``encoding`` is ``None``, ``errors="replace"`` is passed to
  275. ``open()``. Otherwise, ``errors="strict"`` is passed to ``open()``.
  276. This behavior was previously only the case for ``engine="python"``.
  277. .. versionchanged:: 1.3.0
  278. ``encoding_errors`` is a new argument. ``encoding`` has no longer an
  279. influence on how encoding errors are handled.
  280. encoding_errors : str, optional, default "strict"
  281. How encoding errors are treated. `List of possible values
  282. <https://docs.python.org/3/library/codecs.html#error-handlers>`_ .
  283. .. versionadded:: 1.3.0
  284. dialect : str or csv.Dialect, optional
  285. If provided, this parameter will override values (default or not) for the
  286. following parameters: `delimiter`, `doublequote`, `escapechar`,
  287. `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
  288. override values, a ParserWarning will be issued. See csv.Dialect
  289. documentation for more details.
  290. error_bad_lines : bool, default ``None``
  291. Lines with too many fields (e.g. a csv line with too many commas) will by
  292. default cause an exception to be raised, and no DataFrame will be returned.
  293. If False, then these "bad lines" will be dropped from the DataFrame that is
  294. returned.
  295. .. deprecated:: 1.3.0
  296. The ``on_bad_lines`` parameter should be used instead to specify behavior upon
  297. encountering a bad line instead.
  298. warn_bad_lines : bool, default ``None``
  299. If error_bad_lines is False, and warn_bad_lines is True, a warning for each
  300. "bad line" will be output.
  301. .. deprecated:: 1.3.0
  302. The ``on_bad_lines`` parameter should be used instead to specify behavior upon
  303. encountering a bad line instead.
  304. on_bad_lines : {{'error', 'warn', 'skip'}}, default 'error'
  305. Specifies what to do upon encountering a bad line (a line with too many fields).
  306. Allowed values are :
  307. - 'error', raise an Exception when a bad line is encountered.
  308. - 'warn', raise a warning when a bad line is encountered and skip that line.
  309. - 'skip', skip bad lines without raising or warning when they are encountered.
  310. .. versionadded:: 1.3.0
  311. delim_whitespace : bool, default False
  312. Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
  313. used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
  314. is set to True, nothing should be passed in for the ``delimiter``
  315. parameter.
  316. low_memory : bool, default True
  317. Internally process the file in chunks, resulting in lower memory use
  318. while parsing, but possibly mixed type inference. To ensure no mixed
  319. types either set False, or specify the type with the `dtype` parameter.
  320. Note that the entire file is read into a single DataFrame regardless,
  321. use the `chunksize` or `iterator` parameter to return the data in chunks.
  322. (Only valid with C parser).
  323. memory_map : bool, default False
  324. If a filepath is provided for `filepath_or_buffer`, map the file object
  325. directly onto memory and access the data directly from there. Using this
  326. option can improve performance because there is no longer any I/O overhead.
  327. float_precision : str, optional
  328. Specifies which converter the C engine should use for floating-point
  329. values. The options are ``None`` or 'high' for the ordinary converter,
  330. 'legacy' for the original lower precision pandas converter, and
  331. 'round_trip' for the round-trip converter.
  332. .. versionchanged:: 1.2
  333. {storage_options}
  334. .. versionadded:: 1.2
  335. Returns
  336. -------
  337. DataFrame or TextParser
  338. A comma-separated values (csv) file is returned as two-dimensional
  339. data structure with labeled axes.
  340. See Also
  341. --------
  342. DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
  343. read_csv : Read a comma-separated values (csv) file into DataFrame.
  344. read_fwf : Read a table of fixed-width formatted lines into DataFrame.
  345. Examples
  346. --------
  347. >>> pd.{func_name}('data.csv') # doctest: +SKIP
  348. """
  349. )
  350. _c_parser_defaults = {
  351. "delim_whitespace": False,
  352. "na_filter": True,
  353. "low_memory": True,
  354. "memory_map": False,
  355. "error_bad_lines": None,
  356. "warn_bad_lines": None,
  357. "float_precision": None,
  358. }
  359. _fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
  360. _c_unsupported = {"skipfooter"}
  361. _python_unsupported = {"low_memory", "float_precision"}
  362. _deprecated_defaults: dict[str, Any] = {"error_bad_lines": None, "warn_bad_lines": None}
  363. _deprecated_args: set[str] = {"error_bad_lines", "warn_bad_lines"}
  364. def validate_integer(name, val, min_val=0):
  365. """
  366. Checks whether the 'name' parameter for parsing is either
  367. an integer OR float that can SAFELY be cast to an integer
  368. without losing accuracy. Raises a ValueError if that is
  369. not the case.
  370. Parameters
  371. ----------
  372. name : str
  373. Parameter name (used for error reporting)
  374. val : int or float
  375. The value to check
  376. min_val : int
  377. Minimum allowed value (val < min_val will result in a ValueError)
  378. """
  379. msg = f"'{name:s}' must be an integer >={min_val:d}"
  380. if val is not None:
  381. if is_float(val):
  382. if int(val) != val:
  383. raise ValueError(msg)
  384. val = int(val)
  385. elif not (is_integer(val) and val >= min_val):
  386. raise ValueError(msg)
  387. return val
  388. def _validate_names(names):
  389. """
  390. Raise ValueError if the `names` parameter contains duplicates or has an
  391. invalid data type.
  392. Parameters
  393. ----------
  394. names : array-like or None
  395. An array containing a list of the names used for the output DataFrame.
  396. Raises
  397. ------
  398. ValueError
  399. If names are not unique or are not ordered (e.g. set).
  400. """
  401. if names is not None:
  402. if len(names) != len(set(names)):
  403. raise ValueError("Duplicate names are not allowed.")
  404. if not (
  405. is_list_like(names, allow_sets=False) or isinstance(names, abc.KeysView)
  406. ):
  407. raise ValueError("Names should be an ordered collection.")
  408. def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
  409. """Generic reader of line files."""
  410. if kwds.get("date_parser", None) is not None:
  411. if isinstance(kwds["parse_dates"], bool):
  412. kwds["parse_dates"] = True
  413. # Extract some of the arguments (pass chunksize on).
  414. iterator = kwds.get("iterator", False)
  415. chunksize = validate_integer("chunksize", kwds.get("chunksize", None), 1)
  416. nrows = kwds.get("nrows", None)
  417. # Check for duplicates in names.
  418. _validate_names(kwds.get("names", None))
  419. # Create the parser.
  420. parser = TextFileReader(filepath_or_buffer, **kwds)
  421. if chunksize or iterator:
  422. return parser
  423. with parser:
  424. return parser.read(nrows)
  425. @deprecate_nonkeyword_arguments(
  426. version=None, allowed_args=["filepath_or_buffer"], stacklevel=3
  427. )
  428. @Appender(
  429. _doc_read_csv_and_table.format(
  430. func_name="read_csv",
  431. summary="Read a comma-separated values (csv) file into DataFrame.",
  432. _default_sep="','",
  433. storage_options=generic._shared_docs["storage_options"],
  434. )
  435. )
  436. def read_csv(
  437. filepath_or_buffer: FilePathOrBuffer,
  438. sep=lib.no_default,
  439. delimiter=None,
  440. # Column and Index Locations and Names
  441. header="infer",
  442. names=lib.no_default,
  443. index_col=None,
  444. usecols=None,
  445. squeeze=False,
  446. prefix=lib.no_default,
  447. mangle_dupe_cols=True,
  448. # General Parsing Configuration
  449. dtype: DtypeArg | None = None,
  450. engine=None,
  451. converters=None,
  452. true_values=None,
  453. false_values=None,
  454. skipinitialspace=False,
  455. skiprows=None,
  456. skipfooter=0,
  457. nrows=None,
  458. # NA and Missing Data Handling
  459. na_values=None,
  460. keep_default_na=True,
  461. na_filter=True,
  462. verbose=False,
  463. skip_blank_lines=True,
  464. # Datetime Handling
  465. parse_dates=False,
  466. infer_datetime_format=False,
  467. keep_date_col=False,
  468. date_parser=None,
  469. dayfirst=False,
  470. cache_dates=True,
  471. # Iteration
  472. iterator=False,
  473. chunksize=None,
  474. # Quoting, Compression, and File Format
  475. compression="infer",
  476. thousands=None,
  477. decimal: str = ".",
  478. lineterminator=None,
  479. quotechar='"',
  480. quoting=csv.QUOTE_MINIMAL,
  481. doublequote=True,
  482. escapechar=None,
  483. comment=None,
  484. encoding=None,
  485. encoding_errors: str | None = "strict",
  486. dialect=None,
  487. # Error Handling
  488. error_bad_lines=None,
  489. warn_bad_lines=None,
  490. # TODO (2.0): set on_bad_lines to "error".
  491. # See _refine_defaults_read comment for why we do this.
  492. on_bad_lines=None,
  493. # Internal
  494. delim_whitespace=False,
  495. low_memory=_c_parser_defaults["low_memory"],
  496. memory_map=False,
  497. float_precision=None,
  498. storage_options: StorageOptions = None,
  499. ):
  500. # locals() should never be modified
  501. kwds = locals().copy()
  502. del kwds["filepath_or_buffer"]
  503. del kwds["sep"]
  504. kwds_defaults = _refine_defaults_read(
  505. dialect,
  506. delimiter,
  507. delim_whitespace,
  508. engine,
  509. sep,
  510. error_bad_lines,
  511. warn_bad_lines,
  512. on_bad_lines,
  513. names,
  514. prefix,
  515. defaults={"delimiter": ","},
  516. )
  517. kwds.update(kwds_defaults)
  518. return _read(filepath_or_buffer, kwds)
  519. @deprecate_nonkeyword_arguments(
  520. version=None, allowed_args=["filepath_or_buffer"], stacklevel=3
  521. )
  522. @Appender(
  523. _doc_read_csv_and_table.format(
  524. func_name="read_table",
  525. summary="Read general delimited file into DataFrame.",
  526. _default_sep=r"'\\t' (tab-stop)",
  527. storage_options=generic._shared_docs["storage_options"],
  528. )
  529. )
  530. def read_table(
  531. filepath_or_buffer: FilePathOrBuffer,
  532. sep=lib.no_default,
  533. delimiter=None,
  534. # Column and Index Locations and Names
  535. header="infer",
  536. names=lib.no_default,
  537. index_col=None,
  538. usecols=None,
  539. squeeze=False,
  540. prefix=lib.no_default,
  541. mangle_dupe_cols=True,
  542. # General Parsing Configuration
  543. dtype: DtypeArg | None = None,
  544. engine=None,
  545. converters=None,
  546. true_values=None,
  547. false_values=None,
  548. skipinitialspace=False,
  549. skiprows=None,
  550. skipfooter=0,
  551. nrows=None,
  552. # NA and Missing Data Handling
  553. na_values=None,
  554. keep_default_na=True,
  555. na_filter=True,
  556. verbose=False,
  557. skip_blank_lines=True,
  558. # Datetime Handling
  559. parse_dates=False,
  560. infer_datetime_format=False,
  561. keep_date_col=False,
  562. date_parser=None,
  563. dayfirst=False,
  564. cache_dates=True,
  565. # Iteration
  566. iterator=False,
  567. chunksize=None,
  568. # Quoting, Compression, and File Format
  569. compression="infer",
  570. thousands=None,
  571. decimal: str = ".",
  572. lineterminator=None,
  573. quotechar='"',
  574. quoting=csv.QUOTE_MINIMAL,
  575. doublequote=True,
  576. escapechar=None,
  577. comment=None,
  578. encoding=None,
  579. dialect=None,
  580. # Error Handling
  581. error_bad_lines=None,
  582. warn_bad_lines=None,
  583. # TODO (2.0): set on_bad_lines to "error".
  584. # See _refine_defaults_read comment for why we do this.
  585. on_bad_lines=None,
  586. encoding_errors: str | None = "strict",
  587. # Internal
  588. delim_whitespace=False,
  589. low_memory=_c_parser_defaults["low_memory"],
  590. memory_map=False,
  591. float_precision=None,
  592. ):
  593. # locals() should never be modified
  594. kwds = locals().copy()
  595. del kwds["filepath_or_buffer"]
  596. del kwds["sep"]
  597. kwds_defaults = _refine_defaults_read(
  598. dialect,
  599. delimiter,
  600. delim_whitespace,
  601. engine,
  602. sep,
  603. error_bad_lines,
  604. warn_bad_lines,
  605. on_bad_lines,
  606. names,
  607. prefix,
  608. defaults={"delimiter": "\t"},
  609. )
  610. kwds.update(kwds_defaults)
  611. return _read(filepath_or_buffer, kwds)
  612. def read_fwf(
  613. filepath_or_buffer: FilePathOrBuffer,
  614. colspecs="infer",
  615. widths=None,
  616. infer_nrows=100,
  617. **kwds,
  618. ):
  619. r"""
  620. Read a table of fixed-width formatted lines into DataFrame.
  621. Also supports optionally iterating or breaking of the file
  622. into chunks.
  623. Additional help can be found in the `online docs for IO Tools
  624. <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
  625. Parameters
  626. ----------
  627. filepath_or_buffer : str, path object or file-like object
  628. Any valid string path is acceptable. The string could be a URL. Valid
  629. URL schemes include http, ftp, s3, and file. For file URLs, a host is
  630. expected. A local file could be:
  631. ``file://localhost/path/to/table.csv``.
  632. If you want to pass in a path object, pandas accepts any
  633. ``os.PathLike``.
  634. By file-like object, we refer to objects with a ``read()`` method,
  635. such as a file handle (e.g. via builtin ``open`` function)
  636. or ``StringIO``.
  637. colspecs : list of tuple (int, int) or 'infer'. optional
  638. A list of tuples giving the extents of the fixed-width
  639. fields of each line as half-open intervals (i.e., [from, to[ ).
  640. String value 'infer' can be used to instruct the parser to try
  641. detecting the column specifications from the first 100 rows of
  642. the data which are not being skipped via skiprows (default='infer').
  643. widths : list of int, optional
  644. A list of field widths which can be used instead of 'colspecs' if
  645. the intervals are contiguous.
  646. infer_nrows : int, default 100
  647. The number of rows to consider when letting the parser determine the
  648. `colspecs`.
  649. **kwds : optional
  650. Optional keyword arguments can be passed to ``TextFileReader``.
  651. Returns
  652. -------
  653. DataFrame or TextParser
  654. A comma-separated values (csv) file is returned as two-dimensional
  655. data structure with labeled axes.
  656. See Also
  657. --------
  658. DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
  659. read_csv : Read a comma-separated values (csv) file into DataFrame.
  660. Examples
  661. --------
  662. >>> pd.read_fwf('data.csv') # doctest: +SKIP
  663. """
  664. # Check input arguments.
  665. if colspecs is None and widths is None:
  666. raise ValueError("Must specify either colspecs or widths")
  667. elif colspecs not in (None, "infer") and widths is not None:
  668. raise ValueError("You must specify only one of 'widths' and 'colspecs'")
  669. # Compute 'colspecs' from 'widths', if specified.
  670. if widths is not None:
  671. colspecs, col = [], 0
  672. for w in widths:
  673. colspecs.append((col, col + w))
  674. col += w
  675. kwds["colspecs"] = colspecs
  676. kwds["infer_nrows"] = infer_nrows
  677. kwds["engine"] = "python-fwf"
  678. return _read(filepath_or_buffer, kwds)
  679. class TextFileReader(abc.Iterator):
  680. """
  681. Passed dialect overrides any of the related parser options
  682. """
  683. def __init__(self, f, engine=None, **kwds):
  684. self.f = f
  685. if engine is not None:
  686. engine_specified = True
  687. else:
  688. engine = "python"
  689. engine_specified = False
  690. self.engine = engine
  691. self._engine_specified = kwds.get("engine_specified", engine_specified)
  692. _validate_skipfooter(kwds)
  693. dialect = _extract_dialect(kwds)
  694. if dialect is not None:
  695. kwds = _merge_with_dialect_properties(dialect, kwds)
  696. if kwds.get("header", "infer") == "infer":
  697. kwds["header"] = 0 if kwds.get("names") is None else None
  698. self.orig_options = kwds
  699. # miscellanea
  700. self._currow = 0
  701. options = self._get_options_with_defaults(engine)
  702. options["storage_options"] = kwds.get("storage_options", None)
  703. self.chunksize = options.pop("chunksize", None)
  704. self.nrows = options.pop("nrows", None)
  705. self.squeeze = options.pop("squeeze", False)
  706. self._check_file_or_buffer(f, engine)
  707. self.options, self.engine = self._clean_options(options, engine)
  708. if "has_index_names" in kwds:
  709. self.options["has_index_names"] = kwds["has_index_names"]
  710. self._engine = self._make_engine(self.engine)
  711. def close(self):
  712. self._engine.close()
  713. def _get_options_with_defaults(self, engine):
  714. kwds = self.orig_options
  715. options = {}
  716. default: object | None
  717. for argname, default in parser_defaults.items():
  718. value = kwds.get(argname, default)
  719. # see gh-12935
  720. if argname == "mangle_dupe_cols" and not value:
  721. raise ValueError("Setting mangle_dupe_cols=False is not supported yet")
  722. else:
  723. options[argname] = value
  724. for argname, default in _c_parser_defaults.items():
  725. if argname in kwds:
  726. value = kwds[argname]
  727. if engine != "c" and value != default:
  728. if "python" in engine and argname not in _python_unsupported:
  729. pass
  730. elif value == _deprecated_defaults.get(argname, default):
  731. pass
  732. else:
  733. raise ValueError(
  734. f"The {repr(argname)} option is not supported with the "
  735. f"{repr(engine)} engine"
  736. )
  737. else:
  738. value = _deprecated_defaults.get(argname, default)
  739. options[argname] = value
  740. if engine == "python-fwf":
  741. for argname, default in _fwf_defaults.items():
  742. options[argname] = kwds.get(argname, default)
  743. return options
  744. def _check_file_or_buffer(self, f, engine):
  745. # see gh-16530
  746. if is_file_like(f) and engine != "c" and not hasattr(f, "__next__"):
  747. # The C engine doesn't need the file-like to have the "__next__"
  748. # attribute. However, the Python engine explicitly calls
  749. # "__next__(...)" when iterating through such an object, meaning it
  750. # needs to have that attribute
  751. raise ValueError(
  752. "The 'python' engine cannot iterate through this file buffer."
  753. )
  754. def _clean_options(self, options, engine):
  755. result = options.copy()
  756. fallback_reason = None
  757. # C engine not supported yet
  758. if engine == "c":
  759. if options["skipfooter"] > 0:
  760. fallback_reason = "the 'c' engine does not support skipfooter"
  761. engine = "python"
  762. sep = options["delimiter"]
  763. delim_whitespace = options["delim_whitespace"]
  764. if sep is None and not delim_whitespace:
  765. if engine == "c":
  766. fallback_reason = (
  767. "the 'c' engine does not support "
  768. "sep=None with delim_whitespace=False"
  769. )
  770. engine = "python"
  771. elif sep is not None and len(sep) > 1:
  772. if engine == "c" and sep == r"\s+":
  773. result["delim_whitespace"] = True
  774. del result["delimiter"]
  775. elif engine not in ("python", "python-fwf"):
  776. # wait until regex engine integrated
  777. fallback_reason = (
  778. "the 'c' engine does not support "
  779. "regex separators (separators > 1 char and "
  780. r"different from '\s+' are interpreted as regex)"
  781. )
  782. engine = "python"
  783. elif delim_whitespace:
  784. if "python" in engine:
  785. result["delimiter"] = r"\s+"
  786. elif sep is not None:
  787. encodeable = True
  788. encoding = sys.getfilesystemencoding() or "utf-8"
  789. try:
  790. if len(sep.encode(encoding)) > 1:
  791. encodeable = False
  792. except UnicodeDecodeError:
  793. encodeable = False
  794. if not encodeable and engine not in ("python", "python-fwf"):
  795. fallback_reason = (
  796. f"the separator encoded in {encoding} "
  797. "is > 1 char long, and the 'c' engine "
  798. "does not support such separators"
  799. )
  800. engine = "python"
  801. quotechar = options["quotechar"]
  802. if quotechar is not None and isinstance(quotechar, (str, bytes)):
  803. if (
  804. len(quotechar) == 1
  805. and ord(quotechar) > 127
  806. and engine not in ("python", "python-fwf")
  807. ):
  808. fallback_reason = (
  809. "ord(quotechar) > 127, meaning the "
  810. "quotechar is larger than one byte, "
  811. "and the 'c' engine does not support such quotechars"
  812. )
  813. engine = "python"
  814. if fallback_reason and self._engine_specified:
  815. raise ValueError(fallback_reason)
  816. if engine == "c":
  817. for arg in _c_unsupported:
  818. del result[arg]
  819. if "python" in engine:
  820. for arg in _python_unsupported:
  821. if fallback_reason and result[arg] != _c_parser_defaults[arg]:
  822. raise ValueError(
  823. "Falling back to the 'python' engine because "
  824. f"{fallback_reason}, but this causes {repr(arg)} to be "
  825. "ignored as it is not supported by the 'python' engine."
  826. )
  827. del result[arg]
  828. if fallback_reason:
  829. warnings.warn(
  830. (
  831. "Falling back to the 'python' engine because "
  832. f"{fallback_reason}; you can avoid this warning by specifying "
  833. "engine='python'."
  834. ),
  835. ParserWarning,
  836. stacklevel=5,
  837. )
  838. index_col = options["index_col"]
  839. names = options["names"]
  840. converters = options["converters"]
  841. na_values = options["na_values"]
  842. skiprows = options["skiprows"]
  843. validate_header_arg(options["header"])
  844. for arg in _deprecated_args:
  845. parser_default = _c_parser_defaults[arg]
  846. depr_default = _deprecated_defaults[arg]
  847. if result.get(arg, depr_default) != depr_default:
  848. msg = (
  849. f"The {arg} argument has been deprecated and will be "
  850. "removed in a future version.\n\n"
  851. )
  852. warnings.warn(msg, FutureWarning, stacklevel=7)
  853. else:
  854. result[arg] = parser_default
  855. if index_col is True:
  856. raise ValueError("The value of index_col couldn't be 'True'")
  857. if is_index_col(index_col):
  858. if not isinstance(index_col, (list, tuple, np.ndarray)):
  859. index_col = [index_col]
  860. result["index_col"] = index_col
  861. names = list(names) if names is not None else names
  862. # type conversion-related
  863. if converters is not None:
  864. if not isinstance(converters, dict):
  865. raise TypeError(
  866. "Type converters must be a dict or subclass, "
  867. f"input was a {type(converters).__name__}"
  868. )
  869. else:
  870. converters = {}
  871. # Converting values to NA
  872. keep_default_na = options["keep_default_na"]
  873. na_values, na_fvalues = _clean_na_values(na_values, keep_default_na)
  874. # handle skiprows; this is internally handled by the
  875. # c-engine, so only need for python parsers
  876. if engine != "c":
  877. if is_integer(skiprows):
  878. skiprows = list(range(skiprows))
  879. if skiprows is None:
  880. skiprows = set()
  881. elif not callable(skiprows):
  882. skiprows = set(skiprows)
  883. # put stuff back
  884. result["names"] = names
  885. result["converters"] = converters
  886. result["na_values"] = na_values
  887. result["na_fvalues"] = na_fvalues
  888. result["skiprows"] = skiprows
  889. return result, engine
  890. def __next__(self):
  891. try:
  892. return self.get_chunk()
  893. except StopIteration:
  894. self.close()
  895. raise
  896. def _make_engine(self, engine="c"):
  897. mapping: dict[str, type[ParserBase]] = {
  898. "c": CParserWrapper,
  899. "python": PythonParser,
  900. "python-fwf": FixedWidthFieldParser,
  901. }
  902. if engine not in mapping:
  903. raise ValueError(
  904. f"Unknown engine: {engine} (valid options are {mapping.keys()})"
  905. )
  906. # error: Too many arguments for "ParserBase"
  907. return mapping[engine](self.f, **self.options) # type: ignore[call-arg]
  908. def _failover_to_python(self):
  909. raise AbstractMethodError(self)
  910. def read(self, nrows=None):
  911. nrows = validate_integer("nrows", nrows)
  912. index, columns, col_dict = self._engine.read(nrows)
  913. if index is None:
  914. if col_dict:
  915. # Any column is actually fine:
  916. new_rows = len(next(iter(col_dict.values())))
  917. index = RangeIndex(self._currow, self._currow + new_rows)
  918. else:
  919. new_rows = 0
  920. else:
  921. new_rows = len(index)
  922. df = DataFrame(col_dict, columns=columns, index=index)
  923. self._currow += new_rows
  924. if self.squeeze and len(df.columns) == 1:
  925. return df[df.columns[0]].copy()
  926. return df
  927. def get_chunk(self, size=None):
  928. if size is None:
  929. size = self.chunksize
  930. if self.nrows is not None:
  931. if self._currow >= self.nrows:
  932. raise StopIteration
  933. size = min(size, self.nrows - self._currow)
  934. return self.read(nrows=size)
  935. def __enter__(self):
  936. return self
  937. def __exit__(self, exc_type, exc_value, traceback):
  938. self.close()
  939. def TextParser(*args, **kwds):
  940. """
  941. Converts lists of lists/tuples into DataFrames with proper type inference
  942. and optional (e.g. string to datetime) conversion. Also enables iterating
  943. lazily over chunks of large files
  944. Parameters
  945. ----------
  946. data : file-like object or list
  947. delimiter : separator character to use
  948. dialect : str or csv.Dialect instance, optional
  949. Ignored if delimiter is longer than 1 character
  950. names : sequence, default
  951. header : int, default 0
  952. Row to use to parse column labels. Defaults to the first row. Prior
  953. rows will be discarded
  954. index_col : int or list, optional
  955. Column or columns to use as the (possibly hierarchical) index
  956. has_index_names: bool, default False
  957. True if the cols defined in index_col have an index name and are
  958. not in the header.
  959. na_values : scalar, str, list-like, or dict, optional
  960. Additional strings to recognize as NA/NaN.
  961. keep_default_na : bool, default True
  962. thousands : str, optional
  963. Thousands separator
  964. comment : str, optional
  965. Comment out remainder of line
  966. parse_dates : bool, default False
  967. keep_date_col : bool, default False
  968. date_parser : function, optional
  969. skiprows : list of integers
  970. Row numbers to skip
  971. skipfooter : int
  972. Number of line at bottom of file to skip
  973. converters : dict, optional
  974. Dict of functions for converting values in certain columns. Keys can
  975. either be integers or column labels, values are functions that take one
  976. input argument, the cell (not column) content, and return the
  977. transformed content.
  978. encoding : str, optional
  979. Encoding to use for UTF when reading/writing (ex. 'utf-8')
  980. squeeze : bool, default False
  981. returns Series if only one column.
  982. infer_datetime_format: bool, default False
  983. If True and `parse_dates` is True for a column, try to infer the
  984. datetime format based on the first datetime string. If the format
  985. can be inferred, there often will be a large parsing speed-up.
  986. float_precision : str, optional
  987. Specifies which converter the C engine should use for floating-point
  988. values. The options are `None` or `high` for the ordinary converter,
  989. `legacy` for the original lower precision pandas converter, and
  990. `round_trip` for the round-trip converter.
  991. .. versionchanged:: 1.2
  992. """
  993. kwds["engine"] = "python"
  994. return TextFileReader(*args, **kwds)
  995. def _clean_na_values(na_values, keep_default_na=True):
  996. na_fvalues: set | dict
  997. if na_values is None:
  998. if keep_default_na:
  999. na_values = STR_NA_VALUES
  1000. else:
  1001. na_values = set()
  1002. na_fvalues = set()
  1003. elif isinstance(na_values, dict):
  1004. old_na_values = na_values.copy()
  1005. na_values = {} # Prevent aliasing.
  1006. # Convert the values in the na_values dictionary
  1007. # into array-likes for further use. This is also
  1008. # where we append the default NaN values, provided
  1009. # that `keep_default_na=True`.
  1010. for k, v in old_na_values.items():
  1011. if not is_list_like(v):
  1012. v = [v]
  1013. if keep_default_na:
  1014. v = set(v) | STR_NA_VALUES
  1015. na_values[k] = v
  1016. na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()}
  1017. else:
  1018. if not is_list_like(na_values):
  1019. na_values = [na_values]
  1020. na_values = _stringify_na_values(na_values)
  1021. if keep_default_na:
  1022. na_values = na_values | STR_NA_VALUES
  1023. na_fvalues = _floatify_na_values(na_values)
  1024. return na_values, na_fvalues
  1025. def _floatify_na_values(na_values):
  1026. # create float versions of the na_values
  1027. result = set()
  1028. for v in na_values:
  1029. try:
  1030. v = float(v)
  1031. if not np.isnan(v):
  1032. result.add(v)
  1033. except (TypeError, ValueError, OverflowError):
  1034. pass
  1035. return result
  1036. def _stringify_na_values(na_values):
  1037. """return a stringified and numeric for these values"""
  1038. result: list[int | str | float] = []
  1039. for x in na_values:
  1040. result.append(str(x))
  1041. result.append(x)
  1042. try:
  1043. v = float(x)
  1044. # we are like 999 here
  1045. if v == int(v):
  1046. v = int(v)
  1047. result.append(f"{v}.0")
  1048. result.append(str(v))
  1049. result.append(v)
  1050. except (TypeError, ValueError, OverflowError):
  1051. pass
  1052. try:
  1053. result.append(int(x))
  1054. except (TypeError, ValueError, OverflowError):
  1055. pass
  1056. return set(result)
  1057. def _refine_defaults_read(
  1058. dialect: str | csv.Dialect,
  1059. delimiter: str | object,
  1060. delim_whitespace: bool,
  1061. engine: str,
  1062. sep: str | object,
  1063. error_bad_lines: bool | None,
  1064. warn_bad_lines: bool | None,
  1065. on_bad_lines: str | None,
  1066. names: ArrayLike | None | object,
  1067. prefix: str | None | object,
  1068. defaults: dict[str, Any],
  1069. ):
  1070. """Validate/refine default values of input parameters of read_csv, read_table.
  1071. Parameters
  1072. ----------
  1073. dialect : str or csv.Dialect
  1074. If provided, this parameter will override values (default or not) for the
  1075. following parameters: `delimiter`, `doublequote`, `escapechar`,
  1076. `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
  1077. override values, a ParserWarning will be issued. See csv.Dialect
  1078. documentation for more details.
  1079. delimiter : str or object
  1080. Alias for sep.
  1081. delim_whitespace : bool
  1082. Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
  1083. used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
  1084. is set to True, nothing should be passed in for the ``delimiter``
  1085. parameter.
  1086. engine : {{'c', 'python'}}
  1087. Parser engine to use. The C engine is faster while the python engine is
  1088. currently more feature-complete.
  1089. sep : str or object
  1090. A delimiter provided by the user (str) or a sentinel value, i.e.
  1091. pandas._libs.lib.no_default.
  1092. error_bad_lines : str or None
  1093. Whether to error on a bad line or not.
  1094. warn_bad_lines : str or None
  1095. Whether to warn on a bad line or not.
  1096. on_bad_lines : str or None
  1097. An option for handling bad lines or a sentinel value(None).
  1098. names : array-like, optional
  1099. List of column names to use. If the file contains a header row,
  1100. then you should explicitly pass ``header=0`` to override the column names.
  1101. Duplicates in this list are not allowed.
  1102. prefix : str, optional
  1103. Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
  1104. defaults: dict
  1105. Default values of input parameters.
  1106. Returns
  1107. -------
  1108. kwds : dict
  1109. Input parameters with correct values.
  1110. Raises
  1111. ------
  1112. ValueError :
  1113. If a delimiter was specified with ``sep`` (or ``delimiter``) and
  1114. ``delim_whitespace=True``.
  1115. If on_bad_lines is specified(not ``None``) and ``error_bad_lines``/
  1116. ``warn_bad_lines`` is True.
  1117. """
  1118. # fix types for sep, delimiter to Union(str, Any)
  1119. delim_default = defaults["delimiter"]
  1120. kwds: dict[str, Any] = {}
  1121. # gh-23761
  1122. #
  1123. # When a dialect is passed, it overrides any of the overlapping
  1124. # parameters passed in directly. We don't want to warn if the
  1125. # default parameters were passed in (since it probably means
  1126. # that the user didn't pass them in explicitly in the first place).
  1127. #
  1128. # "delimiter" is the annoying corner case because we alias it to
  1129. # "sep" before doing comparison to the dialect values later on.
  1130. # Thus, we need a flag to indicate that we need to "override"
  1131. # the comparison to dialect values by checking if default values
  1132. # for BOTH "delimiter" and "sep" were provided.
  1133. if dialect is not None:
  1134. kwds["sep_override"] = delimiter is None and (
  1135. sep is lib.no_default or sep == delim_default
  1136. )
  1137. if delimiter and (sep is not lib.no_default):
  1138. raise ValueError("Specified a sep and a delimiter; you can only specify one.")
  1139. if names is not lib.no_default and prefix is not lib.no_default:
  1140. raise ValueError("Specified named and prefix; you can only specify one.")
  1141. kwds["names"] = None if names is lib.no_default else names
  1142. kwds["prefix"] = None if prefix is lib.no_default else prefix
  1143. # Alias sep -> delimiter.
  1144. if delimiter is None:
  1145. delimiter = sep
  1146. if delim_whitespace and (delimiter is not lib.no_default):
  1147. raise ValueError(
  1148. "Specified a delimiter with both sep and "
  1149. "delim_whitespace=True; you can only specify one."
  1150. )
  1151. if delimiter is lib.no_default:
  1152. # assign default separator value
  1153. kwds["delimiter"] = delim_default
  1154. else:
  1155. kwds["delimiter"] = delimiter
  1156. if engine is not None:
  1157. kwds["engine_specified"] = True
  1158. else:
  1159. kwds["engine"] = "c"
  1160. kwds["engine_specified"] = False
  1161. # Ensure that on_bad_lines and error_bad_lines/warn_bad_lines
  1162. # aren't specified at the same time. If so, raise. Otherwise,
  1163. # alias on_bad_lines to "error" if error/warn_bad_lines not set
  1164. # and on_bad_lines is not set. on_bad_lines is defaulted to None
  1165. # so we can tell if it is set (this is why this hack exists).
  1166. if on_bad_lines is not None:
  1167. if error_bad_lines is not None or warn_bad_lines is not None:
  1168. raise ValueError(
  1169. "Both on_bad_lines and error_bad_lines/warn_bad_lines are set. "
  1170. "Please only set on_bad_lines."
  1171. )
  1172. if on_bad_lines == "error":
  1173. kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR
  1174. elif on_bad_lines == "warn":
  1175. kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN
  1176. elif on_bad_lines == "skip":
  1177. kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP
  1178. else:
  1179. raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines")
  1180. else:
  1181. if error_bad_lines is not None:
  1182. # Must check is_bool, because other stuff(e.g. non-empty lists) eval to true
  1183. validate_bool_kwarg(error_bad_lines, "error_bad_lines")
  1184. if error_bad_lines:
  1185. kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR
  1186. else:
  1187. if warn_bad_lines is not None:
  1188. # This is the case where error_bad_lines is False
  1189. # We can only warn/skip if error_bad_lines is False
  1190. # None doesn't work because backwards-compatibility reasons
  1191. validate_bool_kwarg(warn_bad_lines, "warn_bad_lines")
  1192. if warn_bad_lines:
  1193. kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN
  1194. else:
  1195. kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP
  1196. else:
  1197. # Backwards compat, when only error_bad_lines = false, we warn
  1198. kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN
  1199. else:
  1200. # Everything None -> Error
  1201. kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR
  1202. return kwds
  1203. def _extract_dialect(kwds: dict[str, Any]) -> csv.Dialect | None:
  1204. """
  1205. Extract concrete csv dialect instance.
  1206. Returns
  1207. -------
  1208. csv.Dialect or None
  1209. """
  1210. if kwds.get("dialect") is None:
  1211. return None
  1212. dialect = kwds["dialect"]
  1213. if dialect in csv.list_dialects():
  1214. dialect = csv.get_dialect(dialect)
  1215. _validate_dialect(dialect)
  1216. return dialect
  1217. MANDATORY_DIALECT_ATTRS = (
  1218. "delimiter",
  1219. "doublequote",
  1220. "escapechar",
  1221. "skipinitialspace",
  1222. "quotechar",
  1223. "quoting",
  1224. )
  1225. def _validate_dialect(dialect: csv.Dialect) -> None:
  1226. """
  1227. Validate csv dialect instance.
  1228. Raises
  1229. ------
  1230. ValueError
  1231. If incorrect dialect is provided.
  1232. """
  1233. for param in MANDATORY_DIALECT_ATTRS:
  1234. if not hasattr(dialect, param):
  1235. raise ValueError(f"Invalid dialect {dialect} provided")
  1236. def _merge_with_dialect_properties(
  1237. dialect: csv.Dialect,
  1238. defaults: dict[str, Any],
  1239. ) -> dict[str, Any]:
  1240. """
  1241. Merge default kwargs in TextFileReader with dialect parameters.
  1242. Parameters
  1243. ----------
  1244. dialect : csv.Dialect
  1245. Concrete csv dialect. See csv.Dialect documentation for more details.
  1246. defaults : dict
  1247. Keyword arguments passed to TextFileReader.
  1248. Returns
  1249. -------
  1250. kwds : dict
  1251. Updated keyword arguments, merged with dialect parameters.
  1252. """
  1253. kwds = defaults.copy()
  1254. for param in MANDATORY_DIALECT_ATTRS:
  1255. dialect_val = getattr(dialect, param)
  1256. parser_default = parser_defaults[param]
  1257. provided = kwds.get(param, parser_default)
  1258. # Messages for conflicting values between the dialect
  1259. # instance and the actual parameters provided.
  1260. conflict_msgs = []
  1261. # Don't warn if the default parameter was passed in,
  1262. # even if it conflicts with the dialect (gh-23761).
  1263. if provided != parser_default and provided != dialect_val:
  1264. msg = (
  1265. f"Conflicting values for '{param}': '{provided}' was "
  1266. f"provided, but the dialect specifies '{dialect_val}'. "
  1267. "Using the dialect-specified value."
  1268. )
  1269. # Annoying corner case for not warning about
  1270. # conflicts between dialect and delimiter parameter.
  1271. # Refer to the outer "_read_" function for more info.
  1272. if not (param == "delimiter" and kwds.pop("sep_override", False)):
  1273. conflict_msgs.append(msg)
  1274. if conflict_msgs:
  1275. warnings.warn("\n\n".join(conflict_msgs), ParserWarning, stacklevel=2)
  1276. kwds[param] = dialect_val
  1277. return kwds
  1278. def _validate_skipfooter(kwds: dict[str, Any]) -> None:
  1279. """
  1280. Check whether skipfooter is compatible with other kwargs in TextFileReader.
  1281. Parameters
  1282. ----------
  1283. kwds : dict
  1284. Keyword arguments passed to TextFileReader.
  1285. Raises
  1286. ------
  1287. ValueError
  1288. If skipfooter is not compatible with other parameters.
  1289. """
  1290. if kwds.get("skipfooter"):
  1291. if kwds.get("iterator") or kwds.get("chunksize"):
  1292. raise ValueError("'skipfooter' not supported for iteration")
  1293. if kwds.get("nrows"):
  1294. raise ValueError("'skipfooter' not supported with 'nrows'")