1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492 |
- """
- Module contains tools for processing files into DataFrames or other objects
- """
- from __future__ import annotations
- from collections import abc
- import csv
- import sys
- from textwrap import fill
- from typing import Any
- import warnings
- import numpy as np
- import pandas._libs.lib as lib
- from pandas._libs.parsers import STR_NA_VALUES
- from pandas._typing import (
- ArrayLike,
- DtypeArg,
- FilePathOrBuffer,
- StorageOptions,
- )
- from pandas.errors import (
- AbstractMethodError,
- ParserWarning,
- )
- from pandas.util._decorators import (
- Appender,
- deprecate_nonkeyword_arguments,
- )
- from pandas.util._validators import validate_bool_kwarg
- from pandas.core.dtypes.common import (
- is_file_like,
- is_float,
- is_integer,
- is_list_like,
- )
- from pandas.core import generic
- from pandas.core.frame import DataFrame
- from pandas.core.indexes.api import RangeIndex
- from pandas.io.common import validate_header_arg
- from pandas.io.parsers.base_parser import (
- ParserBase,
- is_index_col,
- parser_defaults,
- )
- from pandas.io.parsers.c_parser_wrapper import CParserWrapper
- from pandas.io.parsers.python_parser import (
- FixedWidthFieldParser,
- PythonParser,
- )
- _doc_read_csv_and_table = (
- r"""
- {summary}
- Also supports optionally iterating or breaking of the file
- into chunks.
- Additional help can be found in the online docs for
- `IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
- Parameters
- ----------
- filepath_or_buffer : str, path object or file-like object
- Any valid string path is acceptable. The string could be a URL. Valid
- URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is
- expected. A local file could be: file://localhost/path/to/table.csv.
- If you want to pass in a path object, pandas accepts any ``os.PathLike``.
- By file-like object, we refer to objects with a ``read()`` method, such as
- a file handle (e.g. via builtin ``open`` function) or ``StringIO``.
- sep : str, default {_default_sep}
- Delimiter to use. If sep is None, the C engine cannot automatically detect
- the separator, but the Python parsing engine can, meaning the latter will
- be used and automatically detect the separator by Python's builtin sniffer
- tool, ``csv.Sniffer``. In addition, separators longer than 1 character and
- different from ``'\s+'`` will be interpreted as regular expressions and
- will also force the use of the Python parsing engine. Note that regex
- delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``.
- delimiter : str, default ``None``
- Alias for sep.
- header : int, list of int, default 'infer'
- Row number(s) to use as the column names, and the start of the
- data. Default behavior is to infer the column names: if no names
- are passed the behavior is identical to ``header=0`` and column
- names are inferred from the first line of the file, if column
- names are passed explicitly then the behavior is identical to
- ``header=None``. Explicitly pass ``header=0`` to be able to
- replace existing names. The header can be a list of integers that
- specify row locations for a multi-index on the columns
- e.g. [0,1,3]. Intervening rows that are not specified will be
- skipped (e.g. 2 in this example is skipped). Note that this
- parameter ignores commented lines and empty lines if
- ``skip_blank_lines=True``, so ``header=0`` denotes the first line of
- data rather than the first line of the file.
- names : array-like, optional
- List of column names to use. If the file contains a header row,
- then you should explicitly pass ``header=0`` to override the column names.
- Duplicates in this list are not allowed.
- index_col : int, str, sequence of int / str, or False, default ``None``
- Column(s) to use as the row labels of the ``DataFrame``, either given as
- string name or column index. If a sequence of int / str is given, a
- MultiIndex is used.
- Note: ``index_col=False`` can be used to force pandas to *not* use the first
- column as the index, e.g. when you have a malformed file with delimiters at
- the end of each line.
- usecols : list-like or callable, optional
- Return a subset of the columns. If list-like, all elements must either
- be positional (i.e. integer indices into the document columns) or strings
- that correspond to column names provided either by the user in `names` or
- inferred from the document header row(s). For example, a valid list-like
- `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
- Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
- To instantiate a DataFrame from ``data`` with element order preserved use
- ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns
- in ``['foo', 'bar']`` order or
- ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
- for ``['bar', 'foo']`` order.
- If callable, the callable function will be evaluated against the column
- names, returning names where the callable function evaluates to True. An
- example of a valid callable argument would be ``lambda x: x.upper() in
- ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
- parsing time and lower memory usage.
- squeeze : bool, default False
- If the parsed data only contains one column then return a Series.
- prefix : str, optional
- Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
- mangle_dupe_cols : bool, default True
- Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
- 'X'...'X'. Passing in False will cause data to be overwritten if there
- are duplicate names in the columns.
- dtype : Type name or dict of column -> type, optional
- Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
- 'c': 'Int64'}}
- Use `str` or `object` together with suitable `na_values` settings
- to preserve and not interpret dtype.
- If converters are specified, they will be applied INSTEAD
- of dtype conversion.
- engine : {{'c', 'python'}}, optional
- Parser engine to use. The C engine is faster while the python engine is
- currently more feature-complete.
- converters : dict, optional
- Dict of functions for converting values in certain columns. Keys can either
- be integers or column labels.
- true_values : list, optional
- Values to consider as True.
- false_values : list, optional
- Values to consider as False.
- skipinitialspace : bool, default False
- Skip spaces after delimiter.
- skiprows : list-like, int or callable, optional
- Line numbers to skip (0-indexed) or number of lines to skip (int)
- at the start of the file.
- If callable, the callable function will be evaluated against the row
- indices, returning True if the row should be skipped and False otherwise.
- An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
- skipfooter : int, default 0
- Number of lines at bottom of file to skip (Unsupported with engine='c').
- nrows : int, optional
- Number of rows of file to read. Useful for reading pieces of large files.
- na_values : scalar, str, list-like, or dict, optional
- Additional strings to recognize as NA/NaN. If dict passed, specific
- per-column NA values. By default the following values are interpreted as
- NaN: '"""
- + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ")
- + """'.
- keep_default_na : bool, default True
- Whether or not to include the default NaN values when parsing the data.
- Depending on whether `na_values` is passed in, the behavior is as follows:
- * If `keep_default_na` is True, and `na_values` are specified, `na_values`
- is appended to the default NaN values used for parsing.
- * If `keep_default_na` is True, and `na_values` are not specified, only
- the default NaN values are used for parsing.
- * If `keep_default_na` is False, and `na_values` are specified, only
- the NaN values specified `na_values` are used for parsing.
- * If `keep_default_na` is False, and `na_values` are not specified, no
- strings will be parsed as NaN.
- Note that if `na_filter` is passed in as False, the `keep_default_na` and
- `na_values` parameters will be ignored.
- na_filter : bool, default True
- Detect missing value markers (empty strings and the value of na_values). In
- data without any NAs, passing na_filter=False can improve the performance
- of reading a large file.
- verbose : bool, default False
- Indicate number of NA values placed in non-numeric columns.
- skip_blank_lines : bool, default True
- If True, skip over blank lines rather than interpreting as NaN values.
- parse_dates : bool or list of int or names or list of lists or dict, \
- default False
- The behavior is as follows:
- * boolean. If True -> try parsing the index.
- * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
- each as a separate date column.
- * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
- a single date column.
- * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
- result 'foo'
- If a column or index cannot be represented as an array of datetimes,
- say because of an unparsable value or a mixture of timezones, the column
- or index will be returned unaltered as an object data type. For
- non-standard datetime parsing, use ``pd.to_datetime`` after
- ``pd.read_csv``. To parse an index or column with a mixture of timezones,
- specify ``date_parser`` to be a partially-applied
- :func:`pandas.to_datetime` with ``utc=True``. See
- :ref:`io.csv.mixed_timezones` for more.
- Note: A fast-path exists for iso8601-formatted dates.
- infer_datetime_format : bool, default False
- If True and `parse_dates` is enabled, pandas will attempt to infer the
- format of the datetime strings in the columns, and if it can be inferred,
- switch to a faster method of parsing them. In some cases this can increase
- the parsing speed by 5-10x.
- keep_date_col : bool, default False
- If True and `parse_dates` specifies combining multiple columns then
- keep the original columns.
- date_parser : function, optional
- Function to use for converting a sequence of string columns to an array of
- datetime instances. The default uses ``dateutil.parser.parser`` to do the
- conversion. Pandas will try to call `date_parser` in three different ways,
- advancing to the next if an exception occurs: 1) Pass one or more arrays
- (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
- string values from the columns defined by `parse_dates` into a single array
- and pass that; and 3) call `date_parser` once for each row using one or
- more strings (corresponding to the columns defined by `parse_dates`) as
- arguments.
- dayfirst : bool, default False
- DD/MM format dates, international and European format.
- cache_dates : bool, default True
- If True, use a cache of unique, converted dates to apply the datetime
- conversion. May produce significant speed-up when parsing duplicate
- date strings, especially ones with timezone offsets.
- .. versionadded:: 0.25.0
- iterator : bool, default False
- Return TextFileReader object for iteration or getting chunks with
- ``get_chunk()``.
- .. versionchanged:: 1.2
- ``TextFileReader`` is a context manager.
- chunksize : int, optional
- Return TextFileReader object for iteration.
- See the `IO Tools docs
- <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
- for more information on ``iterator`` and ``chunksize``.
- .. versionchanged:: 1.2
- ``TextFileReader`` is a context manager.
- compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer'
- For on-the-fly decompression of on-disk data. If 'infer' and
- `filepath_or_buffer` is path-like, then detect compression from the
- following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
- decompression). If using 'zip', the ZIP file must contain only one data
- file to be read in. Set to None for no decompression.
- thousands : str, optional
- Thousands separator.
- decimal : str, default '.'
- Character to recognize as decimal point (e.g. use ',' for European data).
- lineterminator : str (length 1), optional
- Character to break file into lines. Only valid with C parser.
- quotechar : str (length 1), optional
- The character used to denote the start and end of a quoted item. Quoted
- items can include the delimiter and it will be ignored.
- quoting : int or csv.QUOTE_* instance, default 0
- Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
- QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
- doublequote : bool, default ``True``
- When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate
- whether or not to interpret two consecutive quotechar elements INSIDE a
- field as a single ``quotechar`` element.
- escapechar : str (length 1), optional
- One-character string used to escape other characters.
- comment : str, optional
- Indicates remainder of line should not be parsed. If found at the beginning
- of a line, the line will be ignored altogether. This parameter must be a
- single character. Like empty lines (as long as ``skip_blank_lines=True``),
- fully commented lines are ignored by the parameter `header` but not by
- `skiprows`. For example, if ``comment='#'``, parsing
- ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in 'a,b,c' being
- treated as the header.
- encoding : str, optional
- Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
- standard encodings
- <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
- .. versionchanged:: 1.2
- When ``encoding`` is ``None``, ``errors="replace"`` is passed to
- ``open()``. Otherwise, ``errors="strict"`` is passed to ``open()``.
- This behavior was previously only the case for ``engine="python"``.
- .. versionchanged:: 1.3.0
- ``encoding_errors`` is a new argument. ``encoding`` has no longer an
- influence on how encoding errors are handled.
- encoding_errors : str, optional, default "strict"
- How encoding errors are treated. `List of possible values
- <https://docs.python.org/3/library/codecs.html#error-handlers>`_ .
- .. versionadded:: 1.3.0
- dialect : str or csv.Dialect, optional
- If provided, this parameter will override values (default or not) for the
- following parameters: `delimiter`, `doublequote`, `escapechar`,
- `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
- override values, a ParserWarning will be issued. See csv.Dialect
- documentation for more details.
- error_bad_lines : bool, default ``None``
- Lines with too many fields (e.g. a csv line with too many commas) will by
- default cause an exception to be raised, and no DataFrame will be returned.
- If False, then these "bad lines" will be dropped from the DataFrame that is
- returned.
- .. deprecated:: 1.3.0
- The ``on_bad_lines`` parameter should be used instead to specify behavior upon
- encountering a bad line instead.
- warn_bad_lines : bool, default ``None``
- If error_bad_lines is False, and warn_bad_lines is True, a warning for each
- "bad line" will be output.
- .. deprecated:: 1.3.0
- The ``on_bad_lines`` parameter should be used instead to specify behavior upon
- encountering a bad line instead.
- on_bad_lines : {{'error', 'warn', 'skip'}}, default 'error'
- Specifies what to do upon encountering a bad line (a line with too many fields).
- Allowed values are :
- - 'error', raise an Exception when a bad line is encountered.
- - 'warn', raise a warning when a bad line is encountered and skip that line.
- - 'skip', skip bad lines without raising or warning when they are encountered.
- .. versionadded:: 1.3.0
- delim_whitespace : bool, default False
- Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
- used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
- is set to True, nothing should be passed in for the ``delimiter``
- parameter.
- low_memory : bool, default True
- Internally process the file in chunks, resulting in lower memory use
- while parsing, but possibly mixed type inference. To ensure no mixed
- types either set False, or specify the type with the `dtype` parameter.
- Note that the entire file is read into a single DataFrame regardless,
- use the `chunksize` or `iterator` parameter to return the data in chunks.
- (Only valid with C parser).
- memory_map : bool, default False
- If a filepath is provided for `filepath_or_buffer`, map the file object
- directly onto memory and access the data directly from there. Using this
- option can improve performance because there is no longer any I/O overhead.
- float_precision : str, optional
- Specifies which converter the C engine should use for floating-point
- values. The options are ``None`` or 'high' for the ordinary converter,
- 'legacy' for the original lower precision pandas converter, and
- 'round_trip' for the round-trip converter.
- .. versionchanged:: 1.2
- {storage_options}
- .. versionadded:: 1.2
- Returns
- -------
- DataFrame or TextParser
- A comma-separated values (csv) file is returned as two-dimensional
- data structure with labeled axes.
- See Also
- --------
- DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
- read_csv : Read a comma-separated values (csv) file into DataFrame.
- read_fwf : Read a table of fixed-width formatted lines into DataFrame.
- Examples
- --------
- >>> pd.{func_name}('data.csv') # doctest: +SKIP
- """
- )
- _c_parser_defaults = {
- "delim_whitespace": False,
- "na_filter": True,
- "low_memory": True,
- "memory_map": False,
- "error_bad_lines": None,
- "warn_bad_lines": None,
- "float_precision": None,
- }
- _fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
- _c_unsupported = {"skipfooter"}
- _python_unsupported = {"low_memory", "float_precision"}
- _deprecated_defaults: dict[str, Any] = {"error_bad_lines": None, "warn_bad_lines": None}
- _deprecated_args: set[str] = {"error_bad_lines", "warn_bad_lines"}
- def validate_integer(name, val, min_val=0):
- """
- Checks whether the 'name' parameter for parsing is either
- an integer OR float that can SAFELY be cast to an integer
- without losing accuracy. Raises a ValueError if that is
- not the case.
- Parameters
- ----------
- name : str
- Parameter name (used for error reporting)
- val : int or float
- The value to check
- min_val : int
- Minimum allowed value (val < min_val will result in a ValueError)
- """
- msg = f"'{name:s}' must be an integer >={min_val:d}"
- if val is not None:
- if is_float(val):
- if int(val) != val:
- raise ValueError(msg)
- val = int(val)
- elif not (is_integer(val) and val >= min_val):
- raise ValueError(msg)
- return val
- def _validate_names(names):
- """
- Raise ValueError if the `names` parameter contains duplicates or has an
- invalid data type.
- Parameters
- ----------
- names : array-like or None
- An array containing a list of the names used for the output DataFrame.
- Raises
- ------
- ValueError
- If names are not unique or are not ordered (e.g. set).
- """
- if names is not None:
- if len(names) != len(set(names)):
- raise ValueError("Duplicate names are not allowed.")
- if not (
- is_list_like(names, allow_sets=False) or isinstance(names, abc.KeysView)
- ):
- raise ValueError("Names should be an ordered collection.")
- def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
- """Generic reader of line files."""
- if kwds.get("date_parser", None) is not None:
- if isinstance(kwds["parse_dates"], bool):
- kwds["parse_dates"] = True
- # Extract some of the arguments (pass chunksize on).
- iterator = kwds.get("iterator", False)
- chunksize = validate_integer("chunksize", kwds.get("chunksize", None), 1)
- nrows = kwds.get("nrows", None)
- # Check for duplicates in names.
- _validate_names(kwds.get("names", None))
- # Create the parser.
- parser = TextFileReader(filepath_or_buffer, **kwds)
- if chunksize or iterator:
- return parser
- with parser:
- return parser.read(nrows)
- @deprecate_nonkeyword_arguments(
- version=None, allowed_args=["filepath_or_buffer"], stacklevel=3
- )
- @Appender(
- _doc_read_csv_and_table.format(
- func_name="read_csv",
- summary="Read a comma-separated values (csv) file into DataFrame.",
- _default_sep="','",
- storage_options=generic._shared_docs["storage_options"],
- )
- )
- def read_csv(
- filepath_or_buffer: FilePathOrBuffer,
- sep=lib.no_default,
- delimiter=None,
- # Column and Index Locations and Names
- header="infer",
- names=lib.no_default,
- index_col=None,
- usecols=None,
- squeeze=False,
- prefix=lib.no_default,
- mangle_dupe_cols=True,
- # General Parsing Configuration
- dtype: DtypeArg | None = None,
- engine=None,
- converters=None,
- true_values=None,
- false_values=None,
- skipinitialspace=False,
- skiprows=None,
- skipfooter=0,
- nrows=None,
- # NA and Missing Data Handling
- na_values=None,
- keep_default_na=True,
- na_filter=True,
- verbose=False,
- skip_blank_lines=True,
- # Datetime Handling
- parse_dates=False,
- infer_datetime_format=False,
- keep_date_col=False,
- date_parser=None,
- dayfirst=False,
- cache_dates=True,
- # Iteration
- iterator=False,
- chunksize=None,
- # Quoting, Compression, and File Format
- compression="infer",
- thousands=None,
- decimal: str = ".",
- lineterminator=None,
- quotechar='"',
- quoting=csv.QUOTE_MINIMAL,
- doublequote=True,
- escapechar=None,
- comment=None,
- encoding=None,
- encoding_errors: str | None = "strict",
- dialect=None,
- # Error Handling
- error_bad_lines=None,
- warn_bad_lines=None,
- # TODO (2.0): set on_bad_lines to "error".
- # See _refine_defaults_read comment for why we do this.
- on_bad_lines=None,
- # Internal
- delim_whitespace=False,
- low_memory=_c_parser_defaults["low_memory"],
- memory_map=False,
- float_precision=None,
- storage_options: StorageOptions = None,
- ):
- # locals() should never be modified
- kwds = locals().copy()
- del kwds["filepath_or_buffer"]
- del kwds["sep"]
- kwds_defaults = _refine_defaults_read(
- dialect,
- delimiter,
- delim_whitespace,
- engine,
- sep,
- error_bad_lines,
- warn_bad_lines,
- on_bad_lines,
- names,
- prefix,
- defaults={"delimiter": ","},
- )
- kwds.update(kwds_defaults)
- return _read(filepath_or_buffer, kwds)
- @deprecate_nonkeyword_arguments(
- version=None, allowed_args=["filepath_or_buffer"], stacklevel=3
- )
- @Appender(
- _doc_read_csv_and_table.format(
- func_name="read_table",
- summary="Read general delimited file into DataFrame.",
- _default_sep=r"'\\t' (tab-stop)",
- storage_options=generic._shared_docs["storage_options"],
- )
- )
- def read_table(
- filepath_or_buffer: FilePathOrBuffer,
- sep=lib.no_default,
- delimiter=None,
- # Column and Index Locations and Names
- header="infer",
- names=lib.no_default,
- index_col=None,
- usecols=None,
- squeeze=False,
- prefix=lib.no_default,
- mangle_dupe_cols=True,
- # General Parsing Configuration
- dtype: DtypeArg | None = None,
- engine=None,
- converters=None,
- true_values=None,
- false_values=None,
- skipinitialspace=False,
- skiprows=None,
- skipfooter=0,
- nrows=None,
- # NA and Missing Data Handling
- na_values=None,
- keep_default_na=True,
- na_filter=True,
- verbose=False,
- skip_blank_lines=True,
- # Datetime Handling
- parse_dates=False,
- infer_datetime_format=False,
- keep_date_col=False,
- date_parser=None,
- dayfirst=False,
- cache_dates=True,
- # Iteration
- iterator=False,
- chunksize=None,
- # Quoting, Compression, and File Format
- compression="infer",
- thousands=None,
- decimal: str = ".",
- lineterminator=None,
- quotechar='"',
- quoting=csv.QUOTE_MINIMAL,
- doublequote=True,
- escapechar=None,
- comment=None,
- encoding=None,
- dialect=None,
- # Error Handling
- error_bad_lines=None,
- warn_bad_lines=None,
- # TODO (2.0): set on_bad_lines to "error".
- # See _refine_defaults_read comment for why we do this.
- on_bad_lines=None,
- encoding_errors: str | None = "strict",
- # Internal
- delim_whitespace=False,
- low_memory=_c_parser_defaults["low_memory"],
- memory_map=False,
- float_precision=None,
- ):
- # locals() should never be modified
- kwds = locals().copy()
- del kwds["filepath_or_buffer"]
- del kwds["sep"]
- kwds_defaults = _refine_defaults_read(
- dialect,
- delimiter,
- delim_whitespace,
- engine,
- sep,
- error_bad_lines,
- warn_bad_lines,
- on_bad_lines,
- names,
- prefix,
- defaults={"delimiter": "\t"},
- )
- kwds.update(kwds_defaults)
- return _read(filepath_or_buffer, kwds)
- def read_fwf(
- filepath_or_buffer: FilePathOrBuffer,
- colspecs="infer",
- widths=None,
- infer_nrows=100,
- **kwds,
- ):
- r"""
- Read a table of fixed-width formatted lines into DataFrame.
- Also supports optionally iterating or breaking of the file
- into chunks.
- Additional help can be found in the `online docs for IO Tools
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
- Parameters
- ----------
- filepath_or_buffer : str, path object or file-like object
- Any valid string path is acceptable. The string could be a URL. Valid
- URL schemes include http, ftp, s3, and file. For file URLs, a host is
- expected. A local file could be:
- ``file://localhost/path/to/table.csv``.
- If you want to pass in a path object, pandas accepts any
- ``os.PathLike``.
- By file-like object, we refer to objects with a ``read()`` method,
- such as a file handle (e.g. via builtin ``open`` function)
- or ``StringIO``.
- colspecs : list of tuple (int, int) or 'infer'. optional
- A list of tuples giving the extents of the fixed-width
- fields of each line as half-open intervals (i.e., [from, to[ ).
- String value 'infer' can be used to instruct the parser to try
- detecting the column specifications from the first 100 rows of
- the data which are not being skipped via skiprows (default='infer').
- widths : list of int, optional
- A list of field widths which can be used instead of 'colspecs' if
- the intervals are contiguous.
- infer_nrows : int, default 100
- The number of rows to consider when letting the parser determine the
- `colspecs`.
- **kwds : optional
- Optional keyword arguments can be passed to ``TextFileReader``.
- Returns
- -------
- DataFrame or TextParser
- A comma-separated values (csv) file is returned as two-dimensional
- data structure with labeled axes.
- See Also
- --------
- DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
- read_csv : Read a comma-separated values (csv) file into DataFrame.
- Examples
- --------
- >>> pd.read_fwf('data.csv') # doctest: +SKIP
- """
- # Check input arguments.
- if colspecs is None and widths is None:
- raise ValueError("Must specify either colspecs or widths")
- elif colspecs not in (None, "infer") and widths is not None:
- raise ValueError("You must specify only one of 'widths' and 'colspecs'")
- # Compute 'colspecs' from 'widths', if specified.
- if widths is not None:
- colspecs, col = [], 0
- for w in widths:
- colspecs.append((col, col + w))
- col += w
- kwds["colspecs"] = colspecs
- kwds["infer_nrows"] = infer_nrows
- kwds["engine"] = "python-fwf"
- return _read(filepath_or_buffer, kwds)
- class TextFileReader(abc.Iterator):
- """
- Passed dialect overrides any of the related parser options
- """
- def __init__(self, f, engine=None, **kwds):
- self.f = f
- if engine is not None:
- engine_specified = True
- else:
- engine = "python"
- engine_specified = False
- self.engine = engine
- self._engine_specified = kwds.get("engine_specified", engine_specified)
- _validate_skipfooter(kwds)
- dialect = _extract_dialect(kwds)
- if dialect is not None:
- kwds = _merge_with_dialect_properties(dialect, kwds)
- if kwds.get("header", "infer") == "infer":
- kwds["header"] = 0 if kwds.get("names") is None else None
- self.orig_options = kwds
- # miscellanea
- self._currow = 0
- options = self._get_options_with_defaults(engine)
- options["storage_options"] = kwds.get("storage_options", None)
- self.chunksize = options.pop("chunksize", None)
- self.nrows = options.pop("nrows", None)
- self.squeeze = options.pop("squeeze", False)
- self._check_file_or_buffer(f, engine)
- self.options, self.engine = self._clean_options(options, engine)
- if "has_index_names" in kwds:
- self.options["has_index_names"] = kwds["has_index_names"]
- self._engine = self._make_engine(self.engine)
- def close(self):
- self._engine.close()
- def _get_options_with_defaults(self, engine):
- kwds = self.orig_options
- options = {}
- default: object | None
- for argname, default in parser_defaults.items():
- value = kwds.get(argname, default)
- # see gh-12935
- if argname == "mangle_dupe_cols" and not value:
- raise ValueError("Setting mangle_dupe_cols=False is not supported yet")
- else:
- options[argname] = value
- for argname, default in _c_parser_defaults.items():
- if argname in kwds:
- value = kwds[argname]
- if engine != "c" and value != default:
- if "python" in engine and argname not in _python_unsupported:
- pass
- elif value == _deprecated_defaults.get(argname, default):
- pass
- else:
- raise ValueError(
- f"The {repr(argname)} option is not supported with the "
- f"{repr(engine)} engine"
- )
- else:
- value = _deprecated_defaults.get(argname, default)
- options[argname] = value
- if engine == "python-fwf":
- for argname, default in _fwf_defaults.items():
- options[argname] = kwds.get(argname, default)
- return options
- def _check_file_or_buffer(self, f, engine):
- # see gh-16530
- if is_file_like(f) and engine != "c" and not hasattr(f, "__next__"):
- # The C engine doesn't need the file-like to have the "__next__"
- # attribute. However, the Python engine explicitly calls
- # "__next__(...)" when iterating through such an object, meaning it
- # needs to have that attribute
- raise ValueError(
- "The 'python' engine cannot iterate through this file buffer."
- )
- def _clean_options(self, options, engine):
- result = options.copy()
- fallback_reason = None
- # C engine not supported yet
- if engine == "c":
- if options["skipfooter"] > 0:
- fallback_reason = "the 'c' engine does not support skipfooter"
- engine = "python"
- sep = options["delimiter"]
- delim_whitespace = options["delim_whitespace"]
- if sep is None and not delim_whitespace:
- if engine == "c":
- fallback_reason = (
- "the 'c' engine does not support "
- "sep=None with delim_whitespace=False"
- )
- engine = "python"
- elif sep is not None and len(sep) > 1:
- if engine == "c" and sep == r"\s+":
- result["delim_whitespace"] = True
- del result["delimiter"]
- elif engine not in ("python", "python-fwf"):
- # wait until regex engine integrated
- fallback_reason = (
- "the 'c' engine does not support "
- "regex separators (separators > 1 char and "
- r"different from '\s+' are interpreted as regex)"
- )
- engine = "python"
- elif delim_whitespace:
- if "python" in engine:
- result["delimiter"] = r"\s+"
- elif sep is not None:
- encodeable = True
- encoding = sys.getfilesystemencoding() or "utf-8"
- try:
- if len(sep.encode(encoding)) > 1:
- encodeable = False
- except UnicodeDecodeError:
- encodeable = False
- if not encodeable and engine not in ("python", "python-fwf"):
- fallback_reason = (
- f"the separator encoded in {encoding} "
- "is > 1 char long, and the 'c' engine "
- "does not support such separators"
- )
- engine = "python"
- quotechar = options["quotechar"]
- if quotechar is not None and isinstance(quotechar, (str, bytes)):
- if (
- len(quotechar) == 1
- and ord(quotechar) > 127
- and engine not in ("python", "python-fwf")
- ):
- fallback_reason = (
- "ord(quotechar) > 127, meaning the "
- "quotechar is larger than one byte, "
- "and the 'c' engine does not support such quotechars"
- )
- engine = "python"
- if fallback_reason and self._engine_specified:
- raise ValueError(fallback_reason)
- if engine == "c":
- for arg in _c_unsupported:
- del result[arg]
- if "python" in engine:
- for arg in _python_unsupported:
- if fallback_reason and result[arg] != _c_parser_defaults[arg]:
- raise ValueError(
- "Falling back to the 'python' engine because "
- f"{fallback_reason}, but this causes {repr(arg)} to be "
- "ignored as it is not supported by the 'python' engine."
- )
- del result[arg]
- if fallback_reason:
- warnings.warn(
- (
- "Falling back to the 'python' engine because "
- f"{fallback_reason}; you can avoid this warning by specifying "
- "engine='python'."
- ),
- ParserWarning,
- stacklevel=5,
- )
- index_col = options["index_col"]
- names = options["names"]
- converters = options["converters"]
- na_values = options["na_values"]
- skiprows = options["skiprows"]
- validate_header_arg(options["header"])
- for arg in _deprecated_args:
- parser_default = _c_parser_defaults[arg]
- depr_default = _deprecated_defaults[arg]
- if result.get(arg, depr_default) != depr_default:
- msg = (
- f"The {arg} argument has been deprecated and will be "
- "removed in a future version.\n\n"
- )
- warnings.warn(msg, FutureWarning, stacklevel=7)
- else:
- result[arg] = parser_default
- if index_col is True:
- raise ValueError("The value of index_col couldn't be 'True'")
- if is_index_col(index_col):
- if not isinstance(index_col, (list, tuple, np.ndarray)):
- index_col = [index_col]
- result["index_col"] = index_col
- names = list(names) if names is not None else names
- # type conversion-related
- if converters is not None:
- if not isinstance(converters, dict):
- raise TypeError(
- "Type converters must be a dict or subclass, "
- f"input was a {type(converters).__name__}"
- )
- else:
- converters = {}
- # Converting values to NA
- keep_default_na = options["keep_default_na"]
- na_values, na_fvalues = _clean_na_values(na_values, keep_default_na)
- # handle skiprows; this is internally handled by the
- # c-engine, so only need for python parsers
- if engine != "c":
- if is_integer(skiprows):
- skiprows = list(range(skiprows))
- if skiprows is None:
- skiprows = set()
- elif not callable(skiprows):
- skiprows = set(skiprows)
- # put stuff back
- result["names"] = names
- result["converters"] = converters
- result["na_values"] = na_values
- result["na_fvalues"] = na_fvalues
- result["skiprows"] = skiprows
- return result, engine
- def __next__(self):
- try:
- return self.get_chunk()
- except StopIteration:
- self.close()
- raise
- def _make_engine(self, engine="c"):
- mapping: dict[str, type[ParserBase]] = {
- "c": CParserWrapper,
- "python": PythonParser,
- "python-fwf": FixedWidthFieldParser,
- }
- if engine not in mapping:
- raise ValueError(
- f"Unknown engine: {engine} (valid options are {mapping.keys()})"
- )
- # error: Too many arguments for "ParserBase"
- return mapping[engine](self.f, **self.options) # type: ignore[call-arg]
- def _failover_to_python(self):
- raise AbstractMethodError(self)
- def read(self, nrows=None):
- nrows = validate_integer("nrows", nrows)
- index, columns, col_dict = self._engine.read(nrows)
- if index is None:
- if col_dict:
- # Any column is actually fine:
- new_rows = len(next(iter(col_dict.values())))
- index = RangeIndex(self._currow, self._currow + new_rows)
- else:
- new_rows = 0
- else:
- new_rows = len(index)
- df = DataFrame(col_dict, columns=columns, index=index)
- self._currow += new_rows
- if self.squeeze and len(df.columns) == 1:
- return df[df.columns[0]].copy()
- return df
- def get_chunk(self, size=None):
- if size is None:
- size = self.chunksize
- if self.nrows is not None:
- if self._currow >= self.nrows:
- raise StopIteration
- size = min(size, self.nrows - self._currow)
- return self.read(nrows=size)
- def __enter__(self):
- return self
- def __exit__(self, exc_type, exc_value, traceback):
- self.close()
- def TextParser(*args, **kwds):
- """
- Converts lists of lists/tuples into DataFrames with proper type inference
- and optional (e.g. string to datetime) conversion. Also enables iterating
- lazily over chunks of large files
- Parameters
- ----------
- data : file-like object or list
- delimiter : separator character to use
- dialect : str or csv.Dialect instance, optional
- Ignored if delimiter is longer than 1 character
- names : sequence, default
- header : int, default 0
- Row to use to parse column labels. Defaults to the first row. Prior
- rows will be discarded
- index_col : int or list, optional
- Column or columns to use as the (possibly hierarchical) index
- has_index_names: bool, default False
- True if the cols defined in index_col have an index name and are
- not in the header.
- na_values : scalar, str, list-like, or dict, optional
- Additional strings to recognize as NA/NaN.
- keep_default_na : bool, default True
- thousands : str, optional
- Thousands separator
- comment : str, optional
- Comment out remainder of line
- parse_dates : bool, default False
- keep_date_col : bool, default False
- date_parser : function, optional
- skiprows : list of integers
- Row numbers to skip
- skipfooter : int
- Number of line at bottom of file to skip
- converters : dict, optional
- Dict of functions for converting values in certain columns. Keys can
- either be integers or column labels, values are functions that take one
- input argument, the cell (not column) content, and return the
- transformed content.
- encoding : str, optional
- Encoding to use for UTF when reading/writing (ex. 'utf-8')
- squeeze : bool, default False
- returns Series if only one column.
- infer_datetime_format: bool, default False
- If True and `parse_dates` is True for a column, try to infer the
- datetime format based on the first datetime string. If the format
- can be inferred, there often will be a large parsing speed-up.
- float_precision : str, optional
- Specifies which converter the C engine should use for floating-point
- values. The options are `None` or `high` for the ordinary converter,
- `legacy` for the original lower precision pandas converter, and
- `round_trip` for the round-trip converter.
- .. versionchanged:: 1.2
- """
- kwds["engine"] = "python"
- return TextFileReader(*args, **kwds)
- def _clean_na_values(na_values, keep_default_na=True):
- na_fvalues: set | dict
- if na_values is None:
- if keep_default_na:
- na_values = STR_NA_VALUES
- else:
- na_values = set()
- na_fvalues = set()
- elif isinstance(na_values, dict):
- old_na_values = na_values.copy()
- na_values = {} # Prevent aliasing.
- # Convert the values in the na_values dictionary
- # into array-likes for further use. This is also
- # where we append the default NaN values, provided
- # that `keep_default_na=True`.
- for k, v in old_na_values.items():
- if not is_list_like(v):
- v = [v]
- if keep_default_na:
- v = set(v) | STR_NA_VALUES
- na_values[k] = v
- na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()}
- else:
- if not is_list_like(na_values):
- na_values = [na_values]
- na_values = _stringify_na_values(na_values)
- if keep_default_na:
- na_values = na_values | STR_NA_VALUES
- na_fvalues = _floatify_na_values(na_values)
- return na_values, na_fvalues
- def _floatify_na_values(na_values):
- # create float versions of the na_values
- result = set()
- for v in na_values:
- try:
- v = float(v)
- if not np.isnan(v):
- result.add(v)
- except (TypeError, ValueError, OverflowError):
- pass
- return result
- def _stringify_na_values(na_values):
- """return a stringified and numeric for these values"""
- result: list[int | str | float] = []
- for x in na_values:
- result.append(str(x))
- result.append(x)
- try:
- v = float(x)
- # we are like 999 here
- if v == int(v):
- v = int(v)
- result.append(f"{v}.0")
- result.append(str(v))
- result.append(v)
- except (TypeError, ValueError, OverflowError):
- pass
- try:
- result.append(int(x))
- except (TypeError, ValueError, OverflowError):
- pass
- return set(result)
- def _refine_defaults_read(
- dialect: str | csv.Dialect,
- delimiter: str | object,
- delim_whitespace: bool,
- engine: str,
- sep: str | object,
- error_bad_lines: bool | None,
- warn_bad_lines: bool | None,
- on_bad_lines: str | None,
- names: ArrayLike | None | object,
- prefix: str | None | object,
- defaults: dict[str, Any],
- ):
- """Validate/refine default values of input parameters of read_csv, read_table.
- Parameters
- ----------
- dialect : str or csv.Dialect
- If provided, this parameter will override values (default or not) for the
- following parameters: `delimiter`, `doublequote`, `escapechar`,
- `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
- override values, a ParserWarning will be issued. See csv.Dialect
- documentation for more details.
- delimiter : str or object
- Alias for sep.
- delim_whitespace : bool
- Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
- used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
- is set to True, nothing should be passed in for the ``delimiter``
- parameter.
- engine : {{'c', 'python'}}
- Parser engine to use. The C engine is faster while the python engine is
- currently more feature-complete.
- sep : str or object
- A delimiter provided by the user (str) or a sentinel value, i.e.
- pandas._libs.lib.no_default.
- error_bad_lines : str or None
- Whether to error on a bad line or not.
- warn_bad_lines : str or None
- Whether to warn on a bad line or not.
- on_bad_lines : str or None
- An option for handling bad lines or a sentinel value(None).
- names : array-like, optional
- List of column names to use. If the file contains a header row,
- then you should explicitly pass ``header=0`` to override the column names.
- Duplicates in this list are not allowed.
- prefix : str, optional
- Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
- defaults: dict
- Default values of input parameters.
- Returns
- -------
- kwds : dict
- Input parameters with correct values.
- Raises
- ------
- ValueError :
- If a delimiter was specified with ``sep`` (or ``delimiter``) and
- ``delim_whitespace=True``.
- If on_bad_lines is specified(not ``None``) and ``error_bad_lines``/
- ``warn_bad_lines`` is True.
- """
- # fix types for sep, delimiter to Union(str, Any)
- delim_default = defaults["delimiter"]
- kwds: dict[str, Any] = {}
- # gh-23761
- #
- # When a dialect is passed, it overrides any of the overlapping
- # parameters passed in directly. We don't want to warn if the
- # default parameters were passed in (since it probably means
- # that the user didn't pass them in explicitly in the first place).
- #
- # "delimiter" is the annoying corner case because we alias it to
- # "sep" before doing comparison to the dialect values later on.
- # Thus, we need a flag to indicate that we need to "override"
- # the comparison to dialect values by checking if default values
- # for BOTH "delimiter" and "sep" were provided.
- if dialect is not None:
- kwds["sep_override"] = delimiter is None and (
- sep is lib.no_default or sep == delim_default
- )
- if delimiter and (sep is not lib.no_default):
- raise ValueError("Specified a sep and a delimiter; you can only specify one.")
- if names is not lib.no_default and prefix is not lib.no_default:
- raise ValueError("Specified named and prefix; you can only specify one.")
- kwds["names"] = None if names is lib.no_default else names
- kwds["prefix"] = None if prefix is lib.no_default else prefix
- # Alias sep -> delimiter.
- if delimiter is None:
- delimiter = sep
- if delim_whitespace and (delimiter is not lib.no_default):
- raise ValueError(
- "Specified a delimiter with both sep and "
- "delim_whitespace=True; you can only specify one."
- )
- if delimiter is lib.no_default:
- # assign default separator value
- kwds["delimiter"] = delim_default
- else:
- kwds["delimiter"] = delimiter
- if engine is not None:
- kwds["engine_specified"] = True
- else:
- kwds["engine"] = "c"
- kwds["engine_specified"] = False
- # Ensure that on_bad_lines and error_bad_lines/warn_bad_lines
- # aren't specified at the same time. If so, raise. Otherwise,
- # alias on_bad_lines to "error" if error/warn_bad_lines not set
- # and on_bad_lines is not set. on_bad_lines is defaulted to None
- # so we can tell if it is set (this is why this hack exists).
- if on_bad_lines is not None:
- if error_bad_lines is not None or warn_bad_lines is not None:
- raise ValueError(
- "Both on_bad_lines and error_bad_lines/warn_bad_lines are set. "
- "Please only set on_bad_lines."
- )
- if on_bad_lines == "error":
- kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR
- elif on_bad_lines == "warn":
- kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN
- elif on_bad_lines == "skip":
- kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP
- else:
- raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines")
- else:
- if error_bad_lines is not None:
- # Must check is_bool, because other stuff(e.g. non-empty lists) eval to true
- validate_bool_kwarg(error_bad_lines, "error_bad_lines")
- if error_bad_lines:
- kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR
- else:
- if warn_bad_lines is not None:
- # This is the case where error_bad_lines is False
- # We can only warn/skip if error_bad_lines is False
- # None doesn't work because backwards-compatibility reasons
- validate_bool_kwarg(warn_bad_lines, "warn_bad_lines")
- if warn_bad_lines:
- kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN
- else:
- kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP
- else:
- # Backwards compat, when only error_bad_lines = false, we warn
- kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN
- else:
- # Everything None -> Error
- kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR
- return kwds
- def _extract_dialect(kwds: dict[str, Any]) -> csv.Dialect | None:
- """
- Extract concrete csv dialect instance.
- Returns
- -------
- csv.Dialect or None
- """
- if kwds.get("dialect") is None:
- return None
- dialect = kwds["dialect"]
- if dialect in csv.list_dialects():
- dialect = csv.get_dialect(dialect)
- _validate_dialect(dialect)
- return dialect
- MANDATORY_DIALECT_ATTRS = (
- "delimiter",
- "doublequote",
- "escapechar",
- "skipinitialspace",
- "quotechar",
- "quoting",
- )
- def _validate_dialect(dialect: csv.Dialect) -> None:
- """
- Validate csv dialect instance.
- Raises
- ------
- ValueError
- If incorrect dialect is provided.
- """
- for param in MANDATORY_DIALECT_ATTRS:
- if not hasattr(dialect, param):
- raise ValueError(f"Invalid dialect {dialect} provided")
- def _merge_with_dialect_properties(
- dialect: csv.Dialect,
- defaults: dict[str, Any],
- ) -> dict[str, Any]:
- """
- Merge default kwargs in TextFileReader with dialect parameters.
- Parameters
- ----------
- dialect : csv.Dialect
- Concrete csv dialect. See csv.Dialect documentation for more details.
- defaults : dict
- Keyword arguments passed to TextFileReader.
- Returns
- -------
- kwds : dict
- Updated keyword arguments, merged with dialect parameters.
- """
- kwds = defaults.copy()
- for param in MANDATORY_DIALECT_ATTRS:
- dialect_val = getattr(dialect, param)
- parser_default = parser_defaults[param]
- provided = kwds.get(param, parser_default)
- # Messages for conflicting values between the dialect
- # instance and the actual parameters provided.
- conflict_msgs = []
- # Don't warn if the default parameter was passed in,
- # even if it conflicts with the dialect (gh-23761).
- if provided != parser_default and provided != dialect_val:
- msg = (
- f"Conflicting values for '{param}': '{provided}' was "
- f"provided, but the dialect specifies '{dialect_val}'. "
- "Using the dialect-specified value."
- )
- # Annoying corner case for not warning about
- # conflicts between dialect and delimiter parameter.
- # Refer to the outer "_read_" function for more info.
- if not (param == "delimiter" and kwds.pop("sep_override", False)):
- conflict_msgs.append(msg)
- if conflict_msgs:
- warnings.warn("\n\n".join(conflict_msgs), ParserWarning, stacklevel=2)
- kwds[param] = dialect_val
- return kwds
- def _validate_skipfooter(kwds: dict[str, Any]) -> None:
- """
- Check whether skipfooter is compatible with other kwargs in TextFileReader.
- Parameters
- ----------
- kwds : dict
- Keyword arguments passed to TextFileReader.
- Raises
- ------
- ValueError
- If skipfooter is not compatible with other parameters.
- """
- if kwds.get("skipfooter"):
- if kwds.get("iterator") or kwds.get("chunksize"):
- raise ValueError("'skipfooter' not supported for iteration")
- if kwds.get("nrows"):
- raise ValueError("'skipfooter' not supported with 'nrows'")
|