123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547 |
- import os
- import re
- import functools
- import itertools
- import warnings
- import weakref
- import contextlib
- import operator
- from operator import itemgetter, index as opindex, methodcaller
- from collections.abc import Mapping
- import numpy as np
- from . import format
- from ._datasource import DataSource
- from numpy.core import overrides
- from numpy.core.multiarray import packbits, unpackbits
- from numpy.core._multiarray_umath import _load_from_filelike
- from numpy.core.overrides import set_array_function_like_doc, set_module
- from ._iotools import (
- LineSplitter, NameValidator, StringConverter, ConverterError,
- ConverterLockError, ConversionWarning, _is_string_like,
- has_nested_fields, flatten_dtype, easy_dtype, _decode_line
- )
- from numpy.compat import (
- asbytes, asstr, asunicode, os_fspath, os_PathLike,
- pickle
- )
- __all__ = [
- 'savetxt', 'loadtxt', 'genfromtxt',
- 'recfromtxt', 'recfromcsv', 'load', 'save', 'savez',
- 'savez_compressed', 'packbits', 'unpackbits', 'fromregex', 'DataSource'
- ]
- array_function_dispatch = functools.partial(
- overrides.array_function_dispatch, module='numpy')
- class BagObj:
- """
- BagObj(obj)
- Convert attribute look-ups to getitems on the object passed in.
- Parameters
- ----------
- obj : class instance
- Object on which attribute look-up is performed.
- Examples
- --------
- >>> from numpy.lib.npyio import BagObj as BO
- >>> class BagDemo:
- ... def __getitem__(self, key): # An instance of BagObj(BagDemo)
- ... # will call this method when any
- ... # attribute look-up is required
- ... result = "Doesn't matter what you want, "
- ... return result + "you're gonna get this"
- ...
- >>> demo_obj = BagDemo()
- >>> bagobj = BO(demo_obj)
- >>> bagobj.hello_there
- "Doesn't matter what you want, you're gonna get this"
- >>> bagobj.I_can_be_anything
- "Doesn't matter what you want, you're gonna get this"
- """
- def __init__(self, obj):
- # Use weakref to make NpzFile objects collectable by refcount
- self._obj = weakref.proxy(obj)
- def __getattribute__(self, key):
- try:
- return object.__getattribute__(self, '_obj')[key]
- except KeyError:
- raise AttributeError(key) from None
- def __dir__(self):
- """
- Enables dir(bagobj) to list the files in an NpzFile.
- This also enables tab-completion in an interpreter or IPython.
- """
- return list(object.__getattribute__(self, '_obj').keys())
- def zipfile_factory(file, *args, **kwargs):
- """
- Create a ZipFile.
- Allows for Zip64, and the `file` argument can accept file, str, or
- pathlib.Path objects. `args` and `kwargs` are passed to the zipfile.ZipFile
- constructor.
- """
- if not hasattr(file, 'read'):
- file = os_fspath(file)
- import zipfile
- kwargs['allowZip64'] = True
- return zipfile.ZipFile(file, *args, **kwargs)
- class NpzFile(Mapping):
- """
- NpzFile(fid)
- A dictionary-like object with lazy-loading of files in the zipped
- archive provided on construction.
- `NpzFile` is used to load files in the NumPy ``.npz`` data archive
- format. It assumes that files in the archive have a ``.npy`` extension,
- other files are ignored.
- The arrays and file strings are lazily loaded on either
- getitem access using ``obj['key']`` or attribute lookup using
- ``obj.f.key``. A list of all files (without ``.npy`` extensions) can
- be obtained with ``obj.files`` and the ZipFile object itself using
- ``obj.zip``.
- Attributes
- ----------
- files : list of str
- List of all files in the archive with a ``.npy`` extension.
- zip : ZipFile instance
- The ZipFile object initialized with the zipped archive.
- f : BagObj instance
- An object on which attribute can be performed as an alternative
- to getitem access on the `NpzFile` instance itself.
- allow_pickle : bool, optional
- Allow loading pickled data. Default: False
- .. versionchanged:: 1.16.3
- Made default False in response to CVE-2019-6446.
- pickle_kwargs : dict, optional
- Additional keyword arguments to pass on to pickle.load.
- These are only useful when loading object arrays saved on
- Python 2 when using Python 3.
- max_header_size : int, optional
- Maximum allowed size of the header. Large headers may not be safe
- to load securely and thus require explicitly passing a larger value.
- See :py:func:`ast.literal_eval()` for details.
- This option is ignored when `allow_pickle` is passed. In that case
- the file is by definition trusted and the limit is unnecessary.
- Parameters
- ----------
- fid : file or str
- The zipped archive to open. This is either a file-like object
- or a string containing the path to the archive.
- own_fid : bool, optional
- Whether NpzFile should close the file handle.
- Requires that `fid` is a file-like object.
- Examples
- --------
- >>> from tempfile import TemporaryFile
- >>> outfile = TemporaryFile()
- >>> x = np.arange(10)
- >>> y = np.sin(x)
- >>> np.savez(outfile, x=x, y=y)
- >>> _ = outfile.seek(0)
- >>> npz = np.load(outfile)
- >>> isinstance(npz, np.lib.npyio.NpzFile)
- True
- >>> npz
- NpzFile 'object' with keys x, y
- >>> sorted(npz.files)
- ['x', 'y']
- >>> npz['x'] # getitem access
- array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
- >>> npz.f.x # attribute lookup
- array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
- """
- # Make __exit__ safe if zipfile_factory raises an exception
- zip = None
- fid = None
- _MAX_REPR_ARRAY_COUNT = 5
- def __init__(self, fid, own_fid=False, allow_pickle=False,
- pickle_kwargs=None, *,
- max_header_size=format._MAX_HEADER_SIZE):
- # Import is postponed to here since zipfile depends on gzip, an
- # optional component of the so-called standard library.
- _zip = zipfile_factory(fid)
- self._files = _zip.namelist()
- self.files = []
- self.allow_pickle = allow_pickle
- self.max_header_size = max_header_size
- self.pickle_kwargs = pickle_kwargs
- for x in self._files:
- if x.endswith('.npy'):
- self.files.append(x[:-4])
- else:
- self.files.append(x)
- self.zip = _zip
- self.f = BagObj(self)
- if own_fid:
- self.fid = fid
- def __enter__(self):
- return self
- def __exit__(self, exc_type, exc_value, traceback):
- self.close()
- def close(self):
- """
- Close the file.
- """
- if self.zip is not None:
- self.zip.close()
- self.zip = None
- if self.fid is not None:
- self.fid.close()
- self.fid = None
- self.f = None # break reference cycle
- def __del__(self):
- self.close()
- # Implement the Mapping ABC
- def __iter__(self):
- return iter(self.files)
- def __len__(self):
- return len(self.files)
- def __getitem__(self, key):
- # FIXME: This seems like it will copy strings around
- # more than is strictly necessary. The zipfile
- # will read the string and then
- # the format.read_array will copy the string
- # to another place in memory.
- # It would be better if the zipfile could read
- # (or at least uncompress) the data
- # directly into the array memory.
- member = False
- if key in self._files:
- member = True
- elif key in self.files:
- member = True
- key += '.npy'
- if member:
- bytes = self.zip.open(key)
- magic = bytes.read(len(format.MAGIC_PREFIX))
- bytes.close()
- if magic == format.MAGIC_PREFIX:
- bytes = self.zip.open(key)
- return format.read_array(bytes,
- allow_pickle=self.allow_pickle,
- pickle_kwargs=self.pickle_kwargs,
- max_header_size=self.max_header_size)
- else:
- return self.zip.read(key)
- else:
- raise KeyError(f"{key} is not a file in the archive")
- def __contains__(self, key):
- return (key in self._files or key in self.files)
- def __repr__(self):
- # Get filename or default to `object`
- if isinstance(self.fid, str):
- filename = self.fid
- else:
- filename = getattr(self.fid, "name", "object")
- # Get the name of arrays
- array_names = ', '.join(self.files[:self._MAX_REPR_ARRAY_COUNT])
- if len(self.files) > self._MAX_REPR_ARRAY_COUNT:
- array_names += "..."
- return f"NpzFile {filename!r} with keys: {array_names}"
- @set_module('numpy')
- def load(file, mmap_mode=None, allow_pickle=False, fix_imports=True,
- encoding='ASCII', *, max_header_size=format._MAX_HEADER_SIZE):
- """
- Load arrays or pickled objects from ``.npy``, ``.npz`` or pickled files.
- .. warning:: Loading files that contain object arrays uses the ``pickle``
- module, which is not secure against erroneous or maliciously
- constructed data. Consider passing ``allow_pickle=False`` to
- load data that is known not to contain object arrays for the
- safer handling of untrusted sources.
- Parameters
- ----------
- file : file-like object, string, or pathlib.Path
- The file to read. File-like objects must support the
- ``seek()`` and ``read()`` methods and must always
- be opened in binary mode. Pickled files require that the
- file-like object support the ``readline()`` method as well.
- mmap_mode : {None, 'r+', 'r', 'w+', 'c'}, optional
- If not None, then memory-map the file, using the given mode (see
- `numpy.memmap` for a detailed description of the modes). A
- memory-mapped array is kept on disk. However, it can be accessed
- and sliced like any ndarray. Memory mapping is especially useful
- for accessing small fragments of large files without reading the
- entire file into memory.
- allow_pickle : bool, optional
- Allow loading pickled object arrays stored in npy files. Reasons for
- disallowing pickles include security, as loading pickled data can
- execute arbitrary code. If pickles are disallowed, loading object
- arrays will fail. Default: False
- .. versionchanged:: 1.16.3
- Made default False in response to CVE-2019-6446.
- fix_imports : bool, optional
- Only useful when loading Python 2 generated pickled files on Python 3,
- which includes npy/npz files containing object arrays. If `fix_imports`
- is True, pickle will try to map the old Python 2 names to the new names
- used in Python 3.
- encoding : str, optional
- What encoding to use when reading Python 2 strings. Only useful when
- loading Python 2 generated pickled files in Python 3, which includes
- npy/npz files containing object arrays. Values other than 'latin1',
- 'ASCII', and 'bytes' are not allowed, as they can corrupt numerical
- data. Default: 'ASCII'
- max_header_size : int, optional
- Maximum allowed size of the header. Large headers may not be safe
- to load securely and thus require explicitly passing a larger value.
- See :py:func:`ast.literal_eval()` for details.
- This option is ignored when `allow_pickle` is passed. In that case
- the file is by definition trusted and the limit is unnecessary.
- Returns
- -------
- result : array, tuple, dict, etc.
- Data stored in the file. For ``.npz`` files, the returned instance
- of NpzFile class must be closed to avoid leaking file descriptors.
- Raises
- ------
- OSError
- If the input file does not exist or cannot be read.
- UnpicklingError
- If ``allow_pickle=True``, but the file cannot be loaded as a pickle.
- ValueError
- The file contains an object array, but ``allow_pickle=False`` given.
- EOFError
- When calling ``np.load`` multiple times on the same file handle,
- if all data has already been read
- See Also
- --------
- save, savez, savez_compressed, loadtxt
- memmap : Create a memory-map to an array stored in a file on disk.
- lib.format.open_memmap : Create or load a memory-mapped ``.npy`` file.
- Notes
- -----
- - If the file contains pickle data, then whatever object is stored
- in the pickle is returned.
- - If the file is a ``.npy`` file, then a single array is returned.
- - If the file is a ``.npz`` file, then a dictionary-like object is
- returned, containing ``{filename: array}`` key-value pairs, one for
- each file in the archive.
- - If the file is a ``.npz`` file, the returned value supports the
- context manager protocol in a similar fashion to the open function::
- with load('foo.npz') as data:
- a = data['a']
- The underlying file descriptor is closed when exiting the 'with'
- block.
- Examples
- --------
- Store data to disk, and load it again:
- >>> np.save('/tmp/123', np.array([[1, 2, 3], [4, 5, 6]]))
- >>> np.load('/tmp/123.npy')
- array([[1, 2, 3],
- [4, 5, 6]])
- Store compressed data to disk, and load it again:
- >>> a=np.array([[1, 2, 3], [4, 5, 6]])
- >>> b=np.array([1, 2])
- >>> np.savez('/tmp/123.npz', a=a, b=b)
- >>> data = np.load('/tmp/123.npz')
- >>> data['a']
- array([[1, 2, 3],
- [4, 5, 6]])
- >>> data['b']
- array([1, 2])
- >>> data.close()
- Mem-map the stored array, and then access the second row
- directly from disk:
- >>> X = np.load('/tmp/123.npy', mmap_mode='r')
- >>> X[1, :]
- memmap([4, 5, 6])
- """
- if encoding not in ('ASCII', 'latin1', 'bytes'):
- # The 'encoding' value for pickle also affects what encoding
- # the serialized binary data of NumPy arrays is loaded
- # in. Pickle does not pass on the encoding information to
- # NumPy. The unpickling code in numpy.core.multiarray is
- # written to assume that unicode data appearing where binary
- # should be is in 'latin1'. 'bytes' is also safe, as is 'ASCII'.
- #
- # Other encoding values can corrupt binary data, and we
- # purposefully disallow them. For the same reason, the errors=
- # argument is not exposed, as values other than 'strict'
- # result can similarly silently corrupt numerical data.
- raise ValueError("encoding must be 'ASCII', 'latin1', or 'bytes'")
- pickle_kwargs = dict(encoding=encoding, fix_imports=fix_imports)
- with contextlib.ExitStack() as stack:
- if hasattr(file, 'read'):
- fid = file
- own_fid = False
- else:
- fid = stack.enter_context(open(os_fspath(file), "rb"))
- own_fid = True
- # Code to distinguish from NumPy binary files and pickles.
- _ZIP_PREFIX = b'PK\x03\x04'
- _ZIP_SUFFIX = b'PK\x05\x06' # empty zip files start with this
- N = len(format.MAGIC_PREFIX)
- magic = fid.read(N)
- if not magic:
- raise EOFError("No data left in file")
- # If the file size is less than N, we need to make sure not
- # to seek past the beginning of the file
- fid.seek(-min(N, len(magic)), 1) # back-up
- if magic.startswith(_ZIP_PREFIX) or magic.startswith(_ZIP_SUFFIX):
- # zip-file (assume .npz)
- # Potentially transfer file ownership to NpzFile
- stack.pop_all()
- ret = NpzFile(fid, own_fid=own_fid, allow_pickle=allow_pickle,
- pickle_kwargs=pickle_kwargs,
- max_header_size=max_header_size)
- return ret
- elif magic == format.MAGIC_PREFIX:
- # .npy file
- if mmap_mode:
- if allow_pickle:
- max_header_size = 2**64
- return format.open_memmap(file, mode=mmap_mode,
- max_header_size=max_header_size)
- else:
- return format.read_array(fid, allow_pickle=allow_pickle,
- pickle_kwargs=pickle_kwargs,
- max_header_size=max_header_size)
- else:
- # Try a pickle
- if not allow_pickle:
- raise ValueError("Cannot load file containing pickled data "
- "when allow_pickle=False")
- try:
- return pickle.load(fid, **pickle_kwargs)
- except Exception as e:
- raise pickle.UnpicklingError(
- f"Failed to interpret file {file!r} as a pickle") from e
- def _save_dispatcher(file, arr, allow_pickle=None, fix_imports=None):
- return (arr,)
- @array_function_dispatch(_save_dispatcher)
- def save(file, arr, allow_pickle=True, fix_imports=True):
- """
- Save an array to a binary file in NumPy ``.npy`` format.
- Parameters
- ----------
- file : file, str, or pathlib.Path
- File or filename to which the data is saved. If file is a file-object,
- then the filename is unchanged. If file is a string or Path, a ``.npy``
- extension will be appended to the filename if it does not already
- have one.
- arr : array_like
- Array data to be saved.
- allow_pickle : bool, optional
- Allow saving object arrays using Python pickles. Reasons for disallowing
- pickles include security (loading pickled data can execute arbitrary
- code) and portability (pickled objects may not be loadable on different
- Python installations, for example if the stored objects require libraries
- that are not available, and not all pickled data is compatible between
- Python 2 and Python 3).
- Default: True
- fix_imports : bool, optional
- Only useful in forcing objects in object arrays on Python 3 to be
- pickled in a Python 2 compatible way. If `fix_imports` is True, pickle
- will try to map the new Python 3 names to the old module names used in
- Python 2, so that the pickle data stream is readable with Python 2.
- See Also
- --------
- savez : Save several arrays into a ``.npz`` archive
- savetxt, load
- Notes
- -----
- For a description of the ``.npy`` format, see :py:mod:`numpy.lib.format`.
- Any data saved to the file is appended to the end of the file.
- Examples
- --------
- >>> from tempfile import TemporaryFile
- >>> outfile = TemporaryFile()
- >>> x = np.arange(10)
- >>> np.save(outfile, x)
- >>> _ = outfile.seek(0) # Only needed here to simulate closing & reopening file
- >>> np.load(outfile)
- array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
- >>> with open('test.npy', 'wb') as f:
- ... np.save(f, np.array([1, 2]))
- ... np.save(f, np.array([1, 3]))
- >>> with open('test.npy', 'rb') as f:
- ... a = np.load(f)
- ... b = np.load(f)
- >>> print(a, b)
- # [1 2] [1 3]
- """
- if hasattr(file, 'write'):
- file_ctx = contextlib.nullcontext(file)
- else:
- file = os_fspath(file)
- if not file.endswith('.npy'):
- file = file + '.npy'
- file_ctx = open(file, "wb")
- with file_ctx as fid:
- arr = np.asanyarray(arr)
- format.write_array(fid, arr, allow_pickle=allow_pickle,
- pickle_kwargs=dict(fix_imports=fix_imports))
- def _savez_dispatcher(file, *args, **kwds):
- yield from args
- yield from kwds.values()
- @array_function_dispatch(_savez_dispatcher)
- def savez(file, *args, **kwds):
- """Save several arrays into a single file in uncompressed ``.npz`` format.
- Provide arrays as keyword arguments to store them under the
- corresponding name in the output file: ``savez(fn, x=x, y=y)``.
- If arrays are specified as positional arguments, i.e., ``savez(fn,
- x, y)``, their names will be `arr_0`, `arr_1`, etc.
- Parameters
- ----------
- file : str or file
- Either the filename (string) or an open file (file-like object)
- where the data will be saved. If file is a string or a Path, the
- ``.npz`` extension will be appended to the filename if it is not
- already there.
- args : Arguments, optional
- Arrays to save to the file. Please use keyword arguments (see
- `kwds` below) to assign names to arrays. Arrays specified as
- args will be named "arr_0", "arr_1", and so on.
- kwds : Keyword arguments, optional
- Arrays to save to the file. Each array will be saved to the
- output file with its corresponding keyword name.
- Returns
- -------
- None
- See Also
- --------
- save : Save a single array to a binary file in NumPy format.
- savetxt : Save an array to a file as plain text.
- savez_compressed : Save several arrays into a compressed ``.npz`` archive
- Notes
- -----
- The ``.npz`` file format is a zipped archive of files named after the
- variables they contain. The archive is not compressed and each file
- in the archive contains one variable in ``.npy`` format. For a
- description of the ``.npy`` format, see :py:mod:`numpy.lib.format`.
- When opening the saved ``.npz`` file with `load` a `NpzFile` object is
- returned. This is a dictionary-like object which can be queried for
- its list of arrays (with the ``.files`` attribute), and for the arrays
- themselves.
- Keys passed in `kwds` are used as filenames inside the ZIP archive.
- Therefore, keys should be valid filenames; e.g., avoid keys that begin with
- ``/`` or contain ``.``.
- When naming variables with keyword arguments, it is not possible to name a
- variable ``file``, as this would cause the ``file`` argument to be defined
- twice in the call to ``savez``.
- Examples
- --------
- >>> from tempfile import TemporaryFile
- >>> outfile = TemporaryFile()
- >>> x = np.arange(10)
- >>> y = np.sin(x)
- Using `savez` with \\*args, the arrays are saved with default names.
- >>> np.savez(outfile, x, y)
- >>> _ = outfile.seek(0) # Only needed here to simulate closing & reopening file
- >>> npzfile = np.load(outfile)
- >>> npzfile.files
- ['arr_0', 'arr_1']
- >>> npzfile['arr_0']
- array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
- Using `savez` with \\**kwds, the arrays are saved with the keyword names.
- >>> outfile = TemporaryFile()
- >>> np.savez(outfile, x=x, y=y)
- >>> _ = outfile.seek(0)
- >>> npzfile = np.load(outfile)
- >>> sorted(npzfile.files)
- ['x', 'y']
- >>> npzfile['x']
- array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
- """
- _savez(file, args, kwds, False)
- def _savez_compressed_dispatcher(file, *args, **kwds):
- yield from args
- yield from kwds.values()
- @array_function_dispatch(_savez_compressed_dispatcher)
- def savez_compressed(file, *args, **kwds):
- """
- Save several arrays into a single file in compressed ``.npz`` format.
- Provide arrays as keyword arguments to store them under the
- corresponding name in the output file: ``savez(fn, x=x, y=y)``.
- If arrays are specified as positional arguments, i.e., ``savez(fn,
- x, y)``, their names will be `arr_0`, `arr_1`, etc.
- Parameters
- ----------
- file : str or file
- Either the filename (string) or an open file (file-like object)
- where the data will be saved. If file is a string or a Path, the
- ``.npz`` extension will be appended to the filename if it is not
- already there.
- args : Arguments, optional
- Arrays to save to the file. Please use keyword arguments (see
- `kwds` below) to assign names to arrays. Arrays specified as
- args will be named "arr_0", "arr_1", and so on.
- kwds : Keyword arguments, optional
- Arrays to save to the file. Each array will be saved to the
- output file with its corresponding keyword name.
- Returns
- -------
- None
- See Also
- --------
- numpy.save : Save a single array to a binary file in NumPy format.
- numpy.savetxt : Save an array to a file as plain text.
- numpy.savez : Save several arrays into an uncompressed ``.npz`` file format
- numpy.load : Load the files created by savez_compressed.
- Notes
- -----
- The ``.npz`` file format is a zipped archive of files named after the
- variables they contain. The archive is compressed with
- ``zipfile.ZIP_DEFLATED`` and each file in the archive contains one variable
- in ``.npy`` format. For a description of the ``.npy`` format, see
- :py:mod:`numpy.lib.format`.
- When opening the saved ``.npz`` file with `load` a `NpzFile` object is
- returned. This is a dictionary-like object which can be queried for
- its list of arrays (with the ``.files`` attribute), and for the arrays
- themselves.
- Examples
- --------
- >>> test_array = np.random.rand(3, 2)
- >>> test_vector = np.random.rand(4)
- >>> np.savez_compressed('/tmp/123', a=test_array, b=test_vector)
- >>> loaded = np.load('/tmp/123.npz')
- >>> print(np.array_equal(test_array, loaded['a']))
- True
- >>> print(np.array_equal(test_vector, loaded['b']))
- True
- """
- _savez(file, args, kwds, True)
- def _savez(file, args, kwds, compress, allow_pickle=True, pickle_kwargs=None):
- # Import is postponed to here since zipfile depends on gzip, an optional
- # component of the so-called standard library.
- import zipfile
- if not hasattr(file, 'write'):
- file = os_fspath(file)
- if not file.endswith('.npz'):
- file = file + '.npz'
- namedict = kwds
- for i, val in enumerate(args):
- key = 'arr_%d' % i
- if key in namedict.keys():
- raise ValueError(
- "Cannot use un-named variables and keyword %s" % key)
- namedict[key] = val
- if compress:
- compression = zipfile.ZIP_DEFLATED
- else:
- compression = zipfile.ZIP_STORED
- zipf = zipfile_factory(file, mode="w", compression=compression)
- for key, val in namedict.items():
- fname = key + '.npy'
- val = np.asanyarray(val)
- # always force zip64, gh-10776
- with zipf.open(fname, 'w', force_zip64=True) as fid:
- format.write_array(fid, val,
- allow_pickle=allow_pickle,
- pickle_kwargs=pickle_kwargs)
- zipf.close()
- def _ensure_ndmin_ndarray_check_param(ndmin):
- """Just checks if the param ndmin is supported on
- _ensure_ndmin_ndarray. It is intended to be used as
- verification before running anything expensive.
- e.g. loadtxt, genfromtxt
- """
- # Check correctness of the values of `ndmin`
- if ndmin not in [0, 1, 2]:
- raise ValueError(f"Illegal value of ndmin keyword: {ndmin}")
- def _ensure_ndmin_ndarray(a, *, ndmin: int):
- """This is a helper function of loadtxt and genfromtxt to ensure
- proper minimum dimension as requested
- ndim : int. Supported values 1, 2, 3
- ^^ whenever this changes, keep in sync with
- _ensure_ndmin_ndarray_check_param
- """
- # Verify that the array has at least dimensions `ndmin`.
- # Tweak the size and shape of the arrays - remove extraneous dimensions
- if a.ndim > ndmin:
- a = np.squeeze(a)
- # and ensure we have the minimum number of dimensions asked for
- # - has to be in this order for the odd case ndmin=1, a.squeeze().ndim=0
- if a.ndim < ndmin:
- if ndmin == 1:
- a = np.atleast_1d(a)
- elif ndmin == 2:
- a = np.atleast_2d(a).T
- return a
- # amount of lines loadtxt reads in one chunk, can be overridden for testing
- _loadtxt_chunksize = 50000
- def _check_nonneg_int(value, name="argument"):
- try:
- operator.index(value)
- except TypeError:
- raise TypeError(f"{name} must be an integer") from None
- if value < 0:
- raise ValueError(f"{name} must be nonnegative")
- def _preprocess_comments(iterable, comments, encoding):
- """
- Generator that consumes a line iterated iterable and strips out the
- multiple (or multi-character) comments from lines.
- This is a pre-processing step to achieve feature parity with loadtxt
- (we assume that this feature is a nieche feature).
- """
- for line in iterable:
- if isinstance(line, bytes):
- # Need to handle conversion here, or the splitting would fail
- line = line.decode(encoding)
- for c in comments:
- line = line.split(c, 1)[0]
- yield line
- # The number of rows we read in one go if confronted with a parametric dtype
- _loadtxt_chunksize = 50000
- def _read(fname, *, delimiter=',', comment='#', quote='"',
- imaginary_unit='j', usecols=None, skiplines=0,
- max_rows=None, converters=None, ndmin=None, unpack=False,
- dtype=np.float64, encoding="bytes"):
- r"""
- Read a NumPy array from a text file.
- Parameters
- ----------
- fname : str or file object
- The filename or the file to be read.
- delimiter : str, optional
- Field delimiter of the fields in line of the file.
- Default is a comma, ','. If None any sequence of whitespace is
- considered a delimiter.
- comment : str or sequence of str or None, optional
- Character that begins a comment. All text from the comment
- character to the end of the line is ignored.
- Multiple comments or multiple-character comment strings are supported,
- but may be slower and `quote` must be empty if used.
- Use None to disable all use of comments.
- quote : str or None, optional
- Character that is used to quote string fields. Default is '"'
- (a double quote). Use None to disable quote support.
- imaginary_unit : str, optional
- Character that represent the imaginay unit `sqrt(-1)`.
- Default is 'j'.
- usecols : array_like, optional
- A one-dimensional array of integer column numbers. These are the
- columns from the file to be included in the array. If this value
- is not given, all the columns are used.
- skiplines : int, optional
- Number of lines to skip before interpreting the data in the file.
- max_rows : int, optional
- Maximum number of rows of data to read. Default is to read the
- entire file.
- converters : dict or callable, optional
- A function to parse all columns strings into the desired value, or
- a dictionary mapping column number to a parser function.
- E.g. if column 0 is a date string: ``converters = {0: datestr2num}``.
- Converters can also be used to provide a default value for missing
- data, e.g. ``converters = lambda s: float(s.strip() or 0)`` will
- convert empty fields to 0.
- Default: None
- ndmin : int, optional
- Minimum dimension of the array returned.
- Allowed values are 0, 1 or 2. Default is 0.
- unpack : bool, optional
- If True, the returned array is transposed, so that arguments may be
- unpacked using ``x, y, z = read(...)``. When used with a structured
- data-type, arrays are returned for each field. Default is False.
- dtype : numpy data type
- A NumPy dtype instance, can be a structured dtype to map to the
- columns of the file.
- encoding : str, optional
- Encoding used to decode the inputfile. The special value 'bytes'
- (the default) enables backwards-compatible behavior for `converters`,
- ensuring that inputs to the converter functions are encoded
- bytes objects. The special value 'bytes' has no additional effect if
- ``converters=None``. If encoding is ``'bytes'`` or ``None``, the
- default system encoding is used.
- Returns
- -------
- ndarray
- NumPy array.
- Examples
- --------
- First we create a file for the example.
- >>> s1 = '1.0,2.0,3.0\n4.0,5.0,6.0\n'
- >>> with open('example1.csv', 'w') as f:
- ... f.write(s1)
- >>> a1 = read_from_filename('example1.csv')
- >>> a1
- array([[1., 2., 3.],
- [4., 5., 6.]])
- The second example has columns with different data types, so a
- one-dimensional array with a structured data type is returned.
- The tab character is used as the field delimiter.
- >>> s2 = '1.0\t10\talpha\n2.3\t25\tbeta\n4.5\t16\tgamma\n'
- >>> with open('example2.tsv', 'w') as f:
- ... f.write(s2)
- >>> a2 = read_from_filename('example2.tsv', delimiter='\t')
- >>> a2
- array([(1. , 10, b'alpha'), (2.3, 25, b'beta'), (4.5, 16, b'gamma')],
- dtype=[('f0', '<f8'), ('f1', 'u1'), ('f2', 'S5')])
- """
- # Handle special 'bytes' keyword for encoding
- byte_converters = False
- if encoding == 'bytes':
- encoding = None
- byte_converters = True
- if dtype is None:
- raise TypeError("a dtype must be provided.")
- dtype = np.dtype(dtype)
- read_dtype_via_object_chunks = None
- if dtype.kind in 'SUM' and (
- dtype == "S0" or dtype == "U0" or dtype == "M8" or dtype == 'm8'):
- # This is a legacy "flexible" dtype. We do not truly support
- # parametric dtypes currently (no dtype discovery step in the core),
- # but have to support these for backward compatibility.
- read_dtype_via_object_chunks = dtype
- dtype = np.dtype(object)
- if usecols is not None:
- # Allow usecols to be a single int or a sequence of ints, the C-code
- # handles the rest
- try:
- usecols = list(usecols)
- except TypeError:
- usecols = [usecols]
- _ensure_ndmin_ndarray_check_param(ndmin)
- if comment is None:
- comments = None
- else:
- # assume comments are a sequence of strings
- if "" in comment:
- raise ValueError(
- "comments cannot be an empty string. Use comments=None to "
- "disable comments."
- )
- comments = tuple(comment)
- comment = None
- if len(comments) == 0:
- comments = None # No comments at all
- elif len(comments) == 1:
- # If there is only one comment, and that comment has one character,
- # the normal parsing can deal with it just fine.
- if isinstance(comments[0], str) and len(comments[0]) == 1:
- comment = comments[0]
- comments = None
- else:
- # Input validation if there are multiple comment characters
- if delimiter in comments:
- raise TypeError(
- f"Comment characters '{comments}' cannot include the "
- f"delimiter '{delimiter}'"
- )
- # comment is now either a 1 or 0 character string or a tuple:
- if comments is not None:
- # Note: An earlier version support two character comments (and could
- # have been extended to multiple characters, we assume this is
- # rare enough to not optimize for.
- if quote is not None:
- raise ValueError(
- "when multiple comments or a multi-character comment is "
- "given, quotes are not supported. In this case quotechar "
- "must be set to None.")
- if len(imaginary_unit) != 1:
- raise ValueError('len(imaginary_unit) must be 1.')
- _check_nonneg_int(skiplines)
- if max_rows is not None:
- _check_nonneg_int(max_rows)
- else:
- # Passing -1 to the C code means "read the entire file".
- max_rows = -1
- fh_closing_ctx = contextlib.nullcontext()
- filelike = False
- try:
- if isinstance(fname, os.PathLike):
- fname = os.fspath(fname)
- if isinstance(fname, str):
- fh = np.lib._datasource.open(fname, 'rt', encoding=encoding)
- if encoding is None:
- encoding = getattr(fh, 'encoding', 'latin1')
- fh_closing_ctx = contextlib.closing(fh)
- data = fh
- filelike = True
- else:
- if encoding is None:
- encoding = getattr(fname, 'encoding', 'latin1')
- data = iter(fname)
- except TypeError as e:
- raise ValueError(
- f"fname must be a string, filehandle, list of strings,\n"
- f"or generator. Got {type(fname)} instead.") from e
- with fh_closing_ctx:
- if comments is not None:
- if filelike:
- data = iter(data)
- filelike = False
- data = _preprocess_comments(data, comments, encoding)
- if read_dtype_via_object_chunks is None:
- arr = _load_from_filelike(
- data, delimiter=delimiter, comment=comment, quote=quote,
- imaginary_unit=imaginary_unit,
- usecols=usecols, skiplines=skiplines, max_rows=max_rows,
- converters=converters, dtype=dtype,
- encoding=encoding, filelike=filelike,
- byte_converters=byte_converters)
- else:
- # This branch reads the file into chunks of object arrays and then
- # casts them to the desired actual dtype. This ensures correct
- # string-length and datetime-unit discovery (like `arr.astype()`).
- # Due to chunking, certain error reports are less clear, currently.
- if filelike:
- data = iter(data) # cannot chunk when reading from file
- c_byte_converters = False
- if read_dtype_via_object_chunks == "S":
- c_byte_converters = True # Use latin1 rather than ascii
- chunks = []
- while max_rows != 0:
- if max_rows < 0:
- chunk_size = _loadtxt_chunksize
- else:
- chunk_size = min(_loadtxt_chunksize, max_rows)
- next_arr = _load_from_filelike(
- data, delimiter=delimiter, comment=comment, quote=quote,
- imaginary_unit=imaginary_unit,
- usecols=usecols, skiplines=skiplines, max_rows=max_rows,
- converters=converters, dtype=dtype,
- encoding=encoding, filelike=filelike,
- byte_converters=byte_converters,
- c_byte_converters=c_byte_converters)
- # Cast here already. We hope that this is better even for
- # large files because the storage is more compact. It could
- # be adapted (in principle the concatenate could cast).
- chunks.append(next_arr.astype(read_dtype_via_object_chunks))
- skiprows = 0 # Only have to skip for first chunk
- if max_rows >= 0:
- max_rows -= chunk_size
- if len(next_arr) < chunk_size:
- # There was less data than requested, so we are done.
- break
- # Need at least one chunk, but if empty, the last one may have
- # the wrong shape.
- if len(chunks) > 1 and len(chunks[-1]) == 0:
- del chunks[-1]
- if len(chunks) == 1:
- arr = chunks[0]
- else:
- arr = np.concatenate(chunks, axis=0)
- # NOTE: ndmin works as advertised for structured dtypes, but normally
- # these would return a 1D result plus the structured dimension,
- # so ndmin=2 adds a third dimension even when no squeezing occurs.
- # A `squeeze=False` could be a better solution (pandas uses squeeze).
- arr = _ensure_ndmin_ndarray(arr, ndmin=ndmin)
- if arr.shape:
- if arr.shape[0] == 0:
- warnings.warn(
- f'loadtxt: input contained no data: "{fname}"',
- category=UserWarning,
- stacklevel=3
- )
- if unpack:
- # Unpack structured dtypes if requested:
- dt = arr.dtype
- if dt.names is not None:
- # For structured arrays, return an array for each field.
- return [arr[field] for field in dt.names]
- else:
- return arr.T
- else:
- return arr
- @set_array_function_like_doc
- @set_module('numpy')
- def loadtxt(fname, dtype=float, comments='#', delimiter=None,
- converters=None, skiprows=0, usecols=None, unpack=False,
- ndmin=0, encoding='bytes', max_rows=None, *, quotechar=None,
- like=None):
- r"""
- Load data from a text file.
- Parameters
- ----------
- fname : file, str, pathlib.Path, list of str, generator
- File, filename, list, or generator to read. If the filename
- extension is ``.gz`` or ``.bz2``, the file is first decompressed. Note
- that generators must return bytes or strings. The strings
- in a list or produced by a generator are treated as lines.
- dtype : data-type, optional
- Data-type of the resulting array; default: float. If this is a
- structured data-type, the resulting array will be 1-dimensional, and
- each row will be interpreted as an element of the array. In this
- case, the number of columns used must match the number of fields in
- the data-type.
- comments : str or sequence of str or None, optional
- The characters or list of characters used to indicate the start of a
- comment. None implies no comments. For backwards compatibility, byte
- strings will be decoded as 'latin1'. The default is '#'.
- delimiter : str, optional
- The character used to separate the values. For backwards compatibility,
- byte strings will be decoded as 'latin1'. The default is whitespace.
- .. versionchanged:: 1.23.0
- Only single character delimiters are supported. Newline characters
- cannot be used as the delimiter.
- converters : dict or callable, optional
- Converter functions to customize value parsing. If `converters` is
- callable, the function is applied to all columns, else it must be a
- dict that maps column number to a parser function.
- See examples for further details.
- Default: None.
- .. versionchanged:: 1.23.0
- The ability to pass a single callable to be applied to all columns
- was added.
- skiprows : int, optional
- Skip the first `skiprows` lines, including comments; default: 0.
- usecols : int or sequence, optional
- Which columns to read, with 0 being the first. For example,
- ``usecols = (1,4,5)`` will extract the 2nd, 5th and 6th columns.
- The default, None, results in all columns being read.
- .. versionchanged:: 1.11.0
- When a single column has to be read it is possible to use
- an integer instead of a tuple. E.g ``usecols = 3`` reads the
- fourth column the same way as ``usecols = (3,)`` would.
- unpack : bool, optional
- If True, the returned array is transposed, so that arguments may be
- unpacked using ``x, y, z = loadtxt(...)``. When used with a
- structured data-type, arrays are returned for each field.
- Default is False.
- ndmin : int, optional
- The returned array will have at least `ndmin` dimensions.
- Otherwise mono-dimensional axes will be squeezed.
- Legal values: 0 (default), 1 or 2.
- .. versionadded:: 1.6.0
- encoding : str, optional
- Encoding used to decode the inputfile. Does not apply to input streams.
- The special value 'bytes' enables backward compatibility workarounds
- that ensures you receive byte arrays as results if possible and passes
- 'latin1' encoded strings to converters. Override this value to receive
- unicode arrays and pass strings as input to converters. If set to None
- the system default is used. The default value is 'bytes'.
- .. versionadded:: 1.14.0
- max_rows : int, optional
- Read `max_rows` rows of content after `skiprows` lines. The default is
- to read all the rows. Note that empty rows containing no data such as
- empty lines and comment lines are not counted towards `max_rows`,
- while such lines are counted in `skiprows`.
- .. versionadded:: 1.16.0
- .. versionchanged:: 1.23.0
- Lines containing no data, including comment lines (e.g., lines
- starting with '#' or as specified via `comments`) are not counted
- towards `max_rows`.
- quotechar : unicode character or None, optional
- The character used to denote the start and end of a quoted item.
- Occurrences of the delimiter or comment characters are ignored within
- a quoted item. The default value is ``quotechar=None``, which means
- quoting support is disabled.
- If two consecutive instances of `quotechar` are found within a quoted
- field, the first is treated as an escape character. See examples.
- .. versionadded:: 1.23.0
- ${ARRAY_FUNCTION_LIKE}
- .. versionadded:: 1.20.0
- Returns
- -------
- out : ndarray
- Data read from the text file.
- See Also
- --------
- load, fromstring, fromregex
- genfromtxt : Load data with missing values handled as specified.
- scipy.io.loadmat : reads MATLAB data files
- Notes
- -----
- This function aims to be a fast reader for simply formatted files. The
- `genfromtxt` function provides more sophisticated handling of, e.g.,
- lines with missing values.
- Each row in the input text file must have the same number of values to be
- able to read all values. If all rows do not have same number of values, a
- subset of up to n columns (where n is the least number of values present
- in all rows) can be read by specifying the columns via `usecols`.
- .. versionadded:: 1.10.0
- The strings produced by the Python float.hex method can be used as
- input for floats.
- Examples
- --------
- >>> from io import StringIO # StringIO behaves like a file object
- >>> c = StringIO("0 1\n2 3")
- >>> np.loadtxt(c)
- array([[0., 1.],
- [2., 3.]])
- >>> d = StringIO("M 21 72\nF 35 58")
- >>> np.loadtxt(d, dtype={'names': ('gender', 'age', 'weight'),
- ... 'formats': ('S1', 'i4', 'f4')})
- array([(b'M', 21, 72.), (b'F', 35, 58.)],
- dtype=[('gender', 'S1'), ('age', '<i4'), ('weight', '<f4')])
- >>> c = StringIO("1,0,2\n3,0,4")
- >>> x, y = np.loadtxt(c, delimiter=',', usecols=(0, 2), unpack=True)
- >>> x
- array([1., 3.])
- >>> y
- array([2., 4.])
- The `converters` argument is used to specify functions to preprocess the
- text prior to parsing. `converters` can be a dictionary that maps
- preprocessing functions to each column:
- >>> s = StringIO("1.618, 2.296\n3.141, 4.669\n")
- >>> conv = {
- ... 0: lambda x: np.floor(float(x)), # conversion fn for column 0
- ... 1: lambda x: np.ceil(float(x)), # conversion fn for column 1
- ... }
- >>> np.loadtxt(s, delimiter=",", converters=conv)
- array([[1., 3.],
- [3., 5.]])
- `converters` can be a callable instead of a dictionary, in which case it
- is applied to all columns:
- >>> s = StringIO("0xDE 0xAD\n0xC0 0xDE")
- >>> import functools
- >>> conv = functools.partial(int, base=16)
- >>> np.loadtxt(s, converters=conv)
- array([[222., 173.],
- [192., 222.]])
- This example shows how `converters` can be used to convert a field
- with a trailing minus sign into a negative number.
- >>> s = StringIO('10.01 31.25-\n19.22 64.31\n17.57- 63.94')
- >>> def conv(fld):
- ... return -float(fld[:-1]) if fld.endswith(b'-') else float(fld)
- ...
- >>> np.loadtxt(s, converters=conv)
- array([[ 10.01, -31.25],
- [ 19.22, 64.31],
- [-17.57, 63.94]])
- Using a callable as the converter can be particularly useful for handling
- values with different formatting, e.g. floats with underscores:
- >>> s = StringIO("1 2.7 100_000")
- >>> np.loadtxt(s, converters=float)
- array([1.e+00, 2.7e+00, 1.e+05])
- This idea can be extended to automatically handle values specified in
- many different formats:
- >>> def conv(val):
- ... try:
- ... return float(val)
- ... except ValueError:
- ... return float.fromhex(val)
- >>> s = StringIO("1, 2.5, 3_000, 0b4, 0x1.4000000000000p+2")
- >>> np.loadtxt(s, delimiter=",", converters=conv, encoding=None)
- array([1.0e+00, 2.5e+00, 3.0e+03, 1.8e+02, 5.0e+00])
- Note that with the default ``encoding="bytes"``, the inputs to the
- converter function are latin-1 encoded byte strings. To deactivate the
- implicit encoding prior to conversion, use ``encoding=None``
- >>> s = StringIO('10.01 31.25-\n19.22 64.31\n17.57- 63.94')
- >>> conv = lambda x: -float(x[:-1]) if x.endswith('-') else float(x)
- >>> np.loadtxt(s, converters=conv, encoding=None)
- array([[ 10.01, -31.25],
- [ 19.22, 64.31],
- [-17.57, 63.94]])
- Support for quoted fields is enabled with the `quotechar` parameter.
- Comment and delimiter characters are ignored when they appear within a
- quoted item delineated by `quotechar`:
- >>> s = StringIO('"alpha, #42", 10.0\n"beta, #64", 2.0\n')
- >>> dtype = np.dtype([("label", "U12"), ("value", float)])
- >>> np.loadtxt(s, dtype=dtype, delimiter=",", quotechar='"')
- array([('alpha, #42', 10.), ('beta, #64', 2.)],
- dtype=[('label', '<U12'), ('value', '<f8')])
- Quoted fields can be separated by multiple whitespace characters:
- >>> s = StringIO('"alpha, #42" 10.0\n"beta, #64" 2.0\n')
- >>> dtype = np.dtype([("label", "U12"), ("value", float)])
- >>> np.loadtxt(s, dtype=dtype, delimiter=None, quotechar='"')
- array([('alpha, #42', 10.), ('beta, #64', 2.)],
- dtype=[('label', '<U12'), ('value', '<f8')])
- Two consecutive quote characters within a quoted field are treated as a
- single escaped character:
- >>> s = StringIO('"Hello, my name is ""Monty""!"')
- >>> np.loadtxt(s, dtype="U", delimiter=",", quotechar='"')
- array('Hello, my name is "Monty"!', dtype='<U26')
- Read subset of columns when all rows do not contain equal number of values:
- >>> d = StringIO("1 2\n2 4\n3 9 12\n4 16 20")
- >>> np.loadtxt(d, usecols=(0, 1))
- array([[ 1., 2.],
- [ 2., 4.],
- [ 3., 9.],
- [ 4., 16.]])
- """
- if like is not None:
- return _loadtxt_with_like(
- like, fname, dtype=dtype, comments=comments, delimiter=delimiter,
- converters=converters, skiprows=skiprows, usecols=usecols,
- unpack=unpack, ndmin=ndmin, encoding=encoding,
- max_rows=max_rows
- )
- if isinstance(delimiter, bytes):
- delimiter.decode("latin1")
- if dtype is None:
- dtype = np.float64
- comment = comments
- # Control character type conversions for Py3 convenience
- if comment is not None:
- if isinstance(comment, (str, bytes)):
- comment = [comment]
- comment = [
- x.decode('latin1') if isinstance(x, bytes) else x for x in comment]
- if isinstance(delimiter, bytes):
- delimiter = delimiter.decode('latin1')
- arr = _read(fname, dtype=dtype, comment=comment, delimiter=delimiter,
- converters=converters, skiplines=skiprows, usecols=usecols,
- unpack=unpack, ndmin=ndmin, encoding=encoding,
- max_rows=max_rows, quote=quotechar)
- return arr
- _loadtxt_with_like = array_function_dispatch()(loadtxt)
- def _savetxt_dispatcher(fname, X, fmt=None, delimiter=None, newline=None,
- header=None, footer=None, comments=None,
- encoding=None):
- return (X,)
- @array_function_dispatch(_savetxt_dispatcher)
- def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
- footer='', comments='# ', encoding=None):
- """
- Save an array to a text file.
- Parameters
- ----------
- fname : filename or file handle
- If the filename ends in ``.gz``, the file is automatically saved in
- compressed gzip format. `loadtxt` understands gzipped files
- transparently.
- X : 1D or 2D array_like
- Data to be saved to a text file.
- fmt : str or sequence of strs, optional
- A single format (%10.5f), a sequence of formats, or a
- multi-format string, e.g. 'Iteration %d -- %10.5f', in which
- case `delimiter` is ignored. For complex `X`, the legal options
- for `fmt` are:
- * a single specifier, `fmt='%.4e'`, resulting in numbers formatted
- like `' (%s+%sj)' % (fmt, fmt)`
- * a full string specifying every real and imaginary part, e.g.
- `' %.4e %+.4ej %.4e %+.4ej %.4e %+.4ej'` for 3 columns
- * a list of specifiers, one per column - in this case, the real
- and imaginary part must have separate specifiers,
- e.g. `['%.3e + %.3ej', '(%.15e%+.15ej)']` for 2 columns
- delimiter : str, optional
- String or character separating columns.
- newline : str, optional
- String or character separating lines.
- .. versionadded:: 1.5.0
- header : str, optional
- String that will be written at the beginning of the file.
- .. versionadded:: 1.7.0
- footer : str, optional
- String that will be written at the end of the file.
- .. versionadded:: 1.7.0
- comments : str, optional
- String that will be prepended to the ``header`` and ``footer`` strings,
- to mark them as comments. Default: '# ', as expected by e.g.
- ``numpy.loadtxt``.
- .. versionadded:: 1.7.0
- encoding : {None, str}, optional
- Encoding used to encode the outputfile. Does not apply to output
- streams. If the encoding is something other than 'bytes' or 'latin1'
- you will not be able to load the file in NumPy versions < 1.14. Default
- is 'latin1'.
- .. versionadded:: 1.14.0
- See Also
- --------
- save : Save an array to a binary file in NumPy ``.npy`` format
- savez : Save several arrays into an uncompressed ``.npz`` archive
- savez_compressed : Save several arrays into a compressed ``.npz`` archive
- Notes
- -----
- Further explanation of the `fmt` parameter
- (``%[flag]width[.precision]specifier``):
- flags:
- ``-`` : left justify
- ``+`` : Forces to precede result with + or -.
- ``0`` : Left pad the number with zeros instead of space (see width).
- width:
- Minimum number of characters to be printed. The value is not truncated
- if it has more characters.
- precision:
- - For integer specifiers (eg. ``d,i,o,x``), the minimum number of
- digits.
- - For ``e, E`` and ``f`` specifiers, the number of digits to print
- after the decimal point.
- - For ``g`` and ``G``, the maximum number of significant digits.
- - For ``s``, the maximum number of characters.
- specifiers:
- ``c`` : character
- ``d`` or ``i`` : signed decimal integer
- ``e`` or ``E`` : scientific notation with ``e`` or ``E``.
- ``f`` : decimal floating point
- ``g,G`` : use the shorter of ``e,E`` or ``f``
- ``o`` : signed octal
- ``s`` : string of characters
- ``u`` : unsigned decimal integer
- ``x,X`` : unsigned hexadecimal integer
- This explanation of ``fmt`` is not complete, for an exhaustive
- specification see [1]_.
- References
- ----------
- .. [1] `Format Specification Mini-Language
- <https://docs.python.org/library/string.html#format-specification-mini-language>`_,
- Python Documentation.
- Examples
- --------
- >>> x = y = z = np.arange(0.0,5.0,1.0)
- >>> np.savetxt('test.out', x, delimiter=',') # X is an array
- >>> np.savetxt('test.out', (x,y,z)) # x,y,z equal sized 1D arrays
- >>> np.savetxt('test.out', x, fmt='%1.4e') # use exponential notation
- """
- # Py3 conversions first
- if isinstance(fmt, bytes):
- fmt = asstr(fmt)
- delimiter = asstr(delimiter)
- class WriteWrap:
- """Convert to bytes on bytestream inputs.
- """
- def __init__(self, fh, encoding):
- self.fh = fh
- self.encoding = encoding
- self.do_write = self.first_write
- def close(self):
- self.fh.close()
- def write(self, v):
- self.do_write(v)
- def write_bytes(self, v):
- if isinstance(v, bytes):
- self.fh.write(v)
- else:
- self.fh.write(v.encode(self.encoding))
- def write_normal(self, v):
- self.fh.write(asunicode(v))
- def first_write(self, v):
- try:
- self.write_normal(v)
- self.write = self.write_normal
- except TypeError:
- # input is probably a bytestream
- self.write_bytes(v)
- self.write = self.write_bytes
- own_fh = False
- if isinstance(fname, os_PathLike):
- fname = os_fspath(fname)
- if _is_string_like(fname):
- # datasource doesn't support creating a new file ...
- open(fname, 'wt').close()
- fh = np.lib._datasource.open(fname, 'wt', encoding=encoding)
- own_fh = True
- elif hasattr(fname, 'write'):
- # wrap to handle byte output streams
- fh = WriteWrap(fname, encoding or 'latin1')
- else:
- raise ValueError('fname must be a string or file handle')
- try:
- X = np.asarray(X)
- # Handle 1-dimensional arrays
- if X.ndim == 0 or X.ndim > 2:
- raise ValueError(
- "Expected 1D or 2D array, got %dD array instead" % X.ndim)
- elif X.ndim == 1:
- # Common case -- 1d array of numbers
- if X.dtype.names is None:
- X = np.atleast_2d(X).T
- ncol = 1
- # Complex dtype -- each field indicates a separate column
- else:
- ncol = len(X.dtype.names)
- else:
- ncol = X.shape[1]
- iscomplex_X = np.iscomplexobj(X)
- # `fmt` can be a string with multiple insertion points or a
- # list of formats. E.g. '%10.5f\t%10d' or ('%10.5f', '$10d')
- if type(fmt) in (list, tuple):
- if len(fmt) != ncol:
- raise AttributeError('fmt has wrong shape. %s' % str(fmt))
- format = asstr(delimiter).join(map(asstr, fmt))
- elif isinstance(fmt, str):
- n_fmt_chars = fmt.count('%')
- error = ValueError('fmt has wrong number of %% formats: %s' % fmt)
- if n_fmt_chars == 1:
- if iscomplex_X:
- fmt = [' (%s+%sj)' % (fmt, fmt), ] * ncol
- else:
- fmt = [fmt, ] * ncol
- format = delimiter.join(fmt)
- elif iscomplex_X and n_fmt_chars != (2 * ncol):
- raise error
- elif ((not iscomplex_X) and n_fmt_chars != ncol):
- raise error
- else:
- format = fmt
- else:
- raise ValueError('invalid fmt: %r' % (fmt,))
- if len(header) > 0:
- header = header.replace('\n', '\n' + comments)
- fh.write(comments + header + newline)
- if iscomplex_X:
- for row in X:
- row2 = []
- for number in row:
- row2.append(number.real)
- row2.append(number.imag)
- s = format % tuple(row2) + newline
- fh.write(s.replace('+-', '-'))
- else:
- for row in X:
- try:
- v = format % tuple(row) + newline
- except TypeError as e:
- raise TypeError("Mismatch between array dtype ('%s') and "
- "format specifier ('%s')"
- % (str(X.dtype), format)) from e
- fh.write(v)
- if len(footer) > 0:
- footer = footer.replace('\n', '\n' + comments)
- fh.write(comments + footer + newline)
- finally:
- if own_fh:
- fh.close()
- @set_module('numpy')
- def fromregex(file, regexp, dtype, encoding=None):
- r"""
- Construct an array from a text file, using regular expression parsing.
- The returned array is always a structured array, and is constructed from
- all matches of the regular expression in the file. Groups in the regular
- expression are converted to fields of the structured array.
- Parameters
- ----------
- file : path or file
- Filename or file object to read.
- .. versionchanged:: 1.22.0
- Now accepts `os.PathLike` implementations.
- regexp : str or regexp
- Regular expression used to parse the file.
- Groups in the regular expression correspond to fields in the dtype.
- dtype : dtype or list of dtypes
- Dtype for the structured array; must be a structured datatype.
- encoding : str, optional
- Encoding used to decode the inputfile. Does not apply to input streams.
- .. versionadded:: 1.14.0
- Returns
- -------
- output : ndarray
- The output array, containing the part of the content of `file` that
- was matched by `regexp`. `output` is always a structured array.
- Raises
- ------
- TypeError
- When `dtype` is not a valid dtype for a structured array.
- See Also
- --------
- fromstring, loadtxt
- Notes
- -----
- Dtypes for structured arrays can be specified in several forms, but all
- forms specify at least the data type and field name. For details see
- `basics.rec`.
- Examples
- --------
- >>> from io import StringIO
- >>> text = StringIO("1312 foo\n1534 bar\n444 qux")
- >>> regexp = r"(\d+)\s+(...)" # match [digits, whitespace, anything]
- >>> output = np.fromregex(text, regexp,
- ... [('num', np.int64), ('key', 'S3')])
- >>> output
- array([(1312, b'foo'), (1534, b'bar'), ( 444, b'qux')],
- dtype=[('num', '<i8'), ('key', 'S3')])
- >>> output['num']
- array([1312, 1534, 444])
- """
- own_fh = False
- if not hasattr(file, "read"):
- file = os.fspath(file)
- file = np.lib._datasource.open(file, 'rt', encoding=encoding)
- own_fh = True
- try:
- if not isinstance(dtype, np.dtype):
- dtype = np.dtype(dtype)
- if dtype.names is None:
- raise TypeError('dtype must be a structured datatype.')
- content = file.read()
- if isinstance(content, bytes) and isinstance(regexp, str):
- regexp = asbytes(regexp)
- elif isinstance(content, str) and isinstance(regexp, bytes):
- regexp = asstr(regexp)
- if not hasattr(regexp, 'match'):
- regexp = re.compile(regexp)
- seq = regexp.findall(content)
- if seq and not isinstance(seq[0], tuple):
- # Only one group is in the regexp.
- # Create the new array as a single data-type and then
- # re-interpret as a single-field structured array.
- newdtype = np.dtype(dtype[dtype.names[0]])
- output = np.array(seq, dtype=newdtype)
- output.dtype = dtype
- else:
- output = np.array(seq, dtype=dtype)
- return output
- finally:
- if own_fh:
- file.close()
- #####--------------------------------------------------------------------------
- #---- --- ASCII functions ---
- #####--------------------------------------------------------------------------
- @set_array_function_like_doc
- @set_module('numpy')
- def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
- skip_header=0, skip_footer=0, converters=None,
- missing_values=None, filling_values=None, usecols=None,
- names=None, excludelist=None,
- deletechars=''.join(sorted(NameValidator.defaultdeletechars)),
- replace_space='_', autostrip=False, case_sensitive=True,
- defaultfmt="f%i", unpack=None, usemask=False, loose=True,
- invalid_raise=True, max_rows=None, encoding='bytes',
- *, ndmin=0, like=None):
- """
- Load data from a text file, with missing values handled as specified.
- Each line past the first `skip_header` lines is split at the `delimiter`
- character, and characters following the `comments` character are discarded.
- Parameters
- ----------
- fname : file, str, pathlib.Path, list of str, generator
- File, filename, list, or generator to read. If the filename
- extension is ``.gz`` or ``.bz2``, the file is first decompressed. Note
- that generators must return bytes or strings. The strings
- in a list or produced by a generator are treated as lines.
- dtype : dtype, optional
- Data type of the resulting array.
- If None, the dtypes will be determined by the contents of each
- column, individually.
- comments : str, optional
- The character used to indicate the start of a comment.
- All the characters occurring on a line after a comment are discarded.
- delimiter : str, int, or sequence, optional
- The string used to separate values. By default, any consecutive
- whitespaces act as delimiter. An integer or sequence of integers
- can also be provided as width(s) of each field.
- skiprows : int, optional
- `skiprows` was removed in numpy 1.10. Please use `skip_header` instead.
- skip_header : int, optional
- The number of lines to skip at the beginning of the file.
- skip_footer : int, optional
- The number of lines to skip at the end of the file.
- converters : variable, optional
- The set of functions that convert the data of a column to a value.
- The converters can also be used to provide a default value
- for missing data: ``converters = {3: lambda s: float(s or 0)}``.
- missing : variable, optional
- `missing` was removed in numpy 1.10. Please use `missing_values`
- instead.
- missing_values : variable, optional
- The set of strings corresponding to missing data.
- filling_values : variable, optional
- The set of values to be used as default when the data are missing.
- usecols : sequence, optional
- Which columns to read, with 0 being the first. For example,
- ``usecols = (1, 4, 5)`` will extract the 2nd, 5th and 6th columns.
- names : {None, True, str, sequence}, optional
- If `names` is True, the field names are read from the first line after
- the first `skip_header` lines. This line can optionally be preceded
- by a comment delimiter. If `names` is a sequence or a single-string of
- comma-separated names, the names will be used to define the field names
- in a structured dtype. If `names` is None, the names of the dtype
- fields will be used, if any.
- excludelist : sequence, optional
- A list of names to exclude. This list is appended to the default list
- ['return','file','print']. Excluded names are appended with an
- underscore: for example, `file` would become `file_`.
- deletechars : str, optional
- A string combining invalid characters that must be deleted from the
- names.
- defaultfmt : str, optional
- A format used to define default field names, such as "f%i" or "f_%02i".
- autostrip : bool, optional
- Whether to automatically strip white spaces from the variables.
- replace_space : char, optional
- Character(s) used in replacement of white spaces in the variable
- names. By default, use a '_'.
- case_sensitive : {True, False, 'upper', 'lower'}, optional
- If True, field names are case sensitive.
- If False or 'upper', field names are converted to upper case.
- If 'lower', field names are converted to lower case.
- unpack : bool, optional
- If True, the returned array is transposed, so that arguments may be
- unpacked using ``x, y, z = genfromtxt(...)``. When used with a
- structured data-type, arrays are returned for each field.
- Default is False.
- usemask : bool, optional
- If True, return a masked array.
- If False, return a regular array.
- loose : bool, optional
- If True, do not raise errors for invalid values.
- invalid_raise : bool, optional
- If True, an exception is raised if an inconsistency is detected in the
- number of columns.
- If False, a warning is emitted and the offending lines are skipped.
- max_rows : int, optional
- The maximum number of rows to read. Must not be used with skip_footer
- at the same time. If given, the value must be at least 1. Default is
- to read the entire file.
- .. versionadded:: 1.10.0
- encoding : str, optional
- Encoding used to decode the inputfile. Does not apply when `fname` is
- a file object. The special value 'bytes' enables backward compatibility
- workarounds that ensure that you receive byte arrays when possible
- and passes latin1 encoded strings to converters. Override this value to
- receive unicode arrays and pass strings as input to converters. If set
- to None the system default is used. The default value is 'bytes'.
- .. versionadded:: 1.14.0
- ndmin : int, optional
- Same parameter as `loadtxt`
- .. versionadded:: 1.23.0
- ${ARRAY_FUNCTION_LIKE}
- .. versionadded:: 1.20.0
- Returns
- -------
- out : ndarray
- Data read from the text file. If `usemask` is True, this is a
- masked array.
- See Also
- --------
- numpy.loadtxt : equivalent function when no data is missing.
- Notes
- -----
- * When spaces are used as delimiters, or when no delimiter has been given
- as input, there should not be any missing data between two fields.
- * When the variables are named (either by a flexible dtype or with `names`),
- there must not be any header in the file (else a ValueError
- exception is raised).
- * Individual values are not stripped of spaces by default.
- When using a custom converter, make sure the function does remove spaces.
- References
- ----------
- .. [1] NumPy User Guide, section `I/O with NumPy
- <https://docs.scipy.org/doc/numpy/user/basics.io.genfromtxt.html>`_.
- Examples
- --------
- >>> from io import StringIO
- >>> import numpy as np
- Comma delimited file with mixed dtype
- >>> s = StringIO(u"1,1.3,abcde")
- >>> data = np.genfromtxt(s, dtype=[('myint','i8'),('myfloat','f8'),
- ... ('mystring','S5')], delimiter=",")
- >>> data
- array((1, 1.3, b'abcde'),
- dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', 'S5')])
- Using dtype = None
- >>> _ = s.seek(0) # needed for StringIO example only
- >>> data = np.genfromtxt(s, dtype=None,
- ... names = ['myint','myfloat','mystring'], delimiter=",")
- >>> data
- array((1, 1.3, b'abcde'),
- dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', 'S5')])
- Specifying dtype and names
- >>> _ = s.seek(0)
- >>> data = np.genfromtxt(s, dtype="i8,f8,S5",
- ... names=['myint','myfloat','mystring'], delimiter=",")
- >>> data
- array((1, 1.3, b'abcde'),
- dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', 'S5')])
- An example with fixed-width columns
- >>> s = StringIO(u"11.3abcde")
- >>> data = np.genfromtxt(s, dtype=None, names=['intvar','fltvar','strvar'],
- ... delimiter=[1,3,5])
- >>> data
- array((1, 1.3, b'abcde'),
- dtype=[('intvar', '<i8'), ('fltvar', '<f8'), ('strvar', 'S5')])
- An example to show comments
- >>> f = StringIO('''
- ... text,# of chars
- ... hello world,11
- ... numpy,5''')
- >>> np.genfromtxt(f, dtype='S12,S12', delimiter=',')
- array([(b'text', b''), (b'hello world', b'11'), (b'numpy', b'5')],
- dtype=[('f0', 'S12'), ('f1', 'S12')])
- """
- if like is not None:
- return _genfromtxt_with_like(
- like, fname, dtype=dtype, comments=comments, delimiter=delimiter,
- skip_header=skip_header, skip_footer=skip_footer,
- converters=converters, missing_values=missing_values,
- filling_values=filling_values, usecols=usecols, names=names,
- excludelist=excludelist, deletechars=deletechars,
- replace_space=replace_space, autostrip=autostrip,
- case_sensitive=case_sensitive, defaultfmt=defaultfmt,
- unpack=unpack, usemask=usemask, loose=loose,
- invalid_raise=invalid_raise, max_rows=max_rows, encoding=encoding,
- ndmin=ndmin,
- )
- _ensure_ndmin_ndarray_check_param(ndmin)
- if max_rows is not None:
- if skip_footer:
- raise ValueError(
- "The keywords 'skip_footer' and 'max_rows' can not be "
- "specified at the same time.")
- if max_rows < 1:
- raise ValueError("'max_rows' must be at least 1.")
- if usemask:
- from numpy.ma import MaskedArray, make_mask_descr
- # Check the input dictionary of converters
- user_converters = converters or {}
- if not isinstance(user_converters, dict):
- raise TypeError(
- "The input argument 'converter' should be a valid dictionary "
- "(got '%s' instead)" % type(user_converters))
- if encoding == 'bytes':
- encoding = None
- byte_converters = True
- else:
- byte_converters = False
- # Initialize the filehandle, the LineSplitter and the NameValidator
- if isinstance(fname, os_PathLike):
- fname = os_fspath(fname)
- if isinstance(fname, str):
- fid = np.lib._datasource.open(fname, 'rt', encoding=encoding)
- fid_ctx = contextlib.closing(fid)
- else:
- fid = fname
- fid_ctx = contextlib.nullcontext(fid)
- try:
- fhd = iter(fid)
- except TypeError as e:
- raise TypeError(
- "fname must be a string, a filehandle, a sequence of strings,\n"
- f"or an iterator of strings. Got {type(fname)} instead."
- ) from e
- with fid_ctx:
- split_line = LineSplitter(delimiter=delimiter, comments=comments,
- autostrip=autostrip, encoding=encoding)
- validate_names = NameValidator(excludelist=excludelist,
- deletechars=deletechars,
- case_sensitive=case_sensitive,
- replace_space=replace_space)
- # Skip the first `skip_header` rows
- try:
- for i in range(skip_header):
- next(fhd)
- # Keep on until we find the first valid values
- first_values = None
- while not first_values:
- first_line = _decode_line(next(fhd), encoding)
- if (names is True) and (comments is not None):
- if comments in first_line:
- first_line = (
- ''.join(first_line.split(comments)[1:]))
- first_values = split_line(first_line)
- except StopIteration:
- # return an empty array if the datafile is empty
- first_line = ''
- first_values = []
- warnings.warn('genfromtxt: Empty input file: "%s"' % fname, stacklevel=2)
- # Should we take the first values as names ?
- if names is True:
- fval = first_values[0].strip()
- if comments is not None:
- if fval in comments:
- del first_values[0]
- # Check the columns to use: make sure `usecols` is a list
- if usecols is not None:
- try:
- usecols = [_.strip() for _ in usecols.split(",")]
- except AttributeError:
- try:
- usecols = list(usecols)
- except TypeError:
- usecols = [usecols, ]
- nbcols = len(usecols or first_values)
- # Check the names and overwrite the dtype.names if needed
- if names is True:
- names = validate_names([str(_.strip()) for _ in first_values])
- first_line = ''
- elif _is_string_like(names):
- names = validate_names([_.strip() for _ in names.split(',')])
- elif names:
- names = validate_names(names)
- # Get the dtype
- if dtype is not None:
- dtype = easy_dtype(dtype, defaultfmt=defaultfmt, names=names,
- excludelist=excludelist,
- deletechars=deletechars,
- case_sensitive=case_sensitive,
- replace_space=replace_space)
- # Make sure the names is a list (for 2.5)
- if names is not None:
- names = list(names)
- if usecols:
- for (i, current) in enumerate(usecols):
- # if usecols is a list of names, convert to a list of indices
- if _is_string_like(current):
- usecols[i] = names.index(current)
- elif current < 0:
- usecols[i] = current + len(first_values)
- # If the dtype is not None, make sure we update it
- if (dtype is not None) and (len(dtype) > nbcols):
- descr = dtype.descr
- dtype = np.dtype([descr[_] for _ in usecols])
- names = list(dtype.names)
- # If `names` is not None, update the names
- elif (names is not None) and (len(names) > nbcols):
- names = [names[_] for _ in usecols]
- elif (names is not None) and (dtype is not None):
- names = list(dtype.names)
- # Process the missing values ...............................
- # Rename missing_values for convenience
- user_missing_values = missing_values or ()
- if isinstance(user_missing_values, bytes):
- user_missing_values = user_missing_values.decode('latin1')
- # Define the list of missing_values (one column: one list)
- missing_values = [list(['']) for _ in range(nbcols)]
- # We have a dictionary: process it field by field
- if isinstance(user_missing_values, dict):
- # Loop on the items
- for (key, val) in user_missing_values.items():
- # Is the key a string ?
- if _is_string_like(key):
- try:
- # Transform it into an integer
- key = names.index(key)
- except ValueError:
- # We couldn't find it: the name must have been dropped
- continue
- # Redefine the key as needed if it's a column number
- if usecols:
- try:
- key = usecols.index(key)
- except ValueError:
- pass
- # Transform the value as a list of string
- if isinstance(val, (list, tuple)):
- val = [str(_) for _ in val]
- else:
- val = [str(val), ]
- # Add the value(s) to the current list of missing
- if key is None:
- # None acts as default
- for miss in missing_values:
- miss.extend(val)
- else:
- missing_values[key].extend(val)
- # We have a sequence : each item matches a column
- elif isinstance(user_missing_values, (list, tuple)):
- for (value, entry) in zip(user_missing_values, missing_values):
- value = str(value)
- if value not in entry:
- entry.append(value)
- # We have a string : apply it to all entries
- elif isinstance(user_missing_values, str):
- user_value = user_missing_values.split(",")
- for entry in missing_values:
- entry.extend(user_value)
- # We have something else: apply it to all entries
- else:
- for entry in missing_values:
- entry.extend([str(user_missing_values)])
- # Process the filling_values ...............................
- # Rename the input for convenience
- user_filling_values = filling_values
- if user_filling_values is None:
- user_filling_values = []
- # Define the default
- filling_values = [None] * nbcols
- # We have a dictionary : update each entry individually
- if isinstance(user_filling_values, dict):
- for (key, val) in user_filling_values.items():
- if _is_string_like(key):
- try:
- # Transform it into an integer
- key = names.index(key)
- except ValueError:
- # We couldn't find it: the name must have been dropped,
- continue
- # Redefine the key if it's a column number and usecols is defined
- if usecols:
- try:
- key = usecols.index(key)
- except ValueError:
- pass
- # Add the value to the list
- filling_values[key] = val
- # We have a sequence : update on a one-to-one basis
- elif isinstance(user_filling_values, (list, tuple)):
- n = len(user_filling_values)
- if (n <= nbcols):
- filling_values[:n] = user_filling_values
- else:
- filling_values = user_filling_values[:nbcols]
- # We have something else : use it for all entries
- else:
- filling_values = [user_filling_values] * nbcols
- # Initialize the converters ................................
- if dtype is None:
- # Note: we can't use a [...]*nbcols, as we would have 3 times the same
- # ... converter, instead of 3 different converters.
- converters = [StringConverter(None, missing_values=miss, default=fill)
- for (miss, fill) in zip(missing_values, filling_values)]
- else:
- dtype_flat = flatten_dtype(dtype, flatten_base=True)
- # Initialize the converters
- if len(dtype_flat) > 1:
- # Flexible type : get a converter from each dtype
- zipit = zip(dtype_flat, missing_values, filling_values)
- converters = [StringConverter(dt, locked=True,
- missing_values=miss, default=fill)
- for (dt, miss, fill) in zipit]
- else:
- # Set to a default converter (but w/ different missing values)
- zipit = zip(missing_values, filling_values)
- converters = [StringConverter(dtype, locked=True,
- missing_values=miss, default=fill)
- for (miss, fill) in zipit]
- # Update the converters to use the user-defined ones
- uc_update = []
- for (j, conv) in user_converters.items():
- # If the converter is specified by column names, use the index instead
- if _is_string_like(j):
- try:
- j = names.index(j)
- i = j
- except ValueError:
- continue
- elif usecols:
- try:
- i = usecols.index(j)
- except ValueError:
- # Unused converter specified
- continue
- else:
- i = j
- # Find the value to test - first_line is not filtered by usecols:
- if len(first_line):
- testing_value = first_values[j]
- else:
- testing_value = None
- if conv is bytes:
- user_conv = asbytes
- elif byte_converters:
- # converters may use decode to workaround numpy's old behaviour,
- # so encode the string again before passing to the user converter
- def tobytes_first(x, conv):
- if type(x) is bytes:
- return conv(x)
- return conv(x.encode("latin1"))
- user_conv = functools.partial(tobytes_first, conv=conv)
- else:
- user_conv = conv
- converters[i].update(user_conv, locked=True,
- testing_value=testing_value,
- default=filling_values[i],
- missing_values=missing_values[i],)
- uc_update.append((i, user_conv))
- # Make sure we have the corrected keys in user_converters...
- user_converters.update(uc_update)
- # Fixme: possible error as following variable never used.
- # miss_chars = [_.missing_values for _ in converters]
- # Initialize the output lists ...
- # ... rows
- rows = []
- append_to_rows = rows.append
- # ... masks
- if usemask:
- masks = []
- append_to_masks = masks.append
- # ... invalid
- invalid = []
- append_to_invalid = invalid.append
- # Parse each line
- for (i, line) in enumerate(itertools.chain([first_line, ], fhd)):
- values = split_line(line)
- nbvalues = len(values)
- # Skip an empty line
- if nbvalues == 0:
- continue
- if usecols:
- # Select only the columns we need
- try:
- values = [values[_] for _ in usecols]
- except IndexError:
- append_to_invalid((i + skip_header + 1, nbvalues))
- continue
- elif nbvalues != nbcols:
- append_to_invalid((i + skip_header + 1, nbvalues))
- continue
- # Store the values
- append_to_rows(tuple(values))
- if usemask:
- append_to_masks(tuple([v.strip() in m
- for (v, m) in zip(values,
- missing_values)]))
- if len(rows) == max_rows:
- break
- # Upgrade the converters (if needed)
- if dtype is None:
- for (i, converter) in enumerate(converters):
- current_column = [itemgetter(i)(_m) for _m in rows]
- try:
- converter.iterupgrade(current_column)
- except ConverterLockError:
- errmsg = "Converter #%i is locked and cannot be upgraded: " % i
- current_column = map(itemgetter(i), rows)
- for (j, value) in enumerate(current_column):
- try:
- converter.upgrade(value)
- except (ConverterError, ValueError):
- errmsg += "(occurred line #%i for value '%s')"
- errmsg %= (j + 1 + skip_header, value)
- raise ConverterError(errmsg)
- # Check that we don't have invalid values
- nbinvalid = len(invalid)
- if nbinvalid > 0:
- nbrows = len(rows) + nbinvalid - skip_footer
- # Construct the error message
- template = " Line #%%i (got %%i columns instead of %i)" % nbcols
- if skip_footer > 0:
- nbinvalid_skipped = len([_ for _ in invalid
- if _[0] > nbrows + skip_header])
- invalid = invalid[:nbinvalid - nbinvalid_skipped]
- skip_footer -= nbinvalid_skipped
- #
- # nbrows -= skip_footer
- # errmsg = [template % (i, nb)
- # for (i, nb) in invalid if i < nbrows]
- # else:
- errmsg = [template % (i, nb)
- for (i, nb) in invalid]
- if len(errmsg):
- errmsg.insert(0, "Some errors were detected !")
- errmsg = "\n".join(errmsg)
- # Raise an exception ?
- if invalid_raise:
- raise ValueError(errmsg)
- # Issue a warning ?
- else:
- warnings.warn(errmsg, ConversionWarning, stacklevel=2)
- # Strip the last skip_footer data
- if skip_footer > 0:
- rows = rows[:-skip_footer]
- if usemask:
- masks = masks[:-skip_footer]
- # Convert each value according to the converter:
- # We want to modify the list in place to avoid creating a new one...
- if loose:
- rows = list(
- zip(*[[conv._loose_call(_r) for _r in map(itemgetter(i), rows)]
- for (i, conv) in enumerate(converters)]))
- else:
- rows = list(
- zip(*[[conv._strict_call(_r) for _r in map(itemgetter(i), rows)]
- for (i, conv) in enumerate(converters)]))
- # Reset the dtype
- data = rows
- if dtype is None:
- # Get the dtypes from the types of the converters
- column_types = [conv.type for conv in converters]
- # Find the columns with strings...
- strcolidx = [i for (i, v) in enumerate(column_types)
- if v == np.str_]
- if byte_converters and strcolidx:
- # convert strings back to bytes for backward compatibility
- warnings.warn(
- "Reading unicode strings without specifying the encoding "
- "argument is deprecated. Set the encoding, use None for the "
- "system default.",
- np.VisibleDeprecationWarning, stacklevel=2)
- def encode_unicode_cols(row_tup):
- row = list(row_tup)
- for i in strcolidx:
- row[i] = row[i].encode('latin1')
- return tuple(row)
- try:
- data = [encode_unicode_cols(r) for r in data]
- except UnicodeEncodeError:
- pass
- else:
- for i in strcolidx:
- column_types[i] = np.bytes_
- # Update string types to be the right length
- sized_column_types = column_types[:]
- for i, col_type in enumerate(column_types):
- if np.issubdtype(col_type, np.character):
- n_chars = max(len(row[i]) for row in data)
- sized_column_types[i] = (col_type, n_chars)
- if names is None:
- # If the dtype is uniform (before sizing strings)
- base = {
- c_type
- for c, c_type in zip(converters, column_types)
- if c._checked}
- if len(base) == 1:
- uniform_type, = base
- (ddtype, mdtype) = (uniform_type, bool)
- else:
- ddtype = [(defaultfmt % i, dt)
- for (i, dt) in enumerate(sized_column_types)]
- if usemask:
- mdtype = [(defaultfmt % i, bool)
- for (i, dt) in enumerate(sized_column_types)]
- else:
- ddtype = list(zip(names, sized_column_types))
- mdtype = list(zip(names, [bool] * len(sized_column_types)))
- output = np.array(data, dtype=ddtype)
- if usemask:
- outputmask = np.array(masks, dtype=mdtype)
- else:
- # Overwrite the initial dtype names if needed
- if names and dtype.names is not None:
- dtype.names = names
- # Case 1. We have a structured type
- if len(dtype_flat) > 1:
- # Nested dtype, eg [('a', int), ('b', [('b0', int), ('b1', 'f4')])]
- # First, create the array using a flattened dtype:
- # [('a', int), ('b1', int), ('b2', float)]
- # Then, view the array using the specified dtype.
- if 'O' in (_.char for _ in dtype_flat):
- if has_nested_fields(dtype):
- raise NotImplementedError(
- "Nested fields involving objects are not supported...")
- else:
- output = np.array(data, dtype=dtype)
- else:
- rows = np.array(data, dtype=[('', _) for _ in dtype_flat])
- output = rows.view(dtype)
- # Now, process the rowmasks the same way
- if usemask:
- rowmasks = np.array(
- masks, dtype=np.dtype([('', bool) for t in dtype_flat]))
- # Construct the new dtype
- mdtype = make_mask_descr(dtype)
- outputmask = rowmasks.view(mdtype)
- # Case #2. We have a basic dtype
- else:
- # We used some user-defined converters
- if user_converters:
- ishomogeneous = True
- descr = []
- for i, ttype in enumerate([conv.type for conv in converters]):
- # Keep the dtype of the current converter
- if i in user_converters:
- ishomogeneous &= (ttype == dtype.type)
- if np.issubdtype(ttype, np.character):
- ttype = (ttype, max(len(row[i]) for row in data))
- descr.append(('', ttype))
- else:
- descr.append(('', dtype))
- # So we changed the dtype ?
- if not ishomogeneous:
- # We have more than one field
- if len(descr) > 1:
- dtype = np.dtype(descr)
- # We have only one field: drop the name if not needed.
- else:
- dtype = np.dtype(ttype)
- #
- output = np.array(data, dtype)
- if usemask:
- if dtype.names is not None:
- mdtype = [(_, bool) for _ in dtype.names]
- else:
- mdtype = bool
- outputmask = np.array(masks, dtype=mdtype)
- # Try to take care of the missing data we missed
- names = output.dtype.names
- if usemask and names:
- for (name, conv) in zip(names, converters):
- missing_values = [conv(_) for _ in conv.missing_values
- if _ != '']
- for mval in missing_values:
- outputmask[name] |= (output[name] == mval)
- # Construct the final array
- if usemask:
- output = output.view(MaskedArray)
- output._mask = outputmask
- output = _ensure_ndmin_ndarray(output, ndmin=ndmin)
- if unpack:
- if names is None:
- return output.T
- elif len(names) == 1:
- # squeeze single-name dtypes too
- return output[names[0]]
- else:
- # For structured arrays with multiple fields,
- # return an array for each field.
- return [output[field] for field in names]
- return output
- _genfromtxt_with_like = array_function_dispatch()(genfromtxt)
- def recfromtxt(fname, **kwargs):
- """
- Load ASCII data from a file and return it in a record array.
- If ``usemask=False`` a standard `recarray` is returned,
- if ``usemask=True`` a MaskedRecords array is returned.
- Parameters
- ----------
- fname, kwargs : For a description of input parameters, see `genfromtxt`.
- See Also
- --------
- numpy.genfromtxt : generic function
- Notes
- -----
- By default, `dtype` is None, which means that the data-type of the output
- array will be determined from the data.
- """
- kwargs.setdefault("dtype", None)
- usemask = kwargs.get('usemask', False)
- output = genfromtxt(fname, **kwargs)
- if usemask:
- from numpy.ma.mrecords import MaskedRecords
- output = output.view(MaskedRecords)
- else:
- output = output.view(np.recarray)
- return output
- def recfromcsv(fname, **kwargs):
- """
- Load ASCII data stored in a comma-separated file.
- The returned array is a record array (if ``usemask=False``, see
- `recarray`) or a masked record array (if ``usemask=True``,
- see `ma.mrecords.MaskedRecords`).
- Parameters
- ----------
- fname, kwargs : For a description of input parameters, see `genfromtxt`.
- See Also
- --------
- numpy.genfromtxt : generic function to load ASCII data.
- Notes
- -----
- By default, `dtype` is None, which means that the data-type of the output
- array will be determined from the data.
- """
- # Set default kwargs for genfromtxt as relevant to csv import.
- kwargs.setdefault("case_sensitive", "lower")
- kwargs.setdefault("names", True)
- kwargs.setdefault("delimiter", ",")
- kwargs.setdefault("dtype", None)
- output = genfromtxt(fname, **kwargs)
- usemask = kwargs.get("usemask", False)
- if usemask:
- from numpy.ma.mrecords import MaskedRecords
- output = output.view(MaskedRecords)
- else:
- output = output.view(np.recarray)
- return output
|