123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768 |
- import cython
- from cpython.datetime cimport (
- PyDate_Check,
- PyDateTime_Check,
- PyDateTime_IMPORT,
- datetime,
- tzinfo,
- )
- # import datetime C API
- PyDateTime_IMPORT
- cimport numpy as cnp
- from numpy cimport (
- float64_t,
- int64_t,
- ndarray,
- )
- import numpy as np
- cnp.import_array()
- import pytz
- from pandas._libs.tslibs.np_datetime cimport (
- _string_to_dts,
- check_dts_bounds,
- dt64_to_dtstruct,
- dtstruct_to_dt64,
- get_datetime64_value,
- npy_datetimestruct,
- pydate_to_dt64,
- pydatetime_to_dt64,
- )
- from pandas._libs.util cimport (
- is_datetime64_object,
- is_float_object,
- is_integer_object,
- )
- from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime
- from pandas._libs.tslibs.parsing import parse_datetime_string
- from pandas._libs.tslibs.conversion cimport (
- _TSObject,
- cast_from_unit,
- convert_datetime_to_tsobject,
- get_datetime64_nanos,
- precision_from_unit,
- )
- from pandas._libs.tslibs.nattype cimport (
- NPY_NAT,
- c_NaT as NaT,
- c_nat_strings as nat_strings,
- )
- from pandas._libs.tslibs.timestamps cimport _Timestamp
- from pandas._libs.tslibs.timestamps import Timestamp
- # Note: this is the only non-tslibs intra-pandas dependency here
- from pandas._libs.missing cimport checknull_with_nat_and_na
- from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single
- def _test_parse_iso8601(ts: str):
- """
- TESTING ONLY: Parse string into Timestamp using iso8601 parser. Used
- only for testing, actual construction uses `convert_str_to_tsobject`
- """
- cdef:
- _TSObject obj
- int out_local = 0, out_tzoffset = 0
- obj = _TSObject()
- if ts == 'now':
- return Timestamp.utcnow()
- elif ts == 'today':
- return Timestamp.now().normalize()
- _string_to_dts(ts, &obj.dts, &out_local, &out_tzoffset, True)
- obj.value = dtstruct_to_dt64(&obj.dts)
- check_dts_bounds(&obj.dts)
- if out_local == 1:
- obj.tzinfo = pytz.FixedOffset(out_tzoffset)
- obj.value = tz_localize_to_utc_single(obj.value, obj.tzinfo)
- return Timestamp(obj.value, tz=obj.tzinfo)
- else:
- return Timestamp(obj.value)
- @cython.wraparound(False)
- @cython.boundscheck(False)
- def format_array_from_datetime(
- ndarray[int64_t] values,
- tzinfo tz=None,
- str format=None,
- object na_rep=None
- ) -> np.ndarray:
- """
- return a np object array of the string formatted values
- Parameters
- ----------
- values : a 1-d i8 array
- tz : tzinfo or None, default None
- format : str or None, default None
- a strftime capable string
- na_rep : optional, default is None
- a nat format
- Returns
- -------
- np.ndarray[object]
- """
- cdef:
- int64_t val, ns, N = len(values)
- ndarray[int64_t] consider_values
- bint show_ms = False, show_us = False, show_ns = False
- bint basic_format = False
- ndarray[object] result = np.empty(N, dtype=object)
- object ts, res
- npy_datetimestruct dts
- if na_rep is None:
- na_rep = 'NaT'
- # if we don't have a format nor tz, then choose
- # a format based on precision
- basic_format = format is None and tz is None
- if basic_format:
- consider_values = values[values != NPY_NAT]
- show_ns = (consider_values % 1000).any()
- if not show_ns:
- consider_values //= 1000
- show_us = (consider_values % 1000).any()
- if not show_ms:
- consider_values //= 1000
- show_ms = (consider_values % 1000).any()
- for i in range(N):
- val = values[i]
- if val == NPY_NAT:
- result[i] = na_rep
- elif basic_format:
- dt64_to_dtstruct(val, &dts)
- res = (f'{dts.year}-{dts.month:02d}-{dts.day:02d} '
- f'{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}')
- if show_ns:
- ns = dts.ps // 1000
- res += f'.{ns + dts.us * 1000:09d}'
- elif show_us:
- res += f'.{dts.us:06d}'
- elif show_ms:
- res += f'.{dts.us // 1000:03d}'
- result[i] = res
- else:
- ts = Timestamp(val, tz=tz)
- if format is None:
- result[i] = str(ts)
- else:
- # invalid format string
- # requires dates > 1900
- try:
- result[i] = ts.strftime(format)
- except ValueError:
- result[i] = str(ts)
- return result
- def array_with_unit_to_datetime(
- ndarray values,
- str unit,
- str errors="coerce"
- ):
- """
- Convert the ndarray to datetime according to the time unit.
- This function converts an array of objects into a numpy array of
- datetime64[ns]. It returns the converted array
- and also returns the timezone offset
- if errors:
- - raise: return converted values or raise OutOfBoundsDatetime
- if out of range on the conversion or
- ValueError for other conversions (e.g. a string)
- - ignore: return non-convertible values as the same unit
- - coerce: NaT for non-convertibles
- Parameters
- ----------
- values : ndarray
- Date-like objects to convert.
- unit : str
- Time unit to use during conversion.
- errors : str, default 'raise'
- Error behavior when parsing.
- Returns
- -------
- result : ndarray of m8 values
- tz : parsed timezone offset or None
- """
- cdef:
- Py_ssize_t i, j, n=len(values)
- int64_t m
- int prec = 0
- ndarray[float64_t] fvalues
- bint is_ignore = errors=='ignore'
- bint is_coerce = errors=='coerce'
- bint is_raise = errors=='raise'
- bint need_to_iterate = True
- ndarray[int64_t] iresult
- ndarray[object] oresult
- ndarray mask
- object tz = None
- assert is_ignore or is_coerce or is_raise
- if unit == "ns":
- if issubclass(values.dtype.type, (np.integer, np.float_)):
- result = values.astype("M8[ns]", copy=False)
- else:
- result, tz = array_to_datetime(
- values.astype(object, copy=False),
- errors=errors,
- )
- return result, tz
- m, p = precision_from_unit(unit)
- if is_raise:
- # try a quick conversion to i8/f8
- # if we have nulls that are not type-compat
- # then need to iterate
- if values.dtype.kind == "i" or values.dtype.kind == "f":
- iresult = values.astype("i8", copy=False)
- # fill missing values by comparing to NPY_NAT
- mask = iresult == NPY_NAT
- iresult[mask] = 0
- fvalues = iresult.astype("f8") * m
- need_to_iterate = False
- if not need_to_iterate:
- # check the bounds
- if (fvalues < Timestamp.min.value).any() or (
- (fvalues > Timestamp.max.value).any()
- ):
- raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'")
- if values.dtype.kind == "i":
- result = (iresult * m).astype("M8[ns]")
- elif values.dtype.kind == "f":
- fresult = (values * m).astype("f8")
- fresult[mask] = 0
- if prec:
- fresult = round(fresult, prec)
- result = fresult.astype("M8[ns]", copy=False)
- iresult = result.view("i8")
- iresult[mask] = NPY_NAT
- return result, tz
- result = np.empty(n, dtype='M8[ns]')
- iresult = result.view('i8')
- try:
- for i in range(n):
- val = values[i]
- if checknull_with_nat_and_na(val):
- iresult[i] = NPY_NAT
- elif is_integer_object(val) or is_float_object(val):
- if val != val or val == NPY_NAT:
- iresult[i] = NPY_NAT
- else:
- try:
- iresult[i] = cast_from_unit(val, unit)
- except OverflowError:
- if is_raise:
- raise OutOfBoundsDatetime(
- f"cannot convert input {val} with the unit '{unit}'"
- )
- elif is_ignore:
- raise AssertionError
- iresult[i] = NPY_NAT
- elif isinstance(val, str):
- if len(val) == 0 or val in nat_strings:
- iresult[i] = NPY_NAT
- else:
- try:
- iresult[i] = cast_from_unit(float(val), unit)
- except ValueError:
- if is_raise:
- raise ValueError(
- f"non convertible value {val} with the unit '{unit}'"
- )
- elif is_ignore:
- raise AssertionError
- iresult[i] = NPY_NAT
- except OverflowError:
- if is_raise:
- raise OutOfBoundsDatetime(
- f"cannot convert input {val} with the unit '{unit}'"
- )
- elif is_ignore:
- raise AssertionError
- iresult[i] = NPY_NAT
- else:
- if is_raise:
- raise ValueError(
- f"unit='{unit}' not valid with non-numerical val='{val}'"
- )
- if is_ignore:
- raise AssertionError
- iresult[i] = NPY_NAT
- return result, tz
- except AssertionError:
- pass
- # we have hit an exception
- # and are in ignore mode
- # redo as object
- oresult = np.empty(n, dtype=object)
- for i in range(n):
- val = values[i]
- if checknull_with_nat_and_na(val):
- oresult[i] = <object>NaT
- elif is_integer_object(val) or is_float_object(val):
- if val != val or val == NPY_NAT:
- oresult[i] = <object>NaT
- else:
- try:
- oresult[i] = Timestamp(cast_from_unit(val, unit))
- except OverflowError:
- oresult[i] = val
- elif isinstance(val, str):
- if len(val) == 0 or val in nat_strings:
- oresult[i] = <object>NaT
- else:
- oresult[i] = val
- return oresult, tz
- @cython.wraparound(False)
- @cython.boundscheck(False)
- cpdef array_to_datetime(
- ndarray[object] values,
- str errors='raise',
- bint dayfirst=False,
- bint yearfirst=False,
- bint utc=False,
- bint require_iso8601=False,
- bint allow_mixed=False,
- ):
- """
- Converts a 1D array of date-like values to a numpy array of either:
- 1) datetime64[ns] data
- 2) datetime.datetime objects, if OutOfBoundsDatetime or TypeError
- is encountered
- Also returns a pytz.FixedOffset if an array of strings with the same
- timezone offset is passed and utc=True is not passed. Otherwise, None
- is returned
- Handles datetime.date, datetime.datetime, np.datetime64 objects, numeric,
- strings
- Parameters
- ----------
- values : ndarray of object
- date-like objects to convert
- errors : str, default 'raise'
- error behavior when parsing
- dayfirst : bool, default False
- dayfirst parsing behavior when encountering datetime strings
- yearfirst : bool, default False
- yearfirst parsing behavior when encountering datetime strings
- utc : bool, default False
- indicator whether the dates should be UTC
- require_iso8601 : bool, default False
- indicator whether the datetime string should be iso8601
- allow_mixed : bool, default False
- Whether to allow mixed datetimes and integers.
- Returns
- -------
- np.ndarray
- May be datetime64[ns] or object dtype
- tzinfo or None
- """
- cdef:
- Py_ssize_t i, n = len(values)
- object val, py_dt, tz, tz_out = None
- ndarray[int64_t] iresult
- ndarray[object] oresult
- npy_datetimestruct dts
- bint utc_convert = bool(utc)
- bint seen_integer = False
- bint seen_string = False
- bint seen_datetime = False
- bint seen_datetime_offset = False
- bint is_raise = errors=='raise'
- bint is_ignore = errors=='ignore'
- bint is_coerce = errors=='coerce'
- bint is_same_offsets
- _TSObject _ts
- int64_t value
- int out_local = 0, out_tzoffset = 0
- float offset_seconds, tz_offset
- set out_tzoffset_vals = set()
- bint string_to_dts_failed
- # specify error conditions
- assert is_raise or is_ignore or is_coerce
- result = np.empty(n, dtype='M8[ns]')
- iresult = result.view('i8')
- try:
- for i in range(n):
- val = values[i]
- try:
- if checknull_with_nat_and_na(val):
- iresult[i] = NPY_NAT
- elif PyDateTime_Check(val):
- seen_datetime = True
- if val.tzinfo is not None:
- if utc_convert:
- _ts = convert_datetime_to_tsobject(val, None)
- iresult[i] = _ts.value
- else:
- raise ValueError('Tz-aware datetime.datetime '
- 'cannot be converted to '
- 'datetime64 unless utc=True')
- elif isinstance(val, _Timestamp):
- iresult[i] = val.value
- else:
- iresult[i] = pydatetime_to_dt64(val, &dts)
- check_dts_bounds(&dts)
- elif PyDate_Check(val):
- seen_datetime = True
- iresult[i] = pydate_to_dt64(val, &dts)
- check_dts_bounds(&dts)
- elif is_datetime64_object(val):
- seen_datetime = True
- iresult[i] = get_datetime64_nanos(val)
- elif is_integer_object(val) or is_float_object(val):
- # these must be ns unit by-definition
- seen_integer = True
- if val != val or val == NPY_NAT:
- iresult[i] = NPY_NAT
- elif is_raise or is_ignore:
- iresult[i] = val
- else:
- # coerce
- # we now need to parse this as if unit='ns'
- # we can ONLY accept integers at this point
- # if we have previously (or in future accept
- # datetimes/strings, then we must coerce)
- try:
- iresult[i] = cast_from_unit(val, 'ns')
- except OverflowError:
- iresult[i] = NPY_NAT
- elif isinstance(val, str):
- # string
- seen_string = True
- if len(val) == 0 or val in nat_strings:
- iresult[i] = NPY_NAT
- continue
- string_to_dts_failed = _string_to_dts(
- val, &dts, &out_local,
- &out_tzoffset, False
- )
- if string_to_dts_failed:
- # An error at this point is a _parsing_ error
- # specifically _not_ OutOfBoundsDatetime
- if _parse_today_now(val, &iresult[i]):
- continue
- elif require_iso8601:
- # if requiring iso8601 strings, skip trying
- # other formats
- if is_coerce:
- iresult[i] = NPY_NAT
- continue
- elif is_raise:
- raise ValueError(
- f"time data {val} doesn't match format specified"
- )
- return values, tz_out
- try:
- py_dt = parse_datetime_string(val,
- dayfirst=dayfirst,
- yearfirst=yearfirst)
- # If the dateutil parser returned tzinfo, capture it
- # to check if all arguments have the same tzinfo
- tz = py_dt.utcoffset()
- except (ValueError, OverflowError):
- if is_coerce:
- iresult[i] = NPY_NAT
- continue
- raise TypeError("invalid string coercion to datetime")
- if tz is not None:
- seen_datetime_offset = True
- # dateutil timezone objects cannot be hashed, so
- # store the UTC offsets in seconds instead
- out_tzoffset_vals.add(tz.total_seconds())
- else:
- # Add a marker for naive string, to track if we are
- # parsing mixed naive and aware strings
- out_tzoffset_vals.add('naive')
- _ts = convert_datetime_to_tsobject(py_dt, None)
- iresult[i] = _ts.value
- if not string_to_dts_failed:
- # No error reported by string_to_dts, pick back up
- # where we left off
- value = dtstruct_to_dt64(&dts)
- if out_local == 1:
- seen_datetime_offset = True
- # Store the out_tzoffset in seconds
- # since we store the total_seconds of
- # dateutil.tz.tzoffset objects
- out_tzoffset_vals.add(out_tzoffset * 60.)
- tz = pytz.FixedOffset(out_tzoffset)
- value = tz_localize_to_utc_single(value, tz)
- out_local = 0
- out_tzoffset = 0
- else:
- # Add a marker for naive string, to track if we are
- # parsing mixed naive and aware strings
- out_tzoffset_vals.add('naive')
- iresult[i] = value
- check_dts_bounds(&dts)
- else:
- if is_coerce:
- iresult[i] = NPY_NAT
- else:
- raise TypeError(f"{type(val)} is not convertible to datetime")
- except OutOfBoundsDatetime:
- if is_coerce:
- iresult[i] = NPY_NAT
- continue
- elif require_iso8601 and isinstance(val, str):
- # GH#19382 for just-barely-OutOfBounds falling back to
- # dateutil parser will return incorrect result because
- # it will ignore nanoseconds
- if is_raise:
- # Still raise OutOfBoundsDatetime,
- # as error message is informative.
- raise
- assert is_ignore
- return values, tz_out
- raise
- except OutOfBoundsDatetime:
- if is_raise:
- raise
- return ignore_errors_out_of_bounds_fallback(values), tz_out
- except TypeError:
- return _array_to_datetime_object(values, errors, dayfirst, yearfirst)
- if seen_datetime and seen_integer:
- # we have mixed datetimes & integers
- if is_coerce:
- # coerce all of the integers/floats to NaT, preserve
- # the datetimes and other convertibles
- for i in range(n):
- val = values[i]
- if is_integer_object(val) or is_float_object(val):
- result[i] = NPY_NAT
- elif allow_mixed:
- pass
- elif is_raise:
- raise ValueError("mixed datetimes and integers in passed array")
- else:
- return _array_to_datetime_object(values, errors, dayfirst, yearfirst)
- if seen_datetime_offset and not utc_convert:
- # GH#17697
- # 1) If all the offsets are equal, return one offset for
- # the parsed dates to (maybe) pass to DatetimeIndex
- # 2) If the offsets are different, then force the parsing down the
- # object path where an array of datetimes
- # (with individual dateutil.tzoffsets) are returned
- is_same_offsets = len(out_tzoffset_vals) == 1
- if not is_same_offsets:
- return _array_to_datetime_object(values, errors, dayfirst, yearfirst)
- else:
- tz_offset = out_tzoffset_vals.pop()
- tz_out = pytz.FixedOffset(tz_offset / 60.)
- return result, tz_out
- cdef ndarray[object] ignore_errors_out_of_bounds_fallback(ndarray[object] values):
- """
- Fallback for array_to_datetime if an OutOfBoundsDatetime is raised
- and errors == "ignore"
- Parameters
- ----------
- values : ndarray[object]
- Returns
- -------
- ndarray[object]
- """
- cdef:
- Py_ssize_t i, n = len(values)
- object val
- oresult = np.empty(n, dtype=object)
- for i in range(n):
- val = values[i]
- # set as nan except if its a NaT
- if checknull_with_nat_and_na(val):
- if isinstance(val, float):
- oresult[i] = np.nan
- else:
- oresult[i] = NaT
- elif is_datetime64_object(val):
- if get_datetime64_value(val) == NPY_NAT:
- oresult[i] = NaT
- else:
- oresult[i] = val.item()
- else:
- oresult[i] = val
- return oresult
- @cython.wraparound(False)
- @cython.boundscheck(False)
- cdef _array_to_datetime_object(
- ndarray[object] values,
- str errors,
- bint dayfirst=False,
- bint yearfirst=False,
- ):
- """
- Fall back function for array_to_datetime
- Attempts to parse datetime strings with dateutil to return an array
- of datetime objects
- Parameters
- ----------
- values : ndarray[object]
- date-like objects to convert
- errors : str
- error behavior when parsing
- dayfirst : bool, default False
- dayfirst parsing behavior when encountering datetime strings
- yearfirst : bool, default False
- yearfirst parsing behavior when encountering datetime strings
- Returns
- -------
- np.ndarray[object]
- Literal[None]
- """
- cdef:
- Py_ssize_t i, n = len(values)
- object val
- bint is_ignore = errors == 'ignore'
- bint is_coerce = errors == 'coerce'
- bint is_raise = errors == 'raise'
- ndarray[object] oresult
- npy_datetimestruct dts
- assert is_raise or is_ignore or is_coerce
- oresult = np.empty(n, dtype=object)
- # We return an object array and only attempt to parse:
- # 1) NaT or NaT-like values
- # 2) datetime strings, which we return as datetime.datetime
- for i in range(n):
- val = values[i]
- if checknull_with_nat_and_na(val) or PyDateTime_Check(val):
- # GH 25978. No need to parse NaT-like or datetime-like vals
- oresult[i] = val
- elif isinstance(val, str):
- if len(val) == 0 or val in nat_strings:
- oresult[i] = 'NaT'
- continue
- try:
- oresult[i] = parse_datetime_string(val, dayfirst=dayfirst,
- yearfirst=yearfirst)
- pydatetime_to_dt64(oresult[i], &dts)
- check_dts_bounds(&dts)
- except (ValueError, OverflowError):
- if is_coerce:
- oresult[i] = <object>NaT
- continue
- if is_raise:
- raise
- return values, None
- else:
- if is_raise:
- raise
- return values, None
- return oresult, None
- cdef inline bint _parse_today_now(str val, int64_t* iresult):
- # We delay this check for as long as possible
- # because it catches relatively rare cases
- if val == 'now':
- # Note: this is *not* the same as Timestamp('now')
- iresult[0] = Timestamp.utcnow().value
- return True
- elif val == 'today':
- iresult[0] = Timestamp.today().value
- return True
- return False
|