missing.pyx 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571
  1. from decimal import Decimal
  2. import numbers
  3. from sys import maxsize
  4. import cython
  5. from cython import Py_ssize_t
  6. import numpy as np
  7. cimport numpy as cnp
  8. from numpy cimport (
  9. float64_t,
  10. int64_t,
  11. ndarray,
  12. uint8_t,
  13. )
  14. cnp.import_array()
  15. from pandas._libs cimport util
  16. from pandas._libs.tslibs.nattype cimport (
  17. c_NaT as NaT,
  18. checknull_with_nat,
  19. is_null_datetimelike,
  20. )
  21. from pandas._libs.tslibs.np_datetime cimport (
  22. get_datetime64_value,
  23. get_timedelta64_value,
  24. )
  25. from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op
  26. cdef:
  27. float64_t INF = <float64_t>np.inf
  28. float64_t NEGINF = -INF
  29. int64_t NPY_NAT = util.get_nat()
  30. bint is_32bit = maxsize <= 2 ** 32
  31. type cDecimal = Decimal # for faster isinstance checks
  32. cpdef bint is_matching_na(object left, object right, bint nan_matches_none=False):
  33. """
  34. Check if two scalars are both NA of matching types.
  35. Parameters
  36. ----------
  37. left : Any
  38. right : Any
  39. nan_matches_none : bool, default False
  40. For backwards compatibility, consider NaN as matching None.
  41. Returns
  42. -------
  43. bool
  44. """
  45. if left is None:
  46. if nan_matches_none and util.is_nan(right):
  47. return True
  48. return right is None
  49. elif left is C_NA:
  50. return right is C_NA
  51. elif left is NaT:
  52. return right is NaT
  53. elif util.is_float_object(left):
  54. if nan_matches_none and right is None:
  55. return True
  56. return (
  57. util.is_nan(left)
  58. and util.is_float_object(right)
  59. and util.is_nan(right)
  60. )
  61. elif util.is_complex_object(left):
  62. return (
  63. util.is_nan(left)
  64. and util.is_complex_object(right)
  65. and util.is_nan(right)
  66. )
  67. elif util.is_datetime64_object(left):
  68. return (
  69. get_datetime64_value(left) == NPY_NAT
  70. and util.is_datetime64_object(right)
  71. and get_datetime64_value(right) == NPY_NAT
  72. )
  73. elif util.is_timedelta64_object(left):
  74. return (
  75. get_timedelta64_value(left) == NPY_NAT
  76. and util.is_timedelta64_object(right)
  77. and get_timedelta64_value(right) == NPY_NAT
  78. )
  79. elif is_decimal_na(left):
  80. return is_decimal_na(right)
  81. return False
  82. cpdef bint checknull(object val):
  83. """
  84. Return boolean describing of the input is NA-like, defined here as any
  85. of:
  86. - None
  87. - nan
  88. - NaT
  89. - np.datetime64 representation of NaT
  90. - np.timedelta64 representation of NaT
  91. - NA
  92. - Decimal("NaN")
  93. Parameters
  94. ----------
  95. val : object
  96. Returns
  97. -------
  98. bool
  99. Notes
  100. -----
  101. The difference between `checknull` and `checknull_old` is that `checknull`
  102. does *not* consider INF or NEGINF to be NA.
  103. """
  104. return (
  105. val is C_NA
  106. or is_null_datetimelike(val, inat_is_null=False)
  107. or is_decimal_na(val)
  108. )
  109. cdef inline bint is_decimal_na(object val):
  110. """
  111. Is this a decimal.Decimal object Decimal("NAN").
  112. """
  113. return isinstance(val, cDecimal) and val != val
  114. cpdef bint checknull_old(object val):
  115. """
  116. Return boolean describing of the input is NA-like, defined here as any
  117. of:
  118. - None
  119. - nan
  120. - INF
  121. - NEGINF
  122. - NaT
  123. - np.datetime64 representation of NaT
  124. - np.timedelta64 representation of NaT
  125. - NA
  126. - Decimal("NaN")
  127. Parameters
  128. ----------
  129. val : object
  130. Returns
  131. -------
  132. result : bool
  133. Notes
  134. -----
  135. The difference between `checknull` and `checknull_old` is that `checknull`
  136. does *not* consider INF or NEGINF to be NA.
  137. """
  138. if checknull(val):
  139. return True
  140. elif util.is_float_object(val) or util.is_complex_object(val):
  141. return val == INF or val == NEGINF
  142. return False
  143. @cython.wraparound(False)
  144. @cython.boundscheck(False)
  145. cpdef ndarray[uint8_t] isnaobj(ndarray arr):
  146. """
  147. Return boolean mask denoting which elements of a 1-D array are na-like,
  148. according to the criteria defined in `checknull`:
  149. - None
  150. - nan
  151. - NaT
  152. - np.datetime64 representation of NaT
  153. - np.timedelta64 representation of NaT
  154. - NA
  155. - Decimal("NaN")
  156. Parameters
  157. ----------
  158. arr : ndarray
  159. Returns
  160. -------
  161. result : ndarray (dtype=np.bool_)
  162. """
  163. cdef:
  164. Py_ssize_t i, n
  165. object val
  166. ndarray[uint8_t] result
  167. assert arr.ndim == 1, "'arr' must be 1-D."
  168. n = len(arr)
  169. result = np.empty(n, dtype=np.uint8)
  170. for i in range(n):
  171. val = arr[i]
  172. result[i] = checknull(val)
  173. return result.view(np.bool_)
  174. @cython.wraparound(False)
  175. @cython.boundscheck(False)
  176. def isnaobj_old(arr: ndarray) -> ndarray:
  177. """
  178. Return boolean mask denoting which elements of a 1-D array are na-like,
  179. defined as being any of:
  180. - None
  181. - nan
  182. - INF
  183. - NEGINF
  184. - NaT
  185. - NA
  186. - Decimal("NaN")
  187. Parameters
  188. ----------
  189. arr : ndarray
  190. Returns
  191. -------
  192. result : ndarray (dtype=np.bool_)
  193. """
  194. cdef:
  195. Py_ssize_t i, n
  196. object val
  197. ndarray[uint8_t] result
  198. assert arr.ndim == 1, "'arr' must be 1-D."
  199. n = len(arr)
  200. result = np.zeros(n, dtype=np.uint8)
  201. for i in range(n):
  202. val = arr[i]
  203. result[i] = (
  204. checknull(val)
  205. or util.is_float_object(val) and (val == INF or val == NEGINF)
  206. )
  207. return result.view(np.bool_)
  208. @cython.wraparound(False)
  209. @cython.boundscheck(False)
  210. def isnaobj2d(arr: ndarray) -> ndarray:
  211. """
  212. Return boolean mask denoting which elements of a 2-D array are na-like,
  213. according to the criteria defined in `checknull`:
  214. - None
  215. - nan
  216. - NaT
  217. - np.datetime64 representation of NaT
  218. - np.timedelta64 representation of NaT
  219. - NA
  220. - Decimal("NaN")
  221. Parameters
  222. ----------
  223. arr : ndarray
  224. Returns
  225. -------
  226. result : ndarray (dtype=np.bool_)
  227. Notes
  228. -----
  229. The difference between `isnaobj2d` and `isnaobj2d_old` is that `isnaobj2d`
  230. does *not* consider INF or NEGINF to be NA.
  231. """
  232. cdef:
  233. Py_ssize_t i, j, n, m
  234. object val
  235. ndarray[uint8_t, ndim=2] result
  236. assert arr.ndim == 2, "'arr' must be 2-D."
  237. n, m = (<object>arr).shape
  238. result = np.zeros((n, m), dtype=np.uint8)
  239. for i in range(n):
  240. for j in range(m):
  241. val = arr[i, j]
  242. if checknull(val):
  243. result[i, j] = 1
  244. return result.view(np.bool_)
  245. @cython.wraparound(False)
  246. @cython.boundscheck(False)
  247. def isnaobj2d_old(arr: ndarray) -> ndarray:
  248. """
  249. Return boolean mask denoting which elements of a 2-D array are na-like,
  250. according to the criteria defined in `checknull_old`:
  251. - None
  252. - nan
  253. - INF
  254. - NEGINF
  255. - NaT
  256. - np.datetime64 representation of NaT
  257. - np.timedelta64 representation of NaT
  258. - NA
  259. - Decimal("NaN")
  260. Parameters
  261. ----------
  262. arr : ndarray
  263. Returns
  264. -------
  265. ndarray (dtype=np.bool_)
  266. Notes
  267. -----
  268. The difference between `isnaobj2d` and `isnaobj2d_old` is that `isnaobj2d`
  269. does *not* consider INF or NEGINF to be NA.
  270. """
  271. cdef:
  272. Py_ssize_t i, j, n, m
  273. object val
  274. ndarray[uint8_t, ndim=2] result
  275. assert arr.ndim == 2, "'arr' must be 2-D."
  276. n, m = (<object>arr).shape
  277. result = np.zeros((n, m), dtype=np.uint8)
  278. for i in range(n):
  279. for j in range(m):
  280. val = arr[i, j]
  281. if checknull_old(val):
  282. result[i, j] = 1
  283. return result.view(np.bool_)
  284. def isposinf_scalar(val: object) -> bool:
  285. return util.is_float_object(val) and val == INF
  286. def isneginf_scalar(val: object) -> bool:
  287. return util.is_float_object(val) and val == NEGINF
  288. cdef inline bint is_null_datetime64(v):
  289. # determine if we have a null for a datetime (or integer versions),
  290. # excluding np.timedelta64('nat')
  291. if checknull_with_nat(v):
  292. return True
  293. elif util.is_datetime64_object(v):
  294. return get_datetime64_value(v) == NPY_NAT
  295. return False
  296. cdef inline bint is_null_timedelta64(v):
  297. # determine if we have a null for a timedelta (or integer versions),
  298. # excluding np.datetime64('nat')
  299. if checknull_with_nat(v):
  300. return True
  301. elif util.is_timedelta64_object(v):
  302. return get_timedelta64_value(v) == NPY_NAT
  303. return False
  304. cdef bint checknull_with_nat_and_na(object obj):
  305. # See GH#32214
  306. return checknull_with_nat(obj) or obj is C_NA
  307. # -----------------------------------------------------------------------------
  308. # Implementation of NA singleton
  309. def _create_binary_propagating_op(name, is_divmod=False):
  310. def method(self, other):
  311. if (other is C_NA or isinstance(other, str)
  312. or isinstance(other, (numbers.Number, np.bool_))
  313. or isinstance(other, np.ndarray) and not other.shape):
  314. # Need the other.shape clause to handle NumPy scalars,
  315. # since we do a setitem on `out` below, which
  316. # won't work for NumPy scalars.
  317. if is_divmod:
  318. return NA, NA
  319. else:
  320. return NA
  321. elif isinstance(other, np.ndarray):
  322. out = np.empty(other.shape, dtype=object)
  323. out[:] = NA
  324. if is_divmod:
  325. return out, out.copy()
  326. else:
  327. return out
  328. return NotImplemented
  329. method.__name__ = name
  330. return method
  331. def _create_unary_propagating_op(name: str):
  332. def method(self):
  333. return NA
  334. method.__name__ = name
  335. return method
  336. cdef class C_NAType:
  337. pass
  338. class NAType(C_NAType):
  339. """
  340. NA ("not available") missing value indicator.
  341. .. warning::
  342. Experimental: the behaviour of NA can still change without warning.
  343. .. versionadded:: 1.0.0
  344. The NA singleton is a missing value indicator defined by pandas. It is
  345. used in certain new extension dtypes (currently the "string" dtype).
  346. """
  347. _instance = None
  348. def __new__(cls, *args, **kwargs):
  349. if NAType._instance is None:
  350. NAType._instance = C_NAType.__new__(cls, *args, **kwargs)
  351. return NAType._instance
  352. def __repr__(self) -> str:
  353. return "<NA>"
  354. def __format__(self, format_spec) -> str:
  355. try:
  356. return self.__repr__().__format__(format_spec)
  357. except ValueError:
  358. return self.__repr__()
  359. def __bool__(self):
  360. raise TypeError("boolean value of NA is ambiguous")
  361. def __hash__(self):
  362. # GH 30013: Ensure hash is large enough to avoid hash collisions with integers
  363. exponent = 31 if is_32bit else 61
  364. return 2 ** exponent - 1
  365. def __reduce__(self):
  366. return "NA"
  367. # Binary arithmetic and comparison ops -> propagate
  368. __add__ = _create_binary_propagating_op("__add__")
  369. __radd__ = _create_binary_propagating_op("__radd__")
  370. __sub__ = _create_binary_propagating_op("__sub__")
  371. __rsub__ = _create_binary_propagating_op("__rsub__")
  372. __mul__ = _create_binary_propagating_op("__mul__")
  373. __rmul__ = _create_binary_propagating_op("__rmul__")
  374. __matmul__ = _create_binary_propagating_op("__matmul__")
  375. __rmatmul__ = _create_binary_propagating_op("__rmatmul__")
  376. __truediv__ = _create_binary_propagating_op("__truediv__")
  377. __rtruediv__ = _create_binary_propagating_op("__rtruediv__")
  378. __floordiv__ = _create_binary_propagating_op("__floordiv__")
  379. __rfloordiv__ = _create_binary_propagating_op("__rfloordiv__")
  380. __mod__ = _create_binary_propagating_op("__mod__")
  381. __rmod__ = _create_binary_propagating_op("__rmod__")
  382. __divmod__ = _create_binary_propagating_op("__divmod__", is_divmod=True)
  383. __rdivmod__ = _create_binary_propagating_op("__rdivmod__", is_divmod=True)
  384. # __lshift__ and __rshift__ are not implemented
  385. __eq__ = _create_binary_propagating_op("__eq__")
  386. __ne__ = _create_binary_propagating_op("__ne__")
  387. __le__ = _create_binary_propagating_op("__le__")
  388. __lt__ = _create_binary_propagating_op("__lt__")
  389. __gt__ = _create_binary_propagating_op("__gt__")
  390. __ge__ = _create_binary_propagating_op("__ge__")
  391. # Unary ops
  392. __neg__ = _create_unary_propagating_op("__neg__")
  393. __pos__ = _create_unary_propagating_op("__pos__")
  394. __abs__ = _create_unary_propagating_op("__abs__")
  395. __invert__ = _create_unary_propagating_op("__invert__")
  396. # pow has special
  397. def __pow__(self, other):
  398. if other is C_NA:
  399. return NA
  400. elif isinstance(other, (numbers.Number, np.bool_)):
  401. if other == 0:
  402. # returning positive is correct for +/- 0.
  403. return type(other)(1)
  404. else:
  405. return NA
  406. elif isinstance(other, np.ndarray):
  407. return np.where(other == 0, other.dtype.type(1), NA)
  408. return NotImplemented
  409. def __rpow__(self, other):
  410. if other is C_NA:
  411. return NA
  412. elif isinstance(other, (numbers.Number, np.bool_)):
  413. if other == 1:
  414. return other
  415. else:
  416. return NA
  417. elif isinstance(other, np.ndarray):
  418. return np.where(other == 1, other, NA)
  419. return NotImplemented
  420. # Logical ops using Kleene logic
  421. def __and__(self, other):
  422. if other is False:
  423. return False
  424. elif other is True or other is C_NA:
  425. return NA
  426. return NotImplemented
  427. __rand__ = __and__
  428. def __or__(self, other):
  429. if other is True:
  430. return True
  431. elif other is False or other is C_NA:
  432. return NA
  433. return NotImplemented
  434. __ror__ = __or__
  435. def __xor__(self, other):
  436. if other is False or other is True or other is C_NA:
  437. return NA
  438. return NotImplemented
  439. __rxor__ = __xor__
  440. __array_priority__ = 1000
  441. _HANDLED_TYPES = (np.ndarray, numbers.Number, str, np.bool_)
  442. def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
  443. types = self._HANDLED_TYPES + (NAType,)
  444. for x in inputs:
  445. if not isinstance(x, types):
  446. return NotImplemented
  447. if method != "__call__":
  448. raise ValueError(f"ufunc method '{method}' not supported for NA")
  449. result = maybe_dispatch_ufunc_to_dunder_op(
  450. self, ufunc, method, *inputs, **kwargs
  451. )
  452. if result is NotImplemented:
  453. # For a NumPy ufunc that's not a binop, like np.logaddexp
  454. index = [i for i, x in enumerate(inputs) if x is NA][0]
  455. result = np.broadcast_arrays(*inputs)[index]
  456. if result.ndim == 0:
  457. result = result.item()
  458. if ufunc.nout > 1:
  459. result = (NA,) * ufunc.nout
  460. return result
  461. C_NA = NAType() # C-visible
  462. NA = C_NA # Python-visible