test_nanops.py 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089
  1. from functools import partial
  2. import operator
  3. import warnings
  4. import numpy as np
  5. import pytest
  6. import pandas.util._test_decorators as td
  7. from pandas.core.dtypes.common import is_integer_dtype
  8. import pandas as pd
  9. from pandas import (
  10. Series,
  11. isna,
  12. )
  13. import pandas._testing as tm
  14. from pandas.core.arrays import DatetimeArray
  15. import pandas.core.nanops as nanops
  16. use_bn = nanops._USE_BOTTLENECK
  17. has_c16 = hasattr(np, "complex128")
  18. @pytest.fixture(params=[True, False])
  19. def skipna(request):
  20. """
  21. Fixture to pass skipna to nanops functions.
  22. """
  23. return request.param
  24. class TestnanopsDataFrame:
  25. def setup_method(self, method):
  26. np.random.seed(11235)
  27. nanops._USE_BOTTLENECK = False
  28. arr_shape = (11, 7)
  29. self.arr_float = np.random.randn(*arr_shape)
  30. self.arr_float1 = np.random.randn(*arr_shape)
  31. self.arr_complex = self.arr_float + self.arr_float1 * 1j
  32. self.arr_int = np.random.randint(-10, 10, arr_shape)
  33. self.arr_bool = np.random.randint(0, 2, arr_shape) == 0
  34. self.arr_str = np.abs(self.arr_float).astype("S")
  35. self.arr_utf = np.abs(self.arr_float).astype("U")
  36. self.arr_date = np.random.randint(0, 20000, arr_shape).astype("M8[ns]")
  37. self.arr_tdelta = np.random.randint(0, 20000, arr_shape).astype("m8[ns]")
  38. self.arr_nan = np.tile(np.nan, arr_shape)
  39. self.arr_float_nan = np.vstack([self.arr_float, self.arr_nan])
  40. self.arr_float1_nan = np.vstack([self.arr_float1, self.arr_nan])
  41. self.arr_nan_float1 = np.vstack([self.arr_nan, self.arr_float1])
  42. self.arr_nan_nan = np.vstack([self.arr_nan, self.arr_nan])
  43. self.arr_inf = self.arr_float * np.inf
  44. self.arr_float_inf = np.vstack([self.arr_float, self.arr_inf])
  45. self.arr_nan_inf = np.vstack([self.arr_nan, self.arr_inf])
  46. self.arr_float_nan_inf = np.vstack([self.arr_float, self.arr_nan, self.arr_inf])
  47. self.arr_nan_nan_inf = np.vstack([self.arr_nan, self.arr_nan, self.arr_inf])
  48. self.arr_obj = np.vstack(
  49. [
  50. self.arr_float.astype("O"),
  51. self.arr_int.astype("O"),
  52. self.arr_bool.astype("O"),
  53. self.arr_complex.astype("O"),
  54. self.arr_str.astype("O"),
  55. self.arr_utf.astype("O"),
  56. self.arr_date.astype("O"),
  57. self.arr_tdelta.astype("O"),
  58. ]
  59. )
  60. with np.errstate(invalid="ignore"):
  61. self.arr_nan_nanj = self.arr_nan + self.arr_nan * 1j
  62. self.arr_complex_nan = np.vstack([self.arr_complex, self.arr_nan_nanj])
  63. self.arr_nan_infj = self.arr_inf * 1j
  64. self.arr_complex_nan_infj = np.vstack([self.arr_complex, self.arr_nan_infj])
  65. self.arr_float_2d = self.arr_float
  66. self.arr_float1_2d = self.arr_float1
  67. self.arr_nan_2d = self.arr_nan
  68. self.arr_float_nan_2d = self.arr_float_nan
  69. self.arr_float1_nan_2d = self.arr_float1_nan
  70. self.arr_nan_float1_2d = self.arr_nan_float1
  71. self.arr_float_1d = self.arr_float[:, 0]
  72. self.arr_float1_1d = self.arr_float1[:, 0]
  73. self.arr_nan_1d = self.arr_nan[:, 0]
  74. self.arr_float_nan_1d = self.arr_float_nan[:, 0]
  75. self.arr_float1_nan_1d = self.arr_float1_nan[:, 0]
  76. self.arr_nan_float1_1d = self.arr_nan_float1[:, 0]
  77. def teardown_method(self, method):
  78. nanops._USE_BOTTLENECK = use_bn
  79. def check_results(self, targ, res, axis, check_dtype=True):
  80. res = getattr(res, "asm8", res)
  81. if (
  82. axis != 0
  83. and hasattr(targ, "shape")
  84. and targ.ndim
  85. and targ.shape != res.shape
  86. ):
  87. res = np.split(res, [targ.shape[0]], axis=0)[0]
  88. try:
  89. tm.assert_almost_equal(targ, res, check_dtype=check_dtype)
  90. except AssertionError:
  91. # handle timedelta dtypes
  92. if hasattr(targ, "dtype") and targ.dtype == "m8[ns]":
  93. raise
  94. # There are sometimes rounding errors with
  95. # complex and object dtypes.
  96. # If it isn't one of those, re-raise the error.
  97. if not hasattr(res, "dtype") or res.dtype.kind not in ["c", "O"]:
  98. raise
  99. # convert object dtypes to something that can be split into
  100. # real and imaginary parts
  101. if res.dtype.kind == "O":
  102. if targ.dtype.kind != "O":
  103. res = res.astype(targ.dtype)
  104. else:
  105. cast_dtype = "c16" if has_c16 else "f8"
  106. res = res.astype(cast_dtype)
  107. targ = targ.astype(cast_dtype)
  108. # there should never be a case where numpy returns an object
  109. # but nanops doesn't, so make that an exception
  110. elif targ.dtype.kind == "O":
  111. raise
  112. tm.assert_almost_equal(np.real(targ), np.real(res), check_dtype=check_dtype)
  113. tm.assert_almost_equal(np.imag(targ), np.imag(res), check_dtype=check_dtype)
  114. def check_fun_data(
  115. self,
  116. testfunc,
  117. targfunc,
  118. testarval,
  119. targarval,
  120. skipna,
  121. check_dtype=True,
  122. empty_targfunc=None,
  123. **kwargs,
  124. ):
  125. for axis in list(range(targarval.ndim)) + [None]:
  126. targartempval = targarval if skipna else testarval
  127. if skipna and empty_targfunc and isna(targartempval).all():
  128. targ = empty_targfunc(targartempval, axis=axis, **kwargs)
  129. else:
  130. targ = targfunc(targartempval, axis=axis, **kwargs)
  131. res = testfunc(testarval, axis=axis, skipna=skipna, **kwargs)
  132. self.check_results(targ, res, axis, check_dtype=check_dtype)
  133. if skipna:
  134. res = testfunc(testarval, axis=axis, **kwargs)
  135. self.check_results(targ, res, axis, check_dtype=check_dtype)
  136. if axis is None:
  137. res = testfunc(testarval, skipna=skipna, **kwargs)
  138. self.check_results(targ, res, axis, check_dtype=check_dtype)
  139. if skipna and axis is None:
  140. res = testfunc(testarval, **kwargs)
  141. self.check_results(targ, res, axis, check_dtype=check_dtype)
  142. if testarval.ndim <= 1:
  143. return
  144. # Recurse on lower-dimension
  145. testarval2 = np.take(testarval, 0, axis=-1)
  146. targarval2 = np.take(targarval, 0, axis=-1)
  147. self.check_fun_data(
  148. testfunc,
  149. targfunc,
  150. testarval2,
  151. targarval2,
  152. skipna=skipna,
  153. check_dtype=check_dtype,
  154. empty_targfunc=empty_targfunc,
  155. **kwargs,
  156. )
  157. def check_fun(
  158. self, testfunc, targfunc, testar, skipna, empty_targfunc=None, **kwargs
  159. ):
  160. targar = testar
  161. if testar.endswith("_nan") and hasattr(self, testar[:-4]):
  162. targar = testar[:-4]
  163. testarval = getattr(self, testar)
  164. targarval = getattr(self, targar)
  165. self.check_fun_data(
  166. testfunc,
  167. targfunc,
  168. testarval,
  169. targarval,
  170. skipna=skipna,
  171. empty_targfunc=empty_targfunc,
  172. **kwargs,
  173. )
  174. def check_funs(
  175. self,
  176. testfunc,
  177. targfunc,
  178. skipna,
  179. allow_complex=True,
  180. allow_all_nan=True,
  181. allow_date=True,
  182. allow_tdelta=True,
  183. allow_obj=True,
  184. **kwargs,
  185. ):
  186. self.check_fun(testfunc, targfunc, "arr_float", skipna, **kwargs)
  187. self.check_fun(testfunc, targfunc, "arr_float_nan", skipna, **kwargs)
  188. self.check_fun(testfunc, targfunc, "arr_int", skipna, **kwargs)
  189. self.check_fun(testfunc, targfunc, "arr_bool", skipna, **kwargs)
  190. objs = [
  191. self.arr_float.astype("O"),
  192. self.arr_int.astype("O"),
  193. self.arr_bool.astype("O"),
  194. ]
  195. if allow_all_nan:
  196. self.check_fun(testfunc, targfunc, "arr_nan", skipna, **kwargs)
  197. if allow_complex:
  198. self.check_fun(testfunc, targfunc, "arr_complex", skipna, **kwargs)
  199. self.check_fun(testfunc, targfunc, "arr_complex_nan", skipna, **kwargs)
  200. if allow_all_nan:
  201. self.check_fun(testfunc, targfunc, "arr_nan_nanj", skipna, **kwargs)
  202. objs += [self.arr_complex.astype("O")]
  203. if allow_date:
  204. targfunc(self.arr_date)
  205. self.check_fun(testfunc, targfunc, "arr_date", skipna, **kwargs)
  206. objs += [self.arr_date.astype("O")]
  207. if allow_tdelta:
  208. try:
  209. targfunc(self.arr_tdelta)
  210. except TypeError:
  211. pass
  212. else:
  213. self.check_fun(testfunc, targfunc, "arr_tdelta", skipna, **kwargs)
  214. objs += [self.arr_tdelta.astype("O")]
  215. if allow_obj:
  216. self.arr_obj = np.vstack(objs)
  217. # some nanops handle object dtypes better than their numpy
  218. # counterparts, so the numpy functions need to be given something
  219. # else
  220. if allow_obj == "convert":
  221. targfunc = partial(
  222. self._badobj_wrap, func=targfunc, allow_complex=allow_complex
  223. )
  224. self.check_fun(testfunc, targfunc, "arr_obj", skipna, **kwargs)
  225. def _badobj_wrap(self, value, func, allow_complex=True, **kwargs):
  226. if value.dtype.kind == "O":
  227. if allow_complex:
  228. value = value.astype("c16")
  229. else:
  230. value = value.astype("f8")
  231. return func(value, **kwargs)
  232. @pytest.mark.xfail(reason="GH12863: numpy result won't match for object type")
  233. @pytest.mark.parametrize(
  234. "nan_op,np_op", [(nanops.nanany, np.any), (nanops.nanall, np.all)]
  235. )
  236. def test_nan_funcs(self, nan_op, np_op, skipna):
  237. self.check_funs(nan_op, np_op, skipna, allow_all_nan=False, allow_date=False)
  238. def test_nansum(self, skipna):
  239. self.check_funs(
  240. nanops.nansum,
  241. np.sum,
  242. skipna,
  243. allow_date=False,
  244. check_dtype=False,
  245. empty_targfunc=np.nansum,
  246. )
  247. def test_nanmean(self, skipna):
  248. self.check_funs(
  249. nanops.nanmean, np.mean, skipna, allow_obj=False, allow_date=False
  250. )
  251. def test_nanmean_overflow(self):
  252. # GH 10155
  253. # In the previous implementation mean can overflow for int dtypes, it
  254. # is now consistent with numpy
  255. for a in [2 ** 55, -(2 ** 55), 20150515061816532]:
  256. s = Series(a, index=range(500), dtype=np.int64)
  257. result = s.mean()
  258. np_result = s.values.mean()
  259. assert result == a
  260. assert result == np_result
  261. assert result.dtype == np.float64
  262. @pytest.mark.parametrize(
  263. "dtype",
  264. [
  265. np.int16,
  266. np.int32,
  267. np.int64,
  268. np.float32,
  269. np.float64,
  270. getattr(np, "float128", None),
  271. ],
  272. )
  273. def test_returned_dtype(self, dtype):
  274. if dtype is None:
  275. # no float128 available
  276. return
  277. s = Series(range(10), dtype=dtype)
  278. group_a = ["mean", "std", "var", "skew", "kurt"]
  279. group_b = ["min", "max"]
  280. for method in group_a + group_b:
  281. result = getattr(s, method)()
  282. if is_integer_dtype(dtype) and method in group_a:
  283. assert result.dtype == np.float64
  284. else:
  285. assert result.dtype == dtype
  286. def test_nanmedian(self, skipna):
  287. with warnings.catch_warnings(record=True):
  288. warnings.simplefilter("ignore", RuntimeWarning)
  289. self.check_funs(
  290. nanops.nanmedian,
  291. np.median,
  292. skipna,
  293. allow_complex=False,
  294. allow_date=False,
  295. allow_obj="convert",
  296. )
  297. @pytest.mark.parametrize("ddof", range(3))
  298. def test_nanvar(self, ddof, skipna):
  299. self.check_funs(
  300. nanops.nanvar,
  301. np.var,
  302. skipna,
  303. allow_complex=False,
  304. allow_date=False,
  305. allow_obj="convert",
  306. ddof=ddof,
  307. )
  308. @pytest.mark.parametrize("ddof", range(3))
  309. def test_nanstd(self, ddof, skipna):
  310. self.check_funs(
  311. nanops.nanstd,
  312. np.std,
  313. skipna,
  314. allow_complex=False,
  315. allow_date=False,
  316. allow_obj="convert",
  317. ddof=ddof,
  318. )
  319. @td.skip_if_no_scipy
  320. @pytest.mark.parametrize("ddof", range(3))
  321. def test_nansem(self, ddof, skipna):
  322. from scipy.stats import sem
  323. with np.errstate(invalid="ignore"):
  324. self.check_funs(
  325. nanops.nansem,
  326. sem,
  327. skipna,
  328. allow_complex=False,
  329. allow_date=False,
  330. allow_tdelta=False,
  331. allow_obj="convert",
  332. ddof=ddof,
  333. )
  334. @pytest.mark.parametrize(
  335. "nan_op,np_op", [(nanops.nanmin, np.min), (nanops.nanmax, np.max)]
  336. )
  337. def test_nanops_with_warnings(self, nan_op, np_op, skipna):
  338. with warnings.catch_warnings(record=True):
  339. warnings.simplefilter("ignore", RuntimeWarning)
  340. self.check_funs(nan_op, np_op, skipna, allow_obj=False)
  341. def _argminmax_wrap(self, value, axis=None, func=None):
  342. res = func(value, axis)
  343. nans = np.min(value, axis)
  344. nullnan = isna(nans)
  345. if res.ndim:
  346. res[nullnan] = -1
  347. elif (
  348. hasattr(nullnan, "all")
  349. and nullnan.all()
  350. or not hasattr(nullnan, "all")
  351. and nullnan
  352. ):
  353. res = -1
  354. return res
  355. def test_nanargmax(self, skipna):
  356. with warnings.catch_warnings(record=True):
  357. warnings.simplefilter("ignore", RuntimeWarning)
  358. func = partial(self._argminmax_wrap, func=np.argmax)
  359. self.check_funs(nanops.nanargmax, func, skipna, allow_obj=False)
  360. def test_nanargmin(self, skipna):
  361. with warnings.catch_warnings(record=True):
  362. warnings.simplefilter("ignore", RuntimeWarning)
  363. func = partial(self._argminmax_wrap, func=np.argmin)
  364. self.check_funs(nanops.nanargmin, func, skipna, allow_obj=False)
  365. def _skew_kurt_wrap(self, values, axis=None, func=None):
  366. if not isinstance(values.dtype.type, np.floating):
  367. values = values.astype("f8")
  368. result = func(values, axis=axis, bias=False)
  369. # fix for handling cases where all elements in an axis are the same
  370. if isinstance(result, np.ndarray):
  371. result[np.max(values, axis=axis) == np.min(values, axis=axis)] = 0
  372. return result
  373. elif np.max(values) == np.min(values):
  374. return 0.0
  375. return result
  376. @td.skip_if_no_scipy
  377. def test_nanskew(self, skipna):
  378. from scipy.stats import skew
  379. func = partial(self._skew_kurt_wrap, func=skew)
  380. with np.errstate(invalid="ignore"):
  381. self.check_funs(
  382. nanops.nanskew,
  383. func,
  384. skipna,
  385. allow_complex=False,
  386. allow_date=False,
  387. allow_tdelta=False,
  388. )
  389. @td.skip_if_no_scipy
  390. def test_nankurt(self, skipna):
  391. from scipy.stats import kurtosis
  392. func1 = partial(kurtosis, fisher=True)
  393. func = partial(self._skew_kurt_wrap, func=func1)
  394. with np.errstate(invalid="ignore"):
  395. self.check_funs(
  396. nanops.nankurt,
  397. func,
  398. skipna,
  399. allow_complex=False,
  400. allow_date=False,
  401. allow_tdelta=False,
  402. )
  403. def test_nanprod(self, skipna):
  404. self.check_funs(
  405. nanops.nanprod,
  406. np.prod,
  407. skipna,
  408. allow_date=False,
  409. allow_tdelta=False,
  410. empty_targfunc=np.nanprod,
  411. )
  412. def check_nancorr_nancov_2d(self, checkfun, targ0, targ1, **kwargs):
  413. res00 = checkfun(self.arr_float_2d, self.arr_float1_2d, **kwargs)
  414. res01 = checkfun(
  415. self.arr_float_2d,
  416. self.arr_float1_2d,
  417. min_periods=len(self.arr_float_2d) - 1,
  418. **kwargs,
  419. )
  420. tm.assert_almost_equal(targ0, res00)
  421. tm.assert_almost_equal(targ0, res01)
  422. res10 = checkfun(self.arr_float_nan_2d, self.arr_float1_nan_2d, **kwargs)
  423. res11 = checkfun(
  424. self.arr_float_nan_2d,
  425. self.arr_float1_nan_2d,
  426. min_periods=len(self.arr_float_2d) - 1,
  427. **kwargs,
  428. )
  429. tm.assert_almost_equal(targ1, res10)
  430. tm.assert_almost_equal(targ1, res11)
  431. targ2 = np.nan
  432. res20 = checkfun(self.arr_nan_2d, self.arr_float1_2d, **kwargs)
  433. res21 = checkfun(self.arr_float_2d, self.arr_nan_2d, **kwargs)
  434. res22 = checkfun(self.arr_nan_2d, self.arr_nan_2d, **kwargs)
  435. res23 = checkfun(self.arr_float_nan_2d, self.arr_nan_float1_2d, **kwargs)
  436. res24 = checkfun(
  437. self.arr_float_nan_2d,
  438. self.arr_nan_float1_2d,
  439. min_periods=len(self.arr_float_2d) - 1,
  440. **kwargs,
  441. )
  442. res25 = checkfun(
  443. self.arr_float_2d,
  444. self.arr_float1_2d,
  445. min_periods=len(self.arr_float_2d) + 1,
  446. **kwargs,
  447. )
  448. tm.assert_almost_equal(targ2, res20)
  449. tm.assert_almost_equal(targ2, res21)
  450. tm.assert_almost_equal(targ2, res22)
  451. tm.assert_almost_equal(targ2, res23)
  452. tm.assert_almost_equal(targ2, res24)
  453. tm.assert_almost_equal(targ2, res25)
  454. def check_nancorr_nancov_1d(self, checkfun, targ0, targ1, **kwargs):
  455. res00 = checkfun(self.arr_float_1d, self.arr_float1_1d, **kwargs)
  456. res01 = checkfun(
  457. self.arr_float_1d,
  458. self.arr_float1_1d,
  459. min_periods=len(self.arr_float_1d) - 1,
  460. **kwargs,
  461. )
  462. tm.assert_almost_equal(targ0, res00)
  463. tm.assert_almost_equal(targ0, res01)
  464. res10 = checkfun(self.arr_float_nan_1d, self.arr_float1_nan_1d, **kwargs)
  465. res11 = checkfun(
  466. self.arr_float_nan_1d,
  467. self.arr_float1_nan_1d,
  468. min_periods=len(self.arr_float_1d) - 1,
  469. **kwargs,
  470. )
  471. tm.assert_almost_equal(targ1, res10)
  472. tm.assert_almost_equal(targ1, res11)
  473. targ2 = np.nan
  474. res20 = checkfun(self.arr_nan_1d, self.arr_float1_1d, **kwargs)
  475. res21 = checkfun(self.arr_float_1d, self.arr_nan_1d, **kwargs)
  476. res22 = checkfun(self.arr_nan_1d, self.arr_nan_1d, **kwargs)
  477. res23 = checkfun(self.arr_float_nan_1d, self.arr_nan_float1_1d, **kwargs)
  478. res24 = checkfun(
  479. self.arr_float_nan_1d,
  480. self.arr_nan_float1_1d,
  481. min_periods=len(self.arr_float_1d) - 1,
  482. **kwargs,
  483. )
  484. res25 = checkfun(
  485. self.arr_float_1d,
  486. self.arr_float1_1d,
  487. min_periods=len(self.arr_float_1d) + 1,
  488. **kwargs,
  489. )
  490. tm.assert_almost_equal(targ2, res20)
  491. tm.assert_almost_equal(targ2, res21)
  492. tm.assert_almost_equal(targ2, res22)
  493. tm.assert_almost_equal(targ2, res23)
  494. tm.assert_almost_equal(targ2, res24)
  495. tm.assert_almost_equal(targ2, res25)
  496. def test_nancorr(self):
  497. targ0 = np.corrcoef(self.arr_float_2d, self.arr_float1_2d)[0, 1]
  498. targ1 = np.corrcoef(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1]
  499. self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1)
  500. targ0 = np.corrcoef(self.arr_float_1d, self.arr_float1_1d)[0, 1]
  501. targ1 = np.corrcoef(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0, 1]
  502. self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="pearson")
  503. def test_nancorr_pearson(self):
  504. targ0 = np.corrcoef(self.arr_float_2d, self.arr_float1_2d)[0, 1]
  505. targ1 = np.corrcoef(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1]
  506. self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, method="pearson")
  507. targ0 = np.corrcoef(self.arr_float_1d, self.arr_float1_1d)[0, 1]
  508. targ1 = np.corrcoef(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0, 1]
  509. self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="pearson")
  510. @td.skip_if_no_scipy
  511. def test_nancorr_kendall(self):
  512. from scipy.stats import kendalltau
  513. targ0 = kendalltau(self.arr_float_2d, self.arr_float1_2d)[0]
  514. targ1 = kendalltau(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0]
  515. self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, method="kendall")
  516. targ0 = kendalltau(self.arr_float_1d, self.arr_float1_1d)[0]
  517. targ1 = kendalltau(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0]
  518. self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="kendall")
  519. @td.skip_if_no_scipy
  520. def test_nancorr_spearman(self):
  521. from scipy.stats import spearmanr
  522. targ0 = spearmanr(self.arr_float_2d, self.arr_float1_2d)[0]
  523. targ1 = spearmanr(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0]
  524. self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, method="spearman")
  525. targ0 = spearmanr(self.arr_float_1d, self.arr_float1_1d)[0]
  526. targ1 = spearmanr(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0]
  527. self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="spearman")
  528. @td.skip_if_no_scipy
  529. def test_invalid_method(self):
  530. targ0 = np.corrcoef(self.arr_float_2d, self.arr_float1_2d)[0, 1]
  531. targ1 = np.corrcoef(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1]
  532. msg = "Unknown method 'foo', expected one of 'kendall', 'spearman'"
  533. with pytest.raises(ValueError, match=msg):
  534. self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="foo")
  535. def test_nancov(self):
  536. targ0 = np.cov(self.arr_float_2d, self.arr_float1_2d)[0, 1]
  537. targ1 = np.cov(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1]
  538. self.check_nancorr_nancov_2d(nanops.nancov, targ0, targ1)
  539. targ0 = np.cov(self.arr_float_1d, self.arr_float1_1d)[0, 1]
  540. targ1 = np.cov(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0, 1]
  541. self.check_nancorr_nancov_1d(nanops.nancov, targ0, targ1)
  542. def check_nancomp(self, checkfun, targ0):
  543. arr_float = self.arr_float
  544. arr_float1 = self.arr_float1
  545. arr_nan = self.arr_nan
  546. arr_nan_nan = self.arr_nan_nan
  547. arr_float_nan = self.arr_float_nan
  548. arr_float1_nan = self.arr_float1_nan
  549. arr_nan_float1 = self.arr_nan_float1
  550. while targ0.ndim:
  551. res0 = checkfun(arr_float, arr_float1)
  552. tm.assert_almost_equal(targ0, res0)
  553. if targ0.ndim > 1:
  554. targ1 = np.vstack([targ0, arr_nan])
  555. else:
  556. targ1 = np.hstack([targ0, arr_nan])
  557. res1 = checkfun(arr_float_nan, arr_float1_nan)
  558. tm.assert_numpy_array_equal(targ1, res1, check_dtype=False)
  559. targ2 = arr_nan_nan
  560. res2 = checkfun(arr_float_nan, arr_nan_float1)
  561. tm.assert_numpy_array_equal(targ2, res2, check_dtype=False)
  562. # Lower dimension for next step in the loop
  563. arr_float = np.take(arr_float, 0, axis=-1)
  564. arr_float1 = np.take(arr_float1, 0, axis=-1)
  565. arr_nan = np.take(arr_nan, 0, axis=-1)
  566. arr_nan_nan = np.take(arr_nan_nan, 0, axis=-1)
  567. arr_float_nan = np.take(arr_float_nan, 0, axis=-1)
  568. arr_float1_nan = np.take(arr_float1_nan, 0, axis=-1)
  569. arr_nan_float1 = np.take(arr_nan_float1, 0, axis=-1)
  570. targ0 = np.take(targ0, 0, axis=-1)
  571. @pytest.mark.parametrize(
  572. "op,nanop",
  573. [
  574. (operator.eq, nanops.naneq),
  575. (operator.ne, nanops.nanne),
  576. (operator.gt, nanops.nangt),
  577. (operator.ge, nanops.nange),
  578. (operator.lt, nanops.nanlt),
  579. (operator.le, nanops.nanle),
  580. ],
  581. )
  582. def test_nan_comparison(self, op, nanop):
  583. targ0 = op(self.arr_float, self.arr_float1)
  584. self.check_nancomp(nanop, targ0)
  585. def check_bool(self, func, value, correct):
  586. while getattr(value, "ndim", True):
  587. res0 = func(value)
  588. if correct:
  589. assert res0
  590. else:
  591. assert not res0
  592. if not hasattr(value, "ndim"):
  593. break
  594. # Reduce dimension for next step in the loop
  595. value = np.take(value, 0, axis=-1)
  596. def test__has_infs(self):
  597. pairs = [
  598. ("arr_complex", False),
  599. ("arr_int", False),
  600. ("arr_bool", False),
  601. ("arr_str", False),
  602. ("arr_utf", False),
  603. ("arr_complex", False),
  604. ("arr_complex_nan", False),
  605. ("arr_nan_nanj", False),
  606. ("arr_nan_infj", True),
  607. ("arr_complex_nan_infj", True),
  608. ]
  609. pairs_float = [
  610. ("arr_float", False),
  611. ("arr_nan", False),
  612. ("arr_float_nan", False),
  613. ("arr_nan_nan", False),
  614. ("arr_float_inf", True),
  615. ("arr_inf", True),
  616. ("arr_nan_inf", True),
  617. ("arr_float_nan_inf", True),
  618. ("arr_nan_nan_inf", True),
  619. ]
  620. for arr, correct in pairs:
  621. val = getattr(self, arr)
  622. self.check_bool(nanops._has_infs, val, correct)
  623. for arr, correct in pairs_float:
  624. val = getattr(self, arr)
  625. self.check_bool(nanops._has_infs, val, correct)
  626. self.check_bool(nanops._has_infs, val.astype("f4"), correct)
  627. self.check_bool(nanops._has_infs, val.astype("f2"), correct)
  628. def test__bn_ok_dtype(self):
  629. assert nanops._bn_ok_dtype(self.arr_float.dtype, "test")
  630. assert nanops._bn_ok_dtype(self.arr_complex.dtype, "test")
  631. assert nanops._bn_ok_dtype(self.arr_int.dtype, "test")
  632. assert nanops._bn_ok_dtype(self.arr_bool.dtype, "test")
  633. assert nanops._bn_ok_dtype(self.arr_str.dtype, "test")
  634. assert nanops._bn_ok_dtype(self.arr_utf.dtype, "test")
  635. assert not nanops._bn_ok_dtype(self.arr_date.dtype, "test")
  636. assert not nanops._bn_ok_dtype(self.arr_tdelta.dtype, "test")
  637. assert not nanops._bn_ok_dtype(self.arr_obj.dtype, "test")
  638. class TestEnsureNumeric:
  639. def test_numeric_values(self):
  640. # Test integer
  641. assert nanops._ensure_numeric(1) == 1
  642. # Test float
  643. assert nanops._ensure_numeric(1.1) == 1.1
  644. # Test complex
  645. assert nanops._ensure_numeric(1 + 2j) == 1 + 2j
  646. def test_ndarray(self):
  647. # Test numeric ndarray
  648. values = np.array([1, 2, 3])
  649. assert np.allclose(nanops._ensure_numeric(values), values)
  650. # Test object ndarray
  651. o_values = values.astype(object)
  652. assert np.allclose(nanops._ensure_numeric(o_values), values)
  653. # Test convertible string ndarray
  654. s_values = np.array(["1", "2", "3"], dtype=object)
  655. assert np.allclose(nanops._ensure_numeric(s_values), values)
  656. # Test non-convertible string ndarray
  657. s_values = np.array(["foo", "bar", "baz"], dtype=object)
  658. msg = r"Could not convert .* to numeric"
  659. with pytest.raises(TypeError, match=msg):
  660. nanops._ensure_numeric(s_values)
  661. def test_convertable_values(self):
  662. assert np.allclose(nanops._ensure_numeric("1"), 1.0)
  663. assert np.allclose(nanops._ensure_numeric("1.1"), 1.1)
  664. assert np.allclose(nanops._ensure_numeric("1+1j"), 1 + 1j)
  665. def test_non_convertable_values(self):
  666. msg = "Could not convert foo to numeric"
  667. with pytest.raises(TypeError, match=msg):
  668. nanops._ensure_numeric("foo")
  669. # with the wrong type, python raises TypeError for us
  670. msg = "argument must be a string or a number"
  671. with pytest.raises(TypeError, match=msg):
  672. nanops._ensure_numeric({})
  673. with pytest.raises(TypeError, match=msg):
  674. nanops._ensure_numeric([])
  675. class TestNanvarFixedValues:
  676. # xref GH10242
  677. def setup_method(self, method):
  678. # Samples from a normal distribution.
  679. self.variance = variance = 3.0
  680. self.samples = self.prng.normal(scale=variance ** 0.5, size=100000)
  681. def test_nanvar_all_finite(self):
  682. samples = self.samples
  683. actual_variance = nanops.nanvar(samples)
  684. tm.assert_almost_equal(actual_variance, self.variance, rtol=1e-2)
  685. def test_nanvar_nans(self):
  686. samples = np.nan * np.ones(2 * self.samples.shape[0])
  687. samples[::2] = self.samples
  688. actual_variance = nanops.nanvar(samples, skipna=True)
  689. tm.assert_almost_equal(actual_variance, self.variance, rtol=1e-2)
  690. actual_variance = nanops.nanvar(samples, skipna=False)
  691. tm.assert_almost_equal(actual_variance, np.nan, rtol=1e-2)
  692. def test_nanstd_nans(self):
  693. samples = np.nan * np.ones(2 * self.samples.shape[0])
  694. samples[::2] = self.samples
  695. actual_std = nanops.nanstd(samples, skipna=True)
  696. tm.assert_almost_equal(actual_std, self.variance ** 0.5, rtol=1e-2)
  697. actual_std = nanops.nanvar(samples, skipna=False)
  698. tm.assert_almost_equal(actual_std, np.nan, rtol=1e-2)
  699. def test_nanvar_axis(self):
  700. # Generate some sample data.
  701. samples_norm = self.samples
  702. samples_unif = self.prng.uniform(size=samples_norm.shape[0])
  703. samples = np.vstack([samples_norm, samples_unif])
  704. actual_variance = nanops.nanvar(samples, axis=1)
  705. tm.assert_almost_equal(
  706. actual_variance, np.array([self.variance, 1.0 / 12]), rtol=1e-2
  707. )
  708. def test_nanvar_ddof(self):
  709. n = 5
  710. samples = self.prng.uniform(size=(10000, n + 1))
  711. samples[:, -1] = np.nan # Force use of our own algorithm.
  712. variance_0 = nanops.nanvar(samples, axis=1, skipna=True, ddof=0).mean()
  713. variance_1 = nanops.nanvar(samples, axis=1, skipna=True, ddof=1).mean()
  714. variance_2 = nanops.nanvar(samples, axis=1, skipna=True, ddof=2).mean()
  715. # The unbiased estimate.
  716. var = 1.0 / 12
  717. tm.assert_almost_equal(variance_1, var, rtol=1e-2)
  718. # The underestimated variance.
  719. tm.assert_almost_equal(variance_0, (n - 1.0) / n * var, rtol=1e-2)
  720. # The overestimated variance.
  721. tm.assert_almost_equal(variance_2, (n - 1.0) / (n - 2.0) * var, rtol=1e-2)
  722. def test_ground_truth(self):
  723. # Test against values that were precomputed with Numpy.
  724. samples = np.empty((4, 4))
  725. samples[:3, :3] = np.array(
  726. [
  727. [0.97303362, 0.21869576, 0.55560287],
  728. [0.72980153, 0.03109364, 0.99155171],
  729. [0.09317602, 0.60078248, 0.15871292],
  730. ]
  731. )
  732. samples[3] = samples[:, 3] = np.nan
  733. # Actual variances along axis=0, 1 for ddof=0, 1, 2
  734. variance = np.array(
  735. [
  736. [
  737. [0.13762259, 0.05619224, 0.11568816],
  738. [0.20643388, 0.08428837, 0.17353224],
  739. [0.41286776, 0.16857673, 0.34706449],
  740. ],
  741. [
  742. [0.09519783, 0.16435395, 0.05082054],
  743. [0.14279674, 0.24653093, 0.07623082],
  744. [0.28559348, 0.49306186, 0.15246163],
  745. ],
  746. ]
  747. )
  748. # Test nanvar.
  749. for axis in range(2):
  750. for ddof in range(3):
  751. var = nanops.nanvar(samples, skipna=True, axis=axis, ddof=ddof)
  752. tm.assert_almost_equal(var[:3], variance[axis, ddof])
  753. assert np.isnan(var[3])
  754. # Test nanstd.
  755. for axis in range(2):
  756. for ddof in range(3):
  757. std = nanops.nanstd(samples, skipna=True, axis=axis, ddof=ddof)
  758. tm.assert_almost_equal(std[:3], variance[axis, ddof] ** 0.5)
  759. assert np.isnan(std[3])
  760. def test_nanstd_roundoff(self):
  761. # Regression test for GH 10242 (test data taken from GH 10489). Ensure
  762. # that variance is stable.
  763. data = Series(766897346 * np.ones(10))
  764. for ddof in range(3):
  765. result = data.std(ddof=ddof)
  766. assert result == 0.0
  767. @property
  768. def prng(self):
  769. return np.random.RandomState(1234)
  770. class TestNanskewFixedValues:
  771. # xref GH 11974
  772. def setup_method(self, method):
  773. # Test data + skewness value (computed with scipy.stats.skew)
  774. self.samples = np.sin(np.linspace(0, 1, 200))
  775. self.actual_skew = -0.1875895205961754
  776. def test_constant_series(self):
  777. # xref GH 11974
  778. for val in [3075.2, 3075.3, 3075.5]:
  779. data = val * np.ones(300)
  780. skew = nanops.nanskew(data)
  781. assert skew == 0.0
  782. def test_all_finite(self):
  783. alpha, beta = 0.3, 0.1
  784. left_tailed = self.prng.beta(alpha, beta, size=100)
  785. assert nanops.nanskew(left_tailed) < 0
  786. alpha, beta = 0.1, 0.3
  787. right_tailed = self.prng.beta(alpha, beta, size=100)
  788. assert nanops.nanskew(right_tailed) > 0
  789. def test_ground_truth(self):
  790. skew = nanops.nanskew(self.samples)
  791. tm.assert_almost_equal(skew, self.actual_skew)
  792. def test_axis(self):
  793. samples = np.vstack([self.samples, np.nan * np.ones(len(self.samples))])
  794. skew = nanops.nanskew(samples, axis=1)
  795. tm.assert_almost_equal(skew, np.array([self.actual_skew, np.nan]))
  796. def test_nans(self):
  797. samples = np.hstack([self.samples, np.nan])
  798. skew = nanops.nanskew(samples, skipna=False)
  799. assert np.isnan(skew)
  800. def test_nans_skipna(self):
  801. samples = np.hstack([self.samples, np.nan])
  802. skew = nanops.nanskew(samples, skipna=True)
  803. tm.assert_almost_equal(skew, self.actual_skew)
  804. @property
  805. def prng(self):
  806. return np.random.RandomState(1234)
  807. class TestNankurtFixedValues:
  808. # xref GH 11974
  809. def setup_method(self, method):
  810. # Test data + kurtosis value (computed with scipy.stats.kurtosis)
  811. self.samples = np.sin(np.linspace(0, 1, 200))
  812. self.actual_kurt = -1.2058303433799713
  813. def test_constant_series(self):
  814. # xref GH 11974
  815. for val in [3075.2, 3075.3, 3075.5]:
  816. data = val * np.ones(300)
  817. kurt = nanops.nankurt(data)
  818. assert kurt == 0.0
  819. def test_all_finite(self):
  820. alpha, beta = 0.3, 0.1
  821. left_tailed = self.prng.beta(alpha, beta, size=100)
  822. assert nanops.nankurt(left_tailed) < 0
  823. alpha, beta = 0.1, 0.3
  824. right_tailed = self.prng.beta(alpha, beta, size=100)
  825. assert nanops.nankurt(right_tailed) > 0
  826. def test_ground_truth(self):
  827. kurt = nanops.nankurt(self.samples)
  828. tm.assert_almost_equal(kurt, self.actual_kurt)
  829. def test_axis(self):
  830. samples = np.vstack([self.samples, np.nan * np.ones(len(self.samples))])
  831. kurt = nanops.nankurt(samples, axis=1)
  832. tm.assert_almost_equal(kurt, np.array([self.actual_kurt, np.nan]))
  833. def test_nans(self):
  834. samples = np.hstack([self.samples, np.nan])
  835. kurt = nanops.nankurt(samples, skipna=False)
  836. assert np.isnan(kurt)
  837. def test_nans_skipna(self):
  838. samples = np.hstack([self.samples, np.nan])
  839. kurt = nanops.nankurt(samples, skipna=True)
  840. tm.assert_almost_equal(kurt, self.actual_kurt)
  841. @property
  842. def prng(self):
  843. return np.random.RandomState(1234)
  844. class TestDatetime64NaNOps:
  845. # Enabling mean changes the behavior of DataFrame.mean
  846. # See https://github.com/pandas-dev/pandas/issues/24752
  847. def test_nanmean(self):
  848. dti = pd.date_range("2016-01-01", periods=3)
  849. expected = dti[1]
  850. for obj in [dti, DatetimeArray(dti), Series(dti)]:
  851. result = nanops.nanmean(obj)
  852. assert result == expected
  853. dti2 = dti.insert(1, pd.NaT)
  854. for obj in [dti2, DatetimeArray(dti2), Series(dti2)]:
  855. result = nanops.nanmean(obj)
  856. assert result == expected
  857. @pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"])
  858. def test_nanmean_skipna_false(self, dtype):
  859. arr = np.arange(12).astype(np.int64).view(dtype).reshape(4, 3)
  860. arr[-1, -1] = "NaT"
  861. result = nanops.nanmean(arr, skipna=False)
  862. assert result is pd.NaT
  863. result = nanops.nanmean(arr, axis=0, skipna=False)
  864. expected = np.array([4, 5, "NaT"], dtype=arr.dtype)
  865. tm.assert_numpy_array_equal(result, expected)
  866. result = nanops.nanmean(arr, axis=1, skipna=False)
  867. expected = np.array([arr[0, 1], arr[1, 1], arr[2, 1], arr[-1, -1]])
  868. tm.assert_numpy_array_equal(result, expected)
  869. def test_use_bottleneck():
  870. if nanops._BOTTLENECK_INSTALLED:
  871. pd.set_option("use_bottleneck", True)
  872. assert pd.get_option("use_bottleneck")
  873. pd.set_option("use_bottleneck", False)
  874. assert not pd.get_option("use_bottleneck")
  875. pd.set_option("use_bottleneck", use_bn)
  876. @pytest.mark.parametrize(
  877. "numpy_op, expected",
  878. [
  879. (np.sum, 10),
  880. (np.nansum, 10),
  881. (np.mean, 2.5),
  882. (np.nanmean, 2.5),
  883. (np.median, 2.5),
  884. (np.nanmedian, 2.5),
  885. (np.min, 1),
  886. (np.max, 4),
  887. (np.nanmin, 1),
  888. (np.nanmax, 4),
  889. ],
  890. )
  891. def test_numpy_ops(numpy_op, expected):
  892. # GH8383
  893. result = numpy_op(Series([1, 2, 3, 4]))
  894. assert result == expected
  895. @pytest.mark.parametrize(
  896. "operation",
  897. [
  898. nanops.nanany,
  899. nanops.nanall,
  900. nanops.nansum,
  901. nanops.nanmean,
  902. nanops.nanmedian,
  903. nanops.nanstd,
  904. nanops.nanvar,
  905. nanops.nansem,
  906. nanops.nanargmax,
  907. nanops.nanargmin,
  908. nanops.nanmax,
  909. nanops.nanmin,
  910. nanops.nanskew,
  911. nanops.nankurt,
  912. nanops.nanprod,
  913. ],
  914. )
  915. def test_nanops_independent_of_mask_param(operation):
  916. # GH22764
  917. s = Series([1, 2, np.nan, 3, np.nan, 4])
  918. mask = s.isna()
  919. median_expected = operation(s)
  920. median_result = operation(s, mask=mask)
  921. assert median_expected == median_result