algos.pyx 52 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618
  1. import cython
  2. from cython import Py_ssize_t
  3. from libc.math cimport (
  4. fabs,
  5. sqrt,
  6. )
  7. from libc.stdlib cimport (
  8. free,
  9. malloc,
  10. )
  11. from libc.string cimport memmove
  12. import numpy as np
  13. cimport numpy as cnp
  14. from numpy cimport (
  15. NPY_FLOAT32,
  16. NPY_FLOAT64,
  17. NPY_INT8,
  18. NPY_INT16,
  19. NPY_INT32,
  20. NPY_INT64,
  21. NPY_OBJECT,
  22. NPY_UINT8,
  23. NPY_UINT16,
  24. NPY_UINT32,
  25. NPY_UINT64,
  26. float32_t,
  27. float64_t,
  28. int8_t,
  29. int16_t,
  30. int32_t,
  31. int64_t,
  32. intp_t,
  33. ndarray,
  34. uint8_t,
  35. uint16_t,
  36. uint32_t,
  37. uint64_t,
  38. )
  39. cnp.import_array()
  40. cimport pandas._libs.util as util
  41. from pandas._libs.khash cimport (
  42. kh_destroy_int64,
  43. kh_get_int64,
  44. kh_init_int64,
  45. kh_int64_t,
  46. kh_put_int64,
  47. kh_resize_int64,
  48. khiter_t,
  49. )
  50. from pandas._libs.util cimport (
  51. get_nat,
  52. numeric,
  53. )
  54. import pandas._libs.missing as missing
  55. cdef:
  56. float64_t FP_ERR = 1e-13
  57. float64_t NaN = <float64_t>np.NaN
  58. int64_t NPY_NAT = get_nat()
  59. cdef enum TiebreakEnumType:
  60. TIEBREAK_AVERAGE
  61. TIEBREAK_MIN,
  62. TIEBREAK_MAX
  63. TIEBREAK_FIRST
  64. TIEBREAK_FIRST_DESCENDING
  65. TIEBREAK_DENSE
  66. tiebreakers = {
  67. "average": TIEBREAK_AVERAGE,
  68. "min": TIEBREAK_MIN,
  69. "max": TIEBREAK_MAX,
  70. "first": TIEBREAK_FIRST,
  71. "dense": TIEBREAK_DENSE,
  72. }
  73. cdef inline bint are_diff(object left, object right):
  74. try:
  75. return fabs(left - right) > FP_ERR
  76. except TypeError:
  77. return left != right
  78. class Infinity:
  79. """
  80. Provide a positive Infinity comparison method for ranking.
  81. """
  82. __lt__ = lambda self, other: False
  83. __le__ = lambda self, other: isinstance(other, Infinity)
  84. __eq__ = lambda self, other: isinstance(other, Infinity)
  85. __ne__ = lambda self, other: not isinstance(other, Infinity)
  86. __gt__ = lambda self, other: (not isinstance(other, Infinity) and
  87. not missing.checknull(other))
  88. __ge__ = lambda self, other: not missing.checknull(other)
  89. class NegInfinity:
  90. """
  91. Provide a negative Infinity comparison method for ranking.
  92. """
  93. __lt__ = lambda self, other: (not isinstance(other, NegInfinity) and
  94. not missing.checknull(other))
  95. __le__ = lambda self, other: not missing.checknull(other)
  96. __eq__ = lambda self, other: isinstance(other, NegInfinity)
  97. __ne__ = lambda self, other: not isinstance(other, NegInfinity)
  98. __gt__ = lambda self, other: False
  99. __ge__ = lambda self, other: isinstance(other, NegInfinity)
  100. @cython.wraparound(False)
  101. @cython.boundscheck(False)
  102. cpdef ndarray[int64_t, ndim=1] unique_deltas(const int64_t[:] arr):
  103. """
  104. Efficiently find the unique first-differences of the given array.
  105. Parameters
  106. ----------
  107. arr : ndarray[in64_t]
  108. Returns
  109. -------
  110. ndarray[int64_t]
  111. An ordered ndarray[int64_t]
  112. """
  113. cdef:
  114. Py_ssize_t i, n = len(arr)
  115. int64_t val
  116. khiter_t k
  117. kh_int64_t *table
  118. int ret = 0
  119. list uniques = []
  120. ndarray[int64_t, ndim=1] result
  121. table = kh_init_int64()
  122. kh_resize_int64(table, 10)
  123. for i in range(n - 1):
  124. val = arr[i + 1] - arr[i]
  125. k = kh_get_int64(table, val)
  126. if k == table.n_buckets:
  127. kh_put_int64(table, val, &ret)
  128. uniques.append(val)
  129. kh_destroy_int64(table)
  130. result = np.array(uniques, dtype=np.int64)
  131. result.sort()
  132. return result
  133. @cython.wraparound(False)
  134. @cython.boundscheck(False)
  135. def is_lexsorted(list_of_arrays: list) -> bint:
  136. cdef:
  137. Py_ssize_t i
  138. Py_ssize_t n, nlevels
  139. int64_t k, cur, pre
  140. ndarray arr
  141. bint result = True
  142. nlevels = len(list_of_arrays)
  143. n = len(list_of_arrays[0])
  144. cdef int64_t **vecs = <int64_t**>malloc(nlevels * sizeof(int64_t*))
  145. for i in range(nlevels):
  146. arr = list_of_arrays[i]
  147. assert arr.dtype.name == 'int64'
  148. vecs[i] = <int64_t*>cnp.PyArray_DATA(arr)
  149. # Assume uniqueness??
  150. with nogil:
  151. for i in range(1, n):
  152. for k in range(nlevels):
  153. cur = vecs[k][i]
  154. pre = vecs[k][i -1]
  155. if cur == pre:
  156. continue
  157. elif cur > pre:
  158. break
  159. else:
  160. result = False
  161. break
  162. free(vecs)
  163. return result
  164. @cython.boundscheck(False)
  165. @cython.wraparound(False)
  166. def groupsort_indexer(const intp_t[:] index, Py_ssize_t ngroups):
  167. """
  168. Compute a 1-d indexer.
  169. The indexer is an ordering of the passed index,
  170. ordered by the groups.
  171. Parameters
  172. ----------
  173. index: np.ndarray[np.intp]
  174. Mappings from group -> position.
  175. ngroups: int64
  176. Number of groups.
  177. Returns
  178. -------
  179. ndarray[intp_t, ndim=1]
  180. Indexer
  181. ndarray[intp_t, ndim=1]
  182. Group Counts
  183. Notes
  184. -----
  185. This is a reverse of the label factorization process.
  186. """
  187. cdef:
  188. Py_ssize_t i, loc, label, n
  189. ndarray[intp_t] indexer, where, counts
  190. counts = np.zeros(ngroups + 1, dtype=np.intp)
  191. n = len(index)
  192. indexer = np.zeros(n, dtype=np.intp)
  193. where = np.zeros(ngroups + 1, dtype=np.intp)
  194. with nogil:
  195. # count group sizes, location 0 for NA
  196. for i in range(n):
  197. counts[index[i] + 1] += 1
  198. # mark the start of each contiguous group of like-indexed data
  199. for i in range(1, ngroups + 1):
  200. where[i] = where[i - 1] + counts[i - 1]
  201. # this is our indexer
  202. for i in range(n):
  203. label = index[i] + 1
  204. indexer[where[label]] = i
  205. where[label] += 1
  206. return indexer, counts
  207. cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil:
  208. cdef:
  209. numeric t
  210. # cython doesn't allow pointer dereference so use array syntax
  211. t = a[0]
  212. a[0] = b[0]
  213. b[0] = t
  214. return 0
  215. cdef inline numeric kth_smallest_c(numeric* arr, Py_ssize_t k, Py_ssize_t n) nogil:
  216. """
  217. See kth_smallest.__doc__. The additional parameter n specifies the maximum
  218. number of elements considered in arr, needed for compatibility with usage
  219. in groupby.pyx
  220. """
  221. cdef:
  222. Py_ssize_t i, j, l, m
  223. numeric x
  224. l = 0
  225. m = n - 1
  226. while l < m:
  227. x = arr[k]
  228. i = l
  229. j = m
  230. while 1:
  231. while arr[i] < x: i += 1
  232. while x < arr[j]: j -= 1
  233. if i <= j:
  234. swap(&arr[i], &arr[j])
  235. i += 1; j -= 1
  236. if i > j: break
  237. if j < k: l = i
  238. if k < i: m = j
  239. return arr[k]
  240. @cython.boundscheck(False)
  241. @cython.wraparound(False)
  242. def kth_smallest(numeric[::1] arr, Py_ssize_t k) -> numeric:
  243. """
  244. Compute the kth smallest value in arr. Note that the input
  245. array will be modified.
  246. Parameters
  247. ----------
  248. arr : numeric[::1]
  249. Array to compute the kth smallest value for, must be
  250. contiguous
  251. k : Py_ssize_t
  252. Returns
  253. -------
  254. numeric
  255. The kth smallest value in arr
  256. """
  257. cdef:
  258. numeric result
  259. with nogil:
  260. result = kth_smallest_c(&arr[0], k, arr.shape[0])
  261. return result
  262. # ----------------------------------------------------------------------
  263. # Pairwise correlation/covariance
  264. @cython.boundscheck(False)
  265. @cython.wraparound(False)
  266. def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
  267. cdef:
  268. Py_ssize_t i, j, xi, yi, N, K
  269. bint minpv
  270. ndarray[float64_t, ndim=2] result
  271. ndarray[uint8_t, ndim=2] mask
  272. int64_t nobs = 0
  273. float64_t vx, vy, meanx, meany, divisor, prev_meany, prev_meanx, ssqdmx
  274. float64_t ssqdmy, covxy
  275. N, K = (<object>mat).shape
  276. if minp is None:
  277. minpv = 1
  278. else:
  279. minpv = <int>minp
  280. result = np.empty((K, K), dtype=np.float64)
  281. mask = np.isfinite(mat).view(np.uint8)
  282. with nogil:
  283. for xi in range(K):
  284. for yi in range(xi + 1):
  285. # Welford's method for the variance-calculation
  286. # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
  287. nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
  288. for i in range(N):
  289. if mask[i, xi] and mask[i, yi]:
  290. vx = mat[i, xi]
  291. vy = mat[i, yi]
  292. nobs += 1
  293. prev_meanx = meanx
  294. prev_meany = meany
  295. meanx = meanx + 1 / nobs * (vx - meanx)
  296. meany = meany + 1 / nobs * (vy - meany)
  297. ssqdmx = ssqdmx + (vx - meanx) * (vx - prev_meanx)
  298. ssqdmy = ssqdmy + (vy - meany) * (vy - prev_meany)
  299. covxy = covxy + (vx - meanx) * (vy - prev_meany)
  300. if nobs < minpv:
  301. result[xi, yi] = result[yi, xi] = NaN
  302. else:
  303. divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy)
  304. if divisor != 0:
  305. result[xi, yi] = result[yi, xi] = covxy / divisor
  306. else:
  307. result[xi, yi] = result[yi, xi] = NaN
  308. return result
  309. # ----------------------------------------------------------------------
  310. # Pairwise Spearman correlation
  311. @cython.boundscheck(False)
  312. @cython.wraparound(False)
  313. def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarray:
  314. cdef:
  315. Py_ssize_t i, j, xi, yi, N, K
  316. ndarray[float64_t, ndim=2] result
  317. ndarray[float64_t, ndim=2] ranked_mat
  318. ndarray[float64_t, ndim=1] rankedx, rankedy
  319. float64_t[::1] maskedx, maskedy
  320. ndarray[uint8_t, ndim=2] mask
  321. int64_t nobs = 0
  322. bint no_nans
  323. float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor
  324. const int64_t[:] labels_n, labels_nobs
  325. N, K = (<object>mat).shape
  326. # For compatibility when calling rank_1d
  327. labels_n = np.zeros(N, dtype=np.int64)
  328. # Handle the edge case where we know all results will be nan
  329. # to keep conditional logic inside loop simpler
  330. if N < minp:
  331. result = np.full((K, K), np.nan, dtype=np.float64)
  332. return result
  333. result = np.empty((K, K), dtype=np.float64)
  334. mask = np.isfinite(mat).view(np.uint8)
  335. no_nans = mask.all()
  336. ranked_mat = np.empty((N, K), dtype=np.float64)
  337. # Note: we index into maskedx, maskedy in loops up to nobs, but using N is safe
  338. # here since N >= nobs and values are stored contiguously
  339. maskedx = np.empty(N, dtype=np.float64)
  340. maskedy = np.empty(N, dtype=np.float64)
  341. for i in range(K):
  342. ranked_mat[:, i] = rank_1d(mat[:, i], labels=labels_n)
  343. with nogil:
  344. for xi in range(K):
  345. for yi in range(xi + 1):
  346. sumx = sumxx = sumyy = 0
  347. # Fastpath for data with no nans/infs, allows avoiding mask checks
  348. # and array reassignments
  349. if no_nans:
  350. mean = (N + 1) / 2.
  351. # now the cov numerator
  352. for i in range(N):
  353. vx = ranked_mat[i, xi] - mean
  354. vy = ranked_mat[i, yi] - mean
  355. sumx += vx * vy
  356. sumxx += vx * vx
  357. sumyy += vy * vy
  358. else:
  359. nobs = 0
  360. # Keep track of whether we need to recompute ranks
  361. all_ranks = True
  362. for i in range(N):
  363. all_ranks &= not (mask[i, xi] ^ mask[i, yi])
  364. if mask[i, xi] and mask[i, yi]:
  365. maskedx[nobs] = ranked_mat[i, xi]
  366. maskedy[nobs] = ranked_mat[i, yi]
  367. nobs += 1
  368. if nobs < minp:
  369. result[xi, yi] = result[yi, xi] = NaN
  370. continue
  371. else:
  372. if not all_ranks:
  373. with gil:
  374. # We need to slice back to nobs because rank_1d will
  375. # require arrays of nobs length
  376. labels_nobs = np.zeros(nobs, dtype=np.int64)
  377. rankedx = rank_1d(np.array(maskedx)[:nobs],
  378. labels=labels_nobs)
  379. rankedy = rank_1d(np.array(maskedy)[:nobs],
  380. labels=labels_nobs)
  381. for i in range(nobs):
  382. maskedx[i] = rankedx[i]
  383. maskedy[i] = rankedy[i]
  384. mean = (nobs + 1) / 2.
  385. # now the cov numerator
  386. for i in range(nobs):
  387. vx = maskedx[i] - mean
  388. vy = maskedy[i] - mean
  389. sumx += vx * vy
  390. sumxx += vx * vx
  391. sumyy += vy * vy
  392. divisor = sqrt(sumxx * sumyy)
  393. if divisor != 0:
  394. result[xi, yi] = result[yi, xi] = sumx / divisor
  395. else:
  396. result[xi, yi] = result[yi, xi] = NaN
  397. return result
  398. # ----------------------------------------------------------------------
  399. # Kendall correlation
  400. # Wikipedia article: https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient
  401. @cython.boundscheck(False)
  402. @cython.wraparound(False)
  403. def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarray:
  404. """
  405. Perform kendall correlation on a 2d array
  406. Parameters
  407. ----------
  408. mat : np.ndarray[float64_t, ndim=2]
  409. Array to compute kendall correlation on
  410. minp : int, default 1
  411. Minimum number of observations required per pair of columns
  412. to have a valid result.
  413. Returns
  414. -------
  415. numpy.ndarray[float64_t, ndim=2]
  416. Correlation matrix
  417. """
  418. cdef:
  419. Py_ssize_t i, j, k, xi, yi, N, K
  420. ndarray[float64_t, ndim=2] result
  421. ndarray[float64_t, ndim=2] ranked_mat
  422. ndarray[uint8_t, ndim=2] mask
  423. float64_t currj
  424. ndarray[uint8_t, ndim=1] valid
  425. ndarray[int64_t] sorted_idxs
  426. ndarray[float64_t, ndim=1] col
  427. int64_t n_concordant
  428. int64_t total_concordant = 0
  429. int64_t total_discordant = 0
  430. float64_t kendall_tau
  431. int64_t n_obs
  432. const intp_t[:] labels_n
  433. N, K = (<object>mat).shape
  434. result = np.empty((K, K), dtype=np.float64)
  435. mask = np.isfinite(mat)
  436. ranked_mat = np.empty((N, K), dtype=np.float64)
  437. # For compatibility when calling rank_1d
  438. labels_n = np.zeros(N, dtype=np.intp)
  439. for i in range(K):
  440. ranked_mat[:, i] = rank_1d(mat[:, i], labels_n)
  441. for xi in range(K):
  442. sorted_idxs = ranked_mat[:, xi].argsort()
  443. ranked_mat = ranked_mat[sorted_idxs]
  444. mask = mask[sorted_idxs]
  445. for yi in range(xi + 1, K):
  446. valid = mask[:, xi] & mask[:, yi]
  447. if valid.sum() < minp:
  448. result[xi, yi] = NaN
  449. result[yi, xi] = NaN
  450. else:
  451. # Get columns and order second column using 1st column ranks
  452. if not valid.all():
  453. col = ranked_mat[valid.nonzero()][:, yi]
  454. else:
  455. col = ranked_mat[:, yi]
  456. n_obs = col.shape[0]
  457. total_concordant = 0
  458. total_discordant = 0
  459. for j in range(n_obs - 1):
  460. currj = col[j]
  461. # Count num concordant and discordant pairs
  462. n_concordant = 0
  463. for k in range(j, n_obs):
  464. if col[k] > currj:
  465. n_concordant += 1
  466. total_concordant += n_concordant
  467. total_discordant += (n_obs - 1 - j - n_concordant)
  468. # Note: we do total_concordant+total_discordant here which is
  469. # equivalent to the C(n, 2), the total # of pairs,
  470. # listed on wikipedia
  471. kendall_tau = (total_concordant - total_discordant) / \
  472. (total_concordant + total_discordant)
  473. result[xi, yi] = kendall_tau
  474. result[yi, xi] = kendall_tau
  475. if mask[:, xi].sum() > minp:
  476. result[xi, xi] = 1
  477. else:
  478. result[xi, xi] = NaN
  479. return result
  480. # ----------------------------------------------------------------------
  481. ctypedef fused algos_t:
  482. float64_t
  483. float32_t
  484. object
  485. int64_t
  486. int32_t
  487. int16_t
  488. int8_t
  489. uint64_t
  490. uint32_t
  491. uint16_t
  492. uint8_t
  493. def validate_limit(nobs: int | None, limit=None) -> int:
  494. """
  495. Check that the `limit` argument is a positive integer.
  496. Parameters
  497. ----------
  498. nobs : int
  499. limit : object
  500. Returns
  501. -------
  502. int
  503. The limit.
  504. """
  505. if limit is None:
  506. lim = nobs
  507. else:
  508. if not util.is_integer_object(limit):
  509. raise ValueError('Limit must be an integer')
  510. if limit < 1:
  511. raise ValueError('Limit must be greater than 0')
  512. lim = limit
  513. return lim
  514. @cython.boundscheck(False)
  515. @cython.wraparound(False)
  516. def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None) -> ndarray:
  517. # -> ndarray[intp_t, ndim=1]
  518. cdef:
  519. Py_ssize_t i, j, nleft, nright
  520. ndarray[intp_t, ndim=1] indexer
  521. algos_t cur, next_val
  522. int lim, fill_count = 0
  523. nleft = len(old)
  524. nright = len(new)
  525. indexer = np.empty(nright, dtype=np.intp)
  526. indexer[:] = -1
  527. lim = validate_limit(nright, limit)
  528. if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
  529. return indexer
  530. i = j = 0
  531. cur = old[0]
  532. while j <= nright - 1 and new[j] < cur:
  533. j += 1
  534. while True:
  535. if j == nright:
  536. break
  537. if i == nleft - 1:
  538. while j < nright:
  539. if new[j] == cur:
  540. indexer[j] = i
  541. elif new[j] > cur and fill_count < lim:
  542. indexer[j] = i
  543. fill_count += 1
  544. j += 1
  545. break
  546. next_val = old[i + 1]
  547. while j < nright and cur <= new[j] < next_val:
  548. if new[j] == cur:
  549. indexer[j] = i
  550. elif fill_count < lim:
  551. indexer[j] = i
  552. fill_count += 1
  553. j += 1
  554. fill_count = 0
  555. i += 1
  556. cur = next_val
  557. return indexer
  558. @cython.boundscheck(False)
  559. @cython.wraparound(False)
  560. def pad_inplace(algos_t[:] values, uint8_t[:] mask, limit=None):
  561. cdef:
  562. Py_ssize_t i, N
  563. algos_t val
  564. uint8_t prev_mask
  565. int lim, fill_count = 0
  566. N = len(values)
  567. # GH#2778
  568. if N == 0:
  569. return
  570. lim = validate_limit(N, limit)
  571. val = values[0]
  572. prev_mask = mask[0]
  573. for i in range(N):
  574. if mask[i]:
  575. if fill_count >= lim:
  576. continue
  577. fill_count += 1
  578. values[i] = val
  579. mask[i] = prev_mask
  580. else:
  581. fill_count = 0
  582. val = values[i]
  583. prev_mask = mask[i]
  584. @cython.boundscheck(False)
  585. @cython.wraparound(False)
  586. def pad_2d_inplace(algos_t[:, :] values, const uint8_t[:, :] mask, limit=None):
  587. cdef:
  588. Py_ssize_t i, j, N, K
  589. algos_t val
  590. int lim, fill_count = 0
  591. K, N = (<object>values).shape
  592. # GH#2778
  593. if N == 0:
  594. return
  595. lim = validate_limit(N, limit)
  596. for j in range(K):
  597. fill_count = 0
  598. val = values[j, 0]
  599. for i in range(N):
  600. if mask[j, i]:
  601. if fill_count >= lim:
  602. continue
  603. fill_count += 1
  604. values[j, i] = val
  605. else:
  606. fill_count = 0
  607. val = values[j, i]
  608. """
  609. Backfilling logic for generating fill vector
  610. Diagram of what's going on
  611. Old New Fill vector Mask
  612. . 0 1
  613. . 0 1
  614. . 0 1
  615. A A 0 1
  616. . 1 1
  617. . 1 1
  618. . 1 1
  619. . 1 1
  620. . 1 1
  621. B B 1 1
  622. . 2 1
  623. . 2 1
  624. . 2 1
  625. C C 2 1
  626. . 0
  627. . 0
  628. D
  629. """
  630. @cython.boundscheck(False)
  631. @cython.wraparound(False)
  632. def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None) -> ndarray:
  633. # -> ndarray[intp_t, ndim=1]
  634. cdef:
  635. Py_ssize_t i, j, nleft, nright
  636. ndarray[intp_t, ndim=1] indexer
  637. algos_t cur, prev
  638. int lim, fill_count = 0
  639. nleft = len(old)
  640. nright = len(new)
  641. indexer = np.empty(nright, dtype=np.intp)
  642. indexer[:] = -1
  643. lim = validate_limit(nright, limit)
  644. if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
  645. return indexer
  646. i = nleft - 1
  647. j = nright - 1
  648. cur = old[nleft - 1]
  649. while j >= 0 and new[j] > cur:
  650. j -= 1
  651. while True:
  652. if j < 0:
  653. break
  654. if i == 0:
  655. while j >= 0:
  656. if new[j] == cur:
  657. indexer[j] = i
  658. elif new[j] < cur and fill_count < lim:
  659. indexer[j] = i
  660. fill_count += 1
  661. j -= 1
  662. break
  663. prev = old[i - 1]
  664. while j >= 0 and prev < new[j] <= cur:
  665. if new[j] == cur:
  666. indexer[j] = i
  667. elif new[j] < cur and fill_count < lim:
  668. indexer[j] = i
  669. fill_count += 1
  670. j -= 1
  671. fill_count = 0
  672. i -= 1
  673. cur = prev
  674. return indexer
  675. def backfill_inplace(algos_t[:] values, uint8_t[:] mask, limit=None):
  676. pad_inplace(values[::-1], mask[::-1], limit=limit)
  677. def backfill_2d_inplace(algos_t[:, :] values,
  678. const uint8_t[:, :] mask,
  679. limit=None):
  680. pad_2d_inplace(values[:, ::-1], mask[:, ::-1], limit)
  681. @cython.boundscheck(False)
  682. @cython.wraparound(False)
  683. def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike):
  684. """
  685. Returns
  686. -------
  687. tuple
  688. is_monotonic_inc : bool
  689. is_monotonic_dec : bool
  690. is_unique : bool
  691. """
  692. cdef:
  693. Py_ssize_t i, n
  694. algos_t prev, cur
  695. bint is_monotonic_inc = 1
  696. bint is_monotonic_dec = 1
  697. bint is_unique = 1
  698. bint is_strict_monotonic = 1
  699. n = len(arr)
  700. if n == 1:
  701. if arr[0] != arr[0] or (timelike and <int64_t>arr[0] == NPY_NAT):
  702. # single value is NaN
  703. return False, False, True
  704. else:
  705. return True, True, True
  706. elif n < 2:
  707. return True, True, True
  708. if timelike and <int64_t>arr[0] == NPY_NAT:
  709. return False, False, True
  710. if algos_t is not object:
  711. with nogil:
  712. prev = arr[0]
  713. for i in range(1, n):
  714. cur = arr[i]
  715. if timelike and <int64_t>cur == NPY_NAT:
  716. is_monotonic_inc = 0
  717. is_monotonic_dec = 0
  718. break
  719. if cur < prev:
  720. is_monotonic_inc = 0
  721. elif cur > prev:
  722. is_monotonic_dec = 0
  723. elif cur == prev:
  724. is_unique = 0
  725. else:
  726. # cur or prev is NaN
  727. is_monotonic_inc = 0
  728. is_monotonic_dec = 0
  729. break
  730. if not is_monotonic_inc and not is_monotonic_dec:
  731. is_monotonic_inc = 0
  732. is_monotonic_dec = 0
  733. break
  734. prev = cur
  735. else:
  736. # object-dtype, identical to above except we cannot use `with nogil`
  737. prev = arr[0]
  738. for i in range(1, n):
  739. cur = arr[i]
  740. if timelike and <int64_t>cur == NPY_NAT:
  741. is_monotonic_inc = 0
  742. is_monotonic_dec = 0
  743. break
  744. if cur < prev:
  745. is_monotonic_inc = 0
  746. elif cur > prev:
  747. is_monotonic_dec = 0
  748. elif cur == prev:
  749. is_unique = 0
  750. else:
  751. # cur or prev is NaN
  752. is_monotonic_inc = 0
  753. is_monotonic_dec = 0
  754. break
  755. if not is_monotonic_inc and not is_monotonic_dec:
  756. is_monotonic_inc = 0
  757. is_monotonic_dec = 0
  758. break
  759. prev = cur
  760. is_strict_monotonic = is_unique and (is_monotonic_inc or is_monotonic_dec)
  761. return is_monotonic_inc, is_monotonic_dec, is_strict_monotonic
  762. # ----------------------------------------------------------------------
  763. # rank_1d, rank_2d
  764. # ----------------------------------------------------------------------
  765. ctypedef fused rank_t:
  766. object
  767. float64_t
  768. uint64_t
  769. int64_t
  770. @cython.wraparound(False)
  771. @cython.boundscheck(False)
  772. def rank_1d(
  773. ndarray[rank_t, ndim=1] values,
  774. const intp_t[:] labels,
  775. bint is_datetimelike=False,
  776. ties_method="average",
  777. bint ascending=True,
  778. bint pct=False,
  779. na_option="keep",
  780. ):
  781. """
  782. Fast NaN-friendly version of ``scipy.stats.rankdata``.
  783. Parameters
  784. ----------
  785. values : array of rank_t values to be ranked
  786. labels : np.ndarray[np.intp]
  787. Array containing unique label for each group, with its ordering
  788. matching up to the corresponding record in `values`. If not called
  789. from a groupby operation, will be an array of 0's
  790. is_datetimelike : bool, default False
  791. True if `values` contains datetime-like entries.
  792. ties_method : {'average', 'min', 'max', 'first', 'dense'}, default
  793. 'average'
  794. * average: average rank of group
  795. * min: lowest rank in group
  796. * max: highest rank in group
  797. * first: ranks assigned in order they appear in the array
  798. * dense: like 'min', but rank always increases by 1 between groups
  799. ascending : bool, default True
  800. False for ranks by high (1) to low (N)
  801. na_option : {'keep', 'top', 'bottom'}, default 'keep'
  802. pct : bool, default False
  803. Compute percentage rank of data within each group
  804. na_option : {'keep', 'top', 'bottom'}, default 'keep'
  805. * keep: leave NA values where they are
  806. * top: smallest rank if ascending
  807. * bottom: smallest rank if descending
  808. """
  809. cdef:
  810. TiebreakEnumType tiebreak
  811. Py_ssize_t N
  812. int64_t[::1] grp_sizes
  813. intp_t[:] lexsort_indexer
  814. float64_t[::1] out
  815. ndarray[rank_t, ndim=1] masked_vals
  816. rank_t[:] masked_vals_memview
  817. uint8_t[:] mask
  818. bint keep_na, check_labels, check_mask
  819. rank_t nan_fill_val
  820. tiebreak = tiebreakers[ties_method]
  821. if tiebreak == TIEBREAK_FIRST:
  822. if not ascending:
  823. tiebreak = TIEBREAK_FIRST_DESCENDING
  824. keep_na = na_option == 'keep'
  825. N = len(values)
  826. # TODO Cython 3.0: cast won't be necessary (#2992)
  827. assert <Py_ssize_t>len(labels) == N
  828. out = np.empty(N)
  829. grp_sizes = np.ones(N, dtype=np.int64)
  830. # If all 0 labels, can short-circuit later label
  831. # comparisons
  832. check_labels = np.any(labels)
  833. # For cases where a mask is not possible, we can avoid mask checks
  834. check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike))
  835. # Copy values into new array in order to fill missing data
  836. # with mask, without obfuscating location of missing data
  837. # in values array
  838. if rank_t is object and values.dtype != np.object_:
  839. masked_vals = values.astype('O')
  840. else:
  841. masked_vals = values.copy()
  842. if rank_t is object:
  843. mask = missing.isnaobj(masked_vals)
  844. elif rank_t is int64_t and is_datetimelike:
  845. mask = (masked_vals == NPY_NAT).astype(np.uint8)
  846. elif rank_t is float64_t:
  847. mask = np.isnan(masked_vals).astype(np.uint8)
  848. else:
  849. mask = np.zeros(shape=len(masked_vals), dtype=np.uint8)
  850. # If `na_option == 'top'`, we want to assign the lowest rank
  851. # to NaN regardless of ascending/descending. So if ascending,
  852. # fill with lowest value of type to end up with lowest rank.
  853. # If descending, fill with highest value since descending
  854. # will flip the ordering to still end up with lowest rank.
  855. # Symmetric logic applies to `na_option == 'bottom'`
  856. if ascending ^ (na_option == 'top'):
  857. if rank_t is object:
  858. nan_fill_val = Infinity()
  859. elif rank_t is int64_t:
  860. nan_fill_val = util.INT64_MAX
  861. elif rank_t is uint64_t:
  862. nan_fill_val = util.UINT64_MAX
  863. else:
  864. nan_fill_val = np.inf
  865. order = (masked_vals, mask, labels)
  866. else:
  867. if rank_t is object:
  868. nan_fill_val = NegInfinity()
  869. elif rank_t is int64_t:
  870. nan_fill_val = NPY_NAT
  871. elif rank_t is uint64_t:
  872. nan_fill_val = 0
  873. else:
  874. nan_fill_val = -np.inf
  875. order = (masked_vals, ~(np.array(mask, copy=False)), labels)
  876. np.putmask(masked_vals, mask, nan_fill_val)
  877. # putmask doesn't accept a memoryview, so we assign as a separate step
  878. masked_vals_memview = masked_vals
  879. # lexsort using labels, then mask, then actual values
  880. # each label corresponds to a different group value,
  881. # the mask helps you differentiate missing values before
  882. # performing sort on the actual values
  883. lexsort_indexer = np.lexsort(order).astype(np.intp, copy=False)
  884. if not ascending:
  885. lexsort_indexer = lexsort_indexer[::-1]
  886. with nogil:
  887. rank_sorted_1d(
  888. out,
  889. grp_sizes,
  890. labels,
  891. lexsort_indexer,
  892. masked_vals_memview,
  893. mask,
  894. tiebreak,
  895. check_mask,
  896. check_labels,
  897. keep_na,
  898. N,
  899. )
  900. if pct:
  901. for i in range(N):
  902. if grp_sizes[i] != 0:
  903. out[i] = out[i] / grp_sizes[i]
  904. return np.array(out)
  905. @cython.wraparound(False)
  906. @cython.boundscheck(False)
  907. cdef void rank_sorted_1d(
  908. float64_t[::1] out,
  909. int64_t[::1] grp_sizes,
  910. const intp_t[:] labels,
  911. const intp_t[:] sort_indexer,
  912. # Can make const with cython3 (https://github.com/cython/cython/issues/3222)
  913. rank_t[:] masked_vals,
  914. const uint8_t[:] mask,
  915. TiebreakEnumType tiebreak,
  916. bint check_mask,
  917. bint check_labels,
  918. bint keep_na,
  919. Py_ssize_t N,
  920. ) nogil:
  921. """
  922. See rank_1d.__doc__. Handles only actual ranking, so sorting and masking should
  923. be handled in the caller. Note that `out` and `grp_sizes` are modified inplace.
  924. Parameters
  925. ----------
  926. out : float64_t[::1]
  927. Array to store computed ranks
  928. grp_sizes : int64_t[::1]
  929. Array to store group counts.
  930. labels : See rank_1d.__doc__
  931. sort_indexer : intp_t[:]
  932. Array of indices which sorts masked_vals
  933. masked_vals : rank_t[:]
  934. The values input to rank_1d, with missing values replaced by fill values
  935. mask : uint8_t[:]
  936. Array where entries are True if the value is missing, False otherwise
  937. tiebreak : TiebreakEnumType
  938. See rank_1d.__doc__ for the different modes
  939. check_mask : bint
  940. If False, assumes the mask is all False to skip mask indexing
  941. check_labels : bint
  942. If False, assumes all labels are the same to skip group handling logic
  943. keep_na : bint
  944. Whether or not to keep nulls
  945. N : Py_ssize_t
  946. The number of elements to rank. Note: it is not always true that
  947. N == len(out) or N == len(masked_vals) (see `nancorr_spearman` usage for why)
  948. """
  949. cdef:
  950. Py_ssize_t i, j, dups=0, sum_ranks=0,
  951. Py_ssize_t grp_start=0, grp_vals_seen=1, grp_na_count=0
  952. bint at_end, next_val_diff, group_changed
  953. int64_t grp_size
  954. # Loop over the length of the value array
  955. # each incremental i value can be looked up in the lexsort_indexer
  956. # array that we sorted previously, which gives us the location of
  957. # that sorted value for retrieval back from the original
  958. # values / masked_vals arrays
  959. # TODO: de-duplicate once cython supports conditional nogil
  960. if rank_t is object:
  961. with gil:
  962. for i in range(N):
  963. at_end = i == N - 1
  964. # dups and sum_ranks will be incremented each loop where
  965. # the value / group remains the same, and should be reset
  966. # when either of those change. Used to calculate tiebreakers
  967. dups += 1
  968. sum_ranks += i - grp_start + 1
  969. next_val_diff = at_end or are_diff(masked_vals[sort_indexer[i]],
  970. masked_vals[sort_indexer[i+1]])
  971. # We'll need this check later anyway to determine group size, so just
  972. # compute it here since shortcircuiting won't help
  973. group_changed = at_end or (check_labels and
  974. (labels[sort_indexer[i]]
  975. != labels[sort_indexer[i+1]]))
  976. # Update out only when there is a transition of values or labels.
  977. # When a new value or group is encountered, go back #dups steps(
  978. # the number of occurrence of current value) and assign the ranks
  979. # based on the starting index of the current group (grp_start)
  980. # and the current index
  981. if (next_val_diff or group_changed or (check_mask and
  982. (mask[sort_indexer[i]]
  983. ^ mask[sort_indexer[i+1]]))):
  984. # If keep_na, check for missing values and assign back
  985. # to the result where appropriate
  986. if keep_na and check_mask and mask[sort_indexer[i]]:
  987. grp_na_count = dups
  988. for j in range(i - dups + 1, i + 1):
  989. out[sort_indexer[j]] = NaN
  990. elif tiebreak == TIEBREAK_AVERAGE:
  991. for j in range(i - dups + 1, i + 1):
  992. out[sort_indexer[j]] = sum_ranks / <float64_t>dups
  993. elif tiebreak == TIEBREAK_MIN:
  994. for j in range(i - dups + 1, i + 1):
  995. out[sort_indexer[j]] = i - grp_start - dups + 2
  996. elif tiebreak == TIEBREAK_MAX:
  997. for j in range(i - dups + 1, i + 1):
  998. out[sort_indexer[j]] = i - grp_start + 1
  999. # With n as the previous rank in the group and m as the number
  1000. # of duplicates in this stretch, if TIEBREAK_FIRST and ascending,
  1001. # then rankings should be n + 1, n + 2 ... n + m
  1002. elif tiebreak == TIEBREAK_FIRST:
  1003. for j in range(i - dups + 1, i + 1):
  1004. out[sort_indexer[j]] = j + 1 - grp_start
  1005. # If TIEBREAK_FIRST and descending, the ranking should be
  1006. # n + m, n + (m - 1) ... n + 1. This is equivalent to
  1007. # (i - dups + 1) + (i - j + 1) - grp_start
  1008. elif tiebreak == TIEBREAK_FIRST_DESCENDING:
  1009. for j in range(i - dups + 1, i + 1):
  1010. out[sort_indexer[j]] = 2 * i - j - dups + 2 - grp_start
  1011. elif tiebreak == TIEBREAK_DENSE:
  1012. for j in range(i - dups + 1, i + 1):
  1013. out[sort_indexer[j]] = grp_vals_seen
  1014. # Look forward to the next value (using the sorting in
  1015. # lexsort_indexer). If the value does not equal the current
  1016. # value then we need to reset the dups and sum_ranks, knowing
  1017. # that a new value is coming up. The conditional also needs
  1018. # to handle nan equality and the end of iteration. If group
  1019. # changes we do not record seeing a new value in the group
  1020. if not group_changed and (next_val_diff or (check_mask and
  1021. (mask[sort_indexer[i]]
  1022. ^ mask[sort_indexer[i+1]]))):
  1023. dups = sum_ranks = 0
  1024. grp_vals_seen += 1
  1025. # Similar to the previous conditional, check now if we are
  1026. # moving to a new group. If so, keep track of the index where
  1027. # the new group occurs, so the tiebreaker calculations can
  1028. # decrement that from their position. Fill in the size of each
  1029. # group encountered (used by pct calculations later). Also be
  1030. # sure to reset any of the items helping to calculate dups
  1031. if group_changed:
  1032. # If not dense tiebreak, group size used to compute
  1033. # percentile will be # of non-null elements in group
  1034. if tiebreak != TIEBREAK_DENSE:
  1035. grp_size = i - grp_start + 1 - grp_na_count
  1036. # Otherwise, it will be the number of distinct values
  1037. # in the group, subtracting 1 if NaNs are present
  1038. # since that is a distinct value we shouldn't count
  1039. else:
  1040. grp_size = grp_vals_seen - (grp_na_count > 0)
  1041. for j in range(grp_start, i + 1):
  1042. grp_sizes[sort_indexer[j]] = grp_size
  1043. dups = sum_ranks = 0
  1044. grp_na_count = 0
  1045. grp_start = i + 1
  1046. grp_vals_seen = 1
  1047. else:
  1048. for i in range(N):
  1049. at_end = i == N - 1
  1050. # dups and sum_ranks will be incremented each loop where
  1051. # the value / group remains the same, and should be reset
  1052. # when either of those change. Used to calculate tiebreakers
  1053. dups += 1
  1054. sum_ranks += i - grp_start + 1
  1055. next_val_diff = at_end or (masked_vals[sort_indexer[i]]
  1056. != masked_vals[sort_indexer[i+1]])
  1057. # We'll need this check later anyway to determine group size, so just
  1058. # compute it here since shortcircuiting won't help
  1059. group_changed = at_end or (check_labels and
  1060. (labels[sort_indexer[i]]
  1061. != labels[sort_indexer[i+1]]))
  1062. # Update out only when there is a transition of values or labels.
  1063. # When a new value or group is encountered, go back #dups steps(
  1064. # the number of occurrence of current value) and assign the ranks
  1065. # based on the starting index of the current group (grp_start)
  1066. # and the current index
  1067. if (next_val_diff or group_changed
  1068. or (check_mask and
  1069. (mask[sort_indexer[i]] ^ mask[sort_indexer[i+1]]))):
  1070. # If keep_na, check for missing values and assign back
  1071. # to the result where appropriate
  1072. if keep_na and check_mask and mask[sort_indexer[i]]:
  1073. grp_na_count = dups
  1074. for j in range(i - dups + 1, i + 1):
  1075. out[sort_indexer[j]] = NaN
  1076. elif tiebreak == TIEBREAK_AVERAGE:
  1077. for j in range(i - dups + 1, i + 1):
  1078. out[sort_indexer[j]] = sum_ranks / <float64_t>dups
  1079. elif tiebreak == TIEBREAK_MIN:
  1080. for j in range(i - dups + 1, i + 1):
  1081. out[sort_indexer[j]] = i - grp_start - dups + 2
  1082. elif tiebreak == TIEBREAK_MAX:
  1083. for j in range(i - dups + 1, i + 1):
  1084. out[sort_indexer[j]] = i - grp_start + 1
  1085. # With n as the previous rank in the group and m as the number
  1086. # of duplicates in this stretch, if TIEBREAK_FIRST and ascending,
  1087. # then rankings should be n + 1, n + 2 ... n + m
  1088. elif tiebreak == TIEBREAK_FIRST:
  1089. for j in range(i - dups + 1, i + 1):
  1090. out[sort_indexer[j]] = j + 1 - grp_start
  1091. # If TIEBREAK_FIRST and descending, the ranking should be
  1092. # n + m, n + (m - 1) ... n + 1. This is equivalent to
  1093. # (i - dups + 1) + (i - j + 1) - grp_start
  1094. elif tiebreak == TIEBREAK_FIRST_DESCENDING:
  1095. for j in range(i - dups + 1, i + 1):
  1096. out[sort_indexer[j]] = 2 * i - j - dups + 2 - grp_start
  1097. elif tiebreak == TIEBREAK_DENSE:
  1098. for j in range(i - dups + 1, i + 1):
  1099. out[sort_indexer[j]] = grp_vals_seen
  1100. # Look forward to the next value (using the sorting in
  1101. # lexsort_indexer). If the value does not equal the current
  1102. # value then we need to reset the dups and sum_ranks, knowing
  1103. # that a new value is coming up. The conditional also needs
  1104. # to handle nan equality and the end of iteration. If group
  1105. # changes we do not record seeing a new value in the group
  1106. if not group_changed and (next_val_diff
  1107. or (check_mask and
  1108. (mask[sort_indexer[i]]
  1109. ^ mask[sort_indexer[i+1]]))):
  1110. dups = sum_ranks = 0
  1111. grp_vals_seen += 1
  1112. # Similar to the previous conditional, check now if we are
  1113. # moving to a new group. If so, keep track of the index where
  1114. # the new group occurs, so the tiebreaker calculations can
  1115. # decrement that from their position. Fill in the size of each
  1116. # group encountered (used by pct calculations later). Also be
  1117. # sure to reset any of the items helping to calculate dups
  1118. if group_changed:
  1119. # If not dense tiebreak, group size used to compute
  1120. # percentile will be # of non-null elements in group
  1121. if tiebreak != TIEBREAK_DENSE:
  1122. grp_size = i - grp_start + 1 - grp_na_count
  1123. # Otherwise, it will be the number of distinct values
  1124. # in the group, subtracting 1 if NaNs are present
  1125. # since that is a distinct value we shouldn't count
  1126. else:
  1127. grp_size = grp_vals_seen - (grp_na_count > 0)
  1128. for j in range(grp_start, i + 1):
  1129. grp_sizes[sort_indexer[j]] = grp_size
  1130. dups = sum_ranks = 0
  1131. grp_na_count = 0
  1132. grp_start = i + 1
  1133. grp_vals_seen = 1
  1134. def rank_2d(
  1135. ndarray[rank_t, ndim=2] in_arr,
  1136. int axis=0,
  1137. bint is_datetimelike=False,
  1138. ties_method="average",
  1139. bint ascending=True,
  1140. na_option="keep",
  1141. bint pct=False,
  1142. ):
  1143. """
  1144. Fast NaN-friendly version of ``scipy.stats.rankdata``.
  1145. """
  1146. cdef:
  1147. Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0
  1148. Py_ssize_t infs
  1149. ndarray[float64_t, ndim=2] ranks
  1150. ndarray[rank_t, ndim=2] values
  1151. ndarray[intp_t, ndim=2] argsort_indexer
  1152. ndarray[uint8_t, ndim=2] mask
  1153. rank_t val, nan_value
  1154. float64_t count, sum_ranks = 0.0
  1155. int tiebreak = 0
  1156. int64_t idx
  1157. bint check_mask, condition, keep_na
  1158. tiebreak = tiebreakers[ties_method]
  1159. keep_na = na_option == 'keep'
  1160. # For cases where a mask is not possible, we can avoid mask checks
  1161. check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike))
  1162. if axis == 0:
  1163. values = np.asarray(in_arr).T.copy()
  1164. else:
  1165. values = np.asarray(in_arr).copy()
  1166. if rank_t is object:
  1167. if values.dtype != np.object_:
  1168. values = values.astype('O')
  1169. if check_mask:
  1170. if ascending ^ (na_option == 'top'):
  1171. if rank_t is object:
  1172. nan_value = Infinity()
  1173. elif rank_t is float64_t:
  1174. nan_value = np.inf
  1175. # int64 and datetimelike
  1176. else:
  1177. nan_value = util.INT64_MAX
  1178. else:
  1179. if rank_t is object:
  1180. nan_value = NegInfinity()
  1181. elif rank_t is float64_t:
  1182. nan_value = -np.inf
  1183. # int64 and datetimelike
  1184. else:
  1185. nan_value = NPY_NAT
  1186. if rank_t is object:
  1187. mask = missing.isnaobj2d(values)
  1188. elif rank_t is float64_t:
  1189. mask = np.isnan(values)
  1190. # int64 and datetimelike
  1191. else:
  1192. mask = values == NPY_NAT
  1193. np.putmask(values, mask, nan_value)
  1194. else:
  1195. mask = np.zeros_like(values, dtype=bool)
  1196. n, k = (<object>values).shape
  1197. ranks = np.empty((n, k), dtype='f8')
  1198. if tiebreak == TIEBREAK_FIRST:
  1199. # need to use a stable sort here
  1200. argsort_indexer = values.argsort(axis=1, kind='mergesort')
  1201. if not ascending:
  1202. tiebreak = TIEBREAK_FIRST_DESCENDING
  1203. else:
  1204. argsort_indexer = values.argsort(1)
  1205. if not ascending:
  1206. argsort_indexer = argsort_indexer[:, ::-1]
  1207. values = _take_2d(values, argsort_indexer)
  1208. for i in range(n):
  1209. dups = sum_ranks = infs = 0
  1210. total_tie_count = 0
  1211. count = 0.0
  1212. for j in range(k):
  1213. val = values[i, j]
  1214. idx = argsort_indexer[i, j]
  1215. if keep_na and check_mask and mask[i, idx]:
  1216. ranks[i, idx] = NaN
  1217. infs += 1
  1218. continue
  1219. count += 1.0
  1220. sum_ranks += (j - infs) + 1
  1221. dups += 1
  1222. if rank_t is object:
  1223. condition = (
  1224. j == k - 1 or
  1225. are_diff(values[i, j + 1], val) or
  1226. (keep_na and check_mask and mask[i, argsort_indexer[i, j + 1]])
  1227. )
  1228. else:
  1229. condition = (
  1230. j == k - 1 or
  1231. values[i, j + 1] != val or
  1232. (keep_na and check_mask and mask[i, argsort_indexer[i, j + 1]])
  1233. )
  1234. if condition:
  1235. if tiebreak == TIEBREAK_AVERAGE:
  1236. for z in range(j - dups + 1, j + 1):
  1237. ranks[i, argsort_indexer[i, z]] = sum_ranks / dups
  1238. elif tiebreak == TIEBREAK_MIN:
  1239. for z in range(j - dups + 1, j + 1):
  1240. ranks[i, argsort_indexer[i, z]] = j - dups + 2
  1241. elif tiebreak == TIEBREAK_MAX:
  1242. for z in range(j - dups + 1, j + 1):
  1243. ranks[i, argsort_indexer[i, z]] = j + 1
  1244. elif tiebreak == TIEBREAK_FIRST:
  1245. if rank_t is object:
  1246. raise ValueError('first not supported for non-numeric data')
  1247. else:
  1248. for z in range(j - dups + 1, j + 1):
  1249. ranks[i, argsort_indexer[i, z]] = z + 1
  1250. elif tiebreak == TIEBREAK_FIRST_DESCENDING:
  1251. for z in range(j - dups + 1, j + 1):
  1252. ranks[i, argsort_indexer[i, z]] = 2 * j - z - dups + 2
  1253. elif tiebreak == TIEBREAK_DENSE:
  1254. total_tie_count += 1
  1255. for z in range(j - dups + 1, j + 1):
  1256. ranks[i, argsort_indexer[i, z]] = total_tie_count
  1257. sum_ranks = dups = 0
  1258. if pct:
  1259. if tiebreak == TIEBREAK_DENSE:
  1260. ranks[i, :] /= total_tie_count
  1261. else:
  1262. ranks[i, :] /= count
  1263. if axis == 0:
  1264. return ranks.T
  1265. else:
  1266. return ranks
  1267. ctypedef fused diff_t:
  1268. float64_t
  1269. float32_t
  1270. int8_t
  1271. int16_t
  1272. int32_t
  1273. int64_t
  1274. ctypedef fused out_t:
  1275. float32_t
  1276. float64_t
  1277. int64_t
  1278. @cython.boundscheck(False)
  1279. @cython.wraparound(False)
  1280. def diff_2d(
  1281. ndarray[diff_t, ndim=2] arr, # TODO(cython 3) update to "const diff_t[:, :] arr"
  1282. ndarray[out_t, ndim=2] out,
  1283. Py_ssize_t periods,
  1284. int axis,
  1285. bint datetimelike=False,
  1286. ):
  1287. cdef:
  1288. Py_ssize_t i, j, sx, sy, start, stop
  1289. bint f_contig = arr.flags.f_contiguous
  1290. # bint f_contig = arr.is_f_contig() # TODO(cython 3)
  1291. diff_t left, right
  1292. # Disable for unsupported dtype combinations,
  1293. # see https://github.com/cython/cython/issues/2646
  1294. if (out_t is float32_t
  1295. and not (diff_t is float32_t or diff_t is int8_t or diff_t is int16_t)):
  1296. raise NotImplementedError
  1297. elif (out_t is float64_t
  1298. and (diff_t is float32_t or diff_t is int8_t or diff_t is int16_t)):
  1299. raise NotImplementedError
  1300. elif out_t is int64_t and diff_t is not int64_t:
  1301. # We only have out_t of int64_t if we have datetimelike
  1302. raise NotImplementedError
  1303. else:
  1304. # We put this inside an indented else block to avoid cython build
  1305. # warnings about unreachable code
  1306. sx, sy = (<object>arr).shape
  1307. with nogil:
  1308. if f_contig:
  1309. if axis == 0:
  1310. if periods >= 0:
  1311. start, stop = periods, sx
  1312. else:
  1313. start, stop = 0, sx + periods
  1314. for j in range(sy):
  1315. for i in range(start, stop):
  1316. left = arr[i, j]
  1317. right = arr[i - periods, j]
  1318. if out_t is int64_t and datetimelike:
  1319. if left == NPY_NAT or right == NPY_NAT:
  1320. out[i, j] = NPY_NAT
  1321. else:
  1322. out[i, j] = left - right
  1323. else:
  1324. out[i, j] = left - right
  1325. else:
  1326. if periods >= 0:
  1327. start, stop = periods, sy
  1328. else:
  1329. start, stop = 0, sy + periods
  1330. for j in range(start, stop):
  1331. for i in range(sx):
  1332. left = arr[i, j]
  1333. right = arr[i, j - periods]
  1334. if out_t is int64_t and datetimelike:
  1335. if left == NPY_NAT or right == NPY_NAT:
  1336. out[i, j] = NPY_NAT
  1337. else:
  1338. out[i, j] = left - right
  1339. else:
  1340. out[i, j] = left - right
  1341. else:
  1342. if axis == 0:
  1343. if periods >= 0:
  1344. start, stop = periods, sx
  1345. else:
  1346. start, stop = 0, sx + periods
  1347. for i in range(start, stop):
  1348. for j in range(sy):
  1349. left = arr[i, j]
  1350. right = arr[i - periods, j]
  1351. if out_t is int64_t and datetimelike:
  1352. if left == NPY_NAT or right == NPY_NAT:
  1353. out[i, j] = NPY_NAT
  1354. else:
  1355. out[i, j] = left - right
  1356. else:
  1357. out[i, j] = left - right
  1358. else:
  1359. if periods >= 0:
  1360. start, stop = periods, sy
  1361. else:
  1362. start, stop = 0, sy + periods
  1363. for i in range(sx):
  1364. for j in range(start, stop):
  1365. left = arr[i, j]
  1366. right = arr[i, j - periods]
  1367. if out_t is int64_t and datetimelike:
  1368. if left == NPY_NAT or right == NPY_NAT:
  1369. out[i, j] = NPY_NAT
  1370. else:
  1371. out[i, j] = left - right
  1372. else:
  1373. out[i, j] = left - right
  1374. # generated from template
  1375. include "algos_common_helper.pxi"
  1376. include "algos_take_helper.pxi"