groupby.pyx 45 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475
  1. import cython
  2. from cython import Py_ssize_t
  3. from cython cimport floating
  4. from libc.stdlib cimport (
  5. free,
  6. malloc,
  7. )
  8. import numpy as np
  9. cimport numpy as cnp
  10. from numpy cimport (
  11. complex64_t,
  12. complex128_t,
  13. float32_t,
  14. float64_t,
  15. int8_t,
  16. int16_t,
  17. int32_t,
  18. int64_t,
  19. intp_t,
  20. ndarray,
  21. uint8_t,
  22. uint16_t,
  23. uint32_t,
  24. uint64_t,
  25. )
  26. from numpy.math cimport NAN
  27. cnp.import_array()
  28. from pandas._libs.algos cimport kth_smallest_c
  29. from pandas._libs.util cimport (
  30. get_nat,
  31. numeric,
  32. )
  33. from pandas._libs.algos import (
  34. ensure_platform_int,
  35. groupsort_indexer,
  36. rank_1d,
  37. take_2d_axis1_float64_float64,
  38. )
  39. from pandas._libs.missing cimport checknull
  40. cdef int64_t NPY_NAT = get_nat()
  41. _int64_max = np.iinfo(np.int64).max
  42. cdef float64_t NaN = <float64_t>np.NaN
  43. cdef enum InterpolationEnumType:
  44. INTERPOLATION_LINEAR,
  45. INTERPOLATION_LOWER,
  46. INTERPOLATION_HIGHER,
  47. INTERPOLATION_NEAREST,
  48. INTERPOLATION_MIDPOINT
  49. cdef inline float64_t median_linear(float64_t* a, int n) nogil:
  50. cdef:
  51. int i, j, na_count = 0
  52. float64_t result
  53. float64_t* tmp
  54. if n == 0:
  55. return NaN
  56. # count NAs
  57. for i in range(n):
  58. if a[i] != a[i]:
  59. na_count += 1
  60. if na_count:
  61. if na_count == n:
  62. return NaN
  63. tmp = <float64_t*>malloc((n - na_count) * sizeof(float64_t))
  64. j = 0
  65. for i in range(n):
  66. if a[i] == a[i]:
  67. tmp[j] = a[i]
  68. j += 1
  69. a = tmp
  70. n -= na_count
  71. if n % 2:
  72. result = kth_smallest_c(a, n // 2, n)
  73. else:
  74. result = (kth_smallest_c(a, n // 2, n) +
  75. kth_smallest_c(a, n // 2 - 1, n)) / 2
  76. if na_count:
  77. free(a)
  78. return result
  79. @cython.boundscheck(False)
  80. @cython.wraparound(False)
  81. def group_median_float64(ndarray[float64_t, ndim=2] out,
  82. ndarray[int64_t] counts,
  83. ndarray[float64_t, ndim=2] values,
  84. ndarray[intp_t] labels,
  85. Py_ssize_t min_count=-1) -> None:
  86. """
  87. Only aggregates on axis=0
  88. """
  89. cdef:
  90. Py_ssize_t i, j, N, K, ngroups, size
  91. ndarray[intp_t] _counts
  92. ndarray[float64_t, ndim=2] data
  93. ndarray[intp_t] indexer
  94. float64_t* ptr
  95. assert min_count == -1, "'min_count' only used in add and prod"
  96. ngroups = len(counts)
  97. N, K = (<object>values).shape
  98. indexer, _counts = groupsort_indexer(labels, ngroups)
  99. counts[:] = _counts[1:]
  100. data = np.empty((K, N), dtype=np.float64)
  101. ptr = <float64_t*>cnp.PyArray_DATA(data)
  102. take_2d_axis1_float64_float64(values.T, indexer, out=data)
  103. with nogil:
  104. for i in range(K):
  105. # exclude NA group
  106. ptr += _counts[0]
  107. for j in range(ngroups):
  108. size = _counts[j + 1]
  109. out[j, i] = median_linear(ptr, size)
  110. ptr += size
  111. @cython.boundscheck(False)
  112. @cython.wraparound(False)
  113. def group_cumprod_float64(float64_t[:, ::1] out,
  114. const float64_t[:, :] values,
  115. const intp_t[:] labels,
  116. int ngroups,
  117. bint is_datetimelike,
  118. bint skipna=True) -> None:
  119. """
  120. Cumulative product of columns of `values`, in row groups `labels`.
  121. Parameters
  122. ----------
  123. out : np.ndarray[np.float64, ndim=2]
  124. Array to store cumprod in.
  125. values : np.ndarray[np.float64, ndim=2]
  126. Values to take cumprod of.
  127. labels : np.ndarray[np.intp]
  128. Labels to group by.
  129. ngroups : int
  130. Number of groups, larger than all entries of `labels`.
  131. is_datetimelike : bool
  132. Always false, `values` is never datetime-like.
  133. skipna : bool
  134. If true, ignore nans in `values`.
  135. Notes
  136. -----
  137. This method modifies the `out` parameter, rather than returning an object.
  138. """
  139. cdef:
  140. Py_ssize_t i, j, N, K, size
  141. float64_t val
  142. float64_t[:, ::1] accum
  143. intp_t lab
  144. N, K = (<object>values).shape
  145. accum = np.ones((ngroups, K), dtype=np.float64)
  146. with nogil:
  147. for i in range(N):
  148. lab = labels[i]
  149. if lab < 0:
  150. continue
  151. for j in range(K):
  152. val = values[i, j]
  153. if val == val:
  154. accum[lab, j] *= val
  155. out[i, j] = accum[lab, j]
  156. else:
  157. out[i, j] = NaN
  158. if not skipna:
  159. accum[lab, j] = NaN
  160. break
  161. @cython.boundscheck(False)
  162. @cython.wraparound(False)
  163. def group_cumsum(numeric[:, ::1] out,
  164. ndarray[numeric, ndim=2] values,
  165. const intp_t[:] labels,
  166. int ngroups,
  167. is_datetimelike,
  168. bint skipna=True) -> None:
  169. """
  170. Cumulative sum of columns of `values`, in row groups `labels`.
  171. Parameters
  172. ----------
  173. out : np.ndarray[ndim=2]
  174. Array to store cumsum in.
  175. values : np.ndarray[ndim=2]
  176. Values to take cumsum of.
  177. labels : np.ndarray[np.intp]
  178. Labels to group by.
  179. ngroups : int
  180. Number of groups, larger than all entries of `labels`.
  181. is_datetimelike : bool
  182. True if `values` contains datetime-like entries.
  183. skipna : bool
  184. If true, ignore nans in `values`.
  185. Notes
  186. -----
  187. This method modifies the `out` parameter, rather than returning an object.
  188. """
  189. cdef:
  190. Py_ssize_t i, j, N, K, size
  191. numeric val, y, t
  192. numeric[:, ::1] accum, compensation
  193. intp_t lab
  194. N, K = (<object>values).shape
  195. accum = np.zeros((ngroups, K), dtype=np.asarray(values).dtype)
  196. compensation = np.zeros((ngroups, K), dtype=np.asarray(values).dtype)
  197. with nogil:
  198. for i in range(N):
  199. lab = labels[i]
  200. if lab < 0:
  201. continue
  202. for j in range(K):
  203. val = values[i, j]
  204. # For floats, use Kahan summation to reduce floating-point
  205. # error (https://en.wikipedia.org/wiki/Kahan_summation_algorithm)
  206. if numeric == float32_t or numeric == float64_t:
  207. if val == val:
  208. y = val - compensation[lab, j]
  209. t = accum[lab, j] + y
  210. compensation[lab, j] = t - accum[lab, j] - y
  211. accum[lab, j] = t
  212. out[i, j] = t
  213. else:
  214. out[i, j] = NaN
  215. if not skipna:
  216. accum[lab, j] = NaN
  217. break
  218. else:
  219. t = val + accum[lab, j]
  220. accum[lab, j] = t
  221. out[i, j] = t
  222. @cython.boundscheck(False)
  223. @cython.wraparound(False)
  224. def group_shift_indexer(int64_t[::1] out, const intp_t[:] labels,
  225. int ngroups, int periods) -> None:
  226. cdef:
  227. Py_ssize_t N, i, j, ii, lab
  228. int offset = 0, sign
  229. int64_t idxer, idxer_slot
  230. int64_t[::1] label_seen = np.zeros(ngroups, dtype=np.int64)
  231. int64_t[:, ::1] label_indexer
  232. N, = (<object>labels).shape
  233. if periods < 0:
  234. periods = -periods
  235. offset = N - 1
  236. sign = -1
  237. elif periods > 0:
  238. offset = 0
  239. sign = 1
  240. if periods == 0:
  241. with nogil:
  242. for i in range(N):
  243. out[i] = i
  244. else:
  245. # array of each previous indexer seen
  246. label_indexer = np.zeros((ngroups, periods), dtype=np.int64)
  247. with nogil:
  248. for i in range(N):
  249. # reverse iterator if shifting backwards
  250. ii = offset + sign * i
  251. lab = labels[ii]
  252. # Skip null keys
  253. if lab == -1:
  254. out[ii] = -1
  255. continue
  256. label_seen[lab] += 1
  257. idxer_slot = label_seen[lab] % periods
  258. idxer = label_indexer[lab, idxer_slot]
  259. if label_seen[lab] > periods:
  260. out[ii] = idxer
  261. else:
  262. out[ii] = -1
  263. label_indexer[lab, idxer_slot] = ii
  264. @cython.wraparound(False)
  265. @cython.boundscheck(False)
  266. def group_fillna_indexer(ndarray[int64_t] out, ndarray[intp_t] labels,
  267. ndarray[uint8_t] mask, str direction,
  268. int64_t limit, bint dropna) -> None:
  269. """
  270. Indexes how to fill values forwards or backwards within a group.
  271. Parameters
  272. ----------
  273. out : np.ndarray[np.int64]
  274. Values into which this method will write its results.
  275. labels : np.ndarray[np.intp]
  276. Array containing unique label for each group, with its ordering
  277. matching up to the corresponding record in `values`.
  278. values : np.ndarray[np.uint8]
  279. Containing the truth value of each element.
  280. mask : np.ndarray[np.uint8]
  281. Indicating whether a value is na or not.
  282. direction : {'ffill', 'bfill'}
  283. Direction for fill to be applied (forwards or backwards, respectively)
  284. limit : Consecutive values to fill before stopping, or -1 for no limit
  285. dropna : Flag to indicate if NaN groups should return all NaN values
  286. Notes
  287. -----
  288. This method modifies the `out` parameter rather than returning an object
  289. """
  290. cdef:
  291. Py_ssize_t i, N, idx
  292. intp_t[:] sorted_labels
  293. intp_t curr_fill_idx=-1
  294. int64_t filled_vals = 0
  295. N = len(out)
  296. # Make sure all arrays are the same size
  297. assert N == len(labels) == len(mask)
  298. sorted_labels = np.argsort(labels, kind='mergesort').astype(
  299. np.intp, copy=False)
  300. if direction == 'bfill':
  301. sorted_labels = sorted_labels[::-1]
  302. with nogil:
  303. for i in range(N):
  304. idx = sorted_labels[i]
  305. if dropna and labels[idx] == -1: # nan-group gets nan-values
  306. curr_fill_idx = -1
  307. elif mask[idx] == 1: # is missing
  308. # Stop filling once we've hit the limit
  309. if filled_vals >= limit and limit != -1:
  310. curr_fill_idx = -1
  311. filled_vals += 1
  312. else: # reset items when not missing
  313. filled_vals = 0
  314. curr_fill_idx = idx
  315. out[idx] = curr_fill_idx
  316. # If we move to the next group, reset
  317. # the fill_idx and counter
  318. if i == N - 1 or labels[idx] != labels[sorted_labels[i + 1]]:
  319. curr_fill_idx = -1
  320. filled_vals = 0
  321. @cython.boundscheck(False)
  322. @cython.wraparound(False)
  323. def group_any_all(int8_t[::1] out,
  324. const int8_t[::1] values,
  325. const intp_t[:] labels,
  326. const uint8_t[::1] mask,
  327. str val_test,
  328. bint skipna,
  329. bint nullable) -> None:
  330. """
  331. Aggregated boolean values to show truthfulness of group elements. If the
  332. input is a nullable type (nullable=True), the result will be computed
  333. using Kleene logic.
  334. Parameters
  335. ----------
  336. out : np.ndarray[np.int8]
  337. Values into which this method will write its results.
  338. labels : np.ndarray[np.intp]
  339. Array containing unique label for each group, with its
  340. ordering matching up to the corresponding record in `values`
  341. values : np.ndarray[np.int8]
  342. Containing the truth value of each element.
  343. mask : np.ndarray[np.uint8]
  344. Indicating whether a value is na or not.
  345. val_test : {'any', 'all'}
  346. String object dictating whether to use any or all truth testing
  347. skipna : bool
  348. Flag to ignore nan values during truth testing
  349. nullable : bool
  350. Whether or not the input is a nullable type. If True, the
  351. result will be computed using Kleene logic
  352. Notes
  353. -----
  354. This method modifies the `out` parameter rather than returning an object.
  355. The returned values will either be 0, 1 (False or True, respectively), or
  356. -1 to signify a masked position in the case of a nullable input.
  357. """
  358. cdef:
  359. Py_ssize_t i, N = len(labels)
  360. intp_t lab
  361. int8_t flag_val
  362. if val_test == 'all':
  363. # Because the 'all' value of an empty iterable in Python is True we can
  364. # start with an array full of ones and set to zero when a False value
  365. # is encountered
  366. flag_val = 0
  367. elif val_test == 'any':
  368. # Because the 'any' value of an empty iterable in Python is False we
  369. # can start with an array full of zeros and set to one only if any
  370. # value encountered is True
  371. flag_val = 1
  372. else:
  373. raise ValueError("'bool_func' must be either 'any' or 'all'!")
  374. out[:] = 1 - flag_val
  375. with nogil:
  376. for i in range(N):
  377. lab = labels[i]
  378. if lab < 0 or (skipna and mask[i]):
  379. continue
  380. if nullable and mask[i]:
  381. # Set the position as masked if `out[lab] != flag_val`, which
  382. # would indicate True/False has not yet been seen for any/all,
  383. # so by Kleene logic the result is currently unknown
  384. if out[lab] != flag_val:
  385. out[lab] = -1
  386. continue
  387. # If True and 'any' or False and 'all', the result is
  388. # already determined
  389. if values[i] == flag_val:
  390. out[lab] = flag_val
  391. # ----------------------------------------------------------------------
  392. # group_add, group_prod, group_var, group_mean, group_ohlc
  393. # ----------------------------------------------------------------------
  394. ctypedef fused add_t:
  395. float64_t
  396. float32_t
  397. complex64_t
  398. complex128_t
  399. object
  400. @cython.wraparound(False)
  401. @cython.boundscheck(False)
  402. def group_add(add_t[:, ::1] out,
  403. int64_t[::1] counts,
  404. ndarray[add_t, ndim=2] values,
  405. const intp_t[:] labels,
  406. Py_ssize_t min_count=0) -> None:
  407. """
  408. Only aggregates on axis=0 using Kahan summation
  409. """
  410. cdef:
  411. Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
  412. add_t val, t, y
  413. add_t[:, ::1] sumx, compensation
  414. int64_t[:, ::1] nobs
  415. Py_ssize_t len_values = len(values), len_labels = len(labels)
  416. if len_values != len_labels:
  417. raise ValueError("len(index) != len(labels)")
  418. nobs = np.zeros((<object>out).shape, dtype=np.int64)
  419. # the below is equivalent to `np.zeros_like(out)` but faster
  420. sumx = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
  421. compensation = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
  422. N, K = (<object>values).shape
  423. if add_t is object:
  424. # NB: this does not use 'compensation' like the non-object track does.
  425. for i in range(N):
  426. lab = labels[i]
  427. if lab < 0:
  428. continue
  429. counts[lab] += 1
  430. for j in range(K):
  431. val = values[i, j]
  432. # not nan
  433. if not checknull(val):
  434. nobs[lab, j] += 1
  435. if nobs[lab, j] == 1:
  436. # i.e. we havent added anything yet; avoid TypeError
  437. # if e.g. val is a str and sumx[lab, j] is 0
  438. t = val
  439. else:
  440. t = sumx[lab, j] + val
  441. sumx[lab, j] = t
  442. for i in range(ncounts):
  443. for j in range(K):
  444. if nobs[i, j] < min_count:
  445. out[i, j] = NAN
  446. else:
  447. out[i, j] = sumx[i, j]
  448. else:
  449. with nogil:
  450. for i in range(N):
  451. lab = labels[i]
  452. if lab < 0:
  453. continue
  454. counts[lab] += 1
  455. for j in range(K):
  456. val = values[i, j]
  457. # not nan
  458. if val == val:
  459. nobs[lab, j] += 1
  460. y = val - compensation[lab, j]
  461. t = sumx[lab, j] + y
  462. compensation[lab, j] = t - sumx[lab, j] - y
  463. sumx[lab, j] = t
  464. for i in range(ncounts):
  465. for j in range(K):
  466. if nobs[i, j] < min_count:
  467. out[i, j] = NAN
  468. else:
  469. out[i, j] = sumx[i, j]
  470. @cython.wraparound(False)
  471. @cython.boundscheck(False)
  472. def group_prod(floating[:, ::1] out,
  473. int64_t[::1] counts,
  474. ndarray[floating, ndim=2] values,
  475. const intp_t[:] labels,
  476. Py_ssize_t min_count=0) -> None:
  477. """
  478. Only aggregates on axis=0
  479. """
  480. cdef:
  481. Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
  482. floating val, count
  483. floating[:, ::1] prodx
  484. int64_t[:, ::1] nobs
  485. Py_ssize_t len_values = len(values), len_labels = len(labels)
  486. if len_values != len_labels:
  487. raise ValueError("len(index) != len(labels)")
  488. nobs = np.zeros((<object>out).shape, dtype=np.int64)
  489. prodx = np.ones((<object>out).shape, dtype=(<object>out).base.dtype)
  490. N, K = (<object>values).shape
  491. with nogil:
  492. for i in range(N):
  493. lab = labels[i]
  494. if lab < 0:
  495. continue
  496. counts[lab] += 1
  497. for j in range(K):
  498. val = values[i, j]
  499. # not nan
  500. if val == val:
  501. nobs[lab, j] += 1
  502. prodx[lab, j] *= val
  503. for i in range(ncounts):
  504. for j in range(K):
  505. if nobs[i, j] < min_count:
  506. out[i, j] = NAN
  507. else:
  508. out[i, j] = prodx[i, j]
  509. @cython.wraparound(False)
  510. @cython.boundscheck(False)
  511. @cython.cdivision(True)
  512. def group_var(floating[:, ::1] out,
  513. int64_t[::1] counts,
  514. ndarray[floating, ndim=2] values,
  515. const intp_t[:] labels,
  516. Py_ssize_t min_count=-1,
  517. int64_t ddof=1) -> None:
  518. cdef:
  519. Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
  520. floating val, ct, oldmean
  521. floating[:, ::1] mean
  522. int64_t[:, ::1] nobs
  523. Py_ssize_t len_values = len(values), len_labels = len(labels)
  524. assert min_count == -1, "'min_count' only used in add and prod"
  525. if len_values != len_labels:
  526. raise ValueError("len(index) != len(labels)")
  527. nobs = np.zeros((<object>out).shape, dtype=np.int64)
  528. mean = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
  529. N, K = (<object>values).shape
  530. out[:, :] = 0.0
  531. with nogil:
  532. for i in range(N):
  533. lab = labels[i]
  534. if lab < 0:
  535. continue
  536. counts[lab] += 1
  537. for j in range(K):
  538. val = values[i, j]
  539. # not nan
  540. if val == val:
  541. nobs[lab, j] += 1
  542. oldmean = mean[lab, j]
  543. mean[lab, j] += (val - oldmean) / nobs[lab, j]
  544. out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
  545. for i in range(ncounts):
  546. for j in range(K):
  547. ct = nobs[i, j]
  548. if ct <= ddof:
  549. out[i, j] = NAN
  550. else:
  551. out[i, j] /= (ct - ddof)
  552. @cython.wraparound(False)
  553. @cython.boundscheck(False)
  554. def group_mean(floating[:, ::1] out,
  555. int64_t[::1] counts,
  556. ndarray[floating, ndim=2] values,
  557. const intp_t[::1] labels,
  558. Py_ssize_t min_count=-1) -> None:
  559. cdef:
  560. Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
  561. floating val, count, y, t
  562. floating[:, ::1] sumx, compensation
  563. int64_t[:, ::1] nobs
  564. Py_ssize_t len_values = len(values), len_labels = len(labels)
  565. assert min_count == -1, "'min_count' only used in add and prod"
  566. if len_values != len_labels:
  567. raise ValueError("len(index) != len(labels)")
  568. nobs = np.zeros((<object>out).shape, dtype=np.int64)
  569. # the below is equivalent to `np.zeros_like(out)` but faster
  570. sumx = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
  571. compensation = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
  572. N, K = (<object>values).shape
  573. with nogil:
  574. for i in range(N):
  575. lab = labels[i]
  576. if lab < 0:
  577. continue
  578. counts[lab] += 1
  579. for j in range(K):
  580. val = values[i, j]
  581. # not nan
  582. if val == val:
  583. nobs[lab, j] += 1
  584. y = val - compensation[lab, j]
  585. t = sumx[lab, j] + y
  586. compensation[lab, j] = t - sumx[lab, j] - y
  587. sumx[lab, j] = t
  588. for i in range(ncounts):
  589. for j in range(K):
  590. count = nobs[i, j]
  591. if nobs[i, j] == 0:
  592. out[i, j] = NAN
  593. else:
  594. out[i, j] = sumx[i, j] / count
  595. @cython.wraparound(False)
  596. @cython.boundscheck(False)
  597. def group_ohlc(floating[:, ::1] out,
  598. int64_t[::1] counts,
  599. ndarray[floating, ndim=2] values,
  600. const intp_t[:] labels,
  601. Py_ssize_t min_count=-1) -> None:
  602. """
  603. Only aggregates on axis=0
  604. """
  605. cdef:
  606. Py_ssize_t i, j, N, K, lab
  607. floating val
  608. assert min_count == -1, "'min_count' only used in add and prod"
  609. if len(labels) == 0:
  610. return
  611. N, K = (<object>values).shape
  612. if out.shape[1] != 4:
  613. raise ValueError('Output array must have 4 columns')
  614. if K > 1:
  615. raise NotImplementedError("Argument 'values' must have only one dimension")
  616. out[:] = np.nan
  617. with nogil:
  618. for i in range(N):
  619. lab = labels[i]
  620. if lab == -1:
  621. continue
  622. counts[lab] += 1
  623. val = values[i, 0]
  624. if val != val:
  625. continue
  626. if out[lab, 0] != out[lab, 0]:
  627. out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val
  628. else:
  629. out[lab, 1] = max(out[lab, 1], val)
  630. out[lab, 2] = min(out[lab, 2], val)
  631. out[lab, 3] = val
  632. @cython.boundscheck(False)
  633. @cython.wraparound(False)
  634. def group_quantile(ndarray[float64_t] out,
  635. ndarray[numeric, ndim=1] values,
  636. ndarray[intp_t] labels,
  637. ndarray[uint8_t] mask,
  638. float64_t q,
  639. str interpolation) -> None:
  640. """
  641. Calculate the quantile per group.
  642. Parameters
  643. ----------
  644. out : np.ndarray[np.float64]
  645. Array of aggregated values that will be written to.
  646. values : np.ndarray
  647. Array containing the values to apply the function against.
  648. labels : ndarray[np.intp]
  649. Array containing the unique group labels.
  650. q : float
  651. The quantile value to search for.
  652. interpolation : {'linear', 'lower', 'highest', 'nearest', 'midpoint'}
  653. Notes
  654. -----
  655. Rather than explicitly returning a value, this function modifies the
  656. provided `out` parameter.
  657. """
  658. cdef:
  659. Py_ssize_t i, N=len(labels), ngroups, grp_sz, non_na_sz
  660. Py_ssize_t grp_start=0, idx=0
  661. intp_t lab
  662. uint8_t interp
  663. float64_t q_idx, frac, val, next_val
  664. ndarray[int64_t] counts, non_na_counts, sort_arr
  665. assert values.shape[0] == N
  666. if not (0 <= q <= 1):
  667. raise ValueError(f"'q' must be between 0 and 1. Got '{q}' instead")
  668. inter_methods = {
  669. 'linear': INTERPOLATION_LINEAR,
  670. 'lower': INTERPOLATION_LOWER,
  671. 'higher': INTERPOLATION_HIGHER,
  672. 'nearest': INTERPOLATION_NEAREST,
  673. 'midpoint': INTERPOLATION_MIDPOINT,
  674. }
  675. interp = inter_methods[interpolation]
  676. counts = np.zeros_like(out, dtype=np.int64)
  677. non_na_counts = np.zeros_like(out, dtype=np.int64)
  678. ngroups = len(counts)
  679. # First figure out the size of every group
  680. with nogil:
  681. for i in range(N):
  682. lab = labels[i]
  683. if lab == -1: # NA group label
  684. continue
  685. counts[lab] += 1
  686. if not mask[i]:
  687. non_na_counts[lab] += 1
  688. # Get an index of values sorted by labels and then values
  689. if labels.any():
  690. # Put '-1' (NaN) labels as the last group so it does not interfere
  691. # with the calculations.
  692. labels_for_lexsort = np.where(labels == -1, labels.max() + 1, labels)
  693. else:
  694. labels_for_lexsort = labels
  695. order = (values, labels_for_lexsort)
  696. sort_arr = np.lexsort(order).astype(np.int64, copy=False)
  697. with nogil:
  698. for i in range(ngroups):
  699. # Figure out how many group elements there are
  700. grp_sz = counts[i]
  701. non_na_sz = non_na_counts[i]
  702. if non_na_sz == 0:
  703. out[i] = NaN
  704. else:
  705. # Calculate where to retrieve the desired value
  706. # Casting to int will intentionally truncate result
  707. idx = grp_start + <int64_t>(q * <float64_t>(non_na_sz - 1))
  708. val = values[sort_arr[idx]]
  709. # If requested quantile falls evenly on a particular index
  710. # then write that index's value out. Otherwise interpolate
  711. q_idx = q * (non_na_sz - 1)
  712. frac = q_idx % 1
  713. if frac == 0.0 or interp == INTERPOLATION_LOWER:
  714. out[i] = val
  715. else:
  716. next_val = values[sort_arr[idx + 1]]
  717. if interp == INTERPOLATION_LINEAR:
  718. out[i] = val + (next_val - val) * frac
  719. elif interp == INTERPOLATION_HIGHER:
  720. out[i] = next_val
  721. elif interp == INTERPOLATION_MIDPOINT:
  722. out[i] = (val + next_val) / 2.0
  723. elif interp == INTERPOLATION_NEAREST:
  724. if frac > .5 or (frac == .5 and q > .5): # Always OK?
  725. out[i] = next_val
  726. else:
  727. out[i] = val
  728. # Increment the index reference in sorted_arr for the next group
  729. grp_start += grp_sz
  730. # ----------------------------------------------------------------------
  731. # group_nth, group_last, group_rank
  732. # ----------------------------------------------------------------------
  733. ctypedef fused rank_t:
  734. float64_t
  735. float32_t
  736. int64_t
  737. uint64_t
  738. object
  739. cdef inline bint _treat_as_na(rank_t val, bint is_datetimelike) nogil:
  740. if rank_t is object:
  741. # Should never be used, but we need to avoid the `val != val` below
  742. # or else cython will raise about gil acquisition.
  743. raise NotImplementedError
  744. elif rank_t is int64_t:
  745. return is_datetimelike and val == NPY_NAT
  746. elif rank_t is uint64_t:
  747. # There is no NA value for uint64
  748. return False
  749. else:
  750. return val != val
  751. # GH#31710 use memorviews once cython 0.30 is released so we can
  752. # use `const rank_t[:, :] values`
  753. @cython.wraparound(False)
  754. @cython.boundscheck(False)
  755. def group_last(rank_t[:, ::1] out,
  756. int64_t[::1] counts,
  757. ndarray[rank_t, ndim=2] values,
  758. const intp_t[:] labels,
  759. Py_ssize_t min_count=-1) -> None:
  760. """
  761. Only aggregates on axis=0
  762. """
  763. cdef:
  764. Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
  765. rank_t val
  766. ndarray[rank_t, ndim=2] resx
  767. ndarray[int64_t, ndim=2] nobs
  768. bint runtime_error = False
  769. # TODO(cython 3.0):
  770. # Instead of `labels.shape[0]` use `len(labels)`
  771. if not len(values) == labels.shape[0]:
  772. raise AssertionError("len(index) != len(labels)")
  773. min_count = max(min_count, 1)
  774. nobs = np.zeros((<object>out).shape, dtype=np.int64)
  775. if rank_t is object:
  776. resx = np.empty((<object>out).shape, dtype=object)
  777. else:
  778. resx = np.empty_like(out)
  779. N, K = (<object>values).shape
  780. if rank_t is object:
  781. # TODO: De-duplicate once conditional-nogil is available
  782. for i in range(N):
  783. lab = labels[i]
  784. if lab < 0:
  785. continue
  786. counts[lab] += 1
  787. for j in range(K):
  788. val = values[i, j]
  789. if not checknull(val):
  790. # NB: use _treat_as_na here once
  791. # conditional-nogil is available.
  792. nobs[lab, j] += 1
  793. resx[lab, j] = val
  794. for i in range(ncounts):
  795. for j in range(K):
  796. if nobs[i, j] < min_count:
  797. out[i, j] = None
  798. else:
  799. out[i, j] = resx[i, j]
  800. else:
  801. with nogil:
  802. for i in range(N):
  803. lab = labels[i]
  804. if lab < 0:
  805. continue
  806. counts[lab] += 1
  807. for j in range(K):
  808. val = values[i, j]
  809. if not _treat_as_na(val, True):
  810. # TODO: Sure we always want is_datetimelike=True?
  811. nobs[lab, j] += 1
  812. resx[lab, j] = val
  813. for i in range(ncounts):
  814. for j in range(K):
  815. if nobs[i, j] < min_count:
  816. if rank_t is int64_t:
  817. out[i, j] = NPY_NAT
  818. elif rank_t is uint64_t:
  819. runtime_error = True
  820. break
  821. else:
  822. out[i, j] = NAN
  823. else:
  824. out[i, j] = resx[i, j]
  825. if runtime_error:
  826. # We cannot raise directly above because that is within a nogil
  827. # block.
  828. raise RuntimeError("empty group with uint64_t")
  829. # GH#31710 use memorviews once cython 0.30 is released so we can
  830. # use `const rank_t[:, :] values`
  831. @cython.wraparound(False)
  832. @cython.boundscheck(False)
  833. def group_nth(rank_t[:, ::1] out,
  834. int64_t[::1] counts,
  835. ndarray[rank_t, ndim=2] values,
  836. const intp_t[:] labels,
  837. int64_t min_count=-1,
  838. int64_t rank=1,
  839. ) -> None:
  840. """
  841. Only aggregates on axis=0
  842. """
  843. cdef:
  844. Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
  845. rank_t val
  846. ndarray[rank_t, ndim=2] resx
  847. ndarray[int64_t, ndim=2] nobs
  848. bint runtime_error = False
  849. # TODO(cython 3.0):
  850. # Instead of `labels.shape[0]` use `len(labels)`
  851. if not len(values) == labels.shape[0]:
  852. raise AssertionError("len(index) != len(labels)")
  853. min_count = max(min_count, 1)
  854. nobs = np.zeros((<object>out).shape, dtype=np.int64)
  855. if rank_t is object:
  856. resx = np.empty((<object>out).shape, dtype=object)
  857. else:
  858. resx = np.empty_like(out)
  859. N, K = (<object>values).shape
  860. if rank_t is object:
  861. # TODO: De-duplicate once conditional-nogil is available
  862. for i in range(N):
  863. lab = labels[i]
  864. if lab < 0:
  865. continue
  866. counts[lab] += 1
  867. for j in range(K):
  868. val = values[i, j]
  869. if not checknull(val):
  870. # NB: use _treat_as_na here once
  871. # conditional-nogil is available.
  872. nobs[lab, j] += 1
  873. if nobs[lab, j] == rank:
  874. resx[lab, j] = val
  875. for i in range(ncounts):
  876. for j in range(K):
  877. if nobs[i, j] < min_count:
  878. out[i, j] = None
  879. else:
  880. out[i, j] = resx[i, j]
  881. else:
  882. with nogil:
  883. for i in range(N):
  884. lab = labels[i]
  885. if lab < 0:
  886. continue
  887. counts[lab] += 1
  888. for j in range(K):
  889. val = values[i, j]
  890. if not _treat_as_na(val, True):
  891. # TODO: Sure we always want is_datetimelike=True?
  892. nobs[lab, j] += 1
  893. if nobs[lab, j] == rank:
  894. resx[lab, j] = val
  895. for i in range(ncounts):
  896. for j in range(K):
  897. if nobs[i, j] < min_count:
  898. if rank_t is int64_t:
  899. out[i, j] = NPY_NAT
  900. elif rank_t is uint64_t:
  901. runtime_error = True
  902. break
  903. else:
  904. out[i, j] = NAN
  905. else:
  906. out[i, j] = resx[i, j]
  907. if runtime_error:
  908. # We cannot raise directly above because that is within a nogil
  909. # block.
  910. raise RuntimeError("empty group with uint64_t")
  911. @cython.boundscheck(False)
  912. @cython.wraparound(False)
  913. def group_rank(float64_t[:, ::1] out,
  914. ndarray[rank_t, ndim=2] values,
  915. const intp_t[:] labels,
  916. int ngroups,
  917. bint is_datetimelike, str ties_method="average",
  918. bint ascending=True, bint pct=False, str na_option="keep") -> None:
  919. """
  920. Provides the rank of values within each group.
  921. Parameters
  922. ----------
  923. out : np.ndarray[np.float64, ndim=2]
  924. Values to which this method will write its results.
  925. values : np.ndarray of rank_t values to be ranked
  926. labels : np.ndarray[np.intp]
  927. Array containing unique label for each group, with its ordering
  928. matching up to the corresponding record in `values`
  929. ngroups : int
  930. This parameter is not used, is needed to match signatures of other
  931. groupby functions.
  932. is_datetimelike : bool
  933. True if `values` contains datetime-like entries.
  934. ties_method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
  935. * average: average rank of group
  936. * min: lowest rank in group
  937. * max: highest rank in group
  938. * first: ranks assigned in order they appear in the array
  939. * dense: like 'min', but rank always increases by 1 between groups
  940. ascending : bool, default True
  941. False for ranks by high (1) to low (N)
  942. na_option : {'keep', 'top', 'bottom'}, default 'keep'
  943. pct : bool, default False
  944. Compute percentage rank of data within each group
  945. na_option : {'keep', 'top', 'bottom'}, default 'keep'
  946. * keep: leave NA values where they are
  947. * top: smallest rank if ascending
  948. * bottom: smallest rank if descending
  949. Notes
  950. -----
  951. This method modifies the `out` parameter rather than returning an object
  952. """
  953. cdef:
  954. Py_ssize_t i, k, N
  955. ndarray[float64_t, ndim=1] result
  956. N = values.shape[1]
  957. for k in range(N):
  958. result = rank_1d(
  959. values=values[:, k],
  960. labels=labels,
  961. is_datetimelike=is_datetimelike,
  962. ties_method=ties_method,
  963. ascending=ascending,
  964. pct=pct,
  965. na_option=na_option
  966. )
  967. for i in range(len(result)):
  968. # TODO: why cant we do out[:, k] = result?
  969. out[i, k] = result[i]
  970. # ----------------------------------------------------------------------
  971. # group_min, group_max
  972. # ----------------------------------------------------------------------
  973. # TODO: consider implementing for more dtypes
  974. ctypedef fused groupby_t:
  975. float64_t
  976. float32_t
  977. int64_t
  978. uint64_t
  979. @cython.wraparound(False)
  980. @cython.boundscheck(False)
  981. cdef group_min_max(groupby_t[:, ::1] out,
  982. int64_t[::1] counts,
  983. ndarray[groupby_t, ndim=2] values,
  984. const intp_t[:] labels,
  985. Py_ssize_t min_count=-1,
  986. bint is_datetimelike=False,
  987. bint compute_max=True):
  988. """
  989. Compute minimum/maximum of columns of `values`, in row groups `labels`.
  990. Parameters
  991. ----------
  992. out : np.ndarray[groupby_t, ndim=2]
  993. Array to store result in.
  994. counts : np.ndarray[int64]
  995. Input as a zeroed array, populated by group sizes during algorithm
  996. values : array
  997. Values to find column-wise min/max of.
  998. labels : np.ndarray[np.intp]
  999. Labels to group by.
  1000. min_count : Py_ssize_t, default -1
  1001. The minimum number of non-NA group elements, NA result if threshold
  1002. is not met
  1003. is_datetimelike : bool
  1004. True if `values` contains datetime-like entries.
  1005. compute_max : bint, default True
  1006. True to compute group-wise max, False to compute min
  1007. Notes
  1008. -----
  1009. This method modifies the `out` parameter, rather than returning an object.
  1010. `counts` is modified to hold group sizes
  1011. """
  1012. cdef:
  1013. Py_ssize_t i, j, N, K, lab, ngroups = len(counts)
  1014. groupby_t val, nan_val
  1015. ndarray[groupby_t, ndim=2] group_min_or_max
  1016. bint runtime_error = False
  1017. int64_t[:, ::1] nobs
  1018. # TODO(cython 3.0):
  1019. # Instead of `labels.shape[0]` use `len(labels)`
  1020. if not len(values) == labels.shape[0]:
  1021. raise AssertionError("len(index) != len(labels)")
  1022. min_count = max(min_count, 1)
  1023. nobs = np.zeros((<object>out).shape, dtype=np.int64)
  1024. group_min_or_max = np.empty_like(out)
  1025. if groupby_t is int64_t:
  1026. group_min_or_max[:] = -_int64_max if compute_max else _int64_max
  1027. nan_val = NPY_NAT
  1028. elif groupby_t is uint64_t:
  1029. # NB: We do not define nan_val because there is no such thing
  1030. # for uint64_t. We carefully avoid having to reference it in this
  1031. # case.
  1032. group_min_or_max[:] = 0 if compute_max else np.iinfo(np.uint64).max
  1033. else:
  1034. group_min_or_max[:] = -np.inf if compute_max else np.inf
  1035. nan_val = NAN
  1036. N, K = (<object>values).shape
  1037. with nogil:
  1038. for i in range(N):
  1039. lab = labels[i]
  1040. if lab < 0:
  1041. continue
  1042. counts[lab] += 1
  1043. for j in range(K):
  1044. val = values[i, j]
  1045. if not _treat_as_na(val, is_datetimelike):
  1046. nobs[lab, j] += 1
  1047. if compute_max:
  1048. if val > group_min_or_max[lab, j]:
  1049. group_min_or_max[lab, j] = val
  1050. else:
  1051. if val < group_min_or_max[lab, j]:
  1052. group_min_or_max[lab, j] = val
  1053. for i in range(ngroups):
  1054. for j in range(K):
  1055. if nobs[i, j] < min_count:
  1056. if groupby_t is uint64_t:
  1057. runtime_error = True
  1058. break
  1059. else:
  1060. out[i, j] = nan_val
  1061. else:
  1062. out[i, j] = group_min_or_max[i, j]
  1063. if runtime_error:
  1064. # We cannot raise directly above because that is within a nogil
  1065. # block.
  1066. raise RuntimeError("empty group with uint64_t")
  1067. @cython.wraparound(False)
  1068. @cython.boundscheck(False)
  1069. def group_max(groupby_t[:, ::1] out,
  1070. int64_t[::1] counts,
  1071. ndarray[groupby_t, ndim=2] values,
  1072. const intp_t[:] labels,
  1073. Py_ssize_t min_count=-1,
  1074. bint is_datetimelike=False) -> None:
  1075. """See group_min_max.__doc__"""
  1076. group_min_max(
  1077. out,
  1078. counts,
  1079. values,
  1080. labels,
  1081. min_count=min_count,
  1082. is_datetimelike=is_datetimelike,
  1083. compute_max=True,
  1084. )
  1085. @cython.wraparound(False)
  1086. @cython.boundscheck(False)
  1087. def group_min(groupby_t[:, ::1] out,
  1088. int64_t[::1] counts,
  1089. ndarray[groupby_t, ndim=2] values,
  1090. const intp_t[:] labels,
  1091. Py_ssize_t min_count=-1,
  1092. bint is_datetimelike=False) -> None:
  1093. """See group_min_max.__doc__"""
  1094. group_min_max(
  1095. out,
  1096. counts,
  1097. values,
  1098. labels,
  1099. min_count=min_count,
  1100. is_datetimelike=is_datetimelike,
  1101. compute_max=False,
  1102. )
  1103. @cython.boundscheck(False)
  1104. @cython.wraparound(False)
  1105. cdef group_cummin_max(groupby_t[:, ::1] out,
  1106. ndarray[groupby_t, ndim=2] values,
  1107. uint8_t[:, ::1] mask,
  1108. const intp_t[:] labels,
  1109. int ngroups,
  1110. bint is_datetimelike,
  1111. bint compute_max):
  1112. """
  1113. Cumulative minimum/maximum of columns of `values`, in row groups `labels`.
  1114. Parameters
  1115. ----------
  1116. out : np.ndarray[groupby_t, ndim=2]
  1117. Array to store cummin/max in.
  1118. values : np.ndarray[groupby_t, ndim=2]
  1119. Values to take cummin/max of.
  1120. mask : np.ndarray[bool] or None
  1121. If not None, indices represent missing values,
  1122. otherwise the mask will not be used
  1123. labels : np.ndarray[np.intp]
  1124. Labels to group by.
  1125. ngroups : int
  1126. Number of groups, larger than all entries of `labels`.
  1127. is_datetimelike : bool
  1128. True if `values` contains datetime-like entries.
  1129. compute_max : bool
  1130. True if cumulative maximum should be computed, False
  1131. if cumulative minimum should be computed
  1132. Notes
  1133. -----
  1134. This method modifies the `out` parameter, rather than returning an object.
  1135. """
  1136. cdef:
  1137. groupby_t[:, ::1] accum
  1138. accum = np.empty((ngroups, (<object>values).shape[1]), dtype=values.dtype)
  1139. if groupby_t is int64_t:
  1140. accum[:] = -_int64_max if compute_max else _int64_max
  1141. elif groupby_t is uint64_t:
  1142. accum[:] = 0 if compute_max else np.iinfo(np.uint64).max
  1143. else:
  1144. accum[:] = -np.inf if compute_max else np.inf
  1145. if mask is not None:
  1146. masked_cummin_max(out, values, mask, labels, accum, compute_max)
  1147. else:
  1148. cummin_max(out, values, labels, accum, is_datetimelike, compute_max)
  1149. @cython.boundscheck(False)
  1150. @cython.wraparound(False)
  1151. cdef cummin_max(groupby_t[:, ::1] out,
  1152. ndarray[groupby_t, ndim=2] values,
  1153. const intp_t[:] labels,
  1154. groupby_t[:, ::1] accum,
  1155. bint is_datetimelike,
  1156. bint compute_max):
  1157. """
  1158. Compute the cumulative minimum/maximum of columns of `values`, in row groups
  1159. `labels`.
  1160. """
  1161. cdef:
  1162. Py_ssize_t i, j, N, K
  1163. groupby_t val, mval
  1164. intp_t lab
  1165. N, K = (<object>values).shape
  1166. with nogil:
  1167. for i in range(N):
  1168. lab = labels[i]
  1169. if lab < 0:
  1170. continue
  1171. for j in range(K):
  1172. val = values[i, j]
  1173. if not _treat_as_na(val, is_datetimelike):
  1174. mval = accum[lab, j]
  1175. if compute_max:
  1176. if val > mval:
  1177. accum[lab, j] = mval = val
  1178. else:
  1179. if val < mval:
  1180. accum[lab, j] = mval = val
  1181. out[i, j] = mval
  1182. else:
  1183. out[i, j] = val
  1184. @cython.boundscheck(False)
  1185. @cython.wraparound(False)
  1186. cdef masked_cummin_max(groupby_t[:, ::1] out,
  1187. ndarray[groupby_t, ndim=2] values,
  1188. uint8_t[:, ::1] mask,
  1189. const intp_t[:] labels,
  1190. groupby_t[:, ::1] accum,
  1191. bint compute_max):
  1192. """
  1193. Compute the cumulative minimum/maximum of columns of `values`, in row groups
  1194. `labels` with a masked algorithm.
  1195. """
  1196. cdef:
  1197. Py_ssize_t i, j, N, K
  1198. groupby_t val, mval
  1199. intp_t lab
  1200. N, K = (<object>values).shape
  1201. with nogil:
  1202. for i in range(N):
  1203. lab = labels[i]
  1204. if lab < 0:
  1205. continue
  1206. for j in range(K):
  1207. if not mask[i, j]:
  1208. val = values[i, j]
  1209. mval = accum[lab, j]
  1210. if compute_max:
  1211. if val > mval:
  1212. accum[lab, j] = mval = val
  1213. else:
  1214. if val < mval:
  1215. accum[lab, j] = mval = val
  1216. out[i, j] = mval
  1217. @cython.boundscheck(False)
  1218. @cython.wraparound(False)
  1219. def group_cummin(groupby_t[:, ::1] out,
  1220. ndarray[groupby_t, ndim=2] values,
  1221. const intp_t[:] labels,
  1222. int ngroups,
  1223. bint is_datetimelike,
  1224. uint8_t[:, ::1] mask=None) -> None:
  1225. """See group_cummin_max.__doc__"""
  1226. group_cummin_max(
  1227. out,
  1228. values,
  1229. mask,
  1230. labels,
  1231. ngroups,
  1232. is_datetimelike,
  1233. compute_max=False
  1234. )
  1235. @cython.boundscheck(False)
  1236. @cython.wraparound(False)
  1237. def group_cummax(groupby_t[:, ::1] out,
  1238. ndarray[groupby_t, ndim=2] values,
  1239. const intp_t[:] labels,
  1240. int ngroups,
  1241. bint is_datetimelike,
  1242. uint8_t[:, ::1] mask=None) -> None:
  1243. """See group_cummin_max.__doc__"""
  1244. group_cummin_max(
  1245. out,
  1246. values,
  1247. mask,
  1248. labels,
  1249. ngroups,
  1250. is_datetimelike,
  1251. compute_max=True
  1252. )