internals.pyx 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670
  1. from collections import defaultdict
  2. import cython
  3. from cython import Py_ssize_t
  4. from cpython.slice cimport PySlice_GetIndicesEx
  5. cdef extern from "Python.h":
  6. Py_ssize_t PY_SSIZE_T_MAX
  7. import numpy as np
  8. cimport numpy as cnp
  9. from numpy cimport (
  10. NPY_INTP,
  11. int64_t,
  12. intp_t,
  13. ndarray,
  14. )
  15. cnp.import_array()
  16. from pandas._libs.algos import ensure_int64
  17. from pandas._libs.arrays cimport NDArrayBacked
  18. from pandas._libs.util cimport is_integer_object
  19. @cython.final
  20. @cython.freelist(32)
  21. cdef class BlockPlacement:
  22. # __slots__ = '_as_slice', '_as_array', '_len'
  23. cdef:
  24. slice _as_slice
  25. ndarray _as_array # Note: this still allows `None`; will be intp_t
  26. bint _has_slice, _has_array, _is_known_slice_like
  27. def __cinit__(self, val):
  28. cdef:
  29. slice slc
  30. self._as_slice = None
  31. self._as_array = None
  32. self._has_slice = False
  33. self._has_array = False
  34. if is_integer_object(val):
  35. slc = slice(val, val + 1, 1)
  36. self._as_slice = slc
  37. self._has_slice = True
  38. elif isinstance(val, slice):
  39. slc = slice_canonize(val)
  40. if slc.start != slc.stop:
  41. self._as_slice = slc
  42. self._has_slice = True
  43. else:
  44. arr = np.empty(0, dtype=np.intp)
  45. self._as_array = arr
  46. self._has_array = True
  47. else:
  48. # Cython memoryview interface requires ndarray to be writeable.
  49. arr = np.require(val, dtype=np.intp, requirements='W')
  50. assert arr.ndim == 1, arr.shape
  51. self._as_array = arr
  52. self._has_array = True
  53. def __str__(self) -> str:
  54. cdef:
  55. slice s = self._ensure_has_slice()
  56. if s is not None:
  57. v = self._as_slice
  58. else:
  59. v = self._as_array
  60. return f"{type(self).__name__}({v})"
  61. def __repr__(self) -> str:
  62. return str(self)
  63. def __len__(self) -> int:
  64. cdef:
  65. slice s = self._ensure_has_slice()
  66. if s is not None:
  67. return slice_len(s)
  68. else:
  69. return len(self._as_array)
  70. def __iter__(self):
  71. cdef:
  72. slice s = self._ensure_has_slice()
  73. Py_ssize_t start, stop, step, _
  74. if s is not None:
  75. start, stop, step, _ = slice_get_indices_ex(s)
  76. return iter(range(start, stop, step))
  77. else:
  78. return iter(self._as_array)
  79. @property
  80. def as_slice(self) -> slice:
  81. cdef:
  82. slice s = self._ensure_has_slice()
  83. if s is not None:
  84. return s
  85. else:
  86. raise TypeError("Not slice-like")
  87. @property
  88. def indexer(self):
  89. cdef:
  90. slice s = self._ensure_has_slice()
  91. if s is not None:
  92. return s
  93. else:
  94. return self._as_array
  95. @property
  96. def as_array(self) -> np.ndarray:
  97. cdef:
  98. Py_ssize_t start, stop, end, _
  99. if not self._has_array:
  100. start, stop, step, _ = slice_get_indices_ex(self._as_slice)
  101. # NOTE: this is the C-optimized equivalent of
  102. # `np.arange(start, stop, step, dtype=np.intp)`
  103. self._as_array = cnp.PyArray_Arange(start, stop, step, NPY_INTP)
  104. self._has_array = True
  105. return self._as_array
  106. @property
  107. def is_slice_like(self) -> bool:
  108. cdef:
  109. slice s = self._ensure_has_slice()
  110. return s is not None
  111. def __getitem__(self, loc):
  112. cdef:
  113. slice s = self._ensure_has_slice()
  114. if s is not None:
  115. val = slice_getitem(s, loc)
  116. else:
  117. val = self._as_array[loc]
  118. if not isinstance(val, slice) and val.ndim == 0:
  119. return val
  120. return BlockPlacement(val)
  121. def delete(self, loc) -> BlockPlacement:
  122. return BlockPlacement(np.delete(self.as_array, loc, axis=0))
  123. def append(self, others) -> BlockPlacement:
  124. if not len(others):
  125. return self
  126. return BlockPlacement(
  127. np.concatenate([self.as_array] + [o.as_array for o in others])
  128. )
  129. cdef BlockPlacement iadd(self, other):
  130. cdef:
  131. slice s = self._ensure_has_slice()
  132. Py_ssize_t other_int, start, stop, step, l
  133. if is_integer_object(other) and s is not None:
  134. other_int = <Py_ssize_t>other
  135. if other_int == 0:
  136. # BlockPlacement is treated as immutable
  137. return self
  138. start, stop, step, l = slice_get_indices_ex(s)
  139. start += other_int
  140. stop += other_int
  141. if (step > 0 and start < 0) or (step < 0 and stop < step):
  142. raise ValueError("iadd causes length change")
  143. if stop < 0:
  144. val = slice(start, None, step)
  145. else:
  146. val = slice(start, stop, step)
  147. return BlockPlacement(val)
  148. else:
  149. newarr = self.as_array + other
  150. if (newarr < 0).any():
  151. raise ValueError("iadd causes length change")
  152. val = newarr
  153. return BlockPlacement(val)
  154. def add(self, other) -> BlockPlacement:
  155. # We can get here with int or ndarray
  156. return self.iadd(other)
  157. cdef slice _ensure_has_slice(self):
  158. if not self._has_slice:
  159. self._as_slice = indexer_as_slice(self._as_array)
  160. self._has_slice = True
  161. return self._as_slice
  162. cdef slice slice_canonize(slice s):
  163. """
  164. Convert slice to canonical bounded form.
  165. """
  166. cdef:
  167. Py_ssize_t start = 0, stop = 0, step = 1
  168. if s.step is None:
  169. step = 1
  170. else:
  171. step = <Py_ssize_t>s.step
  172. if step == 0:
  173. raise ValueError("slice step cannot be zero")
  174. if step > 0:
  175. if s.stop is None:
  176. raise ValueError("unbounded slice")
  177. stop = <Py_ssize_t>s.stop
  178. if s.start is None:
  179. start = 0
  180. else:
  181. start = <Py_ssize_t>s.start
  182. if start > stop:
  183. start = stop
  184. elif step < 0:
  185. if s.start is None:
  186. raise ValueError("unbounded slice")
  187. start = <Py_ssize_t>s.start
  188. if s.stop is None:
  189. stop = -1
  190. else:
  191. stop = <Py_ssize_t>s.stop
  192. if stop > start:
  193. stop = start
  194. if start < 0 or (stop < 0 and s.stop is not None and step > 0):
  195. raise ValueError("unbounded slice")
  196. if stop < 0:
  197. return slice(start, None, step)
  198. else:
  199. return slice(start, stop, step)
  200. cpdef Py_ssize_t slice_len(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX) except -1:
  201. """
  202. Get length of a bounded slice.
  203. The slice must not have any "open" bounds that would create dependency on
  204. container size, i.e.:
  205. - if ``s.step is None or s.step > 0``, ``s.stop`` is not ``None``
  206. - if ``s.step < 0``, ``s.start`` is not ``None``
  207. Otherwise, the result is unreliable.
  208. """
  209. cdef:
  210. Py_ssize_t start, stop, step, length
  211. if slc is None:
  212. raise TypeError("slc must be slice")
  213. PySlice_GetIndicesEx(slc, objlen, &start, &stop, &step, &length)
  214. return length
  215. cdef slice_get_indices_ex(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX):
  216. """
  217. Get (start, stop, step, length) tuple for a slice.
  218. If `objlen` is not specified, slice must be bounded, otherwise the result
  219. will be wrong.
  220. """
  221. cdef:
  222. Py_ssize_t start, stop, step, length
  223. if slc is None:
  224. raise TypeError("slc should be a slice")
  225. PySlice_GetIndicesEx(slc, objlen, &start, &stop, &step, &length)
  226. return start, stop, step, length
  227. cdef slice_getitem(slice slc, ind):
  228. cdef:
  229. Py_ssize_t s_start, s_stop, s_step, s_len
  230. Py_ssize_t ind_start, ind_stop, ind_step, ind_len
  231. s_start, s_stop, s_step, s_len = slice_get_indices_ex(slc)
  232. if isinstance(ind, slice):
  233. ind_start, ind_stop, ind_step, ind_len = slice_get_indices_ex(ind, s_len)
  234. if ind_step > 0 and ind_len == s_len:
  235. # short-cut for no-op slice
  236. if ind_len == s_len:
  237. return slc
  238. if ind_step < 0:
  239. s_start = s_stop - s_step
  240. ind_step = -ind_step
  241. s_step *= ind_step
  242. s_stop = s_start + ind_stop * s_step
  243. s_start = s_start + ind_start * s_step
  244. if s_step < 0 and s_stop < 0:
  245. return slice(s_start, None, s_step)
  246. else:
  247. return slice(s_start, s_stop, s_step)
  248. else:
  249. # NOTE:
  250. # this is the C-optimized equivalent of
  251. # `np.arange(s_start, s_stop, s_step, dtype=np.intp)[ind]`
  252. return cnp.PyArray_Arange(s_start, s_stop, s_step, NPY_INTP)[ind]
  253. @cython.boundscheck(False)
  254. @cython.wraparound(False)
  255. cdef slice indexer_as_slice(intp_t[:] vals):
  256. cdef:
  257. Py_ssize_t i, n, start, stop
  258. int64_t d
  259. if vals is None:
  260. raise TypeError("vals must be ndarray")
  261. n = vals.shape[0]
  262. if n == 0 or vals[0] < 0:
  263. return None
  264. if n == 1:
  265. return slice(vals[0], vals[0] + 1, 1)
  266. if vals[1] < 0:
  267. return None
  268. # n > 2
  269. d = vals[1] - vals[0]
  270. if d == 0:
  271. return None
  272. for i in range(2, n):
  273. if vals[i] < 0 or vals[i] - vals[i - 1] != d:
  274. return None
  275. start = vals[0]
  276. stop = start + n * d
  277. if stop < 0 and d < 0:
  278. return slice(start, None, d)
  279. else:
  280. return slice(start, stop, d)
  281. @cython.boundscheck(False)
  282. @cython.wraparound(False)
  283. def get_blkno_indexers(
  284. int64_t[:] blknos, bint group=True
  285. ) -> list[tuple[int, slice | np.ndarray]]:
  286. """
  287. Enumerate contiguous runs of integers in ndarray.
  288. Iterate over elements of `blknos` yielding ``(blkno, slice(start, stop))``
  289. pairs for each contiguous run found.
  290. If `group` is True and there is more than one run for a certain blkno,
  291. ``(blkno, array)`` with an array containing positions of all elements equal
  292. to blkno.
  293. Returns
  294. -------
  295. list[tuple[int, slice | np.ndarray]]
  296. """
  297. # There's blkno in this function's name because it's used in block &
  298. # blockno handling.
  299. cdef:
  300. int64_t cur_blkno
  301. Py_ssize_t i, start, stop, n, diff, tot_len
  302. object blkno
  303. object group_dict = defaultdict(list)
  304. n = blknos.shape[0]
  305. result = list()
  306. start = 0
  307. cur_blkno = blknos[start]
  308. if n == 0:
  309. pass
  310. elif group is False:
  311. for i in range(1, n):
  312. if blknos[i] != cur_blkno:
  313. result.append((cur_blkno, slice(start, i)))
  314. start = i
  315. cur_blkno = blknos[i]
  316. result.append((cur_blkno, slice(start, n)))
  317. else:
  318. for i in range(1, n):
  319. if blknos[i] != cur_blkno:
  320. group_dict[cur_blkno].append((start, i))
  321. start = i
  322. cur_blkno = blknos[i]
  323. group_dict[cur_blkno].append((start, n))
  324. for blkno, slices in group_dict.items():
  325. if len(slices) == 1:
  326. result.append((blkno, slice(slices[0][0], slices[0][1])))
  327. else:
  328. tot_len = sum(stop - start for start, stop in slices)
  329. arr = np.empty(tot_len, dtype=np.int64)
  330. i = 0
  331. for start, stop in slices:
  332. for diff in range(start, stop):
  333. arr[i] = diff
  334. i += 1
  335. result.append((blkno, arr))
  336. return result
  337. def get_blkno_placements(blknos, group: bool = True):
  338. """
  339. Parameters
  340. ----------
  341. blknos : np.ndarray[int64]
  342. group : bool, default True
  343. Returns
  344. -------
  345. iterator
  346. yield (blkno, BlockPlacement)
  347. """
  348. blknos = ensure_int64(blknos)
  349. for blkno, indexer in get_blkno_indexers(blknos, group):
  350. yield blkno, BlockPlacement(indexer)
  351. @cython.freelist(64)
  352. cdef class SharedBlock:
  353. """
  354. Defining __init__ in a cython class significantly improves performance.
  355. """
  356. cdef:
  357. public BlockPlacement _mgr_locs
  358. readonly int ndim
  359. def __cinit__(self, values, placement: BlockPlacement, ndim: int):
  360. """
  361. Parameters
  362. ----------
  363. values : np.ndarray or ExtensionArray
  364. We assume maybe_coerce_values has already been called.
  365. placement : BlockPlacement
  366. ndim : int
  367. 1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame
  368. """
  369. self._mgr_locs = placement
  370. self.ndim = ndim
  371. cpdef __reduce__(self):
  372. # We have to do some gymnastics b/c "ndim" is keyword-only
  373. from functools import partial
  374. from pandas.core.internals.blocks import new_block
  375. args = (self.values, self.mgr_locs.indexer)
  376. func = partial(new_block, ndim=self.ndim)
  377. return func, args
  378. cpdef __setstate__(self, state):
  379. from pandas.core.construction import extract_array
  380. self.mgr_locs = BlockPlacement(state[0])
  381. self.values = extract_array(state[1], extract_numpy=True)
  382. if len(state) > 2:
  383. # we stored ndim
  384. self.ndim = state[2]
  385. else:
  386. # older pickle
  387. from pandas.core.internals.api import maybe_infer_ndim
  388. ndim = maybe_infer_ndim(self.values, self.mgr_locs)
  389. self.ndim = ndim
  390. cdef class NumpyBlock(SharedBlock):
  391. cdef:
  392. public ndarray values
  393. def __cinit__(self, ndarray values, BlockPlacement placement, int ndim):
  394. # set values here the (implicit) call to SharedBlock.__cinit__ will
  395. # set placement and ndim
  396. self.values = values
  397. # @final # not useful in cython, but we _would_ annotate with @final
  398. cpdef NumpyBlock getitem_block_index(self, slice slicer):
  399. """
  400. Perform __getitem__-like specialized to slicing along index.
  401. Assumes self.ndim == 2
  402. """
  403. new_values = self.values[..., slicer]
  404. return type(self)(new_values, self._mgr_locs, ndim=self.ndim)
  405. cdef class NDArrayBackedBlock(SharedBlock):
  406. """
  407. Block backed by NDArrayBackedExtensionArray
  408. """
  409. cdef public:
  410. NDArrayBacked values
  411. def __cinit__(self, NDArrayBacked values, BlockPlacement placement, int ndim):
  412. # set values here the (implicit) call to SharedBlock.__cinit__ will
  413. # set placement and ndim
  414. self.values = values
  415. # @final # not useful in cython, but we _would_ annotate with @final
  416. cpdef NDArrayBackedBlock getitem_block_index(self, slice slicer):
  417. """
  418. Perform __getitem__-like specialized to slicing along index.
  419. Assumes self.ndim == 2
  420. """
  421. new_values = self.values[..., slicer]
  422. return type(self)(new_values, self._mgr_locs, ndim=self.ndim)
  423. cdef class Block(SharedBlock):
  424. cdef:
  425. public object values
  426. def __cinit__(self, object values, BlockPlacement placement, int ndim):
  427. # set values here the (implicit) call to SharedBlock.__cinit__ will
  428. # set placement and ndim
  429. self.values = values
  430. @cython.freelist(64)
  431. cdef class BlockManager:
  432. cdef:
  433. public tuple blocks
  434. public list axes
  435. public bint _known_consolidated, _is_consolidated
  436. public ndarray _blknos, _blklocs
  437. def __cinit__(self, blocks=None, axes=None, verify_integrity=True):
  438. # None as defaults for unpickling GH#42345
  439. if blocks is None:
  440. # This adds 1-2 microseconds to DataFrame(np.array([]))
  441. return
  442. if isinstance(blocks, list):
  443. # Backward compat for e.g. pyarrow
  444. blocks = tuple(blocks)
  445. self.blocks = blocks
  446. self.axes = axes.copy() # copy to make sure we are not remotely-mutable
  447. # Populate known_consolidate, blknos, and blklocs lazily
  448. self._known_consolidated = False
  449. self._is_consolidated = False
  450. # error: Incompatible types in assignment (expression has type "None",
  451. # variable has type "ndarray")
  452. self._blknos = None # type: ignore[assignment]
  453. # error: Incompatible types in assignment (expression has type "None",
  454. # variable has type "ndarray")
  455. self._blklocs = None # type: ignore[assignment]
  456. # -------------------------------------------------------------------
  457. # Pickle
  458. cpdef __reduce__(self):
  459. if len(self.axes) == 1:
  460. # SingleBlockManager, __init__ expects Block, axis
  461. args = (self.blocks[0], self.axes[0])
  462. else:
  463. args = (self.blocks, self.axes)
  464. return type(self), args
  465. cpdef __setstate__(self, state):
  466. from pandas.core.construction import extract_array
  467. from pandas.core.internals.blocks import (
  468. ensure_block_shape,
  469. new_block,
  470. )
  471. from pandas.core.internals.managers import ensure_index
  472. if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]:
  473. state = state[3]["0.14.1"]
  474. axes = [ensure_index(ax) for ax in state["axes"]]
  475. ndim = len(axes)
  476. for blk in state["blocks"]:
  477. vals = blk["values"]
  478. # older versions may hold e.g. DatetimeIndex instead of DTA
  479. vals = extract_array(vals, extract_numpy=True)
  480. blk["values"] = ensure_block_shape(vals, ndim=ndim)
  481. nbs = [
  482. new_block(blk["values"], blk["mgr_locs"], ndim=ndim)
  483. for blk in state["blocks"]
  484. ]
  485. blocks = tuple(nbs)
  486. self.blocks = blocks
  487. self.axes = axes
  488. else:
  489. raise NotImplementedError("pre-0.14.1 pickles are no longer supported")
  490. self._post_setstate()
  491. def _post_setstate(self) -> None:
  492. self._is_consolidated = False
  493. self._known_consolidated = False
  494. self._rebuild_blknos_and_blklocs()
  495. # -------------------------------------------------------------------
  496. # Indexing
  497. cdef BlockManager _get_index_slice(self, slobj):
  498. cdef:
  499. SharedBlock blk, nb
  500. nbs = []
  501. for blk in self.blocks:
  502. nb = blk.getitem_block_index(slobj)
  503. nbs.append(nb)
  504. new_axes = [self.axes[0], self.axes[1]._getitem_slice(slobj)]
  505. return type(self)(tuple(nbs), new_axes, verify_integrity=False)
  506. def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager:
  507. if axis == 0:
  508. new_blocks = self._slice_take_blocks_ax0(slobj)
  509. elif axis == 1:
  510. return self._get_index_slice(slobj)
  511. else:
  512. raise IndexError("Requested axis not found in manager")
  513. new_axes = list(self.axes)
  514. new_axes[axis] = new_axes[axis]._getitem_slice(slobj)
  515. return type(self)(tuple(new_blocks), new_axes, verify_integrity=False)