hashtable.pyx 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. cimport cython
  2. from cpython.mem cimport (
  3. PyMem_Free,
  4. PyMem_Malloc,
  5. )
  6. from cpython.ref cimport (
  7. Py_INCREF,
  8. PyObject,
  9. )
  10. from libc.stdlib cimport (
  11. free,
  12. malloc,
  13. )
  14. import numpy as np
  15. cimport numpy as cnp
  16. from numpy cimport (
  17. float64_t,
  18. ndarray,
  19. uint8_t,
  20. uint32_t,
  21. )
  22. from numpy.math cimport NAN
  23. cnp.import_array()
  24. from pandas._libs cimport util
  25. from pandas._libs.khash cimport (
  26. KHASH_TRACE_DOMAIN,
  27. are_equivalent_float32_t,
  28. are_equivalent_float64_t,
  29. are_equivalent_khcomplex64_t,
  30. are_equivalent_khcomplex128_t,
  31. kh_needed_n_buckets,
  32. kh_python_hash_equal,
  33. kh_python_hash_func,
  34. kh_str_t,
  35. khcomplex64_t,
  36. khcomplex128_t,
  37. khiter_t,
  38. )
  39. from pandas._libs.missing cimport checknull
  40. def get_hashtable_trace_domain():
  41. return KHASH_TRACE_DOMAIN
  42. def object_hash(obj):
  43. return kh_python_hash_func(obj)
  44. def objects_are_equal(a, b):
  45. return kh_python_hash_equal(a, b)
  46. cdef int64_t NPY_NAT = util.get_nat()
  47. SIZE_HINT_LIMIT = (1 << 20) + 7
  48. cdef Py_ssize_t _INIT_VEC_CAP = 128
  49. include "hashtable_class_helper.pxi"
  50. include "hashtable_func_helper.pxi"
  51. cdef class Factorizer:
  52. cdef readonly:
  53. Py_ssize_t count
  54. def __cinit__(self, size_hint: int):
  55. self.count = 0
  56. def get_count(self) -> int:
  57. return self.count
  58. cdef class ObjectFactorizer(Factorizer):
  59. cdef public:
  60. PyObjectHashTable table
  61. ObjectVector uniques
  62. def __cinit__(self, size_hint: int):
  63. self.table = PyObjectHashTable(size_hint)
  64. self.uniques = ObjectVector()
  65. def factorize(
  66. self, ndarray[object] values, sort=False, na_sentinel=-1, na_value=None
  67. ) -> np.ndarray:
  68. """
  69. Returns
  70. -------
  71. np.ndarray[np.intp]
  72. Examples
  73. --------
  74. Factorize values with nans replaced by na_sentinel
  75. >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
  76. array([ 0, 1, 20])
  77. """
  78. cdef:
  79. ndarray[intp_t] labels
  80. if self.uniques.external_view_exists:
  81. uniques = ObjectVector()
  82. uniques.extend(self.uniques.to_array())
  83. self.uniques = uniques
  84. labels = self.table.get_labels(values, self.uniques,
  85. self.count, na_sentinel, na_value)
  86. mask = (labels == na_sentinel)
  87. # sort on
  88. if sort:
  89. sorter = self.uniques.to_array().argsort()
  90. reverse_indexer = np.empty(len(sorter), dtype=np.intp)
  91. reverse_indexer.put(sorter, np.arange(len(sorter)))
  92. labels = reverse_indexer.take(labels, mode='clip')
  93. labels[mask] = na_sentinel
  94. self.count = len(self.uniques)
  95. return labels
  96. cdef class Int64Factorizer(Factorizer):
  97. cdef public:
  98. Int64HashTable table
  99. Int64Vector uniques
  100. def __cinit__(self, size_hint: int):
  101. self.table = Int64HashTable(size_hint)
  102. self.uniques = Int64Vector()
  103. def factorize(self, const int64_t[:] values, sort=False,
  104. na_sentinel=-1, na_value=None) -> np.ndarray:
  105. """
  106. Returns
  107. -------
  108. ndarray[intp_t]
  109. Examples
  110. --------
  111. Factorize values with nans replaced by na_sentinel
  112. >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
  113. array([ 0, 1, 20])
  114. """
  115. cdef:
  116. ndarray[intp_t] labels
  117. if self.uniques.external_view_exists:
  118. uniques = Int64Vector()
  119. uniques.extend(self.uniques.to_array())
  120. self.uniques = uniques
  121. labels = self.table.get_labels(values, self.uniques,
  122. self.count, na_sentinel,
  123. na_value=na_value)
  124. # sort on
  125. if sort:
  126. sorter = self.uniques.to_array().argsort()
  127. reverse_indexer = np.empty(len(sorter), dtype=np.intp)
  128. reverse_indexer.put(sorter, np.arange(len(sorter)))
  129. labels = reverse_indexer.take(labels)
  130. self.count = len(self.uniques)
  131. return labels
  132. @cython.wraparound(False)
  133. @cython.boundscheck(False)
  134. def unique_label_indices(const int64_t[:] labels) -> ndarray:
  135. """
  136. Indices of the first occurrences of the unique labels
  137. *excluding* -1. equivalent to:
  138. np.unique(labels, return_index=True)[1]
  139. """
  140. cdef:
  141. int ret = 0
  142. Py_ssize_t i, n = len(labels)
  143. kh_int64_t *table = kh_init_int64()
  144. Int64Vector idx = Int64Vector()
  145. ndarray[int64_t, ndim=1] arr
  146. Int64VectorData *ud = idx.data
  147. kh_resize_int64(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
  148. with nogil:
  149. for i in range(n):
  150. kh_put_int64(table, labels[i], &ret)
  151. if ret != 0:
  152. if needs_resize(ud):
  153. with gil:
  154. idx.resize()
  155. append_data_int64(ud, i)
  156. kh_destroy_int64(table)
  157. arr = idx.to_array()
  158. arr = arr[np.asarray(labels)[arr].argsort()]
  159. return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr