hashing.pyx 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. # Translated from the reference implementation
  2. # at https://github.com/veorq/SipHash
  3. import cython
  4. from libc.stdlib cimport (
  5. free,
  6. malloc,
  7. )
  8. import numpy as np
  9. from numpy cimport (
  10. import_array,
  11. ndarray,
  12. uint8_t,
  13. uint32_t,
  14. uint64_t,
  15. )
  16. import_array()
  17. from pandas._libs.util cimport is_nan
  18. DEF cROUNDS = 2
  19. DEF dROUNDS = 4
  20. @cython.boundscheck(False)
  21. def hash_object_array(
  22. ndarray[object] arr, str key, str encoding="utf8"
  23. ) -> np.ndarray[np.uint64]:
  24. """
  25. Parameters
  26. ----------
  27. arr : 1-d object ndarray of objects
  28. key : hash key, must be 16 byte len encoded
  29. encoding : encoding for key & arr, default to 'utf8'
  30. Returns
  31. -------
  32. 1-d uint64 ndarray of hashes.
  33. Raises
  34. ------
  35. TypeError
  36. If the array contains mixed types.
  37. Notes
  38. -----
  39. Allowed values must be strings, or nulls
  40. mixed array types will raise TypeError.
  41. """
  42. cdef:
  43. Py_ssize_t i, l, n
  44. uint64_t[:] result
  45. bytes data, k
  46. uint8_t *kb
  47. uint64_t *lens
  48. char **vecs
  49. char *cdata
  50. object val
  51. list datas = []
  52. k = <bytes>key.encode(encoding)
  53. kb = <uint8_t *>k
  54. if len(k) != 16:
  55. raise ValueError(
  56. f"key should be a 16-byte string encoded, got {k} (len {len(k)})"
  57. )
  58. n = len(arr)
  59. # create an array of bytes
  60. vecs = <char **>malloc(n * sizeof(char *))
  61. lens = <uint64_t*>malloc(n * sizeof(uint64_t))
  62. for i in range(n):
  63. val = arr[i]
  64. if isinstance(val, bytes):
  65. data = <bytes>val
  66. elif isinstance(val, str):
  67. data = <bytes>val.encode(encoding)
  68. elif val is None or is_nan(val):
  69. # null, stringify and encode
  70. data = <bytes>str(val).encode(encoding)
  71. elif isinstance(val, tuple):
  72. # GH#28969 we could have a tuple, but need to ensure that
  73. # the tuple entries are themselves hashable before converting
  74. # to str
  75. hash(val)
  76. data = <bytes>str(val).encode(encoding)
  77. else:
  78. raise TypeError(
  79. f"{val} of type {type(val)} is not a valid type for hashing, "
  80. "must be string or null"
  81. )
  82. l = len(data)
  83. lens[i] = l
  84. cdata = data
  85. # keep the references alive through the end of the
  86. # function
  87. datas.append(data)
  88. vecs[i] = cdata
  89. result = np.empty(n, dtype=np.uint64)
  90. with nogil:
  91. for i in range(n):
  92. result[i] = low_level_siphash(<uint8_t *>vecs[i], lens[i], kb)
  93. free(vecs)
  94. free(lens)
  95. return result.base # .base to retrieve underlying np.ndarray
  96. cdef inline uint64_t _rotl(uint64_t x, uint64_t b) nogil:
  97. return (x << b) | (x >> (64 - b))
  98. cdef inline void u32to8_le(uint8_t* p, uint32_t v) nogil:
  99. p[0] = <uint8_t>(v)
  100. p[1] = <uint8_t>(v >> 8)
  101. p[2] = <uint8_t>(v >> 16)
  102. p[3] = <uint8_t>(v >> 24)
  103. cdef inline uint64_t u8to64_le(uint8_t* p) nogil:
  104. return (<uint64_t>p[0] |
  105. <uint64_t>p[1] << 8 |
  106. <uint64_t>p[2] << 16 |
  107. <uint64_t>p[3] << 24 |
  108. <uint64_t>p[4] << 32 |
  109. <uint64_t>p[5] << 40 |
  110. <uint64_t>p[6] << 48 |
  111. <uint64_t>p[7] << 56)
  112. cdef inline void _sipround(uint64_t* v0, uint64_t* v1,
  113. uint64_t* v2, uint64_t* v3) nogil:
  114. v0[0] += v1[0]
  115. v1[0] = _rotl(v1[0], 13)
  116. v1[0] ^= v0[0]
  117. v0[0] = _rotl(v0[0], 32)
  118. v2[0] += v3[0]
  119. v3[0] = _rotl(v3[0], 16)
  120. v3[0] ^= v2[0]
  121. v0[0] += v3[0]
  122. v3[0] = _rotl(v3[0], 21)
  123. v3[0] ^= v0[0]
  124. v2[0] += v1[0]
  125. v1[0] = _rotl(v1[0], 17)
  126. v1[0] ^= v2[0]
  127. v2[0] = _rotl(v2[0], 32)
  128. @cython.cdivision(True)
  129. cdef uint64_t low_level_siphash(uint8_t* data, size_t datalen,
  130. uint8_t* key) nogil:
  131. cdef uint64_t v0 = 0x736f6d6570736575ULL
  132. cdef uint64_t v1 = 0x646f72616e646f6dULL
  133. cdef uint64_t v2 = 0x6c7967656e657261ULL
  134. cdef uint64_t v3 = 0x7465646279746573ULL
  135. cdef uint64_t b
  136. cdef uint64_t k0 = u8to64_le(key)
  137. cdef uint64_t k1 = u8to64_le(key + 8)
  138. cdef uint64_t m
  139. cdef int i
  140. cdef uint8_t* end = data + datalen - (datalen % sizeof(uint64_t))
  141. cdef int left = datalen & 7
  142. cdef int left_byte
  143. b = (<uint64_t>datalen) << 56
  144. v3 ^= k1
  145. v2 ^= k0
  146. v1 ^= k1
  147. v0 ^= k0
  148. while (data != end):
  149. m = u8to64_le(data)
  150. v3 ^= m
  151. for i in range(cROUNDS):
  152. _sipround(&v0, &v1, &v2, &v3)
  153. v0 ^= m
  154. data += sizeof(uint64_t)
  155. for i in range(left-1, -1, -1):
  156. b |= (<uint64_t>data[i]) << (i * 8)
  157. v3 ^= b
  158. for i in range(cROUNDS):
  159. _sipround(&v0, &v1, &v2, &v3)
  160. v0 ^= b
  161. v2 ^= 0xff
  162. for i in range(dROUNDS):
  163. _sipround(&v0, &v1, &v2, &v3)
  164. b = v0 ^ v1 ^ v2 ^ v3
  165. return b