writers.pyx 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. import cython
  2. import numpy as np
  3. from cpython cimport (
  4. PyBytes_GET_SIZE,
  5. PyUnicode_GET_LENGTH,
  6. )
  7. from numpy cimport (
  8. ndarray,
  9. uint8_t,
  10. )
  11. ctypedef fused pandas_string:
  12. str
  13. bytes
  14. @cython.boundscheck(False)
  15. @cython.wraparound(False)
  16. def write_csv_rows(
  17. list data,
  18. ndarray data_index,
  19. Py_ssize_t nlevels,
  20. ndarray cols,
  21. object writer
  22. ) -> None:
  23. """
  24. Write the given data to the writer object, pre-allocating where possible
  25. for performance improvements.
  26. Parameters
  27. ----------
  28. data : list
  29. data_index : ndarray
  30. nlevels : int
  31. cols : ndarray
  32. writer : _csv.writer
  33. """
  34. # In crude testing, N>100 yields little marginal improvement
  35. cdef:
  36. Py_ssize_t i, j = 0, k = len(data_index), N = 100, ncols = len(cols)
  37. list rows
  38. # pre-allocate rows
  39. rows = [[None] * (nlevels + ncols) for _ in range(N)]
  40. if nlevels == 1:
  41. for j in range(k):
  42. row = rows[j % N]
  43. row[0] = data_index[j]
  44. for i in range(ncols):
  45. row[1 + i] = data[i][j]
  46. if j >= N - 1 and j % N == N - 1:
  47. writer.writerows(rows)
  48. elif nlevels > 1:
  49. for j in range(k):
  50. row = rows[j % N]
  51. row[:nlevels] = list(data_index[j])
  52. for i in range(ncols):
  53. row[nlevels + i] = data[i][j]
  54. if j >= N - 1 and j % N == N - 1:
  55. writer.writerows(rows)
  56. else:
  57. for j in range(k):
  58. row = rows[j % N]
  59. for i in range(ncols):
  60. row[i] = data[i][j]
  61. if j >= N - 1 and j % N == N - 1:
  62. writer.writerows(rows)
  63. if j >= 0 and (j < N - 1 or (j % N) != N - 1):
  64. writer.writerows(rows[:((j + 1) % N)])
  65. @cython.boundscheck(False)
  66. @cython.wraparound(False)
  67. def convert_json_to_lines(arr: str) -> str:
  68. """
  69. replace comma separated json with line feeds, paying special attention
  70. to quotes & brackets
  71. """
  72. cdef:
  73. Py_ssize_t i = 0, num_open_brackets_seen = 0, length
  74. bint in_quotes = False, is_escaping = False
  75. ndarray[uint8_t, ndim=1] narr
  76. unsigned char val, newline, comma, left_bracket, right_bracket, quote
  77. unsigned char backslash
  78. newline = ord('\n')
  79. comma = ord(',')
  80. left_bracket = ord('{')
  81. right_bracket = ord('}')
  82. quote = ord('"')
  83. backslash = ord('\\')
  84. narr = np.frombuffer(arr.encode('utf-8'), dtype='u1').copy()
  85. length = narr.shape[0]
  86. for i in range(length):
  87. val = narr[i]
  88. if val == quote and i > 0 and not is_escaping:
  89. in_quotes = ~in_quotes
  90. if val == backslash or is_escaping:
  91. is_escaping = ~is_escaping
  92. if val == comma: # commas that should be \n
  93. if num_open_brackets_seen == 0 and not in_quotes:
  94. narr[i] = newline
  95. elif val == left_bracket:
  96. if not in_quotes:
  97. num_open_brackets_seen += 1
  98. elif val == right_bracket:
  99. if not in_quotes:
  100. num_open_brackets_seen -= 1
  101. return narr.tobytes().decode('utf-8') + '\n' # GH:36888
  102. # stata, pytables
  103. @cython.boundscheck(False)
  104. @cython.wraparound(False)
  105. def max_len_string_array(pandas_string[:] arr) -> Py_ssize_t:
  106. """
  107. Return the maximum size of elements in a 1-dim string array.
  108. """
  109. cdef:
  110. Py_ssize_t i, m = 0, l = 0, length = arr.shape[0]
  111. pandas_string val
  112. for i in range(length):
  113. val = arr[i]
  114. l = word_len(val)
  115. if l > m:
  116. m = l
  117. return m
  118. cpdef inline Py_ssize_t word_len(object val):
  119. """
  120. Return the maximum length of a string or bytes value.
  121. """
  122. cdef:
  123. Py_ssize_t l = 0
  124. if isinstance(val, str):
  125. l = PyUnicode_GET_LENGTH(val)
  126. elif isinstance(val, bytes):
  127. l = PyBytes_GET_SIZE(val)
  128. return l
  129. # ------------------------------------------------------------------
  130. # PyTables Helpers
  131. @cython.boundscheck(False)
  132. @cython.wraparound(False)
  133. def string_array_replace_from_nan_rep(
  134. ndarray[object, ndim=1] arr,
  135. object nan_rep,
  136. object replace=np.nan
  137. ) -> None:
  138. """
  139. Replace the values in the array with 'replacement' if
  140. they are 'nan_rep'. Return the same array.
  141. """
  142. cdef:
  143. Py_ssize_t length = len(arr), i = 0
  144. for i in range(length):
  145. if arr[i] == nan_rep:
  146. arr[i] = replace