lib.pyx 85 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039
  1. from collections import abc
  2. from decimal import Decimal
  3. from enum import Enum
  4. import warnings
  5. import cython
  6. from cython import Py_ssize_t
  7. from cpython.datetime cimport (
  8. PyDate_Check,
  9. PyDateTime_Check,
  10. PyDateTime_IMPORT,
  11. PyDelta_Check,
  12. PyTime_Check,
  13. )
  14. from cpython.iterator cimport PyIter_Check
  15. from cpython.number cimport PyNumber_Check
  16. from cpython.object cimport (
  17. Py_EQ,
  18. PyObject_RichCompareBool,
  19. )
  20. from cpython.ref cimport Py_INCREF
  21. from cpython.sequence cimport PySequence_Check
  22. from cpython.tuple cimport (
  23. PyTuple_New,
  24. PyTuple_SET_ITEM,
  25. )
  26. PyDateTime_IMPORT
  27. import numpy as np
  28. cimport numpy as cnp
  29. from numpy cimport (
  30. NPY_OBJECT,
  31. PyArray_Check,
  32. PyArray_GETITEM,
  33. PyArray_ITER_DATA,
  34. PyArray_ITER_NEXT,
  35. PyArray_IterNew,
  36. complex128_t,
  37. flatiter,
  38. float32_t,
  39. float64_t,
  40. int64_t,
  41. intp_t,
  42. ndarray,
  43. uint8_t,
  44. uint64_t,
  45. )
  46. cnp.import_array()
  47. cdef extern from "numpy/arrayobject.h":
  48. # cython's numpy.dtype specification is incorrect, which leads to
  49. # errors in issubclass(self.dtype.type, np.bool_), so we directly
  50. # include the correct version
  51. # https://github.com/cython/cython/issues/2022
  52. ctypedef class numpy.dtype [object PyArray_Descr]:
  53. # Use PyDataType_* macros when possible, however there are no macros
  54. # for accessing some of the fields, so some are defined. Please
  55. # ask on cython-dev if you need more.
  56. cdef:
  57. int type_num
  58. int itemsize "elsize"
  59. char byteorder
  60. object fields
  61. tuple names
  62. cdef extern from "numpy/ndarrayobject.h":
  63. bint PyArray_CheckScalar(obj) nogil
  64. cdef extern from "src/parse_helper.h":
  65. int floatify(object, float64_t *result, int *maybe_int) except -1
  66. from pandas._libs cimport util
  67. from pandas._libs.util cimport (
  68. INT64_MAX,
  69. INT64_MIN,
  70. UINT64_MAX,
  71. is_nan,
  72. )
  73. from pandas._libs.tslib import array_to_datetime
  74. from pandas._libs.tslibs import (
  75. OutOfBoundsDatetime,
  76. OutOfBoundsTimedelta,
  77. )
  78. from pandas._libs.tslibs.period import Period
  79. from pandas._libs.missing cimport (
  80. C_NA,
  81. checknull,
  82. is_matching_na,
  83. is_null_datetime64,
  84. is_null_timedelta64,
  85. isnaobj,
  86. )
  87. from pandas._libs.tslibs.conversion cimport convert_to_tsobject
  88. from pandas._libs.tslibs.nattype cimport (
  89. NPY_NAT,
  90. c_NaT as NaT,
  91. checknull_with_nat,
  92. )
  93. from pandas._libs.tslibs.offsets cimport is_offset_object
  94. from pandas._libs.tslibs.period cimport is_period_object
  95. from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64
  96. from pandas._libs.tslibs.timezones cimport tz_compare
  97. # constants that will be compared to potentially arbitrarily large
  98. # python int
  99. cdef:
  100. object oINT64_MAX = <int64_t>INT64_MAX
  101. object oINT64_MIN = <int64_t>INT64_MIN
  102. object oUINT64_MAX = <uint64_t>UINT64_MAX
  103. float64_t NaN = <float64_t>np.NaN
  104. # python-visible
  105. i8max = <int64_t>INT64_MAX
  106. u8max = <uint64_t>UINT64_MAX
  107. @cython.wraparound(False)
  108. @cython.boundscheck(False)
  109. def memory_usage_of_objects(arr: object[:]) -> int64_t:
  110. """
  111. Return the memory usage of an object array in bytes.
  112. Does not include the actual bytes of the pointers
  113. """
  114. i: Py_ssize_t
  115. n: Py_ssize_t
  116. size: int64_t
  117. size = 0
  118. n = len(arr)
  119. for i in range(n):
  120. size += arr[i].__sizeof__()
  121. return size
  122. # ----------------------------------------------------------------------
  123. def is_scalar(val: object) -> bool:
  124. """
  125. Return True if given object is scalar.
  126. Parameters
  127. ----------
  128. val : object
  129. This includes:
  130. - numpy array scalar (e.g. np.int64)
  131. - Python builtin numerics
  132. - Python builtin byte arrays and strings
  133. - None
  134. - datetime.datetime
  135. - datetime.timedelta
  136. - Period
  137. - decimal.Decimal
  138. - Interval
  139. - DateOffset
  140. - Fraction
  141. - Number.
  142. Returns
  143. -------
  144. bool
  145. Return True if given object is scalar.
  146. Examples
  147. --------
  148. >>> dt = datetime.datetime(2018, 10, 3)
  149. >>> pd.api.types.is_scalar(dt)
  150. True
  151. >>> pd.api.types.is_scalar([2, 3])
  152. False
  153. >>> pd.api.types.is_scalar({0: 1, 2: 3})
  154. False
  155. >>> pd.api.types.is_scalar((0, 2))
  156. False
  157. pandas supports PEP 3141 numbers:
  158. >>> from fractions import Fraction
  159. >>> pd.api.types.is_scalar(Fraction(3, 5))
  160. True
  161. """
  162. # Start with C-optimized checks
  163. if (cnp.PyArray_IsAnyScalar(val)
  164. # PyArray_IsAnyScalar is always False for bytearrays on Py3
  165. or PyDate_Check(val)
  166. or PyDelta_Check(val)
  167. or PyTime_Check(val)
  168. # We differ from numpy, which claims that None is not scalar;
  169. # see np.isscalar
  170. or val is C_NA
  171. or val is None):
  172. return True
  173. # Next use C-optimized checks to exclude common non-scalars before falling
  174. # back to non-optimized checks.
  175. if PySequence_Check(val):
  176. # e.g. list, tuple
  177. # includes np.ndarray, Series which PyNumber_Check can return True for
  178. return False
  179. # Note: PyNumber_Check check includes Decimal, Fraction, numbers.Number
  180. return (PyNumber_Check(val)
  181. or is_period_object(val)
  182. or is_interval(val)
  183. or is_offset_object(val))
  184. cdef inline int64_t get_itemsize(object val):
  185. """
  186. Get the itemsize of a NumPy scalar, -1 if not a NumPy scalar.
  187. Parameters
  188. ----------
  189. val : object
  190. Returns
  191. -------
  192. is_ndarray : bool
  193. """
  194. if PyArray_CheckScalar(val):
  195. return cnp.PyArray_DescrFromScalar(val).itemsize
  196. else:
  197. return -1
  198. def is_iterator(obj: object) -> bool:
  199. """
  200. Check if the object is an iterator.
  201. This is intended for generators, not list-like objects.
  202. Parameters
  203. ----------
  204. obj : The object to check
  205. Returns
  206. -------
  207. is_iter : bool
  208. Whether `obj` is an iterator.
  209. Examples
  210. --------
  211. >>> is_iterator((x for x in []))
  212. True
  213. >>> is_iterator([1, 2, 3])
  214. False
  215. >>> is_iterator(datetime(2017, 1, 1))
  216. False
  217. >>> is_iterator("foo")
  218. False
  219. >>> is_iterator(1)
  220. False
  221. """
  222. return PyIter_Check(obj)
  223. def item_from_zerodim(val: object) -> object:
  224. """
  225. If the value is a zerodim array, return the item it contains.
  226. Parameters
  227. ----------
  228. val : object
  229. Returns
  230. -------
  231. object
  232. Examples
  233. --------
  234. >>> item_from_zerodim(1)
  235. 1
  236. >>> item_from_zerodim('foobar')
  237. 'foobar'
  238. >>> item_from_zerodim(np.array(1))
  239. 1
  240. >>> item_from_zerodim(np.array([1]))
  241. array([1])
  242. """
  243. if cnp.PyArray_IsZeroDim(val):
  244. return cnp.PyArray_ToScalar(cnp.PyArray_DATA(val), val)
  245. return val
  246. @cython.wraparound(False)
  247. @cython.boundscheck(False)
  248. def fast_unique_multiple(list arrays, sort: bool = True):
  249. """
  250. Generate a list of unique values from a list of arrays.
  251. Parameters
  252. ----------
  253. list : array-like
  254. List of array-like objects.
  255. sort : bool
  256. Whether or not to sort the resulting unique list.
  257. Returns
  258. -------
  259. list of unique values
  260. """
  261. cdef:
  262. ndarray[object] buf
  263. Py_ssize_t k = len(arrays)
  264. Py_ssize_t i, j, n
  265. list uniques = []
  266. dict table = {}
  267. object val, stub = 0
  268. for i in range(k):
  269. buf = arrays[i]
  270. n = len(buf)
  271. for j in range(n):
  272. val = buf[j]
  273. if val not in table:
  274. table[val] = stub
  275. uniques.append(val)
  276. if sort is None:
  277. try:
  278. uniques.sort()
  279. except TypeError:
  280. warnings.warn(
  281. "The values in the array are unorderable. "
  282. "Pass `sort=False` to suppress this warning.",
  283. RuntimeWarning,
  284. )
  285. pass
  286. return uniques
  287. @cython.wraparound(False)
  288. @cython.boundscheck(False)
  289. def fast_unique_multiple_list(lists: list, sort: bool = True) -> list:
  290. cdef:
  291. list buf
  292. Py_ssize_t k = len(lists)
  293. Py_ssize_t i, j, n
  294. list uniques = []
  295. dict table = {}
  296. object val, stub = 0
  297. for i in range(k):
  298. buf = lists[i]
  299. n = len(buf)
  300. for j in range(n):
  301. val = buf[j]
  302. if val not in table:
  303. table[val] = stub
  304. uniques.append(val)
  305. if sort:
  306. try:
  307. uniques.sort()
  308. except TypeError:
  309. pass
  310. return uniques
  311. @cython.wraparound(False)
  312. @cython.boundscheck(False)
  313. def fast_unique_multiple_list_gen(object gen, bint sort=True) -> list:
  314. """
  315. Generate a list of unique values from a generator of lists.
  316. Parameters
  317. ----------
  318. gen : generator object
  319. Generator of lists from which the unique list is created.
  320. sort : bool
  321. Whether or not to sort the resulting unique list.
  322. Returns
  323. -------
  324. list of unique values
  325. """
  326. cdef:
  327. list buf
  328. Py_ssize_t j, n
  329. list uniques = []
  330. dict table = {}
  331. object val, stub = 0
  332. for buf in gen:
  333. n = len(buf)
  334. for j in range(n):
  335. val = buf[j]
  336. if val not in table:
  337. table[val] = stub
  338. uniques.append(val)
  339. if sort:
  340. try:
  341. uniques.sort()
  342. except TypeError:
  343. pass
  344. return uniques
  345. @cython.wraparound(False)
  346. @cython.boundscheck(False)
  347. def dicts_to_array(dicts: list, columns: list):
  348. cdef:
  349. Py_ssize_t i, j, k, n
  350. ndarray[object, ndim=2] result
  351. dict row
  352. object col, onan = np.nan
  353. k = len(columns)
  354. n = len(dicts)
  355. result = np.empty((n, k), dtype='O')
  356. for i in range(n):
  357. row = dicts[i]
  358. for j in range(k):
  359. col = columns[j]
  360. if col in row:
  361. result[i, j] = row[col]
  362. else:
  363. result[i, j] = onan
  364. return result
  365. def fast_zip(list ndarrays) -> ndarray[object]:
  366. """
  367. For zipping multiple ndarrays into an ndarray of tuples.
  368. """
  369. cdef:
  370. Py_ssize_t i, j, k, n
  371. ndarray[object] result
  372. flatiter it
  373. object val, tup
  374. k = len(ndarrays)
  375. n = len(ndarrays[0])
  376. result = np.empty(n, dtype=object)
  377. # initialize tuples on first pass
  378. arr = ndarrays[0]
  379. it = <flatiter>PyArray_IterNew(arr)
  380. for i in range(n):
  381. val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it))
  382. tup = PyTuple_New(k)
  383. PyTuple_SET_ITEM(tup, 0, val)
  384. Py_INCREF(val)
  385. result[i] = tup
  386. PyArray_ITER_NEXT(it)
  387. for j in range(1, k):
  388. arr = ndarrays[j]
  389. it = <flatiter>PyArray_IterNew(arr)
  390. if len(arr) != n:
  391. raise ValueError("all arrays must be same length")
  392. for i in range(n):
  393. val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it))
  394. PyTuple_SET_ITEM(result[i], j, val)
  395. Py_INCREF(val)
  396. PyArray_ITER_NEXT(it)
  397. return result
  398. def get_reverse_indexer(const intp_t[:] indexer, Py_ssize_t length) -> ndarray:
  399. """
  400. Reverse indexing operation.
  401. Given `indexer`, make `indexer_inv` of it, such that::
  402. indexer_inv[indexer[x]] = x
  403. Parameters
  404. ----------
  405. indexer : np.ndarray[np.intp]
  406. length : int
  407. Returns
  408. -------
  409. np.ndarray[np.intp]
  410. Notes
  411. -----
  412. If indexer is not unique, only first occurrence is accounted.
  413. """
  414. cdef:
  415. Py_ssize_t i, n = len(indexer)
  416. ndarray[intp_t] rev_indexer
  417. intp_t idx
  418. rev_indexer = np.empty(length, dtype=np.intp)
  419. rev_indexer[:] = -1
  420. for i in range(n):
  421. idx = indexer[i]
  422. if idx != -1:
  423. rev_indexer[idx] = i
  424. return rev_indexer
  425. @cython.wraparound(False)
  426. @cython.boundscheck(False)
  427. def has_infs_f4(const float32_t[:] arr) -> bool:
  428. cdef:
  429. Py_ssize_t i, n = len(arr)
  430. float32_t inf, neginf, val
  431. inf = np.inf
  432. neginf = -inf
  433. for i in range(n):
  434. val = arr[i]
  435. if val == inf or val == neginf:
  436. return True
  437. return False
  438. @cython.wraparound(False)
  439. @cython.boundscheck(False)
  440. def has_infs_f8(const float64_t[:] arr) -> bool:
  441. cdef:
  442. Py_ssize_t i, n = len(arr)
  443. float64_t inf, neginf, val
  444. inf = np.inf
  445. neginf = -inf
  446. for i in range(n):
  447. val = arr[i]
  448. if val == inf or val == neginf:
  449. return True
  450. return False
  451. def maybe_indices_to_slice(ndarray[intp_t] indices, int max_len):
  452. cdef:
  453. Py_ssize_t i, n = len(indices)
  454. int k, vstart, vlast, v
  455. if n == 0:
  456. return slice(0, 0)
  457. vstart = indices[0]
  458. if vstart < 0 or max_len <= vstart:
  459. return indices
  460. if n == 1:
  461. return slice(vstart, vstart + 1)
  462. vlast = indices[n - 1]
  463. if vlast < 0 or max_len <= vlast:
  464. return indices
  465. k = indices[1] - indices[0]
  466. if k == 0:
  467. return indices
  468. else:
  469. for i in range(2, n):
  470. v = indices[i]
  471. if v - indices[i - 1] != k:
  472. return indices
  473. if k > 0:
  474. return slice(vstart, vlast + 1, k)
  475. else:
  476. if vlast == 0:
  477. return slice(vstart, None, k)
  478. else:
  479. return slice(vstart, vlast - 1, k)
  480. @cython.wraparound(False)
  481. @cython.boundscheck(False)
  482. def maybe_booleans_to_slice(ndarray[uint8_t] mask):
  483. cdef:
  484. Py_ssize_t i, n = len(mask)
  485. Py_ssize_t start = 0, end = 0
  486. bint started = False, finished = False
  487. for i in range(n):
  488. if mask[i]:
  489. if finished:
  490. return mask.view(np.bool_)
  491. if not started:
  492. started = True
  493. start = i
  494. else:
  495. if finished:
  496. continue
  497. if started:
  498. end = i
  499. finished = True
  500. if not started:
  501. return slice(0, 0)
  502. if not finished:
  503. return slice(start, None)
  504. else:
  505. return slice(start, end)
  506. @cython.wraparound(False)
  507. @cython.boundscheck(False)
  508. def array_equivalent_object(left: object[:], right: object[:]) -> bool:
  509. """
  510. Perform an element by element comparison on 1-d object arrays
  511. taking into account nan positions.
  512. """
  513. cdef:
  514. Py_ssize_t i, n = left.shape[0]
  515. object x, y
  516. for i in range(n):
  517. x = left[i]
  518. y = right[i]
  519. # we are either not equal or both nan
  520. # I think None == None will be true here
  521. try:
  522. if PyArray_Check(x) and PyArray_Check(y):
  523. if not array_equivalent_object(x, y):
  524. return False
  525. elif (x is C_NA) ^ (y is C_NA):
  526. return False
  527. elif not (
  528. PyObject_RichCompareBool(x, y, Py_EQ)
  529. or is_matching_na(x, y, nan_matches_none=True)
  530. ):
  531. return False
  532. except ValueError:
  533. # Avoid raising ValueError when comparing Numpy arrays to other types
  534. if cnp.PyArray_IsAnyScalar(x) != cnp.PyArray_IsAnyScalar(y):
  535. # Only compare scalars to scalars and non-scalars to non-scalars
  536. return False
  537. elif (not (cnp.PyArray_IsPythonScalar(x) or cnp.PyArray_IsPythonScalar(y))
  538. and not (isinstance(x, type(y)) or isinstance(y, type(x)))):
  539. # Check if non-scalars have the same type
  540. return False
  541. raise
  542. return True
  543. @cython.wraparound(False)
  544. @cython.boundscheck(False)
  545. def astype_intsafe(ndarray[object] arr, cnp.dtype new_dtype) -> ndarray:
  546. cdef:
  547. Py_ssize_t i, n = len(arr)
  548. object val
  549. bint is_datelike
  550. ndarray result
  551. is_datelike = new_dtype == 'm8[ns]'
  552. result = np.empty(n, dtype=new_dtype)
  553. for i in range(n):
  554. val = arr[i]
  555. if is_datelike and checknull(val):
  556. result[i] = NPY_NAT
  557. else:
  558. result[i] = val
  559. return result
  560. @cython.wraparound(False)
  561. @cython.boundscheck(False)
  562. cpdef ndarray[object] ensure_string_array(
  563. arr,
  564. object na_value=np.nan,
  565. bint convert_na_value=True,
  566. bint copy=True,
  567. bint skipna=True,
  568. ):
  569. """
  570. Returns a new numpy array with object dtype and only strings and na values.
  571. Parameters
  572. ----------
  573. arr : array-like
  574. The values to be converted to str, if needed.
  575. na_value : Any, default np.nan
  576. The value to use for na. For example, np.nan or pd.NA.
  577. convert_na_value : bool, default True
  578. If False, existing na values will be used unchanged in the new array.
  579. copy : bool, default True
  580. Whether to ensure that a new array is returned.
  581. skipna : bool, default True
  582. Whether or not to coerce nulls to their stringified form
  583. (e.g. if False, NaN becomes 'nan').
  584. Returns
  585. -------
  586. np.ndarray[object]
  587. An array with the input array's elements casted to str or nan-like.
  588. """
  589. cdef:
  590. Py_ssize_t i = 0, n = len(arr)
  591. if hasattr(arr, "to_numpy"):
  592. if hasattr(arr, "dtype") and arr.dtype.kind in ["m", "M"]:
  593. # dtype check to exclude DataFrame
  594. # GH#41409 TODO: not a great place for this
  595. out = arr.astype(str).astype(object)
  596. out[arr.isna()] = na_value
  597. return out
  598. arr = arr.to_numpy()
  599. elif not isinstance(arr, np.ndarray):
  600. arr = np.array(arr, dtype="object")
  601. result = np.asarray(arr, dtype="object")
  602. if copy and result is arr:
  603. result = result.copy()
  604. for i in range(n):
  605. val = arr[i]
  606. if isinstance(val, str):
  607. continue
  608. if not checknull(val):
  609. result[i] = str(val)
  610. else:
  611. if convert_na_value:
  612. val = na_value
  613. if skipna:
  614. result[i] = val
  615. else:
  616. result[i] = str(val)
  617. return result
  618. def is_all_arraylike(obj: list) -> bool:
  619. """
  620. Should we treat these as levels of a MultiIndex, as opposed to Index items?
  621. """
  622. cdef:
  623. Py_ssize_t i, n = len(obj)
  624. object val
  625. bint all_arrays = True
  626. for i in range(n):
  627. val = obj[i]
  628. if not (isinstance(val, list) or
  629. util.is_array(val) or hasattr(val, '_data')):
  630. # TODO: EA?
  631. # exclude tuples, frozensets as they may be contained in an Index
  632. all_arrays = False
  633. break
  634. return all_arrays
  635. # ------------------------------------------------------------------------------
  636. # Groupby-related functions
  637. # TODO: could do even better if we know something about the data. eg, index has
  638. # 1-min data, binner has 5-min data, then bins are just strides in index. This
  639. # is a general, O(max(len(values), len(binner))) method.
  640. @cython.boundscheck(False)
  641. @cython.wraparound(False)
  642. def generate_bins_dt64(ndarray[int64_t] values, const int64_t[:] binner,
  643. object closed='left', bint hasnans=False):
  644. """
  645. Int64 (datetime64) version of generic python version in ``groupby.py``.
  646. """
  647. cdef:
  648. Py_ssize_t lenidx, lenbin, i, j, bc, vc
  649. ndarray[int64_t] bins
  650. int64_t l_bin, r_bin, nat_count
  651. bint right_closed = closed == 'right'
  652. nat_count = 0
  653. if hasnans:
  654. mask = values == NPY_NAT
  655. nat_count = np.sum(mask)
  656. values = values[~mask]
  657. lenidx = len(values)
  658. lenbin = len(binner)
  659. if lenidx <= 0 or lenbin <= 0:
  660. raise ValueError("Invalid length for values or for binner")
  661. # check binner fits data
  662. if values[0] < binner[0]:
  663. raise ValueError("Values falls before first bin")
  664. if values[lenidx - 1] > binner[lenbin - 1]:
  665. raise ValueError("Values falls after last bin")
  666. bins = np.empty(lenbin - 1, dtype=np.int64)
  667. j = 0 # index into values
  668. bc = 0 # bin count
  669. # linear scan
  670. if right_closed:
  671. for i in range(0, lenbin - 1):
  672. r_bin = binner[i + 1]
  673. # count values in current bin, advance to next bin
  674. while j < lenidx and values[j] <= r_bin:
  675. j += 1
  676. bins[bc] = j
  677. bc += 1
  678. else:
  679. for i in range(0, lenbin - 1):
  680. r_bin = binner[i + 1]
  681. # count values in current bin, advance to next bin
  682. while j < lenidx and values[j] < r_bin:
  683. j += 1
  684. bins[bc] = j
  685. bc += 1
  686. if nat_count > 0:
  687. # shift bins by the number of NaT
  688. bins = bins + nat_count
  689. bins = np.insert(bins, 0, nat_count)
  690. return bins
  691. @cython.boundscheck(False)
  692. @cython.wraparound(False)
  693. def get_level_sorter(
  694. ndarray[int64_t, ndim=1] codes, const intp_t[:] starts
  695. ) -> ndarray:
  696. """
  697. Argsort for a single level of a multi-index, keeping the order of higher
  698. levels unchanged. `starts` points to starts of same-key indices w.r.t
  699. to leading levels; equivalent to:
  700. np.hstack([codes[starts[i]:starts[i+1]].argsort(kind='mergesort')
  701. + starts[i] for i in range(len(starts) - 1)])
  702. Parameters
  703. ----------
  704. codes : np.ndarray[int64_t, ndim=1]
  705. starts : np.ndarray[intp, ndim=1]
  706. Returns
  707. -------
  708. np.ndarray[np.int, ndim=1]
  709. """
  710. cdef:
  711. Py_ssize_t i, l, r
  712. ndarray[intp_t, ndim=1] out = np.empty(len(codes), dtype=np.intp)
  713. for i in range(len(starts) - 1):
  714. l, r = starts[i], starts[i + 1]
  715. out[l:r] = l + codes[l:r].argsort(kind='mergesort')
  716. return out
  717. @cython.boundscheck(False)
  718. @cython.wraparound(False)
  719. def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
  720. const intp_t[:] labels,
  721. Py_ssize_t max_bin,
  722. int axis):
  723. cdef:
  724. Py_ssize_t i, j, k, n
  725. ndarray[int64_t, ndim=2] counts
  726. assert (axis == 0 or axis == 1)
  727. n, k = (<object>mask).shape
  728. if axis == 0:
  729. counts = np.zeros((max_bin, k), dtype='i8')
  730. with nogil:
  731. for i in range(n):
  732. for j in range(k):
  733. if mask[i, j]:
  734. counts[labels[i], j] += 1
  735. else: # axis == 1
  736. counts = np.zeros((n, max_bin), dtype='i8')
  737. with nogil:
  738. for i in range(n):
  739. for j in range(k):
  740. if mask[i, j]:
  741. counts[i, labels[j]] += 1
  742. return counts
  743. @cython.wraparound(False)
  744. @cython.boundscheck(False)
  745. def generate_slices(const intp_t[:] labels, Py_ssize_t ngroups):
  746. cdef:
  747. Py_ssize_t i, group_size, n, start
  748. intp_t lab
  749. int64_t[::1] starts, ends
  750. n = len(labels)
  751. starts = np.zeros(ngroups, dtype=np.int64)
  752. ends = np.zeros(ngroups, dtype=np.int64)
  753. start = 0
  754. group_size = 0
  755. with nogil:
  756. for i in range(n):
  757. lab = labels[i]
  758. if lab < 0:
  759. start += 1
  760. else:
  761. group_size += 1
  762. if i == n - 1 or lab != labels[i + 1]:
  763. starts[lab] = start
  764. ends[lab] = start + group_size
  765. start += group_size
  766. group_size = 0
  767. return np.asarray(starts), np.asarray(ends)
  768. def indices_fast(ndarray[intp_t] index, const int64_t[:] labels, list keys,
  769. list sorted_labels) -> dict:
  770. """
  771. Parameters
  772. ----------
  773. index : ndarray[intp]
  774. labels : ndarray[int64]
  775. keys : list
  776. sorted_labels : list[ndarray[int64]]
  777. """
  778. cdef:
  779. Py_ssize_t i, j, k, lab, cur, start, n = len(labels)
  780. dict result = {}
  781. object tup
  782. k = len(keys)
  783. # Start at the first non-null entry
  784. j = 0
  785. for j in range(0, n):
  786. if labels[j] != -1:
  787. break
  788. else:
  789. return result
  790. cur = labels[j]
  791. start = j
  792. for i in range(j+1, n):
  793. lab = labels[i]
  794. if lab != cur:
  795. if lab != -1:
  796. if k == 1:
  797. # When k = 1 we do not want to return a tuple as key
  798. tup = keys[0][sorted_labels[0][i - 1]]
  799. else:
  800. tup = PyTuple_New(k)
  801. for j in range(k):
  802. val = keys[j][sorted_labels[j][i - 1]]
  803. PyTuple_SET_ITEM(tup, j, val)
  804. Py_INCREF(val)
  805. result[tup] = index[start:i]
  806. start = i
  807. cur = lab
  808. if k == 1:
  809. # When k = 1 we do not want to return a tuple as key
  810. tup = keys[0][sorted_labels[0][n - 1]]
  811. else:
  812. tup = PyTuple_New(k)
  813. for j in range(k):
  814. val = keys[j][sorted_labels[j][n - 1]]
  815. PyTuple_SET_ITEM(tup, j, val)
  816. Py_INCREF(val)
  817. result[tup] = index[start:]
  818. return result
  819. # core.common import for fast inference checks
  820. def is_float(obj: object) -> bool:
  821. """
  822. Return True if given object is float.
  823. Returns
  824. -------
  825. bool
  826. """
  827. return util.is_float_object(obj)
  828. def is_integer(obj: object) -> bool:
  829. """
  830. Return True if given object is integer.
  831. Returns
  832. -------
  833. bool
  834. """
  835. return util.is_integer_object(obj)
  836. def is_bool(obj: object) -> bool:
  837. """
  838. Return True if given object is boolean.
  839. Returns
  840. -------
  841. bool
  842. """
  843. return util.is_bool_object(obj)
  844. def is_complex(obj: object) -> bool:
  845. """
  846. Return True if given object is complex.
  847. Returns
  848. -------
  849. bool
  850. """
  851. return util.is_complex_object(obj)
  852. cpdef bint is_decimal(object obj):
  853. return isinstance(obj, Decimal)
  854. cpdef bint is_interval(object obj):
  855. return getattr(obj, '_typ', '_typ') == 'interval'
  856. def is_period(val: object) -> bool:
  857. """
  858. Return True if given object is Period.
  859. Returns
  860. -------
  861. bool
  862. """
  863. return is_period_object(val)
  864. def is_list_like(obj: object, allow_sets: bool = True) -> bool:
  865. """
  866. Check if the object is list-like.
  867. Objects that are considered list-like are for example Python
  868. lists, tuples, sets, NumPy arrays, and Pandas Series.
  869. Strings and datetime objects, however, are not considered list-like.
  870. Parameters
  871. ----------
  872. obj : object
  873. Object to check.
  874. allow_sets : bool, default True
  875. If this parameter is False, sets will not be considered list-like.
  876. Returns
  877. -------
  878. bool
  879. Whether `obj` has list-like properties.
  880. Examples
  881. --------
  882. >>> is_list_like([1, 2, 3])
  883. True
  884. >>> is_list_like({1, 2, 3})
  885. True
  886. >>> is_list_like(datetime(2017, 1, 1))
  887. False
  888. >>> is_list_like("foo")
  889. False
  890. >>> is_list_like(1)
  891. False
  892. >>> is_list_like(np.array([2]))
  893. True
  894. >>> is_list_like(np.array(2))
  895. False
  896. """
  897. return c_is_list_like(obj, allow_sets)
  898. cdef inline bint c_is_list_like(object obj, bint allow_sets) except -1:
  899. return (
  900. # equiv: `isinstance(obj, abc.Iterable)`
  901. hasattr(obj, "__iter__") and not isinstance(obj, type)
  902. # we do not count strings/unicode/bytes as list-like
  903. and not isinstance(obj, (str, bytes))
  904. # exclude zero-dimensional numpy arrays, effectively scalars
  905. and not cnp.PyArray_IsZeroDim(obj)
  906. # exclude sets if allow_sets is False
  907. and not (allow_sets is False and isinstance(obj, abc.Set))
  908. )
  909. _TYPE_MAP = {
  910. "categorical": "categorical",
  911. "category": "categorical",
  912. "int8": "integer",
  913. "int16": "integer",
  914. "int32": "integer",
  915. "int64": "integer",
  916. "i": "integer",
  917. "uint8": "integer",
  918. "uint16": "integer",
  919. "uint32": "integer",
  920. "uint64": "integer",
  921. "u": "integer",
  922. "float32": "floating",
  923. "float64": "floating",
  924. "f": "floating",
  925. "complex64": "complex",
  926. "complex128": "complex",
  927. "c": "complex",
  928. "string": "string",
  929. str: "string",
  930. "S": "bytes",
  931. "U": "string",
  932. "bool": "boolean",
  933. "b": "boolean",
  934. "datetime64[ns]": "datetime64",
  935. "M": "datetime64",
  936. "timedelta64[ns]": "timedelta64",
  937. "m": "timedelta64",
  938. "interval": "interval",
  939. Period: "period",
  940. }
  941. # types only exist on certain platform
  942. try:
  943. np.float128
  944. _TYPE_MAP['float128'] = 'floating'
  945. except AttributeError:
  946. pass
  947. try:
  948. np.complex256
  949. _TYPE_MAP['complex256'] = 'complex'
  950. except AttributeError:
  951. pass
  952. try:
  953. np.float16
  954. _TYPE_MAP['float16'] = 'floating'
  955. except AttributeError:
  956. pass
  957. @cython.internal
  958. cdef class Seen:
  959. """
  960. Class for keeping track of the types of elements
  961. encountered when trying to perform type conversions.
  962. """
  963. cdef:
  964. bint int_ # seen_int
  965. bint nat_ # seen nat
  966. bint bool_ # seen_bool
  967. bint null_ # seen_null
  968. bint nan_ # seen_np.nan
  969. bint uint_ # seen_uint (unsigned integer)
  970. bint sint_ # seen_sint (signed integer)
  971. bint float_ # seen_float
  972. bint object_ # seen_object
  973. bint complex_ # seen_complex
  974. bint datetime_ # seen_datetime
  975. bint coerce_numeric # coerce data to numeric
  976. bint timedelta_ # seen_timedelta
  977. bint datetimetz_ # seen_datetimetz
  978. bint period_ # seen_period
  979. bint interval_ # seen_interval
  980. def __cinit__(self, bint coerce_numeric=False):
  981. """
  982. Initialize a Seen instance.
  983. Parameters
  984. ----------
  985. coerce_numeric : bool, default False
  986. Whether or not to force conversion to a numeric data type if
  987. initial methods to convert to numeric fail.
  988. """
  989. self.int_ = False
  990. self.nat_ = False
  991. self.bool_ = False
  992. self.null_ = False
  993. self.nan_ = False
  994. self.uint_ = False
  995. self.sint_ = False
  996. self.float_ = False
  997. self.object_ = False
  998. self.complex_ = False
  999. self.datetime_ = False
  1000. self.timedelta_ = False
  1001. self.datetimetz_ = False
  1002. self.period_ = False
  1003. self.interval_ = False
  1004. self.coerce_numeric = coerce_numeric
  1005. cdef inline bint check_uint64_conflict(self) except -1:
  1006. """
  1007. Check whether we can safely convert a uint64 array to a numeric dtype.
  1008. There are two cases when conversion to numeric dtype with a uint64
  1009. array is not safe (and will therefore not be performed)
  1010. 1) A NaN element is encountered.
  1011. uint64 cannot be safely cast to float64 due to truncation issues
  1012. at the extreme ends of the range.
  1013. 2) A negative number is encountered.
  1014. There is no numerical dtype that can hold both negative numbers
  1015. and numbers greater than INT64_MAX. Hence, at least one number
  1016. will be improperly cast if we convert to a numeric dtype.
  1017. Returns
  1018. -------
  1019. bool
  1020. Whether or not we should return the original input array to avoid
  1021. data truncation.
  1022. Raises
  1023. ------
  1024. ValueError
  1025. uint64 elements were detected, and at least one of the
  1026. two conflict cases was also detected. However, we are
  1027. trying to force conversion to a numeric dtype.
  1028. """
  1029. return (self.uint_ and (self.null_ or self.sint_)
  1030. and not self.coerce_numeric)
  1031. cdef inline saw_null(self):
  1032. """
  1033. Set flags indicating that a null value was encountered.
  1034. """
  1035. self.null_ = True
  1036. self.float_ = True
  1037. cdef saw_int(self, object val):
  1038. """
  1039. Set flags indicating that an integer value was encountered.
  1040. In addition to setting a flag that an integer was seen, we
  1041. also set two flags depending on the type of integer seen:
  1042. 1) sint_ : a negative (signed) number in the
  1043. range of [-2**63, 0) was encountered
  1044. 2) uint_ : a positive number in the range of
  1045. [2**63, 2**64) was encountered
  1046. Parameters
  1047. ----------
  1048. val : Python int
  1049. Value with which to set the flags.
  1050. """
  1051. self.int_ = True
  1052. self.sint_ = self.sint_ or (oINT64_MIN <= val < 0)
  1053. self.uint_ = self.uint_ or (oINT64_MAX < val <= oUINT64_MAX)
  1054. @property
  1055. def numeric_(self):
  1056. return self.complex_ or self.float_ or self.int_
  1057. @property
  1058. def is_bool(self):
  1059. return not (self.datetime_ or self.numeric_ or self.timedelta_
  1060. or self.nat_)
  1061. @property
  1062. def is_float_or_complex(self):
  1063. return not (self.bool_ or self.datetime_ or self.timedelta_
  1064. or self.nat_)
  1065. cdef object _try_infer_map(object dtype):
  1066. """
  1067. If its in our map, just return the dtype.
  1068. """
  1069. cdef:
  1070. object val
  1071. str attr
  1072. for attr in ["name", "kind", "base", "type"]:
  1073. val = getattr(dtype, attr, None)
  1074. if val in _TYPE_MAP:
  1075. return _TYPE_MAP[val]
  1076. return None
  1077. def infer_dtype(value: object, skipna: bool = True) -> str:
  1078. """
  1079. Efficiently infer the type of a passed val, or list-like
  1080. array of values. Return a string describing the type.
  1081. Parameters
  1082. ----------
  1083. value : scalar, list, ndarray, or pandas type
  1084. skipna : bool, default True
  1085. Ignore NaN values when inferring the type.
  1086. Returns
  1087. -------
  1088. str
  1089. Describing the common type of the input data.
  1090. Results can include:
  1091. - string
  1092. - bytes
  1093. - floating
  1094. - integer
  1095. - mixed-integer
  1096. - mixed-integer-float
  1097. - decimal
  1098. - complex
  1099. - categorical
  1100. - boolean
  1101. - datetime64
  1102. - datetime
  1103. - date
  1104. - timedelta64
  1105. - timedelta
  1106. - time
  1107. - period
  1108. - mixed
  1109. - unknown-array
  1110. Raises
  1111. ------
  1112. TypeError
  1113. If ndarray-like but cannot infer the dtype
  1114. Notes
  1115. -----
  1116. - 'mixed' is the catchall for anything that is not otherwise
  1117. specialized
  1118. - 'mixed-integer-float' are floats and integers
  1119. - 'mixed-integer' are integers mixed with non-integers
  1120. - 'unknown-array' is the catchall for something that *is* an array (has
  1121. a dtype attribute), but has a dtype unknown to pandas (e.g. external
  1122. extension array)
  1123. Examples
  1124. --------
  1125. >>> infer_dtype(['foo', 'bar'])
  1126. 'string'
  1127. >>> infer_dtype(['a', np.nan, 'b'], skipna=True)
  1128. 'string'
  1129. >>> infer_dtype(['a', np.nan, 'b'], skipna=False)
  1130. 'mixed'
  1131. >>> infer_dtype([b'foo', b'bar'])
  1132. 'bytes'
  1133. >>> infer_dtype([1, 2, 3])
  1134. 'integer'
  1135. >>> infer_dtype([1, 2, 3.5])
  1136. 'mixed-integer-float'
  1137. >>> infer_dtype([1.0, 2.0, 3.5])
  1138. 'floating'
  1139. >>> infer_dtype(['a', 1])
  1140. 'mixed-integer'
  1141. >>> infer_dtype([Decimal(1), Decimal(2.0)])
  1142. 'decimal'
  1143. >>> infer_dtype([True, False])
  1144. 'boolean'
  1145. >>> infer_dtype([True, False, np.nan])
  1146. 'boolean'
  1147. >>> infer_dtype([pd.Timestamp('20130101')])
  1148. 'datetime'
  1149. >>> infer_dtype([datetime.date(2013, 1, 1)])
  1150. 'date'
  1151. >>> infer_dtype([np.datetime64('2013-01-01')])
  1152. 'datetime64'
  1153. >>> infer_dtype([datetime.timedelta(0, 1, 1)])
  1154. 'timedelta'
  1155. >>> infer_dtype(pd.Series(list('aabc')).astype('category'))
  1156. 'categorical'
  1157. """
  1158. cdef:
  1159. Py_ssize_t i, n
  1160. object val
  1161. ndarray values
  1162. bint seen_pdnat = False
  1163. bint seen_val = False
  1164. if util.is_array(value):
  1165. values = value
  1166. elif hasattr(value, "inferred_type") and skipna is False:
  1167. # Index, use the cached attribute if possible, populate the cache otherwise
  1168. return value.inferred_type
  1169. elif hasattr(value, "dtype"):
  1170. # this will handle ndarray-like
  1171. # e.g. categoricals
  1172. dtype = value.dtype
  1173. if not isinstance(dtype, np.dtype):
  1174. inferred = _try_infer_map(value.dtype)
  1175. if inferred is not None:
  1176. return inferred
  1177. return "unknown-array"
  1178. # Unwrap Series/Index
  1179. values = np.asarray(value)
  1180. else:
  1181. if not isinstance(value, list):
  1182. value = list(value)
  1183. from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
  1184. values = construct_1d_object_array_from_listlike(value)
  1185. # make contiguous
  1186. # for f-contiguous array 1000 x 1000, passing order="K" gives 5000x speedup
  1187. values = values.ravel(order="K")
  1188. val = _try_infer_map(values.dtype)
  1189. if val is not None:
  1190. return val
  1191. if values.dtype != np.object_:
  1192. values = values.astype("O")
  1193. if skipna:
  1194. values = values[~isnaobj(values)]
  1195. n = len(values)
  1196. if n == 0:
  1197. return "empty"
  1198. # try to use a valid value
  1199. for i in range(n):
  1200. val = values[i]
  1201. # do not use is_null_datetimelike to keep
  1202. # np.datetime64('nat') and np.timedelta64('nat')
  1203. if val is None or util.is_nan(val):
  1204. pass
  1205. elif val is NaT:
  1206. seen_pdnat = True
  1207. else:
  1208. seen_val = True
  1209. break
  1210. # if all values are nan/NaT
  1211. if seen_val is False and seen_pdnat is True:
  1212. return "datetime"
  1213. # float/object nan is handled in latter logic
  1214. if util.is_datetime64_object(val):
  1215. if is_datetime64_array(values):
  1216. return "datetime64"
  1217. elif is_timedelta(val):
  1218. if is_timedelta_or_timedelta64_array(values):
  1219. return "timedelta"
  1220. elif util.is_integer_object(val):
  1221. # ordering matters here; this check must come after the is_timedelta
  1222. # check otherwise numpy timedelta64 objects would come through here
  1223. if is_integer_array(values):
  1224. return "integer"
  1225. elif is_integer_float_array(values):
  1226. if is_integer_na_array(values):
  1227. return "integer-na"
  1228. else:
  1229. return "mixed-integer-float"
  1230. return "mixed-integer"
  1231. elif PyDateTime_Check(val):
  1232. if is_datetime_array(values, skipna=skipna):
  1233. return "datetime"
  1234. elif is_date_array(values, skipna=skipna):
  1235. return "date"
  1236. elif PyDate_Check(val):
  1237. if is_date_array(values, skipna=skipna):
  1238. return "date"
  1239. elif PyTime_Check(val):
  1240. if is_time_array(values, skipna=skipna):
  1241. return "time"
  1242. elif is_decimal(val):
  1243. if is_decimal_array(values):
  1244. return "decimal"
  1245. elif util.is_complex_object(val):
  1246. if is_complex_array(values):
  1247. return "complex"
  1248. elif util.is_float_object(val):
  1249. if is_float_array(values):
  1250. return "floating"
  1251. elif is_integer_float_array(values):
  1252. if is_integer_na_array(values):
  1253. return "integer-na"
  1254. else:
  1255. return "mixed-integer-float"
  1256. elif util.is_bool_object(val):
  1257. if is_bool_array(values, skipna=skipna):
  1258. return "boolean"
  1259. elif isinstance(val, str):
  1260. if is_string_array(values, skipna=skipna):
  1261. return "string"
  1262. elif isinstance(val, bytes):
  1263. if is_bytes_array(values, skipna=skipna):
  1264. return "bytes"
  1265. elif is_period_object(val):
  1266. if is_period_array(values):
  1267. return "period"
  1268. elif is_interval(val):
  1269. if is_interval_array(values):
  1270. return "interval"
  1271. for i in range(n):
  1272. val = values[i]
  1273. if util.is_integer_object(val):
  1274. return "mixed-integer"
  1275. return "mixed"
  1276. def infer_datetimelike_array(arr: ndarray[object]) -> tuple[str, bool]:
  1277. """
  1278. Infer if we have a datetime or timedelta array.
  1279. - date: we have *only* date and maybe strings, nulls
  1280. - datetime: we have *only* datetimes and maybe strings, nulls
  1281. - timedelta: we have *only* timedeltas and maybe strings, nulls
  1282. - nat: we do not have *any* date, datetimes or timedeltas, but do have
  1283. at least a NaT
  1284. - mixed: other objects (strings, a mix of tz-aware and tz-naive, or
  1285. actual objects)
  1286. Parameters
  1287. ----------
  1288. arr : ndarray[object]
  1289. Returns
  1290. -------
  1291. str: {datetime, timedelta, date, nat, mixed}
  1292. bool
  1293. """
  1294. cdef:
  1295. Py_ssize_t i, n = len(arr)
  1296. bint seen_timedelta = False, seen_date = False, seen_datetime = False
  1297. bint seen_tz_aware = False, seen_tz_naive = False
  1298. bint seen_nat = False, seen_str = False
  1299. bint seen_period = False, seen_interval = False
  1300. list objs = []
  1301. object v
  1302. for i in range(n):
  1303. v = arr[i]
  1304. if isinstance(v, str):
  1305. objs.append(v)
  1306. seen_str = True
  1307. if len(objs) == 3:
  1308. break
  1309. elif v is None or util.is_nan(v):
  1310. # nan or None
  1311. pass
  1312. elif v is NaT:
  1313. seen_nat = True
  1314. elif PyDateTime_Check(v):
  1315. # datetime
  1316. seen_datetime = True
  1317. # disambiguate between tz-naive and tz-aware
  1318. if v.tzinfo is None:
  1319. seen_tz_naive = True
  1320. else:
  1321. seen_tz_aware = True
  1322. if seen_tz_naive and seen_tz_aware:
  1323. return "mixed", seen_str
  1324. elif util.is_datetime64_object(v):
  1325. # np.datetime64
  1326. seen_datetime = True
  1327. elif PyDate_Check(v):
  1328. seen_date = True
  1329. elif is_timedelta(v):
  1330. # timedelta, or timedelta64
  1331. seen_timedelta = True
  1332. elif is_period_object(v):
  1333. seen_period = True
  1334. break
  1335. elif is_interval(v):
  1336. seen_interval = True
  1337. break
  1338. else:
  1339. return "mixed", seen_str
  1340. if seen_period:
  1341. if is_period_array(arr):
  1342. return "period", seen_str
  1343. return "mixed", seen_str
  1344. if seen_interval:
  1345. if is_interval_array(arr):
  1346. return "interval", seen_str
  1347. return "mixed", seen_str
  1348. if seen_date and not (seen_datetime or seen_timedelta):
  1349. return "date", seen_str
  1350. elif seen_datetime and not seen_timedelta:
  1351. return "datetime", seen_str
  1352. elif seen_timedelta and not seen_datetime:
  1353. return "timedelta", seen_str
  1354. elif seen_nat:
  1355. return "nat", seen_str
  1356. # short-circuit by trying to
  1357. # actually convert these strings
  1358. # this is for performance as we don't need to try
  1359. # convert *every* string array
  1360. if len(objs):
  1361. try:
  1362. # require_iso8601 as in maybe_infer_to_datetimelike
  1363. array_to_datetime(objs, errors="raise", require_iso8601=True)
  1364. return "datetime", seen_str
  1365. except (ValueError, TypeError):
  1366. pass
  1367. # we are *not* going to infer from strings
  1368. # for timedelta as too much ambiguity
  1369. return "mixed", seen_str
  1370. cdef inline bint is_timedelta(object o):
  1371. return PyDelta_Check(o) or util.is_timedelta64_object(o)
  1372. @cython.internal
  1373. cdef class Validator:
  1374. cdef:
  1375. Py_ssize_t n
  1376. dtype dtype
  1377. bint skipna
  1378. def __cinit__(self, Py_ssize_t n, dtype dtype=np.dtype(np.object_),
  1379. bint skipna=False):
  1380. self.n = n
  1381. self.dtype = dtype
  1382. self.skipna = skipna
  1383. cdef bint validate(self, ndarray values) except -1:
  1384. if not self.n:
  1385. return False
  1386. if self.is_array_typed():
  1387. # i.e. this ndarray is already of the desired dtype
  1388. return True
  1389. elif self.dtype.type_num == NPY_OBJECT:
  1390. if self.skipna:
  1391. return self._validate_skipna(values)
  1392. else:
  1393. return self._validate(values)
  1394. else:
  1395. return False
  1396. @cython.wraparound(False)
  1397. @cython.boundscheck(False)
  1398. cdef bint _validate(self, ndarray values) except -1:
  1399. cdef:
  1400. Py_ssize_t i
  1401. Py_ssize_t n = self.n
  1402. for i in range(n):
  1403. if not self.is_valid(values[i]):
  1404. return False
  1405. return self.finalize_validate()
  1406. @cython.wraparound(False)
  1407. @cython.boundscheck(False)
  1408. cdef bint _validate_skipna(self, ndarray values) except -1:
  1409. cdef:
  1410. Py_ssize_t i
  1411. Py_ssize_t n = self.n
  1412. for i in range(n):
  1413. if not self.is_valid_skipna(values[i]):
  1414. return False
  1415. return self.finalize_validate_skipna()
  1416. cdef bint is_valid(self, object value) except -1:
  1417. return self.is_value_typed(value)
  1418. cdef bint is_valid_skipna(self, object value) except -1:
  1419. return self.is_valid(value) or self.is_valid_null(value)
  1420. cdef bint is_value_typed(self, object value) except -1:
  1421. raise NotImplementedError(f"{type(self).__name__} child class "
  1422. "must define is_value_typed")
  1423. cdef bint is_valid_null(self, object value) except -1:
  1424. return value is None or value is C_NA or util.is_nan(value)
  1425. cdef bint is_array_typed(self) except -1:
  1426. return False
  1427. cdef inline bint finalize_validate(self):
  1428. return True
  1429. cdef bint finalize_validate_skipna(self):
  1430. """
  1431. If we _only_ saw non-dtype-specific NA values, even if they are valid
  1432. for this dtype, we do not infer this dtype.
  1433. """
  1434. # TODO(phillipc): Remove the existing validate methods and replace them
  1435. # with the skipna versions upon full deprecation of skipna=False
  1436. return True
  1437. @cython.internal
  1438. cdef class BoolValidator(Validator):
  1439. cdef inline bint is_value_typed(self, object value) except -1:
  1440. return util.is_bool_object(value)
  1441. cdef inline bint is_array_typed(self) except -1:
  1442. return issubclass(self.dtype.type, np.bool_)
  1443. cpdef bint is_bool_array(ndarray values, bint skipna=False):
  1444. cdef:
  1445. BoolValidator validator = BoolValidator(len(values),
  1446. values.dtype,
  1447. skipna=skipna)
  1448. return validator.validate(values)
  1449. @cython.internal
  1450. cdef class IntegerValidator(Validator):
  1451. cdef inline bint is_value_typed(self, object value) except -1:
  1452. return util.is_integer_object(value)
  1453. cdef inline bint is_array_typed(self) except -1:
  1454. return issubclass(self.dtype.type, np.integer)
  1455. # Note: only python-exposed for tests
  1456. cpdef bint is_integer_array(ndarray values):
  1457. cdef:
  1458. IntegerValidator validator = IntegerValidator(len(values),
  1459. values.dtype)
  1460. return validator.validate(values)
  1461. @cython.internal
  1462. cdef class IntegerNaValidator(Validator):
  1463. cdef inline bint is_value_typed(self, object value) except -1:
  1464. return (util.is_integer_object(value)
  1465. or (util.is_nan(value) and util.is_float_object(value)))
  1466. cdef bint is_integer_na_array(ndarray values):
  1467. cdef:
  1468. IntegerNaValidator validator = IntegerNaValidator(len(values),
  1469. values.dtype)
  1470. return validator.validate(values)
  1471. @cython.internal
  1472. cdef class IntegerFloatValidator(Validator):
  1473. cdef inline bint is_value_typed(self, object value) except -1:
  1474. return util.is_integer_object(value) or util.is_float_object(value)
  1475. cdef inline bint is_array_typed(self) except -1:
  1476. return issubclass(self.dtype.type, np.integer)
  1477. cdef bint is_integer_float_array(ndarray values):
  1478. cdef:
  1479. IntegerFloatValidator validator = IntegerFloatValidator(len(values),
  1480. values.dtype)
  1481. return validator.validate(values)
  1482. @cython.internal
  1483. cdef class FloatValidator(Validator):
  1484. cdef inline bint is_value_typed(self, object value) except -1:
  1485. return util.is_float_object(value)
  1486. cdef inline bint is_array_typed(self) except -1:
  1487. return issubclass(self.dtype.type, np.floating)
  1488. # Note: only python-exposed for tests
  1489. cpdef bint is_float_array(ndarray values):
  1490. cdef:
  1491. FloatValidator validator = FloatValidator(len(values), values.dtype)
  1492. return validator.validate(values)
  1493. @cython.internal
  1494. cdef class ComplexValidator(Validator):
  1495. cdef inline bint is_value_typed(self, object value) except -1:
  1496. return (
  1497. util.is_complex_object(value)
  1498. or (util.is_float_object(value) and is_nan(value))
  1499. )
  1500. cdef inline bint is_array_typed(self) except -1:
  1501. return issubclass(self.dtype.type, np.complexfloating)
  1502. cdef bint is_complex_array(ndarray values):
  1503. cdef:
  1504. ComplexValidator validator = ComplexValidator(len(values), values.dtype)
  1505. return validator.validate(values)
  1506. @cython.internal
  1507. cdef class DecimalValidator(Validator):
  1508. cdef inline bint is_value_typed(self, object value) except -1:
  1509. return is_decimal(value)
  1510. cdef bint is_decimal_array(ndarray values):
  1511. cdef:
  1512. DecimalValidator validator = DecimalValidator(len(values), values.dtype)
  1513. return validator.validate(values)
  1514. @cython.internal
  1515. cdef class StringValidator(Validator):
  1516. cdef inline bint is_value_typed(self, object value) except -1:
  1517. return isinstance(value, str)
  1518. cdef inline bint is_array_typed(self) except -1:
  1519. return issubclass(self.dtype.type, np.str_)
  1520. cdef bint is_valid_null(self, object value) except -1:
  1521. # We deliberately exclude None / NaN here since StringArray uses NA
  1522. return value is C_NA
  1523. cpdef bint is_string_array(ndarray values, bint skipna=False):
  1524. cdef:
  1525. StringValidator validator = StringValidator(len(values),
  1526. values.dtype,
  1527. skipna=skipna)
  1528. return validator.validate(values)
  1529. @cython.internal
  1530. cdef class BytesValidator(Validator):
  1531. cdef inline bint is_value_typed(self, object value) except -1:
  1532. return isinstance(value, bytes)
  1533. cdef inline bint is_array_typed(self) except -1:
  1534. return issubclass(self.dtype.type, np.bytes_)
  1535. cdef bint is_bytes_array(ndarray values, bint skipna=False):
  1536. cdef:
  1537. BytesValidator validator = BytesValidator(len(values), values.dtype,
  1538. skipna=skipna)
  1539. return validator.validate(values)
  1540. @cython.internal
  1541. cdef class TemporalValidator(Validator):
  1542. cdef:
  1543. Py_ssize_t generic_null_count
  1544. def __cinit__(self, Py_ssize_t n, dtype dtype=np.dtype(np.object_),
  1545. bint skipna=False):
  1546. self.n = n
  1547. self.dtype = dtype
  1548. self.skipna = skipna
  1549. self.generic_null_count = 0
  1550. cdef inline bint is_valid(self, object value) except -1:
  1551. return self.is_value_typed(value) or self.is_valid_null(value)
  1552. cdef bint is_valid_null(self, object value) except -1:
  1553. raise NotImplementedError(f"{type(self).__name__} child class "
  1554. "must define is_valid_null")
  1555. cdef inline bint is_valid_skipna(self, object value) except -1:
  1556. cdef:
  1557. bint is_typed_null = self.is_valid_null(value)
  1558. bint is_generic_null = value is None or util.is_nan(value)
  1559. self.generic_null_count += is_typed_null and is_generic_null
  1560. return self.is_value_typed(value) or is_typed_null or is_generic_null
  1561. cdef inline bint finalize_validate_skipna(self):
  1562. """
  1563. If we _only_ saw non-dtype-specific NA values, even if they are valid
  1564. for this dtype, we do not infer this dtype.
  1565. """
  1566. return self.generic_null_count != self.n
  1567. @cython.internal
  1568. cdef class DatetimeValidator(TemporalValidator):
  1569. cdef bint is_value_typed(self, object value) except -1:
  1570. return PyDateTime_Check(value)
  1571. cdef inline bint is_valid_null(self, object value) except -1:
  1572. return is_null_datetime64(value)
  1573. cpdef bint is_datetime_array(ndarray values, bint skipna=True):
  1574. cdef:
  1575. DatetimeValidator validator = DatetimeValidator(len(values),
  1576. skipna=skipna)
  1577. return validator.validate(values)
  1578. @cython.internal
  1579. cdef class Datetime64Validator(DatetimeValidator):
  1580. cdef inline bint is_value_typed(self, object value) except -1:
  1581. return util.is_datetime64_object(value)
  1582. # Note: only python-exposed for tests
  1583. cpdef bint is_datetime64_array(ndarray values):
  1584. cdef:
  1585. Datetime64Validator validator = Datetime64Validator(len(values),
  1586. skipna=True)
  1587. return validator.validate(values)
  1588. @cython.internal
  1589. cdef class AnyDatetimeValidator(DatetimeValidator):
  1590. cdef inline bint is_value_typed(self, object value) except -1:
  1591. return util.is_datetime64_object(value) or (
  1592. PyDateTime_Check(value) and value.tzinfo is None
  1593. )
  1594. cdef bint is_datetime_or_datetime64_array(ndarray values):
  1595. cdef:
  1596. AnyDatetimeValidator validator = AnyDatetimeValidator(len(values),
  1597. skipna=True)
  1598. return validator.validate(values)
  1599. # Note: only python-exposed for tests
  1600. def is_datetime_with_singletz_array(values: ndarray) -> bool:
  1601. """
  1602. Check values have the same tzinfo attribute.
  1603. Doesn't check values are datetime-like types.
  1604. """
  1605. cdef:
  1606. Py_ssize_t i = 0, j, n = len(values)
  1607. object base_val, base_tz, val, tz
  1608. if n == 0:
  1609. return False
  1610. # Get a reference timezone to compare with the rest of the tzs in the array
  1611. for i in range(n):
  1612. base_val = values[i]
  1613. if base_val is not NaT and base_val is not None and not util.is_nan(base_val):
  1614. base_tz = getattr(base_val, 'tzinfo', None)
  1615. break
  1616. for j in range(i, n):
  1617. # Compare val's timezone with the reference timezone
  1618. # NaT can coexist with tz-aware datetimes, so skip if encountered
  1619. val = values[j]
  1620. if val is not NaT and val is not None and not util.is_nan(val):
  1621. tz = getattr(val, 'tzinfo', None)
  1622. if not tz_compare(base_tz, tz):
  1623. return False
  1624. # Note: we should only be called if a tzaware datetime has been seen,
  1625. # so base_tz should always be set at this point.
  1626. return True
  1627. @cython.internal
  1628. cdef class TimedeltaValidator(TemporalValidator):
  1629. cdef bint is_value_typed(self, object value) except -1:
  1630. return PyDelta_Check(value)
  1631. cdef inline bint is_valid_null(self, object value) except -1:
  1632. return is_null_timedelta64(value)
  1633. @cython.internal
  1634. cdef class AnyTimedeltaValidator(TimedeltaValidator):
  1635. cdef inline bint is_value_typed(self, object value) except -1:
  1636. return is_timedelta(value)
  1637. # Note: only python-exposed for tests
  1638. cpdef bint is_timedelta_or_timedelta64_array(ndarray values):
  1639. """
  1640. Infer with timedeltas and/or nat/none.
  1641. """
  1642. cdef:
  1643. AnyTimedeltaValidator validator = AnyTimedeltaValidator(len(values),
  1644. skipna=True)
  1645. return validator.validate(values)
  1646. @cython.internal
  1647. cdef class DateValidator(Validator):
  1648. cdef inline bint is_value_typed(self, object value) except -1:
  1649. return PyDate_Check(value)
  1650. # Note: only python-exposed for tests
  1651. cpdef bint is_date_array(ndarray values, bint skipna=False):
  1652. cdef:
  1653. DateValidator validator = DateValidator(len(values), skipna=skipna)
  1654. return validator.validate(values)
  1655. @cython.internal
  1656. cdef class TimeValidator(Validator):
  1657. cdef inline bint is_value_typed(self, object value) except -1:
  1658. return PyTime_Check(value)
  1659. # Note: only python-exposed for tests
  1660. cpdef bint is_time_array(ndarray values, bint skipna=False):
  1661. cdef:
  1662. TimeValidator validator = TimeValidator(len(values), skipna=skipna)
  1663. return validator.validate(values)
  1664. cdef bint is_period_array(ndarray[object] values):
  1665. """
  1666. Is this an ndarray of Period objects (or NaT) with a single `freq`?
  1667. """
  1668. cdef:
  1669. Py_ssize_t i, n = len(values)
  1670. int dtype_code = -10000 # i.e. c_FreqGroup.FR_UND
  1671. object val
  1672. if len(values) == 0:
  1673. return False
  1674. for val in values:
  1675. if is_period_object(val):
  1676. if dtype_code == -10000:
  1677. dtype_code = val._dtype._dtype_code
  1678. elif dtype_code != val._dtype._dtype_code:
  1679. # mismatched freqs
  1680. return False
  1681. elif checknull_with_nat(val):
  1682. pass
  1683. else:
  1684. # Not a Period or NaT-like
  1685. return False
  1686. if dtype_code == -10000:
  1687. # we saw all-NaTs, no actual Periods
  1688. return False
  1689. return True
  1690. # Note: only python-exposed for tests
  1691. cpdef bint is_interval_array(ndarray values):
  1692. """
  1693. Is this an ndarray of Interval (or np.nan) with a single dtype?
  1694. """
  1695. cdef:
  1696. Py_ssize_t i, n = len(values)
  1697. str closed = None
  1698. bint numeric = False
  1699. bint dt64 = False
  1700. bint td64 = False
  1701. object val
  1702. if len(values) == 0:
  1703. return False
  1704. for val in values:
  1705. if is_interval(val):
  1706. if closed is None:
  1707. closed = val.closed
  1708. numeric = (
  1709. util.is_float_object(val.left)
  1710. or util.is_integer_object(val.left)
  1711. )
  1712. td64 = is_timedelta(val.left)
  1713. dt64 = PyDateTime_Check(val.left)
  1714. elif val.closed != closed:
  1715. # mismatched closedness
  1716. return False
  1717. elif numeric:
  1718. if not (
  1719. util.is_float_object(val.left)
  1720. or util.is_integer_object(val.left)
  1721. ):
  1722. # i.e. datetime64 or timedelta64
  1723. return False
  1724. elif td64:
  1725. if not is_timedelta(val.left):
  1726. return False
  1727. elif dt64:
  1728. if not PyDateTime_Check(val.left):
  1729. return False
  1730. else:
  1731. raise ValueError(val)
  1732. elif util.is_nan(val) or val is None:
  1733. pass
  1734. else:
  1735. return False
  1736. if closed is None:
  1737. # we saw all-NAs, no actual Intervals
  1738. return False
  1739. return True
  1740. @cython.boundscheck(False)
  1741. @cython.wraparound(False)
  1742. def maybe_convert_numeric(
  1743. ndarray[object] values,
  1744. set na_values,
  1745. bint convert_empty=True,
  1746. bint coerce_numeric=False,
  1747. bint convert_to_masked_nullable=False,
  1748. ) -> tuple[np.ndarray, np.ndarray | None]:
  1749. """
  1750. Convert object array to a numeric array if possible.
  1751. Parameters
  1752. ----------
  1753. values : ndarray[object]
  1754. Array of object elements to convert.
  1755. na_values : set
  1756. Set of values that should be interpreted as NaN.
  1757. convert_empty : bool, default True
  1758. If an empty array-like object is encountered, whether to interpret
  1759. that element as NaN or not. If set to False, a ValueError will be
  1760. raised if such an element is encountered and 'coerce_numeric' is False.
  1761. coerce_numeric : bool, default False
  1762. If initial attempts to convert to numeric have failed, whether to
  1763. force conversion to numeric via alternative methods or by setting the
  1764. element to NaN. Otherwise, an Exception will be raised when such an
  1765. element is encountered.
  1766. This boolean also has an impact on how conversion behaves when a
  1767. numeric array has no suitable numerical dtype to return (i.e. uint64,
  1768. int32, uint8). If set to False, the original object array will be
  1769. returned. Otherwise, a ValueError will be raised.
  1770. convert_to_masked_nullable : bool, default False
  1771. Whether to return a mask for the converted values. This also disables
  1772. upcasting for ints with nulls to float64.
  1773. Returns
  1774. -------
  1775. np.ndarray
  1776. Array of converted object values to numerical ones.
  1777. Optional[np.ndarray]
  1778. If convert_to_masked_nullable is True,
  1779. returns a boolean mask for the converted values, otherwise returns None.
  1780. """
  1781. if len(values) == 0:
  1782. return (np.array([], dtype='i8'), None)
  1783. # fastpath for ints - try to convert all based on first value
  1784. cdef:
  1785. object val = values[0]
  1786. if util.is_integer_object(val):
  1787. try:
  1788. maybe_ints = values.astype('i8')
  1789. if (maybe_ints == values).all():
  1790. return (maybe_ints, None)
  1791. except (ValueError, OverflowError, TypeError):
  1792. pass
  1793. # Otherwise, iterate and do full inference.
  1794. cdef:
  1795. int status, maybe_int
  1796. Py_ssize_t i, n = values.size
  1797. Seen seen = Seen(coerce_numeric)
  1798. ndarray[float64_t] floats = np.empty(n, dtype='f8')
  1799. ndarray[complex128_t] complexes = np.empty(n, dtype='c16')
  1800. ndarray[int64_t] ints = np.empty(n, dtype='i8')
  1801. ndarray[uint64_t] uints = np.empty(n, dtype='u8')
  1802. ndarray[uint8_t] bools = np.empty(n, dtype='u1')
  1803. ndarray[uint8_t] mask = np.zeros(n, dtype="u1")
  1804. float64_t fval
  1805. bint allow_null_in_int = convert_to_masked_nullable
  1806. for i in range(n):
  1807. val = values[i]
  1808. # We only want to disable NaNs showing as float if
  1809. # a) convert_to_masked_nullable = True
  1810. # b) no floats have been seen ( assuming an int shows up later )
  1811. # However, if no ints present (all null array), we need to return floats
  1812. allow_null_in_int = convert_to_masked_nullable and not seen.float_
  1813. if val.__hash__ is not None and val in na_values:
  1814. if allow_null_in_int:
  1815. seen.null_ = True
  1816. mask[i] = 1
  1817. else:
  1818. if convert_to_masked_nullable:
  1819. mask[i] = 1
  1820. seen.saw_null()
  1821. floats[i] = complexes[i] = NaN
  1822. elif util.is_float_object(val):
  1823. fval = val
  1824. if fval != fval:
  1825. seen.null_ = True
  1826. if allow_null_in_int:
  1827. mask[i] = 1
  1828. else:
  1829. if convert_to_masked_nullable:
  1830. mask[i] = 1
  1831. seen.float_ = True
  1832. else:
  1833. seen.float_ = True
  1834. floats[i] = complexes[i] = fval
  1835. elif util.is_integer_object(val):
  1836. floats[i] = complexes[i] = val
  1837. val = int(val)
  1838. seen.saw_int(val)
  1839. if val >= 0:
  1840. if val <= oUINT64_MAX:
  1841. uints[i] = val
  1842. else:
  1843. seen.float_ = True
  1844. if oINT64_MIN <= val <= oINT64_MAX:
  1845. ints[i] = val
  1846. if val < oINT64_MIN or (seen.sint_ and seen.uint_):
  1847. seen.float_ = True
  1848. elif util.is_bool_object(val):
  1849. floats[i] = uints[i] = ints[i] = bools[i] = val
  1850. seen.bool_ = True
  1851. elif val is None or val is C_NA:
  1852. if allow_null_in_int:
  1853. seen.null_ = True
  1854. mask[i] = 1
  1855. else:
  1856. if convert_to_masked_nullable:
  1857. mask[i] = 1
  1858. seen.saw_null()
  1859. floats[i] = complexes[i] = NaN
  1860. elif hasattr(val, '__len__') and len(val) == 0:
  1861. if convert_empty or seen.coerce_numeric:
  1862. seen.saw_null()
  1863. floats[i] = complexes[i] = NaN
  1864. else:
  1865. raise ValueError("Empty string encountered")
  1866. elif util.is_complex_object(val):
  1867. complexes[i] = val
  1868. seen.complex_ = True
  1869. elif is_decimal(val):
  1870. floats[i] = complexes[i] = val
  1871. seen.float_ = True
  1872. else:
  1873. try:
  1874. status = floatify(val, &fval, &maybe_int)
  1875. if fval in na_values:
  1876. seen.saw_null()
  1877. floats[i] = complexes[i] = NaN
  1878. mask[i] = 1
  1879. else:
  1880. if fval != fval:
  1881. seen.null_ = True
  1882. mask[i] = 1
  1883. floats[i] = fval
  1884. if maybe_int:
  1885. as_int = int(val)
  1886. if as_int in na_values:
  1887. mask[i] = 1
  1888. seen.null_ = True
  1889. if not allow_null_in_int:
  1890. seen.float_ = True
  1891. else:
  1892. seen.saw_int(as_int)
  1893. if as_int not in na_values:
  1894. if as_int < oINT64_MIN or as_int > oUINT64_MAX:
  1895. if seen.coerce_numeric:
  1896. seen.float_ = True
  1897. else:
  1898. raise ValueError("Integer out of range.")
  1899. else:
  1900. if as_int >= 0:
  1901. uints[i] = as_int
  1902. if as_int <= oINT64_MAX:
  1903. ints[i] = as_int
  1904. seen.float_ = seen.float_ or (seen.uint_ and seen.sint_)
  1905. else:
  1906. seen.float_ = True
  1907. except (TypeError, ValueError) as err:
  1908. if not seen.coerce_numeric:
  1909. raise type(err)(f"{err} at position {i}")
  1910. seen.saw_null()
  1911. floats[i] = NaN
  1912. if seen.check_uint64_conflict():
  1913. return (values, None)
  1914. # This occurs since we disabled float nulls showing as null in anticipation
  1915. # of seeing ints that were never seen. So then, we return float
  1916. if allow_null_in_int and seen.null_ and not seen.int_:
  1917. seen.float_ = True
  1918. if seen.complex_:
  1919. return (complexes, None)
  1920. elif seen.float_:
  1921. if seen.null_ and convert_to_masked_nullable:
  1922. return (floats, mask.view(np.bool_))
  1923. return (floats, None)
  1924. elif seen.int_:
  1925. if seen.null_ and convert_to_masked_nullable:
  1926. if seen.uint_:
  1927. return (uints, mask.view(np.bool_))
  1928. else:
  1929. return (ints, mask.view(np.bool_))
  1930. if seen.uint_:
  1931. return (uints, None)
  1932. else:
  1933. return (ints, None)
  1934. elif seen.bool_:
  1935. return (bools.view(np.bool_), None)
  1936. elif seen.uint_:
  1937. return (uints, None)
  1938. return (ints, None)
  1939. @cython.boundscheck(False)
  1940. @cython.wraparound(False)
  1941. def maybe_convert_objects(ndarray[object] objects,
  1942. *,
  1943. bint try_float=False,
  1944. bint safe=False,
  1945. bint convert_datetime=False,
  1946. bint convert_timedelta=False,
  1947. bint convert_period=False,
  1948. bint convert_interval=False,
  1949. bint convert_to_nullable_integer=False,
  1950. object dtype_if_all_nat=None) -> "ArrayLike":
  1951. """
  1952. Type inference function-- convert object array to proper dtype
  1953. Parameters
  1954. ----------
  1955. objects : ndarray[object]
  1956. Array of object elements to convert.
  1957. try_float : bool, default False
  1958. If an array-like object contains only float or NaN values is
  1959. encountered, whether to convert and return an array of float dtype.
  1960. safe : bool, default False
  1961. Whether to upcast numeric type (e.g. int cast to float). If set to
  1962. True, no upcasting will be performed.
  1963. convert_datetime : bool, default False
  1964. If an array-like object contains only datetime values or NaT is
  1965. encountered, whether to convert and return an array of M8[ns] dtype.
  1966. convert_timedelta : bool, default False
  1967. If an array-like object contains only timedelta values or NaT is
  1968. encountered, whether to convert and return an array of m8[ns] dtype.
  1969. convert_period : bool, default False
  1970. If an array-like object contains only (homogeneous-freq) Period values
  1971. or NaT, whether to convert and return a PeriodArray.
  1972. convert_interval : bool, default False
  1973. If an array-like object contains only Interval objects (with matching
  1974. dtypes and closedness) or NaN, whether to convert to IntervalArray.
  1975. convert_to_nullable_integer : bool, default False
  1976. If an array-like object contains only integer values (and NaN) is
  1977. encountered, whether to convert and return an IntegerArray.
  1978. dtype_if_all_nat : np.dtype, ExtensionDtype, or None, default None
  1979. Dtype to cast to if we have all-NaT.
  1980. Returns
  1981. -------
  1982. np.ndarray or ExtensionArray
  1983. Array of converted object values to more specific dtypes if applicable.
  1984. """
  1985. cdef:
  1986. Py_ssize_t i, n, itemsize_max = 0
  1987. ndarray[float64_t] floats
  1988. ndarray[complex128_t] complexes
  1989. ndarray[int64_t] ints
  1990. ndarray[uint64_t] uints
  1991. ndarray[uint8_t] bools
  1992. int64_t[:] idatetimes
  1993. int64_t[:] itimedeltas
  1994. Seen seen = Seen()
  1995. object val
  1996. float64_t fval, fnan = np.nan
  1997. n = len(objects)
  1998. floats = np.empty(n, dtype='f8')
  1999. complexes = np.empty(n, dtype='c16')
  2000. ints = np.empty(n, dtype='i8')
  2001. uints = np.empty(n, dtype='u8')
  2002. bools = np.empty(n, dtype=np.uint8)
  2003. mask = np.full(n, False)
  2004. if convert_datetime:
  2005. datetimes = np.empty(n, dtype='M8[ns]')
  2006. idatetimes = datetimes.view(np.int64)
  2007. if convert_timedelta:
  2008. timedeltas = np.empty(n, dtype='m8[ns]')
  2009. itimedeltas = timedeltas.view(np.int64)
  2010. for i in range(n):
  2011. val = objects[i]
  2012. if itemsize_max != -1:
  2013. itemsize = get_itemsize(val)
  2014. if itemsize > itemsize_max or itemsize == -1:
  2015. itemsize_max = itemsize
  2016. if val is None:
  2017. seen.null_ = True
  2018. floats[i] = complexes[i] = fnan
  2019. mask[i] = True
  2020. elif val is NaT:
  2021. seen.nat_ = True
  2022. if convert_datetime:
  2023. idatetimes[i] = NPY_NAT
  2024. if convert_timedelta:
  2025. itimedeltas[i] = NPY_NAT
  2026. if not (convert_datetime or convert_timedelta or convert_period):
  2027. seen.object_ = True
  2028. break
  2029. elif val is np.nan:
  2030. seen.nan_ = True
  2031. mask[i] = True
  2032. floats[i] = complexes[i] = val
  2033. elif util.is_bool_object(val):
  2034. seen.bool_ = True
  2035. bools[i] = val
  2036. elif util.is_float_object(val):
  2037. floats[i] = complexes[i] = val
  2038. seen.float_ = True
  2039. elif is_timedelta(val):
  2040. if convert_timedelta:
  2041. seen.timedelta_ = True
  2042. try:
  2043. itimedeltas[i] = convert_to_timedelta64(val, "ns").view("i8")
  2044. except OutOfBoundsTimedelta:
  2045. seen.object_ = True
  2046. break
  2047. break
  2048. else:
  2049. seen.object_ = True
  2050. break
  2051. elif util.is_integer_object(val):
  2052. seen.int_ = True
  2053. floats[i] = <float64_t>val
  2054. complexes[i] = <double complex>val
  2055. if not seen.null_:
  2056. val = int(val)
  2057. seen.saw_int(val)
  2058. if ((seen.uint_ and seen.sint_) or
  2059. val > oUINT64_MAX or val < oINT64_MIN):
  2060. seen.object_ = True
  2061. break
  2062. if seen.uint_:
  2063. uints[i] = val
  2064. elif seen.sint_:
  2065. ints[i] = val
  2066. else:
  2067. uints[i] = val
  2068. ints[i] = val
  2069. elif util.is_complex_object(val):
  2070. complexes[i] = val
  2071. seen.complex_ = True
  2072. elif PyDateTime_Check(val) or util.is_datetime64_object(val):
  2073. # if we have an tz's attached then return the objects
  2074. if convert_datetime:
  2075. if getattr(val, 'tzinfo', None) is not None:
  2076. seen.datetimetz_ = True
  2077. break
  2078. else:
  2079. seen.datetime_ = True
  2080. try:
  2081. idatetimes[i] = convert_to_tsobject(
  2082. val, None, None, 0, 0).value
  2083. except OutOfBoundsDatetime:
  2084. seen.object_ = True
  2085. break
  2086. else:
  2087. seen.object_ = True
  2088. break
  2089. elif is_period_object(val):
  2090. if convert_period:
  2091. seen.period_ = True
  2092. break
  2093. else:
  2094. seen.object_ = True
  2095. break
  2096. elif try_float and not isinstance(val, str):
  2097. # this will convert Decimal objects
  2098. try:
  2099. floats[i] = float(val)
  2100. complexes[i] = complex(val)
  2101. seen.float_ = True
  2102. except (ValueError, TypeError):
  2103. seen.object_ = True
  2104. break
  2105. elif is_interval(val):
  2106. if convert_interval:
  2107. seen.interval_ = True
  2108. break
  2109. else:
  2110. seen.object_ = True
  2111. break
  2112. else:
  2113. seen.object_ = True
  2114. break
  2115. # we try to coerce datetime w/tz but must all have the same tz
  2116. if seen.datetimetz_:
  2117. if is_datetime_with_singletz_array(objects):
  2118. from pandas import DatetimeIndex
  2119. dti = DatetimeIndex(objects)
  2120. # unbox to DatetimeArray
  2121. return dti._data
  2122. seen.object_ = True
  2123. elif seen.datetime_:
  2124. if is_datetime_or_datetime64_array(objects):
  2125. from pandas import DatetimeIndex
  2126. try:
  2127. dti = DatetimeIndex(objects)
  2128. except OutOfBoundsDatetime:
  2129. pass
  2130. else:
  2131. # unbox to ndarray[datetime64[ns]]
  2132. return dti._data._ndarray
  2133. seen.object_ = True
  2134. elif seen.timedelta_:
  2135. if is_timedelta_or_timedelta64_array(objects):
  2136. from pandas import TimedeltaIndex
  2137. try:
  2138. tdi = TimedeltaIndex(objects)
  2139. except OutOfBoundsTimedelta:
  2140. pass
  2141. else:
  2142. # unbox to ndarray[timedelta64[ns]]
  2143. return tdi._data._ndarray
  2144. seen.object_ = True
  2145. if seen.period_:
  2146. if is_period_array(objects):
  2147. from pandas import PeriodIndex
  2148. pi = PeriodIndex(objects)
  2149. # unbox to PeriodArray
  2150. return pi._data
  2151. seen.object_ = True
  2152. if seen.interval_:
  2153. if is_interval_array(objects):
  2154. from pandas import IntervalIndex
  2155. ii = IntervalIndex(objects)
  2156. # unbox to IntervalArray
  2157. return ii._data
  2158. seen.object_ = True
  2159. if not seen.object_:
  2160. result = None
  2161. if not safe:
  2162. if seen.null_ or seen.nan_:
  2163. if seen.is_float_or_complex:
  2164. if seen.complex_:
  2165. result = complexes
  2166. elif seen.float_:
  2167. result = floats
  2168. elif seen.int_:
  2169. if convert_to_nullable_integer:
  2170. from pandas.core.arrays import IntegerArray
  2171. result = IntegerArray(ints, mask)
  2172. else:
  2173. result = floats
  2174. elif seen.nan_:
  2175. result = floats
  2176. else:
  2177. if not seen.bool_:
  2178. if seen.datetime_:
  2179. if not seen.numeric_ and not seen.timedelta_:
  2180. result = datetimes
  2181. elif seen.timedelta_:
  2182. if not seen.numeric_:
  2183. result = timedeltas
  2184. elif seen.nat_:
  2185. if not seen.numeric_:
  2186. if convert_datetime and convert_timedelta:
  2187. dtype = dtype_if_all_nat
  2188. if dtype is not None:
  2189. # otherwise we keep object dtype
  2190. result = _infer_all_nats(
  2191. dtype, datetimes, timedeltas
  2192. )
  2193. elif convert_datetime:
  2194. result = datetimes
  2195. elif convert_timedelta:
  2196. result = timedeltas
  2197. else:
  2198. if seen.complex_:
  2199. result = complexes
  2200. elif seen.float_:
  2201. result = floats
  2202. elif seen.int_:
  2203. if seen.uint_:
  2204. result = uints
  2205. else:
  2206. result = ints
  2207. elif seen.is_bool:
  2208. result = bools.view(np.bool_)
  2209. else:
  2210. # don't cast int to float, etc.
  2211. if seen.null_:
  2212. if seen.is_float_or_complex:
  2213. if seen.complex_:
  2214. if not seen.int_:
  2215. result = complexes
  2216. elif seen.float_ or seen.nan_:
  2217. if not seen.int_:
  2218. result = floats
  2219. else:
  2220. if not seen.bool_:
  2221. if seen.datetime_:
  2222. if not seen.numeric_ and not seen.timedelta_:
  2223. result = datetimes
  2224. elif seen.timedelta_:
  2225. if not seen.numeric_:
  2226. result = timedeltas
  2227. elif seen.nat_:
  2228. if not seen.numeric_:
  2229. if convert_datetime and convert_timedelta:
  2230. dtype = dtype_if_all_nat
  2231. if dtype is not None:
  2232. # otherwise we keep object dtype
  2233. result = _infer_all_nats(
  2234. dtype, datetimes, timedeltas
  2235. )
  2236. elif convert_datetime:
  2237. result = datetimes
  2238. elif convert_timedelta:
  2239. result = timedeltas
  2240. else:
  2241. if seen.complex_:
  2242. if not seen.int_:
  2243. result = complexes
  2244. elif seen.float_ or seen.nan_:
  2245. if not seen.int_:
  2246. result = floats
  2247. elif seen.int_:
  2248. if seen.uint_:
  2249. result = uints
  2250. else:
  2251. result = ints
  2252. elif seen.is_bool and not seen.nan_:
  2253. result = bools.view(np.bool_)
  2254. if result is uints or result is ints or result is floats or result is complexes:
  2255. # cast to the largest itemsize when all values are NumPy scalars
  2256. if itemsize_max > 0 and itemsize_max != result.dtype.itemsize:
  2257. result = result.astype(result.dtype.kind + str(itemsize_max))
  2258. return result
  2259. elif result is not None:
  2260. return result
  2261. return objects
  2262. cdef _infer_all_nats(dtype, ndarray datetimes, ndarray timedeltas):
  2263. """
  2264. If we have all-NaT values, cast these to the given dtype.
  2265. """
  2266. if isinstance(dtype, np.dtype):
  2267. if dtype == "M8[ns]":
  2268. result = datetimes
  2269. elif dtype == "m8[ns]":
  2270. result = timedeltas
  2271. else:
  2272. raise ValueError(dtype)
  2273. else:
  2274. # ExtensionDtype
  2275. cls = dtype.construct_array_type()
  2276. i8vals = np.empty(len(datetimes), dtype="i8")
  2277. i8vals.fill(NPY_NAT)
  2278. result = cls(i8vals, dtype=dtype)
  2279. return result
  2280. class NoDefault(Enum):
  2281. # We make this an Enum
  2282. # 1) because it round-trips through pickle correctly (see GH#40397)
  2283. # 2) because mypy does not understand singletons
  2284. no_default = "NO_DEFAULT"
  2285. def __repr__(self) -> str:
  2286. return "<no_default>"
  2287. # Note: no_default is exported to the public API in pandas.api.extensions
  2288. no_default = NoDefault.no_default # Sentinel indicating the default value.
  2289. @cython.boundscheck(False)
  2290. @cython.wraparound(False)
  2291. def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=True,
  2292. object na_value=no_default, cnp.dtype dtype=np.dtype(object)
  2293. ) -> np.ndarray:
  2294. """
  2295. Substitute for np.vectorize with pandas-friendly dtype inference.
  2296. Parameters
  2297. ----------
  2298. arr : ndarray
  2299. f : function
  2300. mask : ndarray
  2301. uint8 dtype ndarray indicating values not to apply `f` to.
  2302. convert : bool, default True
  2303. Whether to call `maybe_convert_objects` on the resulting ndarray
  2304. na_value : Any, optional
  2305. The result value to use for masked values. By default, the
  2306. input value is used
  2307. dtype : numpy.dtype
  2308. The numpy dtype to use for the result ndarray.
  2309. Returns
  2310. -------
  2311. np.ndarray
  2312. """
  2313. cdef:
  2314. Py_ssize_t i, n
  2315. ndarray result
  2316. object val
  2317. n = len(arr)
  2318. result = np.empty(n, dtype=dtype)
  2319. for i in range(n):
  2320. if mask[i]:
  2321. if na_value is no_default:
  2322. val = arr[i]
  2323. else:
  2324. val = na_value
  2325. else:
  2326. val = f(arr[i])
  2327. if cnp.PyArray_IsZeroDim(val):
  2328. # unbox 0-dim arrays, GH#690
  2329. val = val.item()
  2330. result[i] = val
  2331. if convert:
  2332. return maybe_convert_objects(result,
  2333. try_float=False,
  2334. convert_datetime=False,
  2335. convert_timedelta=False)
  2336. return result
  2337. @cython.boundscheck(False)
  2338. @cython.wraparound(False)
  2339. def map_infer(
  2340. ndarray arr, object f, bint convert=True, bint ignore_na=False
  2341. ) -> np.ndarray:
  2342. """
  2343. Substitute for np.vectorize with pandas-friendly dtype inference.
  2344. Parameters
  2345. ----------
  2346. arr : ndarray
  2347. f : function
  2348. convert : bint
  2349. ignore_na : bint
  2350. If True, NA values will not have f applied
  2351. Returns
  2352. -------
  2353. np.ndarray
  2354. """
  2355. cdef:
  2356. Py_ssize_t i, n
  2357. ndarray[object] result
  2358. object val
  2359. n = len(arr)
  2360. result = np.empty(n, dtype=object)
  2361. for i in range(n):
  2362. if ignore_na and checknull(arr[i]):
  2363. result[i] = arr[i]
  2364. continue
  2365. val = f(arr[i])
  2366. if cnp.PyArray_IsZeroDim(val):
  2367. # unbox 0-dim arrays, GH#690
  2368. val = val.item()
  2369. result[i] = val
  2370. if convert:
  2371. return maybe_convert_objects(result,
  2372. try_float=False,
  2373. convert_datetime=False,
  2374. convert_timedelta=False)
  2375. return result
  2376. def to_object_array(rows: object, min_width: int = 0) -> ndarray:
  2377. """
  2378. Convert a list of lists into an object array.
  2379. Parameters
  2380. ----------
  2381. rows : 2-d array (N, K)
  2382. List of lists to be converted into an array.
  2383. min_width : int
  2384. Minimum width of the object array. If a list
  2385. in `rows` contains fewer than `width` elements,
  2386. the remaining elements in the corresponding row
  2387. will all be `NaN`.
  2388. Returns
  2389. -------
  2390. np.ndarray[object, ndim=2]
  2391. """
  2392. cdef:
  2393. Py_ssize_t i, j, n, k, tmp
  2394. ndarray[object, ndim=2] result
  2395. list row
  2396. rows = list(rows)
  2397. n = len(rows)
  2398. k = min_width
  2399. for i in range(n):
  2400. tmp = len(rows[i])
  2401. if tmp > k:
  2402. k = tmp
  2403. result = np.empty((n, k), dtype=object)
  2404. for i in range(n):
  2405. row = list(rows[i])
  2406. for j in range(len(row)):
  2407. result[i, j] = row[j]
  2408. return result
  2409. def tuples_to_object_array(ndarray[object] tuples):
  2410. cdef:
  2411. Py_ssize_t i, j, n, k, tmp
  2412. ndarray[object, ndim=2] result
  2413. tuple tup
  2414. n = len(tuples)
  2415. k = len(tuples[0])
  2416. result = np.empty((n, k), dtype=object)
  2417. for i in range(n):
  2418. tup = tuples[i]
  2419. for j in range(k):
  2420. result[i, j] = tup[j]
  2421. return result
  2422. def to_object_array_tuples(rows: object) -> np.ndarray:
  2423. """
  2424. Convert a list of tuples into an object array. Any subclass of
  2425. tuple in `rows` will be casted to tuple.
  2426. Parameters
  2427. ----------
  2428. rows : 2-d array (N, K)
  2429. List of tuples to be converted into an array.
  2430. Returns
  2431. -------
  2432. np.ndarray[object, ndim=2]
  2433. """
  2434. cdef:
  2435. Py_ssize_t i, j, n, k, tmp
  2436. ndarray[object, ndim=2] result
  2437. tuple row
  2438. rows = list(rows)
  2439. n = len(rows)
  2440. k = 0
  2441. for i in range(n):
  2442. tmp = 1 if checknull(rows[i]) else len(rows[i])
  2443. if tmp > k:
  2444. k = tmp
  2445. result = np.empty((n, k), dtype=object)
  2446. try:
  2447. for i in range(n):
  2448. row = rows[i]
  2449. for j in range(len(row)):
  2450. result[i, j] = row[j]
  2451. except TypeError:
  2452. # e.g. "Expected tuple, got list"
  2453. # upcast any subclasses to tuple
  2454. for i in range(n):
  2455. row = (rows[i],) if checknull(rows[i]) else tuple(rows[i])
  2456. for j in range(len(row)):
  2457. result[i, j] = row[j]
  2458. return result
  2459. @cython.wraparound(False)
  2460. @cython.boundscheck(False)
  2461. def fast_multiget(dict mapping, ndarray keys, default=np.nan) -> np.ndarray:
  2462. cdef:
  2463. Py_ssize_t i, n = len(keys)
  2464. object val
  2465. ndarray[object] output = np.empty(n, dtype='O')
  2466. if n == 0:
  2467. # kludge, for Series
  2468. return np.empty(0, dtype='f8')
  2469. for i in range(n):
  2470. val = keys[i]
  2471. if val in mapping:
  2472. output[i] = mapping[val]
  2473. else:
  2474. output[i] = default
  2475. return maybe_convert_objects(output)
  2476. def is_bool_list(obj: list) -> bool:
  2477. """
  2478. Check if this list contains only bool or np.bool_ objects.
  2479. This is appreciably faster than checking `np.array(obj).dtype == bool`
  2480. obj1 = [True, False] * 100
  2481. obj2 = obj1 * 100
  2482. obj3 = obj2 * 100
  2483. obj4 = [True, None] + obj1
  2484. for obj in [obj1, obj2, obj3, obj4]:
  2485. %timeit is_bool_list(obj)
  2486. %timeit np.array(obj).dtype.kind == "b"
  2487. 340 ns ± 8.22 ns
  2488. 8.78 µs ± 253 ns
  2489. 28.8 µs ± 704 ns
  2490. 813 µs ± 17.8 µs
  2491. 3.4 ms ± 168 µs
  2492. 78.4 ms ± 1.05 ms
  2493. 48.1 ns ± 1.26 ns
  2494. 8.1 µs ± 198 ns
  2495. """
  2496. cdef:
  2497. object item
  2498. for item in obj:
  2499. if not util.is_bool_object(item):
  2500. return False
  2501. # Note: we return True for empty list
  2502. return True