sas7bdat.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822
  1. """
  2. Read SAS7BDAT files
  3. Based on code written by Jared Hobbs:
  4. https://bitbucket.org/jaredhobbs/sas7bdat
  5. See also:
  6. https://github.com/BioStatMatt/sas7bdat
  7. Partial documentation of the file format:
  8. https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf
  9. Reference for binary data compression:
  10. http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
  11. """
  12. from __future__ import annotations
  13. from collections import abc
  14. from datetime import (
  15. datetime,
  16. timedelta,
  17. )
  18. import struct
  19. from typing import (
  20. IO,
  21. Any,
  22. cast,
  23. )
  24. import numpy as np
  25. from pandas.errors import (
  26. EmptyDataError,
  27. OutOfBoundsDatetime,
  28. )
  29. import pandas as pd
  30. from pandas import (
  31. DataFrame,
  32. isna,
  33. )
  34. from pandas.io.common import get_handle
  35. from pandas.io.sas._sas import Parser
  36. import pandas.io.sas.sas_constants as const
  37. from pandas.io.sas.sasreader import ReaderBase
  38. def _parse_datetime(sas_datetime: float, unit: str):
  39. if isna(sas_datetime):
  40. return pd.NaT
  41. if unit == "s":
  42. return datetime(1960, 1, 1) + timedelta(seconds=sas_datetime)
  43. elif unit == "d":
  44. return datetime(1960, 1, 1) + timedelta(days=sas_datetime)
  45. else:
  46. raise ValueError("unit must be 'd' or 's'")
  47. def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series:
  48. """
  49. Convert to Timestamp if possible, otherwise to datetime.datetime.
  50. SAS float64 lacks precision for more than ms resolution so the fit
  51. to datetime.datetime is ok.
  52. Parameters
  53. ----------
  54. sas_datetimes : {Series, Sequence[float]}
  55. Dates or datetimes in SAS
  56. unit : {str}
  57. "d" if the floats represent dates, "s" for datetimes
  58. Returns
  59. -------
  60. Series
  61. Series of datetime64 dtype or datetime.datetime.
  62. """
  63. try:
  64. return pd.to_datetime(sas_datetimes, unit=unit, origin="1960-01-01")
  65. except OutOfBoundsDatetime:
  66. s_series = sas_datetimes.apply(_parse_datetime, unit=unit)
  67. s_series = cast(pd.Series, s_series)
  68. return s_series
  69. class _SubheaderPointer:
  70. offset: int
  71. length: int
  72. compression: int
  73. ptype: int
  74. def __init__(self, offset: int, length: int, compression: int, ptype: int):
  75. self.offset = offset
  76. self.length = length
  77. self.compression = compression
  78. self.ptype = ptype
  79. class _Column:
  80. col_id: int
  81. name: str | bytes
  82. label: str | bytes
  83. format: str | bytes # TODO: i think allowing bytes is from py2 days
  84. ctype: bytes
  85. length: int
  86. def __init__(
  87. self,
  88. col_id: int,
  89. name: str | bytes,
  90. label: str | bytes,
  91. format: str | bytes,
  92. ctype: bytes,
  93. length: int,
  94. ):
  95. self.col_id = col_id
  96. self.name = name
  97. self.label = label
  98. self.format = format
  99. self.ctype = ctype
  100. self.length = length
  101. # SAS7BDAT represents a SAS data file in SAS7BDAT format.
  102. class SAS7BDATReader(ReaderBase, abc.Iterator):
  103. """
  104. Read SAS files in SAS7BDAT format.
  105. Parameters
  106. ----------
  107. path_or_buf : path name or buffer
  108. Name of SAS file or file-like object pointing to SAS file
  109. contents.
  110. index : column identifier, defaults to None
  111. Column to use as index.
  112. convert_dates : bool, defaults to True
  113. Attempt to convert dates to Pandas datetime values. Note that
  114. some rarely used SAS date formats may be unsupported.
  115. blank_missing : bool, defaults to True
  116. Convert empty strings to missing values (SAS uses blanks to
  117. indicate missing character variables).
  118. chunksize : int, defaults to None
  119. Return SAS7BDATReader object for iterations, returns chunks
  120. with given number of lines.
  121. encoding : string, defaults to None
  122. String encoding.
  123. convert_text : bool, defaults to True
  124. If False, text variables are left as raw bytes.
  125. convert_header_text : bool, defaults to True
  126. If False, header text, including column names, are left as raw
  127. bytes.
  128. """
  129. _int_length: int
  130. _cached_page: bytes | None
  131. def __init__(
  132. self,
  133. path_or_buf,
  134. index=None,
  135. convert_dates=True,
  136. blank_missing=True,
  137. chunksize=None,
  138. encoding=None,
  139. convert_text=True,
  140. convert_header_text=True,
  141. ):
  142. self.index = index
  143. self.convert_dates = convert_dates
  144. self.blank_missing = blank_missing
  145. self.chunksize = chunksize
  146. self.encoding = encoding
  147. self.convert_text = convert_text
  148. self.convert_header_text = convert_header_text
  149. self.default_encoding = "latin-1"
  150. self.compression = b""
  151. self.column_names_strings = []
  152. self.column_names = []
  153. self.column_formats = []
  154. self.columns = []
  155. self._current_page_data_subheader_pointers = []
  156. self._cached_page = None
  157. self._column_data_lengths = []
  158. self._column_data_offsets = []
  159. self._column_types = []
  160. self._current_row_in_file_index = 0
  161. self._current_row_on_page_index = 0
  162. self._current_row_in_file_index = 0
  163. self.handles = get_handle(path_or_buf, "rb", is_text=False)
  164. self._path_or_buf = cast(IO[Any], self.handles.handle)
  165. try:
  166. self._get_properties()
  167. self._parse_metadata()
  168. except Exception:
  169. self.close()
  170. raise
  171. def column_data_lengths(self) -> np.ndarray:
  172. """Return a numpy int64 array of the column data lengths"""
  173. return np.asarray(self._column_data_lengths, dtype=np.int64)
  174. def column_data_offsets(self) -> np.ndarray:
  175. """Return a numpy int64 array of the column offsets"""
  176. return np.asarray(self._column_data_offsets, dtype=np.int64)
  177. def column_types(self) -> np.ndarray:
  178. """
  179. Returns a numpy character array of the column types:
  180. s (string) or d (double)
  181. """
  182. return np.asarray(self._column_types, dtype=np.dtype("S1"))
  183. def close(self) -> None:
  184. self.handles.close()
  185. def _get_properties(self) -> None:
  186. # Check magic number
  187. self._path_or_buf.seek(0)
  188. self._cached_page = cast(bytes, self._path_or_buf.read(288))
  189. if self._cached_page[0 : len(const.magic)] != const.magic:
  190. raise ValueError("magic number mismatch (not a SAS file?)")
  191. # Get alignment information
  192. align1, align2 = 0, 0
  193. buf = self._read_bytes(const.align_1_offset, const.align_1_length)
  194. if buf == const.u64_byte_checker_value:
  195. align2 = const.align_2_value
  196. self.U64 = True
  197. self._int_length = 8
  198. self._page_bit_offset = const.page_bit_offset_x64
  199. self._subheader_pointer_length = const.subheader_pointer_length_x64
  200. else:
  201. self.U64 = False
  202. self._page_bit_offset = const.page_bit_offset_x86
  203. self._subheader_pointer_length = const.subheader_pointer_length_x86
  204. self._int_length = 4
  205. buf = self._read_bytes(const.align_2_offset, const.align_2_length)
  206. if buf == const.align_1_checker_value:
  207. align1 = const.align_2_value
  208. total_align = align1 + align2
  209. # Get endianness information
  210. buf = self._read_bytes(const.endianness_offset, const.endianness_length)
  211. if buf == b"\x01":
  212. self.byte_order = "<"
  213. else:
  214. self.byte_order = ">"
  215. # Get encoding information
  216. buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0]
  217. if buf in const.encoding_names:
  218. self.file_encoding = const.encoding_names[buf]
  219. else:
  220. self.file_encoding = f"unknown (code={buf})"
  221. # Get platform information
  222. buf = self._read_bytes(const.platform_offset, const.platform_length)
  223. if buf == b"1":
  224. self.platform = "unix"
  225. elif buf == b"2":
  226. self.platform = "windows"
  227. else:
  228. self.platform = "unknown"
  229. buf = self._read_bytes(const.dataset_offset, const.dataset_length)
  230. self.name = buf.rstrip(b"\x00 ")
  231. if self.convert_header_text:
  232. self.name = self.name.decode(self.encoding or self.default_encoding)
  233. buf = self._read_bytes(const.file_type_offset, const.file_type_length)
  234. self.file_type = buf.rstrip(b"\x00 ")
  235. if self.convert_header_text:
  236. self.file_type = self.file_type.decode(
  237. self.encoding or self.default_encoding
  238. )
  239. # Timestamp is epoch 01/01/1960
  240. epoch = datetime(1960, 1, 1)
  241. x = self._read_float(
  242. const.date_created_offset + align1, const.date_created_length
  243. )
  244. self.date_created = epoch + pd.to_timedelta(x, unit="s")
  245. x = self._read_float(
  246. const.date_modified_offset + align1, const.date_modified_length
  247. )
  248. self.date_modified = epoch + pd.to_timedelta(x, unit="s")
  249. self.header_length = self._read_int(
  250. const.header_size_offset + align1, const.header_size_length
  251. )
  252. # Read the rest of the header into cached_page.
  253. buf = cast(bytes, self._path_or_buf.read(self.header_length - 288))
  254. self._cached_page += buf
  255. # error: Argument 1 to "len" has incompatible type "Optional[bytes]";
  256. # expected "Sized"
  257. if len(self._cached_page) != self.header_length: # type: ignore[arg-type]
  258. raise ValueError("The SAS7BDAT file appears to be truncated.")
  259. self._page_length = self._read_int(
  260. const.page_size_offset + align1, const.page_size_length
  261. )
  262. self._page_count = self._read_int(
  263. const.page_count_offset + align1, const.page_count_length
  264. )
  265. buf = self._read_bytes(
  266. const.sas_release_offset + total_align, const.sas_release_length
  267. )
  268. self.sas_release = buf.rstrip(b"\x00 ")
  269. if self.convert_header_text:
  270. self.sas_release = self.sas_release.decode(
  271. self.encoding or self.default_encoding
  272. )
  273. buf = self._read_bytes(
  274. const.sas_server_type_offset + total_align, const.sas_server_type_length
  275. )
  276. self.server_type = buf.rstrip(b"\x00 ")
  277. if self.convert_header_text:
  278. self.server_type = self.server_type.decode(
  279. self.encoding or self.default_encoding
  280. )
  281. buf = self._read_bytes(
  282. const.os_version_number_offset + total_align, const.os_version_number_length
  283. )
  284. self.os_version = buf.rstrip(b"\x00 ")
  285. if self.convert_header_text:
  286. self.os_version = self.os_version.decode(
  287. self.encoding or self.default_encoding
  288. )
  289. buf = self._read_bytes(const.os_name_offset + total_align, const.os_name_length)
  290. buf = buf.rstrip(b"\x00 ")
  291. if len(buf) > 0:
  292. self.os_name = buf.decode(self.encoding or self.default_encoding)
  293. else:
  294. buf = self._read_bytes(
  295. const.os_maker_offset + total_align, const.os_maker_length
  296. )
  297. self.os_name = buf.rstrip(b"\x00 ")
  298. if self.convert_header_text:
  299. self.os_name = self.os_name.decode(
  300. self.encoding or self.default_encoding
  301. )
  302. def __next__(self):
  303. da = self.read(nrows=self.chunksize or 1)
  304. if da is None:
  305. self.close()
  306. raise StopIteration
  307. return da
  308. # Read a single float of the given width (4 or 8).
  309. def _read_float(self, offset: int, width: int):
  310. if width not in (4, 8):
  311. self.close()
  312. raise ValueError("invalid float width")
  313. buf = self._read_bytes(offset, width)
  314. fd = "f" if width == 4 else "d"
  315. return struct.unpack(self.byte_order + fd, buf)[0]
  316. # Read a single signed integer of the given width (1, 2, 4 or 8).
  317. def _read_int(self, offset: int, width: int) -> int:
  318. if width not in (1, 2, 4, 8):
  319. self.close()
  320. raise ValueError("invalid int width")
  321. buf = self._read_bytes(offset, width)
  322. it = {1: "b", 2: "h", 4: "l", 8: "q"}[width]
  323. iv = struct.unpack(self.byte_order + it, buf)[0]
  324. return iv
  325. def _read_bytes(self, offset: int, length: int):
  326. if self._cached_page is None:
  327. self._path_or_buf.seek(offset)
  328. buf = self._path_or_buf.read(length)
  329. if len(buf) < length:
  330. self.close()
  331. msg = f"Unable to read {length:d} bytes from file position {offset:d}."
  332. raise ValueError(msg)
  333. return buf
  334. else:
  335. if offset + length > len(self._cached_page):
  336. self.close()
  337. raise ValueError("The cached page is too small.")
  338. return self._cached_page[offset : offset + length]
  339. def _parse_metadata(self) -> None:
  340. done = False
  341. while not done:
  342. self._cached_page = cast(bytes, self._path_or_buf.read(self._page_length))
  343. if len(self._cached_page) <= 0:
  344. break
  345. if len(self._cached_page) != self._page_length:
  346. raise ValueError("Failed to read a meta data page from the SAS file.")
  347. done = self._process_page_meta()
  348. def _process_page_meta(self) -> bool:
  349. self._read_page_header()
  350. pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
  351. if self._current_page_type in pt:
  352. self._process_page_metadata()
  353. is_data_page = self._current_page_type & const.page_data_type
  354. is_mix_page = self._current_page_type in const.page_mix_types
  355. return bool(
  356. is_data_page
  357. or is_mix_page
  358. or self._current_page_data_subheader_pointers != []
  359. )
  360. def _read_page_header(self):
  361. bit_offset = self._page_bit_offset
  362. tx = const.page_type_offset + bit_offset
  363. self._current_page_type = self._read_int(tx, const.page_type_length)
  364. tx = const.block_count_offset + bit_offset
  365. self._current_page_block_count = self._read_int(tx, const.block_count_length)
  366. tx = const.subheader_count_offset + bit_offset
  367. self._current_page_subheaders_count = self._read_int(
  368. tx, const.subheader_count_length
  369. )
  370. def _process_page_metadata(self) -> None:
  371. bit_offset = self._page_bit_offset
  372. for i in range(self._current_page_subheaders_count):
  373. pointer = self._process_subheader_pointers(
  374. const.subheader_pointers_offset + bit_offset, i
  375. )
  376. if pointer.length == 0:
  377. continue
  378. if pointer.compression == const.truncated_subheader_id:
  379. continue
  380. subheader_signature = self._read_subheader_signature(pointer.offset)
  381. subheader_index = self._get_subheader_index(
  382. subheader_signature, pointer.compression, pointer.ptype
  383. )
  384. self._process_subheader(subheader_index, pointer)
  385. def _get_subheader_index(self, signature: bytes, compression, ptype) -> int:
  386. # TODO: return here could be made an enum
  387. index = const.subheader_signature_to_index.get(signature)
  388. if index is None:
  389. f1 = (compression == const.compressed_subheader_id) or (compression == 0)
  390. f2 = ptype == const.compressed_subheader_type
  391. if (self.compression != b"") and f1 and f2:
  392. index = const.SASIndex.data_subheader_index
  393. else:
  394. self.close()
  395. raise ValueError("Unknown subheader signature")
  396. return index
  397. def _process_subheader_pointers(
  398. self, offset: int, subheader_pointer_index: int
  399. ) -> _SubheaderPointer:
  400. subheader_pointer_length = self._subheader_pointer_length
  401. total_offset = offset + subheader_pointer_length * subheader_pointer_index
  402. subheader_offset = self._read_int(total_offset, self._int_length)
  403. total_offset += self._int_length
  404. subheader_length = self._read_int(total_offset, self._int_length)
  405. total_offset += self._int_length
  406. subheader_compression = self._read_int(total_offset, 1)
  407. total_offset += 1
  408. subheader_type = self._read_int(total_offset, 1)
  409. x = _SubheaderPointer(
  410. subheader_offset, subheader_length, subheader_compression, subheader_type
  411. )
  412. return x
  413. def _read_subheader_signature(self, offset: int) -> bytes:
  414. subheader_signature = self._read_bytes(offset, self._int_length)
  415. return subheader_signature
  416. def _process_subheader(
  417. self, subheader_index: int, pointer: _SubheaderPointer
  418. ) -> None:
  419. offset = pointer.offset
  420. length = pointer.length
  421. if subheader_index == const.SASIndex.row_size_index:
  422. processor = self._process_rowsize_subheader
  423. elif subheader_index == const.SASIndex.column_size_index:
  424. processor = self._process_columnsize_subheader
  425. elif subheader_index == const.SASIndex.column_text_index:
  426. processor = self._process_columntext_subheader
  427. elif subheader_index == const.SASIndex.column_name_index:
  428. processor = self._process_columnname_subheader
  429. elif subheader_index == const.SASIndex.column_attributes_index:
  430. processor = self._process_columnattributes_subheader
  431. elif subheader_index == const.SASIndex.format_and_label_index:
  432. processor = self._process_format_subheader
  433. elif subheader_index == const.SASIndex.column_list_index:
  434. processor = self._process_columnlist_subheader
  435. elif subheader_index == const.SASIndex.subheader_counts_index:
  436. processor = self._process_subheader_counts
  437. elif subheader_index == const.SASIndex.data_subheader_index:
  438. self._current_page_data_subheader_pointers.append(pointer)
  439. return
  440. else:
  441. raise ValueError("unknown subheader index")
  442. processor(offset, length)
  443. def _process_rowsize_subheader(self, offset: int, length: int) -> None:
  444. int_len = self._int_length
  445. lcs_offset = offset
  446. lcp_offset = offset
  447. if self.U64:
  448. lcs_offset += 682
  449. lcp_offset += 706
  450. else:
  451. lcs_offset += 354
  452. lcp_offset += 378
  453. self.row_length = self._read_int(
  454. offset + const.row_length_offset_multiplier * int_len, int_len
  455. )
  456. self.row_count = self._read_int(
  457. offset + const.row_count_offset_multiplier * int_len, int_len
  458. )
  459. self.col_count_p1 = self._read_int(
  460. offset + const.col_count_p1_multiplier * int_len, int_len
  461. )
  462. self.col_count_p2 = self._read_int(
  463. offset + const.col_count_p2_multiplier * int_len, int_len
  464. )
  465. mx = const.row_count_on_mix_page_offset_multiplier * int_len
  466. self._mix_page_row_count = self._read_int(offset + mx, int_len)
  467. self._lcs = self._read_int(lcs_offset, 2)
  468. self._lcp = self._read_int(lcp_offset, 2)
  469. def _process_columnsize_subheader(self, offset: int, length: int) -> None:
  470. int_len = self._int_length
  471. offset += int_len
  472. self.column_count = self._read_int(offset, int_len)
  473. if self.col_count_p1 + self.col_count_p2 != self.column_count:
  474. print(
  475. f"Warning: column count mismatch ({self.col_count_p1} + "
  476. f"{self.col_count_p2} != {self.column_count})\n"
  477. )
  478. # Unknown purpose
  479. def _process_subheader_counts(self, offset: int, length: int) -> None:
  480. pass
  481. def _process_columntext_subheader(self, offset: int, length: int) -> None:
  482. offset += self._int_length
  483. text_block_size = self._read_int(offset, const.text_block_size_length)
  484. buf = self._read_bytes(offset, text_block_size)
  485. cname_raw = buf[0:text_block_size].rstrip(b"\x00 ")
  486. cname = cname_raw
  487. if self.convert_header_text:
  488. cname = cname.decode(self.encoding or self.default_encoding)
  489. self.column_names_strings.append(cname)
  490. if len(self.column_names_strings) == 1:
  491. compression_literal = b""
  492. for cl in const.compression_literals:
  493. if cl in cname_raw:
  494. compression_literal = cl
  495. self.compression = compression_literal
  496. offset -= self._int_length
  497. offset1 = offset + 16
  498. if self.U64:
  499. offset1 += 4
  500. buf = self._read_bytes(offset1, self._lcp)
  501. compression_literal = buf.rstrip(b"\x00")
  502. if compression_literal == b"":
  503. self._lcs = 0
  504. offset1 = offset + 32
  505. if self.U64:
  506. offset1 += 4
  507. buf = self._read_bytes(offset1, self._lcp)
  508. self.creator_proc = buf[0 : self._lcp]
  509. elif compression_literal == const.rle_compression:
  510. offset1 = offset + 40
  511. if self.U64:
  512. offset1 += 4
  513. buf = self._read_bytes(offset1, self._lcp)
  514. self.creator_proc = buf[0 : self._lcp]
  515. elif self._lcs > 0:
  516. self._lcp = 0
  517. offset1 = offset + 16
  518. if self.U64:
  519. offset1 += 4
  520. buf = self._read_bytes(offset1, self._lcs)
  521. self.creator_proc = buf[0 : self._lcp]
  522. if self.convert_header_text:
  523. if hasattr(self, "creator_proc"):
  524. self.creator_proc = self.creator_proc.decode(
  525. self.encoding or self.default_encoding
  526. )
  527. def _process_columnname_subheader(self, offset: int, length: int) -> None:
  528. int_len = self._int_length
  529. offset += int_len
  530. column_name_pointers_count = (length - 2 * int_len - 12) // 8
  531. for i in range(column_name_pointers_count):
  532. text_subheader = (
  533. offset
  534. + const.column_name_pointer_length * (i + 1)
  535. + const.column_name_text_subheader_offset
  536. )
  537. col_name_offset = (
  538. offset
  539. + const.column_name_pointer_length * (i + 1)
  540. + const.column_name_offset_offset
  541. )
  542. col_name_length = (
  543. offset
  544. + const.column_name_pointer_length * (i + 1)
  545. + const.column_name_length_offset
  546. )
  547. idx = self._read_int(
  548. text_subheader, const.column_name_text_subheader_length
  549. )
  550. col_offset = self._read_int(
  551. col_name_offset, const.column_name_offset_length
  552. )
  553. col_len = self._read_int(col_name_length, const.column_name_length_length)
  554. name_str = self.column_names_strings[idx]
  555. self.column_names.append(name_str[col_offset : col_offset + col_len])
  556. def _process_columnattributes_subheader(self, offset: int, length: int) -> None:
  557. int_len = self._int_length
  558. column_attributes_vectors_count = (length - 2 * int_len - 12) // (int_len + 8)
  559. for i in range(column_attributes_vectors_count):
  560. col_data_offset = (
  561. offset + int_len + const.column_data_offset_offset + i * (int_len + 8)
  562. )
  563. col_data_len = (
  564. offset
  565. + 2 * int_len
  566. + const.column_data_length_offset
  567. + i * (int_len + 8)
  568. )
  569. col_types = (
  570. offset + 2 * int_len + const.column_type_offset + i * (int_len + 8)
  571. )
  572. x = self._read_int(col_data_offset, int_len)
  573. self._column_data_offsets.append(x)
  574. x = self._read_int(col_data_len, const.column_data_length_length)
  575. self._column_data_lengths.append(x)
  576. x = self._read_int(col_types, const.column_type_length)
  577. self._column_types.append(b"d" if x == 1 else b"s")
  578. def _process_columnlist_subheader(self, offset: int, length: int) -> None:
  579. # unknown purpose
  580. pass
  581. def _process_format_subheader(self, offset: int, length: int) -> None:
  582. int_len = self._int_length
  583. text_subheader_format = (
  584. offset + const.column_format_text_subheader_index_offset + 3 * int_len
  585. )
  586. col_format_offset = offset + const.column_format_offset_offset + 3 * int_len
  587. col_format_len = offset + const.column_format_length_offset + 3 * int_len
  588. text_subheader_label = (
  589. offset + const.column_label_text_subheader_index_offset + 3 * int_len
  590. )
  591. col_label_offset = offset + const.column_label_offset_offset + 3 * int_len
  592. col_label_len = offset + const.column_label_length_offset + 3 * int_len
  593. x = self._read_int(
  594. text_subheader_format, const.column_format_text_subheader_index_length
  595. )
  596. format_idx = min(x, len(self.column_names_strings) - 1)
  597. format_start = self._read_int(
  598. col_format_offset, const.column_format_offset_length
  599. )
  600. format_len = self._read_int(col_format_len, const.column_format_length_length)
  601. label_idx = self._read_int(
  602. text_subheader_label, const.column_label_text_subheader_index_length
  603. )
  604. label_idx = min(label_idx, len(self.column_names_strings) - 1)
  605. label_start = self._read_int(col_label_offset, const.column_label_offset_length)
  606. label_len = self._read_int(col_label_len, const.column_label_length_length)
  607. label_names = self.column_names_strings[label_idx]
  608. column_label = label_names[label_start : label_start + label_len]
  609. format_names = self.column_names_strings[format_idx]
  610. column_format = format_names[format_start : format_start + format_len]
  611. current_column_number = len(self.columns)
  612. col = _Column(
  613. current_column_number,
  614. self.column_names[current_column_number],
  615. column_label,
  616. column_format,
  617. self._column_types[current_column_number],
  618. self._column_data_lengths[current_column_number],
  619. )
  620. self.column_formats.append(column_format)
  621. self.columns.append(col)
  622. def read(self, nrows: int | None = None) -> DataFrame | None:
  623. if (nrows is None) and (self.chunksize is not None):
  624. nrows = self.chunksize
  625. elif nrows is None:
  626. nrows = self.row_count
  627. if len(self._column_types) == 0:
  628. self.close()
  629. raise EmptyDataError("No columns to parse from file")
  630. if self._current_row_in_file_index >= self.row_count:
  631. return None
  632. m = self.row_count - self._current_row_in_file_index
  633. if nrows > m:
  634. nrows = m
  635. nd = self._column_types.count(b"d")
  636. ns = self._column_types.count(b"s")
  637. self._string_chunk = np.empty((ns, nrows), dtype=object)
  638. self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8)
  639. self._current_row_in_chunk_index = 0
  640. p = Parser(self)
  641. p.read(nrows)
  642. rslt = self._chunk_to_dataframe()
  643. if self.index is not None:
  644. rslt = rslt.set_index(self.index)
  645. return rslt
  646. def _read_next_page(self):
  647. self._current_page_data_subheader_pointers = []
  648. self._cached_page = cast(bytes, self._path_or_buf.read(self._page_length))
  649. if len(self._cached_page) <= 0:
  650. return True
  651. elif len(self._cached_page) != self._page_length:
  652. self.close()
  653. msg = (
  654. "failed to read complete page from file (read "
  655. f"{len(self._cached_page):d} of {self._page_length:d} bytes)"
  656. )
  657. raise ValueError(msg)
  658. self._read_page_header()
  659. page_type = self._current_page_type
  660. if page_type == const.page_meta_type:
  661. self._process_page_metadata()
  662. is_data_page = page_type & const.page_data_type
  663. pt = [const.page_meta_type] + const.page_mix_types
  664. if not is_data_page and self._current_page_type not in pt:
  665. return self._read_next_page()
  666. return False
  667. def _chunk_to_dataframe(self) -> DataFrame:
  668. n = self._current_row_in_chunk_index
  669. m = self._current_row_in_file_index
  670. ix = range(m - n, m)
  671. rslt = DataFrame(index=ix)
  672. js, jb = 0, 0
  673. for j in range(self.column_count):
  674. name = self.column_names[j]
  675. if self._column_types[j] == b"d":
  676. rslt[name] = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d")
  677. rslt[name] = np.asarray(rslt[name], dtype=np.float64)
  678. if self.convert_dates:
  679. if self.column_formats[j] in const.sas_date_formats:
  680. rslt[name] = _convert_datetimes(rslt[name], "d")
  681. elif self.column_formats[j] in const.sas_datetime_formats:
  682. rslt[name] = _convert_datetimes(rslt[name], "s")
  683. jb += 1
  684. elif self._column_types[j] == b"s":
  685. rslt[name] = self._string_chunk[js, :]
  686. if self.convert_text and (self.encoding is not None):
  687. rslt[name] = rslt[name].str.decode(
  688. self.encoding or self.default_encoding
  689. )
  690. if self.blank_missing:
  691. ii = rslt[name].str.len() == 0
  692. rslt.loc[ii, name] = np.nan
  693. js += 1
  694. else:
  695. self.close()
  696. raise ValueError(f"unknown column type {self._column_types[j]}")
  697. return rslt