sasreader.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. """
  2. Read SAS sas7bdat or xport files.
  3. """
  4. from __future__ import annotations
  5. from abc import (
  6. ABCMeta,
  7. abstractmethod,
  8. )
  9. from typing import (
  10. TYPE_CHECKING,
  11. Hashable,
  12. overload,
  13. )
  14. from pandas._typing import FilePathOrBuffer
  15. from pandas.io.common import stringify_path
  16. if TYPE_CHECKING:
  17. from pandas import DataFrame
  18. # TODO(PY38): replace with Protocol in Python 3.8
  19. class ReaderBase(metaclass=ABCMeta):
  20. """
  21. Protocol for XportReader and SAS7BDATReader classes.
  22. """
  23. @abstractmethod
  24. def read(self, nrows=None):
  25. pass
  26. @abstractmethod
  27. def close(self):
  28. pass
  29. def __enter__(self):
  30. return self
  31. def __exit__(self, exc_type, exc_value, traceback):
  32. self.close()
  33. @overload
  34. def read_sas(
  35. filepath_or_buffer: FilePathOrBuffer,
  36. format: str | None = ...,
  37. index: Hashable | None = ...,
  38. encoding: str | None = ...,
  39. chunksize: int = ...,
  40. iterator: bool = ...,
  41. ) -> ReaderBase:
  42. ...
  43. @overload
  44. def read_sas(
  45. filepath_or_buffer: FilePathOrBuffer,
  46. format: str | None = ...,
  47. index: Hashable | None = ...,
  48. encoding: str | None = ...,
  49. chunksize: None = ...,
  50. iterator: bool = ...,
  51. ) -> DataFrame | ReaderBase:
  52. ...
  53. def read_sas(
  54. filepath_or_buffer: FilePathOrBuffer,
  55. format: str | None = None,
  56. index: Hashable | None = None,
  57. encoding: str | None = None,
  58. chunksize: int | None = None,
  59. iterator: bool = False,
  60. ) -> DataFrame | ReaderBase:
  61. """
  62. Read SAS files stored as either XPORT or SAS7BDAT format files.
  63. Parameters
  64. ----------
  65. filepath_or_buffer : str, path object or file-like object
  66. Any valid string path is acceptable. The string could be a URL. Valid
  67. URL schemes include http, ftp, s3, and file. For file URLs, a host is
  68. expected. A local file could be:
  69. ``file://localhost/path/to/table.sas``.
  70. If you want to pass in a path object, pandas accepts any
  71. ``os.PathLike``.
  72. By file-like object, we refer to objects with a ``read()`` method,
  73. such as a file handle (e.g. via builtin ``open`` function)
  74. or ``StringIO``.
  75. format : str {'xport', 'sas7bdat'} or None
  76. If None, file format is inferred from file extension. If 'xport' or
  77. 'sas7bdat', uses the corresponding format.
  78. index : identifier of index column, defaults to None
  79. Identifier of column that should be used as index of the DataFrame.
  80. encoding : str, default is None
  81. Encoding for text data. If None, text data are stored as raw bytes.
  82. chunksize : int
  83. Read file `chunksize` lines at a time, returns iterator.
  84. .. versionchanged:: 1.2
  85. ``TextFileReader`` is a context manager.
  86. iterator : bool, defaults to False
  87. If True, returns an iterator for reading the file incrementally.
  88. .. versionchanged:: 1.2
  89. ``TextFileReader`` is a context manager.
  90. Returns
  91. -------
  92. DataFrame if iterator=False and chunksize=None, else SAS7BDATReader
  93. or XportReader
  94. """
  95. if format is None:
  96. buffer_error_msg = (
  97. "If this is a buffer object rather "
  98. "than a string name, you must specify a format string"
  99. )
  100. filepath_or_buffer = stringify_path(filepath_or_buffer)
  101. if not isinstance(filepath_or_buffer, str):
  102. raise ValueError(buffer_error_msg)
  103. fname = filepath_or_buffer.lower()
  104. if fname.endswith(".xpt"):
  105. format = "xport"
  106. elif fname.endswith(".sas7bdat"):
  107. format = "sas7bdat"
  108. else:
  109. raise ValueError("unable to infer format of SAS file")
  110. reader: ReaderBase
  111. if format.lower() == "xport":
  112. from pandas.io.sas.sas_xport import XportReader
  113. reader = XportReader(
  114. filepath_or_buffer,
  115. index=index,
  116. encoding=encoding,
  117. chunksize=chunksize,
  118. )
  119. elif format.lower() == "sas7bdat":
  120. from pandas.io.sas.sas7bdat import SAS7BDATReader
  121. reader = SAS7BDATReader(
  122. filepath_or_buffer,
  123. index=index,
  124. encoding=encoding,
  125. chunksize=chunksize,
  126. )
  127. else:
  128. raise ValueError("unknown SAS format")
  129. if iterator or chunksize:
  130. return reader
  131. with reader:
  132. return reader.read()