gbq.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224
  1. """ Google BigQuery support """
  2. from __future__ import annotations
  3. from typing import (
  4. TYPE_CHECKING,
  5. Any,
  6. )
  7. from pandas.compat._optional import import_optional_dependency
  8. if TYPE_CHECKING:
  9. from pandas import DataFrame
  10. def _try_import():
  11. # since pandas is a dependency of pandas-gbq
  12. # we need to import on first use
  13. msg = (
  14. "pandas-gbq is required to load data from Google BigQuery. "
  15. "See the docs: https://pandas-gbq.readthedocs.io."
  16. )
  17. pandas_gbq = import_optional_dependency("pandas_gbq", extra=msg)
  18. return pandas_gbq
  19. def read_gbq(
  20. query: str,
  21. project_id: str | None = None,
  22. index_col: str | None = None,
  23. col_order: list[str] | None = None,
  24. reauth: bool = False,
  25. auth_local_webserver: bool = False,
  26. dialect: str | None = None,
  27. location: str | None = None,
  28. configuration: dict[str, Any] | None = None,
  29. credentials=None,
  30. use_bqstorage_api: bool | None = None,
  31. max_results: int | None = None,
  32. progress_bar_type: str | None = None,
  33. ) -> DataFrame:
  34. """
  35. Load data from Google BigQuery.
  36. This function requires the `pandas-gbq package
  37. <https://pandas-gbq.readthedocs.io>`__.
  38. See the `How to authenticate with Google BigQuery
  39. <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
  40. guide for authentication instructions.
  41. Parameters
  42. ----------
  43. query : str
  44. SQL-Like Query to return data values.
  45. project_id : str, optional
  46. Google BigQuery Account project ID. Optional when available from
  47. the environment.
  48. index_col : str, optional
  49. Name of result column to use for index in results DataFrame.
  50. col_order : list(str), optional
  51. List of BigQuery column names in the desired order for results
  52. DataFrame.
  53. reauth : bool, default False
  54. Force Google BigQuery to re-authenticate the user. This is useful
  55. if multiple accounts are used.
  56. auth_local_webserver : bool, default False
  57. Use the `local webserver flow`_ instead of the `console flow`_
  58. when getting user credentials.
  59. .. _local webserver flow:
  60. https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
  61. .. _console flow:
  62. https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
  63. *New in version 0.2.0 of pandas-gbq*.
  64. dialect : str, default 'legacy'
  65. Note: The default value is changing to 'standard' in a future version.
  66. SQL syntax dialect to use. Value can be one of:
  67. ``'legacy'``
  68. Use BigQuery's legacy SQL dialect. For more information see
  69. `BigQuery Legacy SQL Reference
  70. <https://cloud.google.com/bigquery/docs/reference/legacy-sql>`__.
  71. ``'standard'``
  72. Use BigQuery's standard SQL, which is
  73. compliant with the SQL 2011 standard. For more information
  74. see `BigQuery Standard SQL Reference
  75. <https://cloud.google.com/bigquery/docs/reference/standard-sql/>`__.
  76. location : str, optional
  77. Location where the query job should run. See the `BigQuery locations
  78. documentation
  79. <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
  80. list of available locations. The location must match that of any
  81. datasets used in the query.
  82. *New in version 0.5.0 of pandas-gbq*.
  83. configuration : dict, optional
  84. Query config parameters for job processing.
  85. For example:
  86. configuration = {'query': {'useQueryCache': False}}
  87. For more information see `BigQuery REST API Reference
  88. <https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query>`__.
  89. credentials : google.auth.credentials.Credentials, optional
  90. Credentials for accessing Google APIs. Use this parameter to override
  91. default credentials, such as to use Compute Engine
  92. :class:`google.auth.compute_engine.Credentials` or Service Account
  93. :class:`google.oauth2.service_account.Credentials` directly.
  94. *New in version 0.8.0 of pandas-gbq*.
  95. use_bqstorage_api : bool, default False
  96. Use the `BigQuery Storage API
  97. <https://cloud.google.com/bigquery/docs/reference/storage/>`__ to
  98. download query results quickly, but at an increased cost. To use this
  99. API, first `enable it in the Cloud Console
  100. <https://console.cloud.google.com/apis/library/bigquerystorage.googleapis.com>`__.
  101. You must also have the `bigquery.readsessions.create
  102. <https://cloud.google.com/bigquery/docs/access-control#roles>`__
  103. permission on the project you are billing queries to.
  104. This feature requires version 0.10.0 or later of the ``pandas-gbq``
  105. package. It also requires the ``google-cloud-bigquery-storage`` and
  106. ``fastavro`` packages.
  107. .. versionadded:: 0.25.0
  108. max_results : int, optional
  109. If set, limit the maximum number of rows to fetch from the query
  110. results.
  111. *New in version 0.12.0 of pandas-gbq*.
  112. .. versionadded:: 1.1.0
  113. progress_bar_type : Optional, str
  114. If set, use the `tqdm <https://tqdm.github.io/>`__ library to
  115. display a progress bar while the data downloads. Install the
  116. ``tqdm`` package to use this feature.
  117. Possible values of ``progress_bar_type`` include:
  118. ``None``
  119. No progress bar.
  120. ``'tqdm'``
  121. Use the :func:`tqdm.tqdm` function to print a progress bar
  122. to :data:`sys.stderr`.
  123. ``'tqdm_notebook'``
  124. Use the :func:`tqdm.tqdm_notebook` function to display a
  125. progress bar as a Jupyter notebook widget.
  126. ``'tqdm_gui'``
  127. Use the :func:`tqdm.tqdm_gui` function to display a
  128. progress bar as a graphical dialog box.
  129. Note that this feature requires version 0.12.0 or later of the
  130. ``pandas-gbq`` package. And it requires the ``tqdm`` package. Slightly
  131. different than ``pandas-gbq``, here the default is ``None``.
  132. .. versionadded:: 1.0.0
  133. Returns
  134. -------
  135. df: DataFrame
  136. DataFrame representing results of query.
  137. See Also
  138. --------
  139. pandas_gbq.read_gbq : This function in the pandas-gbq library.
  140. DataFrame.to_gbq : Write a DataFrame to Google BigQuery.
  141. """
  142. pandas_gbq = _try_import()
  143. kwargs: dict[str, str | bool | int | None] = {}
  144. # START: new kwargs. Don't populate unless explicitly set.
  145. if use_bqstorage_api is not None:
  146. kwargs["use_bqstorage_api"] = use_bqstorage_api
  147. if max_results is not None:
  148. kwargs["max_results"] = max_results
  149. kwargs["progress_bar_type"] = progress_bar_type
  150. # END: new kwargs
  151. return pandas_gbq.read_gbq(
  152. query,
  153. project_id=project_id,
  154. index_col=index_col,
  155. col_order=col_order,
  156. reauth=reauth,
  157. auth_local_webserver=auth_local_webserver,
  158. dialect=dialect,
  159. location=location,
  160. configuration=configuration,
  161. credentials=credentials,
  162. **kwargs,
  163. )
  164. def to_gbq(
  165. dataframe: DataFrame,
  166. destination_table: str,
  167. project_id: str | None = None,
  168. chunksize: int | None = None,
  169. reauth: bool = False,
  170. if_exists: str = "fail",
  171. auth_local_webserver: bool = False,
  172. table_schema: list[dict[str, str]] | None = None,
  173. location: str | None = None,
  174. progress_bar: bool = True,
  175. credentials=None,
  176. ) -> None:
  177. pandas_gbq = _try_import()
  178. pandas_gbq.to_gbq(
  179. dataframe,
  180. destination_table,
  181. project_id=project_id,
  182. chunksize=chunksize,
  183. reauth=reauth,
  184. if_exists=if_exists,
  185. auth_local_webserver=auth_local_webserver,
  186. table_schema=table_schema,
  187. location=location,
  188. progress_bar=progress_bar,
  189. credentials=credentials,
  190. )