Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update to_gbq and read_gbq to pandas-gbq 0.5.0 #21628

Merged
merged 1 commit into from
Jun 26, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ Other Enhancements
- :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`)
- :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with :class:`MultiIndex` (:issue:`21115`)
- Added support for reading from Google Cloud Storage via the ``gcsfs`` library (:issue:`19454`)
- :func:`to_gbq` and :func:`read_gbq` signature and documentation updated to
reflect changes from the `Pandas-GBQ library version 0.5.0
<https://pandas-gbq.readthedocs.io/en/latest/changelog.html#changelog-0-5-0>`__.
(:issue:`21627`)


.. _whatsnew_0240.api_breaking:

Expand Down
59 changes: 32 additions & 27 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1102,37 +1102,27 @@ def to_dict(self, orient='dict', into=dict):
else:
raise ValueError("orient '{o}' not understood".format(o=orient))

def to_gbq(self, destination_table, project_id, chunksize=None,
verbose=None, reauth=False, if_exists='fail', private_key=None,
auth_local_webserver=False, table_schema=None):
def to_gbq(self, destination_table, project_id=None, chunksize=None,
reauth=False, if_exists='fail', private_key=None,
auth_local_webserver=False, table_schema=None, location=None,
progress_bar=True, verbose=None):
"""
Write a DataFrame to a Google BigQuery table.

This function requires the `pandas-gbq package
<https://pandas-gbq.readthedocs.io>`__.

Authentication to the Google BigQuery service is via OAuth 2.0.

- If ``private_key`` is provided, the library loads the JSON service
account credentials and uses those to authenticate.

- If no ``private_key`` is provided, the library tries `application
default credentials`_.

.. _application default credentials:
https://cloud.google.com/docs/authentication/production#providing_credentials_to_your_application

- If application default credentials are not found or cannot be used
with BigQuery, the library authenticates with user account
credentials. In this case, you will be asked to grant permissions
for product name 'pandas GBQ'.
See the `How to authenticate with Google BigQuery
<https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
guide for authentication instructions.

Parameters
----------
destination_table : str
Name of table to be written, in the form 'dataset.tablename'.
project_id : str
Google BigQuery Account project ID.
Name of table to be written, in the form ``dataset.tablename``.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think needs to be single backticks

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Two backticks is code font in Sphinx RST, which is what I want.

project_id : str, optional
Google BigQuery Account project ID. Optional when available from
the environment.
chunksize : int, optional
Number of rows to be inserted in each chunk from the dataframe.
Set to ``None`` to load the whole dataframe at once.
Expand Down Expand Up @@ -1170,8 +1160,21 @@ def to_gbq(self, destination_table, project_id, chunksize=None,
BigQuery API documentation on available names of a field.

*New in version 0.3.1 of pandas-gbq*.
verbose : boolean, deprecated
*Deprecated in Pandas-GBQ 0.4.0.* Use the `logging module
location : str, optional
Location where the load job should run. See the `BigQuery locations
documentation
<https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
list of available locations. The location must match that of the
target dataset.

*New in version 0.5.0 of pandas-gbq*.
progress_bar : bool, default True
Use the library `tqdm` to show the progress bar for the upload,
chunk by chunk.

*New in version 0.5.0 of pandas-gbq*.
verbose : bool, deprecated
Deprecated in Pandas-GBQ 0.4.0. Use the `logging module
to adjust verbosity instead
<https://pandas-gbq.readthedocs.io/en/latest/intro.html#logging>`__.

Expand All @@ -1182,10 +1185,12 @@ def to_gbq(self, destination_table, project_id, chunksize=None,
"""
from pandas.io import gbq
return gbq.to_gbq(
self, destination_table, project_id, chunksize=chunksize,
verbose=verbose, reauth=reauth, if_exists=if_exists,
private_key=private_key, auth_local_webserver=auth_local_webserver,
table_schema=table_schema)
self, destination_table, project_id=project_id,
chunksize=chunksize, reauth=reauth,
if_exists=if_exists, private_key=private_key,
auth_local_webserver=auth_local_webserver,
table_schema=table_schema, location=location,
progress_bar=progress_bar, verbose=verbose)

@classmethod
def from_records(cls, data, index=None, exclude=None, columns=None,
Expand Down
86 changes: 48 additions & 38 deletions pandas/io/gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,34 +22,26 @@ def _try_import():


def read_gbq(query, project_id=None, index_col=None, col_order=None,
reauth=False, verbose=None, private_key=None, dialect='legacy',
**kwargs):
reauth=False, private_key=None, auth_local_webserver=False,
dialect='legacy', location=None, configuration=None,
verbose=None):
"""
Load data from Google BigQuery.

This function requires the `pandas-gbq package
<https://pandas-gbq.readthedocs.io>`__.

Authentication to the Google BigQuery service is via OAuth 2.0.

- If "private_key" is not provided:

By default "application default credentials" are used.

If default application credentials are not found or are restrictive,
user account credentials are used. In this case, you will be asked to
grant permissions for product name 'pandas GBQ'.

- If "private_key" is provided:

Service account credentials will be used to authenticate.
See the `How to authenticate with Google BigQuery
<https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
guide for authentication instructions.

Parameters
----------
query : str
SQL-Like Query to return data values.
project_id : str
Google BigQuery Account project ID.
project_id : str, optional
Google BigQuery Account project ID. Optional when available from
the environment.
index_col : str, optional
Name of result column to use for index in results DataFrame.
col_order : list(str), optional
Expand All @@ -62,6 +54,16 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None,
Service account private key in JSON format. Can be file path
or string contents. This is useful for remote server
authentication (eg. Jupyter/IPython notebook on remote host).
auth_local_webserver : boolean, default False
Use the `local webserver flow`_ instead of the `console flow`_
when getting user credentials.

.. _local webserver flow:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think these break linting, need a #noqa I think

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding # noqa breaks the link. Sphinx generates a link like pandas/doc/build/html/generated_single/pandas.read_gbq.html#noqahttp://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server when I add this.

I do not get any lint errors when I run git diff upstream/master -u -- "*.py" | flake8 --diff and scripts/validate_docstrings.py pandas.read_gbq also passes.

http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
.. _console flow:
http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console

*New in version 0.2.0 of pandas-gbq*.
dialect : str, default 'legacy'
SQL syntax dialect to use. Value can be one of:

Expand All @@ -74,19 +76,26 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None,
compliant with the SQL 2011 standard. For more information
see `BigQuery Standard SQL Reference
<https://cloud.google.com/bigquery/docs/reference/standard-sql/>`__.
verbose : boolean, deprecated
*Deprecated in Pandas-GBQ 0.4.0.* Use the `logging module
to adjust verbosity instead
<https://pandas-gbq.readthedocs.io/en/latest/intro.html#logging>`__.
kwargs : dict
Arbitrary keyword arguments.
configuration (dict): query config parameters for job processing.
location : str, optional
Location where the query job should run. See the `BigQuery locations
documentation
<https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
list of available locations. The location must match that of any
datasets used in the query.

*New in version 0.5.0 of pandas-gbq*.
configuration : dict, optional
Query config parameters for job processing.
For example:

configuration = {'query': {'useQueryCache': False}}

For more information see `BigQuery SQL Reference
<https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query>`__
For more information see `BigQuery REST API Reference
<https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query>`__.
verbose : None, deprecated
Deprecated in Pandas-GBQ 0.4.0. Use the `logging module
to adjust verbosity instead
<https://pandas-gbq.readthedocs.io/en/latest/intro.html#logging>`__.

Returns
-------
Expand All @@ -100,20 +109,21 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None,
"""
pandas_gbq = _try_import()
return pandas_gbq.read_gbq(
query, project_id=project_id,
index_col=index_col, col_order=col_order,
reauth=reauth, verbose=verbose,
private_key=private_key,
dialect=dialect,
**kwargs)
query, project_id=project_id, index_col=index_col,
col_order=col_order, reauth=reauth, verbose=verbose,
private_key=private_key, auth_local_webserver=auth_local_webserver,
dialect=dialect, location=location, configuration=configuration)


def to_gbq(dataframe, destination_table, project_id, chunksize=None,
def to_gbq(dataframe, destination_table, project_id=None, chunksize=None,
verbose=None, reauth=False, if_exists='fail', private_key=None,
auth_local_webserver=False, table_schema=None):
auth_local_webserver=False, table_schema=None, location=None,
progress_bar=True):
pandas_gbq = _try_import()
return pandas_gbq.to_gbq(
dataframe, destination_table, project_id, chunksize=chunksize,
verbose=verbose, reauth=reauth, if_exists=if_exists,
private_key=private_key, auth_local_webserver=auth_local_webserver,
table_schema=table_schema)
dataframe, destination_table, project_id=project_id,
chunksize=chunksize, verbose=verbose, reauth=reauth,
if_exists=if_exists, private_key=private_key,
auth_local_webserver=auth_local_webserver,
table_schema=table_schema, location=location,
progress_bar=progress_bar)
15 changes: 7 additions & 8 deletions pandas/tests/io/test_gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from datetime import datetime
import pytz
import platform
from time import sleep
import os

import numpy as np
Expand Down Expand Up @@ -48,16 +47,18 @@ def _in_travis_environment():
def _get_project_id():
if _in_travis_environment():
return os.environ.get('GBQ_PROJECT_ID')
else:
return PROJECT_ID
return PROJECT_ID or os.environ.get('GBQ_PROJECT_ID')


def _get_private_key_path():
if _in_travis_environment():
return os.path.join(*[os.environ.get('TRAVIS_BUILD_DIR'), 'ci',
'travis_gbq.json'])
else:
return PRIVATE_KEY_JSON_PATH

private_key_path = PRIVATE_KEY_JSON_PATH
if not private_key_path:
private_key_path = os.environ.get('GBQ_GOOGLE_APPLICATION_CREDENTIALS')
return private_key_path


def clean_gbq_environment(private_key=None):
Expand Down Expand Up @@ -123,11 +124,9 @@ def test_roundtrip(self):
test_size = 20001
df = make_mixed_dataframe_v2(test_size)

df.to_gbq(destination_table, _get_project_id(), chunksize=10000,
df.to_gbq(destination_table, _get_project_id(), chunksize=None,
private_key=_get_private_key_path())

sleep(30) # <- Curses Google!!!

result = pd.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}"
.format(destination_table),
project_id=_get_project_id(),
Expand Down