Skip to content

Commit

Permalink
fix!: to_gbq uploads ArrowDtype(pa.timestamp(...) without timezon…
Browse files Browse the repository at this point in the history
…e as `DATETIME` type (#832)

* fix!: `to_gbq` uploads `ArrowDtype(pa.timestamp(...)` without timezone as `DATETIME` type

Release-As: 0.25.0

* 🦉 Updates from OwlBot post-processor

See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md

---------

Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
  • Loading branch information
tswast and gcf-owl-bot[bot] authored Dec 11, 2024
1 parent 78aa01e commit 2104b71
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 2 deletions.
9 changes: 9 additions & 0 deletions pandas_gbq/schema/pyarrow_to_bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,15 @@


def arrow_type_to_bigquery_field(name, type_) -> Optional[schema.SchemaField]:
# Since both TIMESTAMP/DATETIME use pyarrow.timestamp(...), we need to use
# a special case to disambiguate them. See:
# https://github.com/googleapis/python-bigquery-pandas/issues/450
if pyarrow.types.is_timestamp(type_):
if type_.tz is None:
return schema.SchemaField(name, "DATETIME")
else:
return schema.SchemaField(name, "TIMESTAMP")

detected_type = _ARROW_SCALAR_IDS_TO_BQ.get(type_.id, None)
if detected_type is not None:
return schema.SchemaField(name, detected_type)
Expand Down
36 changes: 34 additions & 2 deletions tests/unit/schema/test_pyarrow_to_bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,53 @@
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.

from google.cloud import bigquery
import pyarrow
import pytest

from pandas_gbq.schema import pyarrow_to_bigquery


@pytest.mark.parametrize(
(
"pyarrow_type",
"bigquery_type",
),
(
# All integer types should map to BigQuery INT64 (or INTEGER since
# SchemaField uses the legacy SQL names). See:
# https://github.com/googleapis/python-bigquery-pandas/issues/616
(pyarrow.int8(), "INTEGER"),
(pyarrow.int16(), "INTEGER"),
(pyarrow.int32(), "INTEGER"),
(pyarrow.int64(), "INTEGER"),
(pyarrow.uint8(), "INTEGER"),
(pyarrow.uint16(), "INTEGER"),
(pyarrow.uint32(), "INTEGER"),
(pyarrow.uint64(), "INTEGER"),
# If there is no associated timezone, assume a naive (timezone-less)
# DATETIME. See:
# https://github.com/googleapis/python-bigquery-pandas/issues/450
(pyarrow.timestamp("ns"), "DATETIME"),
(pyarrow.timestamp("ns", tz="UTC"), "TIMESTAMP"),
),
)
def test_arrow_type_to_bigquery_field_scalar_types(pyarrow_type, bigquery_type):
field: bigquery.SchemaField = pyarrow_to_bigquery.arrow_type_to_bigquery_field(
"test_name", pyarrow_type
)
assert field.name == "test_name"
assert field.field_type == bigquery_type


def test_arrow_type_to_bigquery_field_unknown():
# Default types should be picked at a higher layer.
assert (
pyarrow_to_bigquery.arrow_type_to_bigquery_field("test_name", pyarrow.null())
is None
)


def test_arrow_type_to_bigquery_field_list_of_unknown():
# Default types should be picked at a higher layer.
assert (
pyarrow_to_bigquery.arrow_type_to_bigquery_field(
"test_name", pyarrow.list_(pyarrow.null())
Expand Down

0 comments on commit 2104b71

Please sign in to comment.