diff --git a/ibis/backends/clickhouse/__init__.py b/ibis/backends/clickhouse/__init__.py index e8eaa7b67a1f..89a98e61275f 100644 --- a/ibis/backends/clickhouse/__init__.py +++ b/ibis/backends/clickhouse/__init__.py @@ -247,6 +247,12 @@ def _normalize_external_tables(self, external_tables=None) -> ExternalData | Non n += 1 if not (schema := obj.schema): raise TypeError(f"Schema is empty for external table {name}") + if null_fields := schema.null_fields: + raise com.IbisTypeError( + "ClickHouse doesn't support NULL-typed fields. " + "Consider assigning a type through casting or on construction. " + f"Got null typed fields: {null_fields}" + ) structure = [ f"{name} {type_mapper.to_string(typ.copy(nullable=not typ.is_nested()))}" diff --git a/ibis/backends/duckdb/__init__.py b/ibis/backends/duckdb/__init__.py index f9d3e4e0e128..b43f846fd1b2 100644 --- a/ibis/backends/duckdb/__init__.py +++ b/ibis/backends/duckdb/__init__.py @@ -27,7 +27,7 @@ import ibis.expr.types as ir from ibis import util from ibis.backends import CanCreateDatabase, UrlFromPath -from ibis.backends.duckdb.converter import DuckDBPandasData +from ibis.backends.duckdb.converter import DuckDBPandasData, DuckDBPyArrowData from ibis.backends.sql import SQLBackend from ibis.backends.sql.compilers.base import STAR, AlterTable, C, RenameTable from ibis.common.dispatch import lazy_singledispatch @@ -150,8 +150,6 @@ def create_table( if obj is None and schema is None: raise ValueError("Either `obj` or `schema` must be specified") - if schema is not None: - schema = ibis.schema(schema) quoted = self.compiler.quoted dialect = self.dialect @@ -174,16 +172,25 @@ def create_table( else: query = None + if schema is None: + schema = table.schema() + else: + schema = ibis.schema(schema) + + if null_fields := schema.null_fields: + raise exc.IbisTypeError( + "DuckDB does not support creating tables with NULL typed columns. " + "Ensure that every column has non-NULL type. " + f"NULL columns: {null_fields}" + ) + if overwrite: temp_name = util.gen_name("duckdb_table") else: temp_name = name initial_table = sg.table(temp_name, catalog=catalog, db=database, quoted=quoted) - target = sge.Schema( - this=initial_table, - expressions=(schema or table.schema()).to_sqlglot(dialect), - ) + target = sge.Schema(this=initial_table, expressions=schema.to_sqlglot(dialect)) create_stmt = sge.Create( kind="TABLE", @@ -254,7 +261,7 @@ def table(self, name: str, database: str | None = None) -> ir.Table: table_schema = self.get_schema(name, catalog=catalog, database=database) # load geospatial only if geo columns - if any(typ.is_geospatial() for typ in table_schema.types): + if table_schema.geospatial: self.load_extension("spatial") return ops.DatabaseTable( name, @@ -1419,7 +1426,7 @@ def to_pyarrow( **_: Any, ) -> pa.Table: table = self._to_duckdb_relation(expr, params=params, limit=limit).arrow() - return expr.__pyarrow_result__(table) + return expr.__pyarrow_result__(table, data_mapper=DuckDBPyArrowData) def execute( self, diff --git a/ibis/backends/duckdb/converter.py b/ibis/backends/duckdb/converter.py index 7d88f3e0cf75..8ddf28d4b97e 100644 --- a/ibis/backends/duckdb/converter.py +++ b/ibis/backends/duckdb/converter.py @@ -1,9 +1,31 @@ from __future__ import annotations +from typing import TYPE_CHECKING + +import pyarrow as pa + from ibis.formats.pandas import PandasData +from ibis.formats.pyarrow import PyArrowData + +if TYPE_CHECKING: + import ibis.expr.datatypes as dt class DuckDBPandasData(PandasData): @staticmethod def convert_Array(s, dtype, pandas_type): return s.replace(float("nan"), None) + + +class DuckDBPyArrowData(PyArrowData): + @classmethod + def convert_scalar(cls, scalar: pa.Scalar, dtype: dt.DataType) -> pa.Scalar: + if dtype.is_null(): + return pa.scalar(None) + return super().convert_scalar(scalar, dtype) + + @classmethod + def convert_column(cls, column: pa.Array, dtype: dt.DataType) -> pa.Array: + if dtype.is_null(): + return pa.nulls(len(column)) + return super().convert_column(column, dtype) diff --git a/ibis/backends/duckdb/tests/test_client.py b/ibis/backends/duckdb/tests/test_client.py index f02af4f040b6..5382a7f0c0f6 100644 --- a/ibis/backends/duckdb/tests/test_client.py +++ b/ibis/backends/duckdb/tests/test_client.py @@ -11,6 +11,7 @@ from pytest import param import ibis +import ibis.common.exceptions as com import ibis.expr.datatypes as dt from ibis.conftest import LINUX, SANDBOXED, not_windows from ibis.util import gen_name @@ -420,3 +421,24 @@ def test_memtable_doesnt_leak(con, monkeypatch): df = ibis.memtable({"a": [1, 2, 3]}, name=name).execute() assert name not in con.list_tables() assert len(df) == 3 + + +def test_create_table_with_nulls(con): + t = ibis.memtable({"a": [None]}) + schema = t.schema() + + assert schema == ibis.schema({"a": "null"}) + assert schema.null_fields == ("a",) + + name = gen_name("duckdb_all_nulls") + + match = "NULL typed columns" + + with pytest.raises(com.IbisTypeError, match=match): + con.create_table(name, obj=t) + + with pytest.raises(com.IbisTypeError, match=match): + con.create_table(name, obj=t, schema=schema) + + with pytest.raises(com.IbisTypeError, match=match): + con.create_table(name, schema=schema) diff --git a/ibis/backends/exasol/__init__.py b/ibis/backends/exasol/__init__.py index e228ce03df9d..fcb7eb894b27 100644 --- a/ibis/backends/exasol/__init__.py +++ b/ibis/backends/exasol/__init__.py @@ -278,7 +278,7 @@ def _get_schema_using_query(self, query: str) -> sch.Schema: def _register_in_memory_table(self, op: ops.InMemoryTable) -> None: schema = op.schema - if null_columns := [col for col, dtype in schema.items() if dtype.is_null()]: + if null_columns := schema.null_fields: raise com.IbisTypeError( "Exasol cannot yet reliably handle `null` typed columns; " f"got null typed columns: {null_columns}" diff --git a/ibis/backends/flink/__init__.py b/ibis/backends/flink/__init__.py index d4964fa8e3e8..327eb86e4591 100644 --- a/ibis/backends/flink/__init__.py +++ b/ibis/backends/flink/__init__.py @@ -369,13 +369,22 @@ def compile( expr, params=params, pretty=pretty ) # Discard `limit` and other kwargs. + def _register_in_memory_table(self, op: ops.InMemoryTable) -> None: + if null_columns := op.schema.null_fields: + raise exc.IbisTypeError( + f"{self.name} cannot yet reliably handle `null` typed columns; " + f"got null typed columns: {null_columns}" + ) + self.create_view(op.name, op.data.to_frame(), schema=op.schema, temp=True) + + def _finalize_memtable(self, name: str) -> None: + self.drop_view(name, temp=True, force=True) + def execute(self, expr: ir.Expr, **kwargs: Any) -> Any: """Execute an expression.""" - self._verify_in_memory_tables_are_unique(expr) - self._register_udfs(expr) + self._run_pre_execute_hooks(expr) - table_expr = expr.as_table() - sql = self.compile(table_expr, **kwargs) + sql = self.compile(expr.as_table(), **kwargs) df = self._table_env.sql_query(sql).to_pandas() return expr.__pandas_result__(df) diff --git a/ibis/backends/impala/__init__.py b/ibis/backends/impala/__init__.py index 88ef7d1d3402..c3d18605df4a 100644 --- a/ibis/backends/impala/__init__.py +++ b/ibis/backends/impala/__init__.py @@ -1225,7 +1225,7 @@ def explain( def _register_in_memory_table(self, op: ops.InMemoryTable) -> None: schema = op.schema - if null_columns := [col for col, dtype in schema.items() if dtype.is_null()]: + if null_columns := schema.null_fields: raise com.IbisTypeError( "Impala cannot yet reliably handle `null` typed columns; " f"got null typed columns: {null_columns}" diff --git a/ibis/backends/mssql/__init__.py b/ibis/backends/mssql/__init__.py index b682bdfce65a..a5c596113208 100644 --- a/ibis/backends/mssql/__init__.py +++ b/ibis/backends/mssql/__init__.py @@ -751,7 +751,7 @@ def create_table( def _register_in_memory_table(self, op: ops.InMemoryTable) -> None: schema = op.schema - if null_columns := [col for col, dtype in schema.items() if dtype.is_null()]: + if null_columns := schema.null_fields: raise com.IbisTypeError( "MS SQL cannot yet reliably handle `null` typed columns; " f"got null typed columns: {null_columns}" diff --git a/ibis/backends/mysql/__init__.py b/ibis/backends/mysql/__init__.py index b27fdb4b70fe..002a3b0634dc 100644 --- a/ibis/backends/mysql/__init__.py +++ b/ibis/backends/mysql/__init__.py @@ -462,7 +462,7 @@ def create_table( def _register_in_memory_table(self, op: ops.InMemoryTable) -> None: schema = op.schema - if null_columns := [col for col, dtype in schema.items() if dtype.is_null()]: + if null_columns := schema.null_fields: raise com.IbisTypeError( "MySQL cannot yet reliably handle `null` typed columns; " f"got null typed columns: {null_columns}" diff --git a/ibis/backends/oracle/__init__.py b/ibis/backends/oracle/__init__.py index cc3b2315cd57..6a9d1a8180fa 100644 --- a/ibis/backends/oracle/__init__.py +++ b/ibis/backends/oracle/__init__.py @@ -509,6 +509,11 @@ def drop_table( def _register_in_memory_table(self, op: ops.InMemoryTable) -> None: schema = op.schema + if null_columns := schema.null_fields: + raise exc.IbisTypeError( + f"{self.name} cannot yet reliably handle `null` typed columns; " + f"got null typed columns: {null_columns}" + ) name = op.name quoted = self.compiler.quoted diff --git a/ibis/backends/postgres/__init__.py b/ibis/backends/postgres/__init__.py index 60fa400ce48b..c6ee554f546c 100644 --- a/ibis/backends/postgres/__init__.py +++ b/ibis/backends/postgres/__init__.py @@ -93,7 +93,7 @@ def _register_in_memory_table(self, op: ops.InMemoryTable) -> None: from psycopg2.extras import execute_batch schema = op.schema - if null_columns := [col for col, dtype in schema.items() if dtype.is_null()]: + if null_columns := schema.null_fields: raise exc.IbisTypeError( f"{self.name} cannot yet reliably handle `null` typed columns; " f"got null typed columns: {null_columns}" diff --git a/ibis/backends/risingwave/__init__.py b/ibis/backends/risingwave/__init__.py index fd3400fb79a1..be226b2affb1 100644 --- a/ibis/backends/risingwave/__init__.py +++ b/ibis/backends/risingwave/__init__.py @@ -264,7 +264,7 @@ def create_table( def _register_in_memory_table(self, op: ops.InMemoryTable) -> None: schema = op.schema - if null_columns := [col for col, dtype in schema.items() if dtype.is_null()]: + if null_columns := schema.null_fields: raise com.IbisTypeError( f"{self.name} cannot yet reliably handle `null` typed columns; " f"got null typed columns: {null_columns}" diff --git a/ibis/backends/tests/test_array.py b/ibis/backends/tests/test_array.py index 2de264d55cd7..91c9d7390fd2 100644 --- a/ibis/backends/tests/test_array.py +++ b/ibis/backends/tests/test_array.py @@ -940,6 +940,9 @@ def test_array_intersect(con, data): @pytest.mark.notimpl( ["trino"], reason="inserting maps into structs doesn't work", raises=TrinoUserError ) +@pytest.mark.notyet( + ["flink"], raises=ValueError, reason="array of struct is not supported" +) def test_unnest_struct(con): data = {"value": [[{"a": 1}, {"a": 2}], [{"a": 3}, {"a": 4}]]} t = ibis.memtable(data, schema=ibis.schema({"value": "!array>"})) @@ -959,8 +962,8 @@ def test_unnest_struct(con): @pytest.mark.notimpl( ["trino"], reason="inserting maps into structs doesn't work", raises=TrinoUserError ) -@pytest.mark.notimpl( - ["flink"], reason="flink unnests a and b as separate columns", raises=Py4JJavaError +@pytest.mark.notyet( + ["flink"], raises=ValueError, reason="array of struct is not supported" ) def test_unnest_struct_with_multiple_fields(con): data = { @@ -1051,9 +1054,7 @@ def test_zip_null(con, fn): ["trino"], reason="inserting maps into structs doesn't work", raises=TrinoUserError ) @pytest.mark.notyet( - ["flink"], - raises=Py4JJavaError, - reason="does not seem to support field selection on unnest", + ["flink"], raises=ValueError, reason="array of struct is not supported" ) def test_array_of_struct_unnest(con): jobs = ibis.memtable( @@ -1573,15 +1574,16 @@ def test_table_unnest_column_expr(backend): assert set(result.values) == set(expected.replace({np.nan: None}).values) -@pytest.mark.notimpl( - ["datafusion", "polars", "flink"], raises=com.OperationNotDefinedError -) +@pytest.mark.notimpl(["datafusion", "polars"], raises=com.OperationNotDefinedError) @pytest.mark.notimpl(["trino"], raises=TrinoUserError) @pytest.mark.notimpl(["postgres"], raises=PsycoPg2SyntaxError) @pytest.mark.notimpl(["risingwave"], raises=PsycoPg2ProgrammingError) @pytest.mark.notyet( ["risingwave"], raises=PsycoPg2InternalError, reason="not supported in risingwave" ) +@pytest.mark.notyet( + ["flink"], raises=ValueError, reason="array of struct is not supported" +) def test_table_unnest_array_of_struct_of_array(con): t = ibis.memtable( { diff --git a/ibis/backends/tests/test_client.py b/ibis/backends/tests/test_client.py index 00341b3b1ca0..0df04b817d85 100644 --- a/ibis/backends/tests/test_client.py +++ b/ibis/backends/tests/test_client.py @@ -1720,9 +1720,7 @@ def test_insert_into_table_missing_columns(con, temp_table): @pytest.mark.notyet(["druid"], raises=AssertionError, reason="can't drop tables") @pytest.mark.notyet( - ["clickhouse", "flink"], - raises=AssertionError, - reason="memtables are assembled every time", + ["clickhouse"], raises=AssertionError, reason="memtables are assembled every time" ) @pytest.mark.notyet( ["bigquery"], raises=AssertionError, reason="test is flaky", strict=False @@ -1768,7 +1766,7 @@ def test_same_name_memtable_is_overwritten(con): @pytest.mark.notimpl( - ["clickhouse", "flink"], + ["clickhouse"], raises=AssertionError, reason="backend doesn't use _register_in_memory_table", ) diff --git a/ibis/backends/tests/test_export.py b/ibis/backends/tests/test_export.py index 1a99367f0a4b..2cee90dc47c1 100644 --- a/ibis/backends/tests/test_export.py +++ b/ibis/backends/tests/test_export.py @@ -7,6 +7,7 @@ from pytest import param import ibis +import ibis.common.exceptions as com import ibis.expr.datatypes as dt from ibis import util from ibis.backends.tests.errors import ( @@ -27,6 +28,7 @@ pd = pytest.importorskip("pandas") pa = pytest.importorskip("pyarrow") +pat = pytest.importorskip("pyarrow.types") limit = [param(42, id="limit")] @@ -593,4 +595,44 @@ def test_scalar_to_memory(limit, awards_players, output_format, converter): expr = awards_players.filter(awards_players.awardID == "DEADBEEF").yearID.min() res = method(expr) + assert converter(res) is None + + +mark_notyet_nulls = pytest.mark.notyet( + [ + "clickhouse", + "exasol", + "flink", + "impala", + "mssql", + "mysql", + "oracle", + "postgres", + "risingwave", + "trino", + ], + raises=com.IbisTypeError, + reason="unable to handle null types as input", +) + + +@mark_notyet_nulls +def test_all_null_table(con): + t = ibis.memtable({"a": [None]}) + result = con.to_pyarrow(t) + assert pat.is_null(result["a"].type) + + +@mark_notyet_nulls +def test_all_null_column(con): + t = ibis.memtable({"a": [None]}) + result = con.to_pyarrow(t.a) + assert pat.is_null(result.type) + + +@mark_notyet_nulls +def test_all_null_scalar(con): + e = ibis.literal(None) + result = con.to_pyarrow(e) + assert pat.is_null(result.type) diff --git a/ibis/backends/trino/__init__.py b/ibis/backends/trino/__init__.py index 38f541537501..40587ed01368 100644 --- a/ibis/backends/trino/__init__.py +++ b/ibis/backends/trino/__init__.py @@ -550,7 +550,7 @@ def _fetch_from_cursor(self, cursor, schema: sch.Schema) -> pd.DataFrame: def _register_in_memory_table(self, op: ops.InMemoryTable) -> None: schema = op.schema - if null_columns := [col for col, dtype in schema.items() if dtype.is_null()]: + if null_columns := schema.null_fields: raise com.IbisTypeError( "Trino cannot yet reliably handle `null` typed columns; " f"got null typed columns: {null_columns}" diff --git a/ibis/expr/schema.py b/ibis/expr/schema.py index b47d13076407..51026744065b 100644 --- a/ibis/expr/schema.py +++ b/ibis/expr/schema.py @@ -70,6 +70,10 @@ def types(self): def geospatial(self) -> tuple[str, ...]: return tuple(name for name, typ in self.fields.items() if typ.is_geospatial()) + @attribute + def null_fields(self) -> tuple[str, ...]: + return tuple(name for name, typ in self.fields.items() if typ.is_null()) + @attribute def _name_locs(self) -> dict[str, int]: return {v: i for i, v in enumerate(self.names)} diff --git a/ibis/expr/tests/test_schema.py b/ibis/expr/tests/test_schema.py index 9fa93be67252..a9711c07160f 100644 --- a/ibis/expr/tests/test_schema.py +++ b/ibis/expr/tests/test_schema.py @@ -472,3 +472,9 @@ def test_schema_from_to_pandas_dtypes(): ("d", pd.DatetimeTZDtype(tz="US/Eastern", unit="ns")), ] assert restored_dtypes == expected_dtypes + + +def test_null_fields(): + assert sch.schema({"a": "int64", "b": "string"}).null_fields == () + assert sch.schema({"a": "null", "b": "string"}).null_fields == ("a",) + assert sch.schema({"a": "null", "b": "null"}).null_fields == ("a", "b") diff --git a/ibis/formats/pyarrow.py b/ibis/formats/pyarrow.py index 93c41b7169aa..eec59563ac6a 100644 --- a/ibis/formats/pyarrow.py +++ b/ibis/formats/pyarrow.py @@ -323,12 +323,12 @@ def convert_column(cls, column: pa.Array, dtype: dt.DataType) -> pa.Array: @classmethod def convert_table(cls, table: pa.Table, schema: Schema) -> pa.Table: desired_schema = PyArrowSchema.from_ibis(schema) - pa_schema = table.schema - - if pa_schema != desired_schema: - return table.cast(desired_schema, safe=False) - else: + if table.schema == desired_schema: return table + arrays = [ + cls.convert_column(table[name], dtype) for name, dtype in schema.items() + ] + return pa.Table.from_arrays(arrays, names=list(schema.keys())) class PyArrowTableProxy(TableProxy[V]):