Skip to content

Commit d048aa8

Browse files
refactor: Switch to using internal schema rules rather than ibis schema (#587)
1 parent eed12c1 commit d048aa8

File tree

5 files changed

+122
-61
lines changed

5 files changed

+122
-61
lines changed

bigframes/core/__init__.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -106,8 +106,7 @@ def session(self) -> Session:
106106

107107
@functools.cached_property
108108
def schema(self) -> schemata.ArraySchema:
109-
# TODO: switch to use self.node.schema
110-
return self._compiled_schema
109+
return self.node.schema
111110

112111
@functools.cached_property
113112
def _compiled_schema(self) -> schemata.ArraySchema:

bigframes/core/blocks.py

+30-2
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import dataclasses
2525
import functools
2626
import itertools
27+
import os
2728
import random
2829
import typing
2930
from typing import Iterable, List, Literal, Mapping, Optional, Sequence, Tuple
@@ -41,10 +42,12 @@
4142
import bigframes.core.guid as guid
4243
import bigframes.core.join_def as join_defs
4344
import bigframes.core.ordering as ordering
45+
import bigframes.core.schema as bf_schema
4446
import bigframes.core.tree_properties as tree_properties
4547
import bigframes.core.utils
4648
import bigframes.core.utils as utils
4749
import bigframes.dtypes
50+
import bigframes.features
4851
import bigframes.operations as ops
4952
import bigframes.operations.aggregations as agg_ops
5053
import bigframes.session._io.pandas
@@ -411,7 +414,32 @@ def _to_dataframe(self, result) -> pd.DataFrame:
411414
"""Convert BigQuery data to pandas DataFrame with specific dtypes."""
412415
dtypes = dict(zip(self.index_columns, self.index.dtypes))
413416
dtypes.update(zip(self.value_columns, self.dtypes))
414-
return self.session._rows_to_dataframe(result, dtypes)
417+
result_dataframe = self.session._rows_to_dataframe(result, dtypes)
418+
# Runs strict validations to ensure internal type predictions and ibis are completely in sync
419+
# Do not execute these validations outside of testing suite.
420+
if "PYTEST_CURRENT_TEST" in os.environ:
421+
self._validate_result_schema(result_dataframe)
422+
return result_dataframe
423+
424+
def _validate_result_schema(self, result_df: pd.DataFrame):
425+
ibis_schema = self.expr._compiled_schema
426+
internal_schema = self.expr.node.schema
427+
actual_schema = bf_schema.ArraySchema(
428+
tuple(
429+
bf_schema.SchemaItem(name, dtype) # type: ignore
430+
for name, dtype in result_df.dtypes.items()
431+
)
432+
)
433+
if not bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable:
434+
return
435+
if internal_schema != actual_schema:
436+
raise ValueError(
437+
f"This error should only occur while testing. BigFrames internal schema: {internal_schema} does not match actual schema: {actual_schema}"
438+
)
439+
if ibis_schema != actual_schema:
440+
raise ValueError(
441+
f"This error should only occur while testing. Ibis schema: {ibis_schema} does not match actual schema: {actual_schema}"
442+
)
415443

416444
def to_pandas(
417445
self,
@@ -1204,7 +1232,7 @@ def _standard_stats(self, column_id) -> typing.Sequence[agg_ops.UnaryAggregateOp
12041232
# TODO: annotate aggregations themself with this information
12051233
dtype = self.expr.get_column_type(column_id)
12061234
stats: list[agg_ops.UnaryAggregateOp] = [agg_ops.count_op]
1207-
if dtype not in bigframes.dtypes.UNORDERED_DTYPES:
1235+
if bigframes.dtypes.is_orderable(dtype):
12081236
stats += [agg_ops.min_op, agg_ops.max_op]
12091237
if dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE:
12101238
# Notable exclusions:

bigframes/dataframe.py

-6
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
from __future__ import annotations
1818

1919
import datetime
20-
import os
2120
import re
2221
import sys
2322
import textwrap
@@ -175,11 +174,6 @@ def __init__(
175174
self._block = bigframes.pandas.read_pandas(pd_dataframe)._get_block()
176175
self._query_job: Optional[bigquery.QueryJob] = None
177176

178-
# Runs strict validations to ensure internal type predictions and ibis are completely in sync
179-
# Do not execute these validations outside of testing suite.
180-
if "PYTEST_CURRENT_TEST" in os.environ:
181-
self._block.expr.validate_schema()
182-
183177
def __dir__(self):
184178
return dir(type(self)) + [
185179
label

bigframes/dtypes.py

+17-13
Original file line numberDiff line numberDiff line change
@@ -57,13 +57,11 @@
5757
TIME_DTYPE = pd.ArrowDtype(pa.time64("us"))
5858
DATETIME_DTYPE = pd.ArrowDtype(pa.timestamp("us"))
5959
TIMESTAMP_DTYPE = pd.ArrowDtype(pa.timestamp("us", tz="UTC"))
60+
GEO_DTYPE = gpd.array.GeometryDtype()
6061

6162
# Used when storing Null expressions
6263
DEFAULT_DTYPE = FLOAT_DTYPE
6364

64-
# On BQ side, ARRAY, STRUCT, GEOGRAPHY, JSON are not orderable
65-
UNORDERED_DTYPES = [gpd.array.GeometryDtype()]
66-
6765
# Type hints for dtype strings supported by BigQuery DataFrame
6866
DtypeString = Literal[
6967
"boolean",
@@ -134,6 +132,12 @@ def is_array_like(type: ExpressionType) -> bool:
134132
)
135133

136134

135+
def is_struct_like(type: ExpressionType) -> bool:
136+
return isinstance(type, pd.ArrowDtype) and isinstance(
137+
type.pyarrow_dtype, pa.StructType
138+
)
139+
140+
137141
def is_numeric(type: ExpressionType) -> bool:
138142
return type in NUMERIC_BIGFRAMES_TYPES_PERMISSIVE
139143

@@ -143,18 +147,18 @@ def is_iterable(type: ExpressionType) -> bool:
143147

144148

145149
def is_comparable(type: ExpressionType) -> bool:
146-
return (type is not None) and (type not in UNORDERED_DTYPES)
150+
return (type is not None) and is_orderable(type)
147151

148152

149-
# Type hints for Ibis data types that can be read to Python objects by BigQuery DataFrame
150-
ReadOnlyIbisDtype = Union[
151-
ibis_dtypes.Binary,
152-
ibis_dtypes.JSON,
153-
ibis_dtypes.Decimal,
154-
ibis_dtypes.GeoSpatial,
155-
ibis_dtypes.Array,
156-
ibis_dtypes.Struct,
157-
]
153+
def is_orderable(type: ExpressionType) -> bool:
154+
# On BQ side, ARRAY, STRUCT, GEOGRAPHY, JSON are not orderable
155+
return not is_array_like(type) and not is_struct_like(type) and (type != GEO_DTYPE)
156+
157+
158+
def is_bool_coercable(type: ExpressionType) -> bool:
159+
# TODO: Implement more bool coercions
160+
return (type is None) or is_numeric(type) or is_string_like(type)
161+
158162

159163
BIDIRECTIONAL_MAPPINGS: Iterable[Tuple[IbisDtype, Dtype]] = (
160164
(ibis_dtypes.boolean, pd.BooleanDtype()),

0 commit comments

Comments
 (0)