Skip to content

Commit a1c0631

Browse files
feat: specific pyarrow mappings for decimal, bytes types (#283)
* feat: new bytes, json, decimal type mappings * amend tests to reflect new types * add implicit type conversion for df.replace * more type casting tests * skip pandas 1.x for more tests --------- Co-authored-by: Tim Swast <swast@google.com>
1 parent ad67465 commit a1c0631

21 files changed

+267
-101
lines changed

bigframes/core/block_transforms.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ def interpolate(block: blocks.Block, method: str = "linear") -> blocks.Block:
131131
if len(index_columns) != 1:
132132
raise ValueError("only method 'linear' supports multi-index")
133133
xvalues = block.index_columns[0]
134-
if block.index_dtypes[0] not in dtypes.NUMERIC_BIGFRAMES_TYPES:
134+
if block.index_dtypes[0] not in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE:
135135
raise ValueError("Can only interpolate on numeric index.")
136136

137137
for column in original_columns:

bigframes/core/blocks.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1063,7 +1063,7 @@ def _standard_stats(self, column_id) -> typing.Sequence[agg_ops.AggregateOp]:
10631063
stats: list[agg_ops.AggregateOp] = [agg_ops.count_op]
10641064
if dtype not in bigframes.dtypes.UNORDERED_DTYPES:
10651065
stats += [agg_ops.min_op, agg_ops.max_op]
1066-
if dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES:
1066+
if dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE:
10671067
# Notable exclusions:
10681068
# prod op tends to cause overflows
10691069
# Also, var_op is redundant as can be derived from std

bigframes/core/compile/compiled.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,10 @@ def _get_ibis_column(self, key: str) -> ibis_types.Value:
220220
raise ValueError(
221221
"Column name {} not in set of values: {}".format(key, self.column_ids)
222222
)
223-
return typing.cast(ibis_types.Value, self._column_names[key])
223+
return typing.cast(
224+
ibis_types.Value,
225+
bigframes.dtypes.ibis_value_to_canonical_type(self._column_names[key]),
226+
)
224227

225228
def get_column_type(self, key: str) -> bigframes.dtypes.Dtype:
226229
ibis_type = typing.cast(
@@ -1177,7 +1180,14 @@ def _to_ibis_expr(
11771180
# Make sure all dtypes are the "canonical" ones for BigFrames. This is
11781181
# important for operations like UNION where the schema must match.
11791182
table = self._table.select(
1180-
bigframes.dtypes.ibis_value_to_canonical_type(column) for column in columns
1183+
bigframes.dtypes.ibis_value_to_canonical_type(
1184+
column.resolve(self._table)
1185+
# TODO(https://github.com/ibis-project/ibis/issues/7613): use
1186+
# public API to refer to Deferred type.
1187+
if isinstance(column, ibis.common.deferred.Deferred)
1188+
else column
1189+
)
1190+
for column in columns
11811191
)
11821192
base_table = table
11831193
if self._reduced_predicate is not None:

bigframes/core/groupby/__init__.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -359,7 +359,8 @@ def _convert_index(self, dataframe: df.DataFrame):
359359

360360
def _raise_on_non_numeric(self, op: str):
361361
if not all(
362-
dtype in dtypes.NUMERIC_BIGFRAMES_TYPES for dtype in self._block.dtypes
362+
dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE
363+
for dtype in self._block.dtypes
363364
):
364365
raise NotImplementedError(
365366
f"'{op}' does not support non-numeric columns. "
@@ -371,7 +372,9 @@ def _raise_on_non_numeric(self, op: str):
371372
def _aggregated_columns(self, numeric_only: bool = False) -> typing.Sequence[str]:
372373
valid_agg_cols: list[str] = []
373374
for col_id in self._selected_cols:
374-
is_numeric = self._column_type(col_id) in dtypes.NUMERIC_BIGFRAMES_TYPES
375+
is_numeric = (
376+
self._column_type(col_id) in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE
377+
)
375378
if is_numeric or not numeric_only:
376379
valid_agg_cols.append(col_id)
377380
return valid_agg_cols

bigframes/dataframe.py

+11-9
Original file line numberDiff line numberDiff line change
@@ -1800,7 +1800,7 @@ def agg(
18001800
) -> DataFrame | bigframes.series.Series:
18011801
if utils.is_list_like(func):
18021802
if any(
1803-
dtype not in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES
1803+
dtype not in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE
18041804
for dtype in self.dtypes
18051805
):
18061806
raise NotImplementedError(
@@ -1867,7 +1867,7 @@ def melt(
18671867
)
18681868

18691869
def describe(self) -> DataFrame:
1870-
df_numeric = self._drop_non_numeric(keep_bool=False)
1870+
df_numeric = self._drop_non_numeric(permissive=False)
18711871
if len(df_numeric.columns) == 0:
18721872
raise NotImplementedError(
18731873
f"df.describe() currently only supports numeric values. {constants.FEEDBACK_LINK}"
@@ -2005,10 +2005,12 @@ def unstack(self, level: LevelsType = -1):
20052005
)
20062006
return DataFrame(pivot_block)
20072007

2008-
def _drop_non_numeric(self, keep_bool=True) -> DataFrame:
2009-
types_to_keep = set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES)
2010-
if not keep_bool:
2011-
types_to_keep -= set(bigframes.dtypes.BOOL_BIGFRAMES_TYPES)
2008+
def _drop_non_numeric(self, permissive=True) -> DataFrame:
2009+
types_to_keep = (
2010+
set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE)
2011+
if permissive
2012+
else set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE)
2013+
)
20122014
non_numeric_cols = [
20132015
col_id
20142016
for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
@@ -2026,7 +2028,7 @@ def _drop_non_bool(self) -> DataFrame:
20262028

20272029
def _raise_on_non_numeric(self, op: str):
20282030
if not all(
2029-
dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES
2031+
dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE
20302032
for dtype in self._block.dtypes
20312033
):
20322034
raise NotImplementedError(
@@ -2301,7 +2303,7 @@ def notna(self) -> DataFrame:
23012303

23022304
def cumsum(self):
23032305
is_numeric_types = [
2304-
(dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES)
2306+
(dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE)
23052307
for _, dtype in self.dtypes.items()
23062308
]
23072309
if not all(is_numeric_types):
@@ -2313,7 +2315,7 @@ def cumsum(self):
23132315

23142316
def cumprod(self) -> DataFrame:
23152317
is_numeric_types = [
2316-
(dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES)
2318+
(dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE)
23172319
for _, dtype in self.dtypes.items()
23182320
]
23192321
if not all(is_numeric_types):

bigframes/dtypes.py

+102-37
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
"""Mappings for Pandas dtypes supported by BigQuery DataFrames package"""
1616

1717
import datetime
18+
import decimal
1819
import textwrap
1920
import typing
2021
from typing import Any, Dict, Iterable, Literal, Tuple, Union
@@ -30,6 +31,7 @@
3031

3132
import bigframes.constants as constants
3233
import third_party.bigframes_vendored.google_cloud_bigquery._pandas_helpers as gcb3p_pandas_helpers
34+
import third_party.bigframes_vendored.ibis.expr.operations as vendored_ibis_ops
3335

3436
# Type hints for Pandas dtypes supported by BigQuery DataFrame
3537
Dtype = Union[
@@ -40,9 +42,6 @@
4042
pd.ArrowDtype,
4143
]
4244

43-
# Corresponds to the pandas concept of numeric type (such as when 'numeric_only' is specified in an operation)
44-
NUMERIC_BIGFRAMES_TYPES = [pd.BooleanDtype(), pd.Float64Dtype(), pd.Int64Dtype()]
45-
4645
# On BQ side, ARRAY, STRUCT, GEOGRAPHY, JSON are not orderable
4746
UNORDERED_DTYPES = [gpd.array.GeometryDtype()]
4847

@@ -57,6 +56,9 @@
5756
"timestamp[us][pyarrow]",
5857
"date32[day][pyarrow]",
5958
"time64[us][pyarrow]",
59+
"decimal128(38, 9)[pyarrow]",
60+
"decimal256(38, 9)[pyarrow]",
61+
"binary[pyarrow]",
6062
]
6163

6264
# Type hints for Ibis data types supported by BigQuery DataFrame
@@ -72,8 +74,17 @@
7274

7375
BOOL_BIGFRAMES_TYPES = [pd.BooleanDtype()]
7476

75-
# Several operations are restricted to these types.
76-
NUMERIC_BIGFRAMES_TYPES = [pd.BooleanDtype(), pd.Float64Dtype(), pd.Int64Dtype()]
77+
# Corresponds to the pandas concept of numeric type (such as when 'numeric_only' is specified in an operation)
78+
# Pandas is inconsistent, so two definitions are provided, each used in different contexts
79+
NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE = [
80+
pd.Float64Dtype(),
81+
pd.Int64Dtype(),
82+
]
83+
NUMERIC_BIGFRAMES_TYPES_PERMISSIVE = NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE + [
84+
pd.BooleanDtype(),
85+
pd.ArrowDtype(pa.decimal128(38, 9)),
86+
pd.ArrowDtype(pa.decimal256(76, 38)),
87+
]
7788

7889
# Type hints for Ibis data types that can be read to Python objects by BigQuery DataFrame
7990
ReadOnlyIbisDtype = Union[
@@ -97,6 +108,15 @@
97108
ibis_dtypes.Timestamp(timezone="UTC"),
98109
pd.ArrowDtype(pa.timestamp("us", tz="UTC")),
99110
),
111+
(ibis_dtypes.binary, pd.ArrowDtype(pa.binary())),
112+
(
113+
ibis_dtypes.Decimal(precision=38, scale=9, nullable=True),
114+
pd.ArrowDtype(pa.decimal128(38, 9)),
115+
),
116+
(
117+
ibis_dtypes.Decimal(precision=76, scale=38, nullable=True),
118+
pd.ArrowDtype(pa.decimal256(76, 38)),
119+
),
100120
)
101121

102122
BIGFRAMES_TO_IBIS: Dict[Dtype, ibis_dtypes.DataType] = {
@@ -112,6 +132,9 @@
112132
ibis_dtypes.time: pa.time64("us"),
113133
ibis_dtypes.Timestamp(timezone=None): pa.timestamp("us"),
114134
ibis_dtypes.Timestamp(timezone="UTC"): pa.timestamp("us", tz="UTC"),
135+
ibis_dtypes.binary: pa.binary(),
136+
ibis_dtypes.Decimal(precision=38, scale=9, nullable=True): pa.decimal128(38, 9),
137+
ibis_dtypes.Decimal(precision=76, scale=38, nullable=True): pa.decimal256(76, 38),
115138
}
116139

117140
ARROW_TO_IBIS = {arrow: ibis for ibis, arrow in IBIS_TO_ARROW.items()}
@@ -125,10 +148,6 @@
125148
)
126149
IBIS_TO_BIGFRAMES.update(
127150
{
128-
ibis_dtypes.binary: np.dtype("O"),
129-
ibis_dtypes.json: np.dtype("O"),
130-
ibis_dtypes.Decimal(precision=38, scale=9, nullable=True): np.dtype("O"),
131-
ibis_dtypes.Decimal(precision=76, scale=38, nullable=True): np.dtype("O"),
132151
ibis_dtypes.GeoSpatial(
133152
geotype="geography", srid=4326, nullable=True
134153
): gpd.array.GeometryDtype(),
@@ -178,7 +197,7 @@ def ibis_dtype_to_bigframes_dtype(
178197
# our IO returns them as objects. Eventually, we should support them as
179198
# ArrowDType (and update the IO accordingly)
180199
if isinstance(ibis_dtype, ibis_dtypes.Array):
181-
return np.dtype("O")
200+
return pd.ArrowDtype(ibis_dtype_to_arrow_dtype(ibis_dtype))
182201

183202
if isinstance(ibis_dtype, ibis_dtypes.Struct):
184203
return pd.ArrowDtype(ibis_dtype_to_arrow_dtype(ibis_dtype))
@@ -200,7 +219,9 @@ def ibis_dtype_to_bigframes_dtype(
200219

201220
def ibis_dtype_to_arrow_dtype(ibis_dtype: ibis_dtypes.DataType) -> pa.DataType:
202221
if isinstance(ibis_dtype, ibis_dtypes.Array):
203-
return pa.list_(ibis_dtype_to_arrow_dtype(ibis_dtype.value_type))
222+
return pa.list_(
223+
ibis_dtype_to_arrow_dtype(ibis_dtype.value_type.copy(nullable=True))
224+
)
204225

205226
if isinstance(ibis_dtype, ibis_dtypes.Struct):
206227
return pa.struct(
@@ -224,21 +245,13 @@ def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value:
224245
This is useful in cases where multiple types correspond to the same BigFrames dtype.
225246
"""
226247
ibis_type = value.type()
248+
name = value.get_name()
249+
if ibis_type.is_json():
250+
value = vendored_ibis_ops.ToJsonString(value).to_expr()
251+
return value.name(name)
227252
# Allow REQUIRED fields to be joined with NULLABLE fields.
228253
nullable_type = ibis_type.copy(nullable=True)
229-
return value.cast(nullable_type).name(value.get_name())
230-
231-
232-
def ibis_table_to_canonical_types(table: ibis_types.Table) -> ibis_types.Table:
233-
"""Converts an Ibis table expression to canonical types.
234-
235-
This is useful in cases where multiple types correspond to the same BigFrames dtype.
236-
"""
237-
casted_columns = []
238-
for column_name in table.columns:
239-
column = typing.cast(ibis_types.Value, table[column_name])
240-
casted_columns.append(ibis_value_to_canonical_type(column))
241-
return table.select(*casted_columns)
254+
return value.cast(nullable_type).name(name)
242255

243256

244257
def arrow_dtype_to_ibis_dtype(arrow_dtype: pa.DataType) -> ibis_dtypes.DataType:
@@ -386,15 +399,35 @@ def cast_ibis_value(
386399
ibis_dtypes.bool,
387400
ibis_dtypes.float64,
388401
ibis_dtypes.string,
402+
ibis_dtypes.Decimal(precision=38, scale=9),
403+
ibis_dtypes.Decimal(precision=76, scale=38),
404+
),
405+
ibis_dtypes.float64: (
406+
ibis_dtypes.string,
407+
ibis_dtypes.int64,
408+
ibis_dtypes.Decimal(precision=38, scale=9),
409+
ibis_dtypes.Decimal(precision=76, scale=38),
410+
),
411+
ibis_dtypes.string: (
412+
ibis_dtypes.int64,
413+
ibis_dtypes.float64,
414+
ibis_dtypes.Decimal(precision=38, scale=9),
415+
ibis_dtypes.Decimal(precision=76, scale=38),
416+
ibis_dtypes.binary,
389417
),
390-
ibis_dtypes.float64: (ibis_dtypes.string, ibis_dtypes.int64),
391-
ibis_dtypes.string: (ibis_dtypes.int64, ibis_dtypes.float64),
392418
ibis_dtypes.date: (ibis_dtypes.string,),
393-
ibis_dtypes.Decimal(precision=38, scale=9): (ibis_dtypes.float64,),
394-
ibis_dtypes.Decimal(precision=76, scale=38): (ibis_dtypes.float64,),
419+
ibis_dtypes.Decimal(precision=38, scale=9): (
420+
ibis_dtypes.float64,
421+
ibis_dtypes.Decimal(precision=76, scale=38),
422+
),
423+
ibis_dtypes.Decimal(precision=76, scale=38): (
424+
ibis_dtypes.float64,
425+
ibis_dtypes.Decimal(precision=38, scale=9),
426+
),
395427
ibis_dtypes.time: (),
396428
ibis_dtypes.timestamp: (ibis_dtypes.Timestamp(timezone="UTC"),),
397429
ibis_dtypes.Timestamp(timezone="UTC"): (ibis_dtypes.timestamp,),
430+
ibis_dtypes.binary: (ibis_dtypes.string,),
398431
}
399432

400433
value = ibis_value_to_canonical_type(value)
@@ -458,30 +491,62 @@ def is_dtype(scalar: typing.Any, dtype: Dtype) -> bool:
458491
return False
459492

460493

494+
# string is binary
461495
def is_patype(scalar: typing.Any, pa_type: pa.DataType) -> bool:
462496
"""Determine whether a scalar's type matches a given pyarrow type."""
463497
if pa_type == pa.time64("us"):
464498
return isinstance(scalar, datetime.time)
465-
if pa_type == pa.timestamp("us"):
499+
elif pa_type == pa.timestamp("us"):
466500
if isinstance(scalar, datetime.datetime):
467501
return not scalar.tzinfo
468502
if isinstance(scalar, pd.Timestamp):
469503
return not scalar.tzinfo
470-
if pa_type == pa.timestamp("us", tz="UTC"):
504+
elif pa_type == pa.timestamp("us", tz="UTC"):
471505
if isinstance(scalar, datetime.datetime):
472506
return scalar.tzinfo == datetime.timezone.utc
473507
if isinstance(scalar, pd.Timestamp):
474508
return scalar.tzinfo == datetime.timezone.utc
475-
if pa_type == pa.date32():
509+
elif pa_type == pa.date32():
476510
return isinstance(scalar, datetime.date)
511+
elif pa_type == pa.binary():
512+
return isinstance(scalar, bytes)
513+
elif pa_type == pa.decimal128(38, 9):
514+
# decimal.Decimal is a superset, but ibis performs out-of-bounds and loss-of-precision checks
515+
return isinstance(scalar, decimal.Decimal)
516+
elif pa_type == pa.decimal256(76, 38):
517+
# decimal.Decimal is a superset, but ibis performs out-of-bounds and loss-of-precision checks
518+
return isinstance(scalar, decimal.Decimal)
477519
return False
478520

479521

480-
def is_comparable(scalar: typing.Any, dtype: Dtype) -> bool:
481-
"""Whether scalar can be compare to items of dtype (though maybe requiring coercion)"""
522+
def is_compatible(scalar: typing.Any, dtype: Dtype) -> typing.Optional[Dtype]:
523+
"""Whether scalar can be compare to items of dtype (though maybe requiring coercion). Returns the datatype that must be used for the comparison"""
482524
if is_dtype(scalar, dtype):
483-
return True
525+
return dtype
484526
elif pd.api.types.is_numeric_dtype(dtype):
485-
return pd.api.types.is_number(scalar)
486-
else:
487-
return False
527+
# Implicit conversion currently only supported for numeric types
528+
if pd.api.types.is_bool(scalar):
529+
return lcd_type(pd.BooleanDtype(), dtype)
530+
if pd.api.types.is_float(scalar):
531+
return lcd_type(pd.Float64Dtype(), dtype)
532+
if pd.api.types.is_integer(scalar):
533+
return lcd_type(pd.Int64Dtype(), dtype)
534+
if isinstance(scalar, decimal.Decimal):
535+
# TODO: Check context to see if can use NUMERIC instead of BIGNUMERIC
536+
return lcd_type(pd.ArrowDtype(pa.decimal128(76, 38)), dtype)
537+
return None
538+
539+
540+
def lcd_type(dtype1: Dtype, dtype2: Dtype) -> typing.Optional[Dtype]:
541+
# Implicit conversion currently only supported for numeric types
542+
hierarchy: list[Dtype] = [
543+
pd.BooleanDtype(),
544+
pd.Int64Dtype(),
545+
pd.Float64Dtype(),
546+
pd.ArrowDtype(pa.decimal128(38, 9)),
547+
pd.ArrowDtype(pa.decimal256(76, 38)),
548+
]
549+
if (dtype1 not in hierarchy) or (dtype2 not in hierarchy):
550+
return None
551+
lcd_index = max(hierarchy.index(dtype1), hierarchy.index(dtype2))
552+
return hierarchy[lcd_index]

0 commit comments

Comments
 (0)