Skip to content

Commit 620533e

Browse files
feat: new bytes, json, decimal type mappings
1 parent dab2f2c commit 620533e

File tree

5 files changed

+58
-31
lines changed

5 files changed

+58
-31
lines changed

bigframes/core/compile/compiled.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,10 @@ def _get_ibis_column(self, key: str) -> ibis_types.Value:
220220
raise ValueError(
221221
"Column name {} not in set of values: {}".format(key, self.column_ids)
222222
)
223-
return typing.cast(ibis_types.Value, self._column_names[key])
223+
return typing.cast(
224+
ibis_types.Value,
225+
bigframes.dtypes.ibis_value_to_canonical_type(self._column_names[key]),
226+
)
224227

225228
def get_column_type(self, key: str) -> bigframes.dtypes.Dtype:
226229
ibis_type = typing.cast(
@@ -1177,7 +1180,14 @@ def _to_ibis_expr(
11771180
# Make sure all dtypes are the "canonical" ones for BigFrames. This is
11781181
# important for operations like UNION where the schema must match.
11791182
table = self._table.select(
1180-
bigframes.dtypes.ibis_value_to_canonical_type(column) for column in columns
1183+
bigframes.dtypes.ibis_value_to_canonical_type(
1184+
column.resolve(self._table)
1185+
# TODO(https://github.com/ibis-project/ibis/issues/7613): use
1186+
# public API to refer to Deferred type.
1187+
if isinstance(column, ibis.common.deferred.Deferred)
1188+
else column
1189+
)
1190+
for column in columns
11811191
)
11821192
base_table = table
11831193
if self._reduced_predicate is not None:

bigframes/dtypes.py

+23-18
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929

3030
import bigframes.constants as constants
3131
import third_party.bigframes_vendored.google_cloud_bigquery._pandas_helpers as gcb3p_pandas_helpers
32+
import third_party.bigframes_vendored.ibis.expr.operations as vendored_ibis_ops
3233

3334
# Type hints for Pandas dtypes supported by BigQuery DataFrame
3435
Dtype = Union[
@@ -96,6 +97,15 @@
9697
ibis_dtypes.Timestamp(timezone="UTC"),
9798
pd.ArrowDtype(pa.timestamp("us", tz="UTC")),
9899
),
100+
(ibis_dtypes.binary, pd.ArrowDtype(pa.binary())),
101+
(
102+
ibis_dtypes.Decimal(precision=38, scale=9, nullable=True),
103+
pd.ArrowDtype(pa.decimal128(38, 9)),
104+
),
105+
(
106+
ibis_dtypes.Decimal(precision=76, scale=38, nullable=True),
107+
pd.ArrowDtype(pa.decimal256(76, 38)),
108+
),
99109
)
100110

101111
BIGFRAMES_TO_IBIS: Dict[Dtype, ibis_dtypes.DataType] = {
@@ -111,6 +121,13 @@
111121
ibis_dtypes.time: pa.time64("us"),
112122
ibis_dtypes.Timestamp(timezone=None): pa.timestamp("us"),
113123
ibis_dtypes.Timestamp(timezone="UTC"): pa.timestamp("us", tz="UTC"),
124+
ibis_dtypes.binary: pd.ArrowDtype(pa.binary()),
125+
ibis_dtypes.Decimal(precision=38, scale=9, nullable=True): pd.ArrowDtype(
126+
pa.decimal128(38, 9)
127+
),
128+
ibis_dtypes.Decimal(precision=76, scale=38, nullable=True): pd.ArrowDtype(
129+
pa.decimal256(76, 38)
130+
),
114131
}
115132

116133
ARROW_TO_IBIS = {arrow: ibis for ibis, arrow in IBIS_TO_ARROW.items()}
@@ -124,10 +141,6 @@
124141
)
125142
IBIS_TO_BIGFRAMES.update(
126143
{
127-
ibis_dtypes.binary: np.dtype("O"),
128-
ibis_dtypes.json: np.dtype("O"),
129-
ibis_dtypes.Decimal(precision=38, scale=9, nullable=True): np.dtype("O"),
130-
ibis_dtypes.Decimal(precision=76, scale=38, nullable=True): np.dtype("O"),
131144
ibis_dtypes.GeoSpatial(
132145
geotype="geography", srid=4326, nullable=True
133146
): gpd.array.GeometryDtype(),
@@ -177,7 +190,7 @@ def ibis_dtype_to_bigframes_dtype(
177190
# our IO returns them as objects. Eventually, we should support them as
178191
# ArrowDType (and update the IO accordingly)
179192
if isinstance(ibis_dtype, ibis_dtypes.Array):
180-
return np.dtype("O")
193+
return pd.ArrowDtype(ibis_dtype_to_arrow_dtype(ibis_dtype))
181194

182195
if isinstance(ibis_dtype, ibis_dtypes.Struct):
183196
return pd.ArrowDtype(ibis_dtype_to_arrow_dtype(ibis_dtype))
@@ -223,21 +236,13 @@ def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value:
223236
This is useful in cases where multiple types correspond to the same BigFrames dtype.
224237
"""
225238
ibis_type = value.type()
239+
name = value.get_name()
240+
if ibis_type.is_json():
241+
value = vendored_ibis_ops.ToJsonString(value).to_expr()
242+
return value.name(name)
226243
# Allow REQUIRED fields to be joined with NULLABLE fields.
227244
nullable_type = ibis_type.copy(nullable=True)
228-
return value.cast(nullable_type).name(value.get_name())
229-
230-
231-
def ibis_table_to_canonical_types(table: ibis_types.Table) -> ibis_types.Table:
232-
"""Converts an Ibis table expression to canonical types.
233-
234-
This is useful in cases where multiple types correspond to the same BigFrames dtype.
235-
"""
236-
casted_columns = []
237-
for column_name in table.columns:
238-
column = typing.cast(ibis_types.Value, table[column_name])
239-
casted_columns.append(ibis_value_to_canonical_type(column))
240-
return table.select(*casted_columns)
245+
return value.cast(nullable_type).name(name)
241246

242247

243248
def arrow_dtype_to_ibis_dtype(arrow_dtype: pa.DataType) -> ibis_dtypes.DataType:

tests/system/small/test_dataframe_io.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,8 @@ def test_load_json(session):
9191
expected = pd.DataFrame(
9292
{
9393
"json_column": ['{"bar":true,"foo":10}'],
94-
}
94+
},
95+
dtype=pd.StringDtype(storage="pyarrow"),
9596
)
9697
expected.index = expected.index.astype("Int64")
9798
pd.testing.assert_series_equal(result.dtypes, expected.dtypes)

tests/system/utils.py

+18-6
Original file line numberDiff line numberDiff line change
@@ -133,16 +133,28 @@ def convert_pandas_dtypes(df: pd.DataFrame, bytes_col: bool):
133133
df["geography_col"].replace({np.nan: None})
134134
)
135135

136-
# Convert bytes types column.
137-
if bytes_col:
136+
if not isinstance(df["bytes_col"].dtype, pd.ArrowDtype):
138137
df["bytes_col"] = df["bytes_col"].apply(
139138
lambda value: base64.b64decode(value) if not pd.isnull(value) else value
140139
)
140+
arrow_table = pa.Table.from_pandas(
141+
pd.DataFrame(df, columns=["bytes_col"]),
142+
schema=pa.schema([("bytes_col", pa.binary())]),
143+
)
144+
df["bytes_col"] = arrow_table.to_pandas(types_mapper=pd.ArrowDtype)["bytes_col"]
141145

142-
# Convert numeric types column.
143-
df["numeric_col"] = df["numeric_col"].apply(
144-
lambda value: decimal.Decimal(str(value)) if value else None # type: ignore
145-
)
146+
if not isinstance(df["numeric_col"].dtype, pd.ArrowDtype):
147+
# Convert numeric types column.
148+
df["numeric_col"] = df["numeric_col"].apply(
149+
lambda value: decimal.Decimal(str(value)) if value else None # type: ignore
150+
)
151+
arrow_table = pa.Table.from_pandas(
152+
pd.DataFrame(df, columns=["numeric_col"]),
153+
schema=pa.schema([("numeric_col", pa.decimal128(38, 9))]),
154+
)
155+
df["numeric_col"] = arrow_table.to_pandas(types_mapper=pd.ArrowDtype)[
156+
"numeric_col"
157+
]
146158

147159

148160
def assert_pandas_df_equal_pca_components(actual, expected, **kwargs):

tests/unit/test_dtypes.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,11 @@
3131
# https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
3232
pytest.param(
3333
ibis_dtypes.Decimal(precision=76, scale=38, nullable=True),
34-
np.dtype("O"),
34+
pd.ArrowDtype(pa.decimal256(76, 38)),
3535
id="bignumeric",
3636
),
3737
pytest.param(ibis_dtypes.boolean, pd.BooleanDtype(), id="bool"),
38-
pytest.param(ibis_dtypes.binary, np.dtype("O"), id="bytes"),
38+
pytest.param(ibis_dtypes.binary, pd.ArrowDtype(pa.binary()), id="bytes"),
3939
pytest.param(ibis_dtypes.date, pd.ArrowDtype(pa.date32()), id="date"),
4040
pytest.param(
4141
ibis_dtypes.Timestamp(), pd.ArrowDtype(pa.timestamp("us")), id="datetime"
@@ -49,10 +49,9 @@
4949
pytest.param(ibis_dtypes.int8, pd.Int64Dtype(), id="int8-as-int64"),
5050
pytest.param(ibis_dtypes.int64, pd.Int64Dtype(), id="int64"),
5151
# TODO(tswast): custom dtype (or at least string dtype) for JSON objects
52-
pytest.param(ibis_dtypes.json, np.dtype("O"), id="json"),
5352
pytest.param(
5453
ibis_dtypes.Decimal(precision=38, scale=9, nullable=True),
55-
np.dtype("O"),
54+
pd.ArrowDtype(pa.decimal128(38, 9)),
5655
id="numeric",
5756
),
5857
pytest.param(

0 commit comments

Comments
 (0)