Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test: test most relevant dtype for aggregates #595

Merged
merged 8 commits into from
Apr 25, 2024
16 changes: 10 additions & 6 deletions bigframes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -658,10 +658,14 @@ def is_compatible(scalar: typing.Any, dtype: Dtype) -> typing.Optional[Dtype]:
return None


def lcd_type(dtype1: Dtype, dtype2: Dtype) -> Dtype:
"""Get the supertype of the two types."""
if dtype1 == dtype2:
return dtype1
def lcd_type(*dtypes: Dtype) -> Dtype:
if len(dtypes) < 1:
raise ValueError("at least one dypes should be provided")
if len(dtypes) == 1:
return dtypes[0]
unique_dtypes = set(dtypes)
if len(unique_dtypes) == 1:
return unique_dtypes.pop()
# Implicit conversion currently only supported for numeric types
hierarchy: list[Dtype] = [
pd.BooleanDtype(),
Expand All @@ -670,9 +674,9 @@ def lcd_type(dtype1: Dtype, dtype2: Dtype) -> Dtype:
pd.ArrowDtype(pa.decimal256(76, 38)),
pd.Float64Dtype(),
]
if (dtype1 not in hierarchy) or (dtype2 not in hierarchy):
if any([dtype not in hierarchy for dtype in dtypes]):
return None
lcd_index = max(hierarchy.index(dtype1), hierarchy.index(dtype2))
lcd_index = max([hierarchy.index(dtype) for dtype in dtypes])
return hierarchy[lcd_index]


Expand Down
126 changes: 109 additions & 17 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2390,12 +2390,27 @@ def test_dataframe_pct_change(scalars_df_index, scalars_pandas_df_index, periods
def test_dataframe_agg_single_string(scalars_dfs):
numeric_cols = ["int64_col", "int64_too", "float64_col"]
scalars_df, scalars_pandas_df = scalars_dfs

bf_result = scalars_df[numeric_cols].agg("sum").to_pandas()
pd_result = scalars_pandas_df[numeric_cols].agg("sum")

# Pandas may produce narrower numeric types, but bigframes always produces Float64
pd_result = pd_result.astype("Float64")
pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False)
assert bf_result.dtype == "Float64"
pd.testing.assert_series_equal(
pd_result, bf_result, check_dtype=False, check_index_type=False
)


def test_dataframe_agg_int_single_string(scalars_dfs):
numeric_cols = ["int64_col", "int64_too", "bool_col"]
scalars_df, scalars_pandas_df = scalars_dfs

bf_result = scalars_df[numeric_cols].agg("sum").to_pandas()
pd_result = scalars_pandas_df[numeric_cols].agg("sum")

assert bf_result.dtype == "Int64"
pd.testing.assert_series_equal(
pd_result, bf_result, check_dtype=False, check_index_type=False
)


def test_dataframe_agg_multi_string(scalars_dfs):
Expand Down Expand Up @@ -2431,6 +2446,27 @@ def test_dataframe_agg_multi_string(scalars_dfs):
).all()


def test_dataframe_agg_int_multi_string(scalars_dfs):
numeric_cols = ["int64_col", "int64_too", "bool_col"]
aggregations = [
"sum",
"nunique",
"count",
]
scalars_df, scalars_pandas_df = scalars_dfs
bf_result = scalars_df[numeric_cols].agg(aggregations).to_pandas()
pd_result = scalars_pandas_df[numeric_cols].agg(aggregations)

for dtype in bf_result.dtypes:
assert dtype == "Int64"

# Pandas may produce narrower numeric types
# Pandas has object index type
pd.testing.assert_frame_equal(
pd_result, bf_result, check_dtype=False, check_index_type=False
)


@skip_legacy_pandas
def test_df_describe(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
Expand Down Expand Up @@ -2959,6 +2995,58 @@ def test_loc_setitem_bool_series_scalar_error(scalars_dfs):
pd_df.loc[pd_df["int64_too"] == 1, "string_col"] = 99


@pytest.mark.parametrize(
("col", "op"),
[
# Int aggregates
pytest.param("int64_col", lambda x: x.sum(), id="int-sum"),
pytest.param("int64_col", lambda x: x.min(), id="int-min"),
pytest.param("int64_col", lambda x: x.max(), id="int-max"),
pytest.param("int64_col", lambda x: x.count(), id="int-count"),
pytest.param("int64_col", lambda x: x.nunique(), id="int-nunique"),
# Float aggregates
pytest.param("float64_col", lambda x: x.count(), id="float-count"),
pytest.param("float64_col", lambda x: x.nunique(), id="float-nunique"),
# Bool aggregates
pytest.param("bool_col", lambda x: x.sum(), id="bool-sum"),
pytest.param("bool_col", lambda x: x.count(), id="bool-count"),
pytest.param("bool_col", lambda x: x.nunique(), id="bool-nunique"),
# String aggregates
pytest.param("string_col", lambda x: x.count(), id="string-count"),
pytest.param("string_col", lambda x: x.nunique(), id="string-nunique"),
],
)
def test_dataframe_aggregate_int(scalars_df_index, scalars_pandas_df_index, col, op):
bf_result = op(scalars_df_index[[col]]).to_pandas()
pd_result = op(scalars_pandas_df_index[[col]])

# Check dtype separately
assert bf_result.dtype == "Int64"

# Pandas may produce narrower numeric types
# Pandas has object index type
assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False)


@pytest.mark.parametrize(
("col", "op"),
[
pytest.param("bool_col", lambda x: x.min(), id="bool-min"),
pytest.param("bool_col", lambda x: x.max(), id="bool-max"),
],
)
def test_dataframe_aggregate_bool(scalars_df_index, scalars_pandas_df_index, col, op):
bf_result = op(scalars_df_index[[col]]).to_pandas()
pd_result = op(scalars_pandas_df_index[[col]])

# Check dtype separately
assert bf_result.dtype == "boolean"

# Pandas may produce narrower numeric types
# Pandas has object index type
assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False)


@pytest.mark.parametrize(
("ordered"),
[
Expand All @@ -2967,34 +3055,38 @@ def test_loc_setitem_bool_series_scalar_error(scalars_dfs):
],
)
@pytest.mark.parametrize(
("op"),
("op", "bf_dtype"),
[
(lambda x: x.sum(numeric_only=True)),
(lambda x: x.mean(numeric_only=True)),
(lambda x: x.min(numeric_only=True)),
(lambda x: x.max(numeric_only=True)),
(lambda x: x.std(numeric_only=True)),
(lambda x: x.var(numeric_only=True)),
(lambda x: x.count(numeric_only=False)),
(lambda x: x.nunique()),
(lambda x: x.sum(numeric_only=True), "Float64"),
(lambda x: x.mean(numeric_only=True), "Float64"),
(lambda x: x.min(numeric_only=True), "Float64"),
(lambda x: x.max(numeric_only=True), "Float64"),
(lambda x: x.std(numeric_only=True), "Float64"),
(lambda x: x.var(numeric_only=True), "Float64"),
(lambda x: x.count(numeric_only=False), "Int64"),
(lambda x: x.nunique(), "Int64"),
],
ids=["sum", "mean", "min", "max", "std", "var", "count", "nunique"],
)
def test_dataframe_aggregates(scalars_df_index, scalars_pandas_df_index, op, ordered):
def test_dataframe_aggregates(
scalars_df_index, scalars_pandas_df_index, op, bf_dtype, ordered
):
col_names = ["int64_too", "float64_col", "string_col", "int64_col", "bool_col"]
bf_series = op(scalars_df_index[col_names])
pd_series = op(scalars_pandas_df_index[col_names])
bf_result = bf_series.to_pandas(ordered=ordered)
pd_result = op(scalars_pandas_df_index[col_names])

# Check dtype separately
assert bf_result.dtype == bf_dtype

# Pandas may produce narrower numeric types, but bigframes always produces Float64
# Pandas has object index type
pd_series.index = pd_series.index.astype(pd.StringDtype(storage="pyarrow"))
assert_series_equal(
pd_series,
pd_result,
bf_result,
check_dtype=False,
check_index_type=False,
ignore_order=not ordered,
check_dtype=False,
)


Expand Down