test: test most relevant dtype for aggregates (#595)

shobsi · web-flow · commit 96c150a9de9a · 2024-04-25T02:12:43.000Z
* fix: keep most relevant dtype for aggregates

* add aggregate tests for bool result

* refactor and reuse dtypes.lcd_dtype

* check_dtype=False
diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
@@ -658,10 +658,14 @@ def is_compatible(scalar: typing.Any, dtype: Dtype) -> typing.Optional[Dtype]:
     return None
 
 
-def lcd_type(dtype1: Dtype, dtype2: Dtype) -> Dtype:
-    """Get the supertype of the two types."""
-    if dtype1 == dtype2:
-        return dtype1
+def lcd_type(*dtypes: Dtype) -> Dtype:
+    if len(dtypes) < 1:
+        raise ValueError("at least one dypes should be provided")
+    if len(dtypes) == 1:
+        return dtypes[0]
+    unique_dtypes = set(dtypes)
+    if len(unique_dtypes) == 1:
+        return unique_dtypes.pop()
     # Implicit conversion currently only supported for numeric types
     hierarchy: list[Dtype] = [
         pd.BooleanDtype(),
@@ -670,9 +674,9 @@ def lcd_type(dtype1: Dtype, dtype2: Dtype) -> Dtype:
         pd.ArrowDtype(pa.decimal256(76, 38)),
         pd.Float64Dtype(),
     ]
-    if (dtype1 not in hierarchy) or (dtype2 not in hierarchy):
+    if any([dtype not in hierarchy for dtype in dtypes]):
         return None
-    lcd_index = max(hierarchy.index(dtype1), hierarchy.index(dtype2))
+    lcd_index = max([hierarchy.index(dtype) for dtype in dtypes])
     return hierarchy[lcd_index]
 
 
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -2390,12 +2390,27 @@ def test_dataframe_pct_change(scalars_df_index, scalars_pandas_df_index, periods
 def test_dataframe_agg_single_string(scalars_dfs):
     numeric_cols = ["int64_col", "int64_too", "float64_col"]
     scalars_df, scalars_pandas_df = scalars_dfs
+
     bf_result = scalars_df[numeric_cols].agg("sum").to_pandas()
     pd_result = scalars_pandas_df[numeric_cols].agg("sum")
 
-    # Pandas may produce narrower numeric types, but bigframes always produces Float64
-    pd_result = pd_result.astype("Float64")
-    pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False)
+    assert bf_result.dtype == "Float64"
+    pd.testing.assert_series_equal(
+        pd_result, bf_result, check_dtype=False, check_index_type=False
+    )
+
+
+def test_dataframe_agg_int_single_string(scalars_dfs):
+    numeric_cols = ["int64_col", "int64_too", "bool_col"]
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    bf_result = scalars_df[numeric_cols].agg("sum").to_pandas()
+    pd_result = scalars_pandas_df[numeric_cols].agg("sum")
+
+    assert bf_result.dtype == "Int64"
+    pd.testing.assert_series_equal(
+        pd_result, bf_result, check_dtype=False, check_index_type=False
+    )
 
 
 def test_dataframe_agg_multi_string(scalars_dfs):
@@ -2431,6 +2446,27 @@ def test_dataframe_agg_multi_string(scalars_dfs):
     ).all()
 
 
+def test_dataframe_agg_int_multi_string(scalars_dfs):
+    numeric_cols = ["int64_col", "int64_too", "bool_col"]
+    aggregations = [
+        "sum",
+        "nunique",
+        "count",
+    ]
+    scalars_df, scalars_pandas_df = scalars_dfs
+    bf_result = scalars_df[numeric_cols].agg(aggregations).to_pandas()
+    pd_result = scalars_pandas_df[numeric_cols].agg(aggregations)
+
+    for dtype in bf_result.dtypes:
+        assert dtype == "Int64"
+
+    # Pandas may produce narrower numeric types
+    # Pandas has object index type
+    pd.testing.assert_frame_equal(
+        pd_result, bf_result, check_dtype=False, check_index_type=False
+    )
+
+
 @skip_legacy_pandas
 def test_df_describe(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
@@ -2982,6 +3018,58 @@ def test_loc_setitem_bool_series_scalar_error(scalars_dfs):
         pd_df.loc[pd_df["int64_too"] == 1, "string_col"] = 99
 
 
+@pytest.mark.parametrize(
+    ("col", "op"),
+    [
+        # Int aggregates
+        pytest.param("int64_col", lambda x: x.sum(), id="int-sum"),
+        pytest.param("int64_col", lambda x: x.min(), id="int-min"),
+        pytest.param("int64_col", lambda x: x.max(), id="int-max"),
+        pytest.param("int64_col", lambda x: x.count(), id="int-count"),
+        pytest.param("int64_col", lambda x: x.nunique(), id="int-nunique"),
+        # Float aggregates
+        pytest.param("float64_col", lambda x: x.count(), id="float-count"),
+        pytest.param("float64_col", lambda x: x.nunique(), id="float-nunique"),
+        # Bool aggregates
+        pytest.param("bool_col", lambda x: x.sum(), id="bool-sum"),
+        pytest.param("bool_col", lambda x: x.count(), id="bool-count"),
+        pytest.param("bool_col", lambda x: x.nunique(), id="bool-nunique"),
+        # String aggregates
+        pytest.param("string_col", lambda x: x.count(), id="string-count"),
+        pytest.param("string_col", lambda x: x.nunique(), id="string-nunique"),
+    ],
+)
+def test_dataframe_aggregate_int(scalars_df_index, scalars_pandas_df_index, col, op):
+    bf_result = op(scalars_df_index[[col]]).to_pandas()
+    pd_result = op(scalars_pandas_df_index[[col]])
+
+    # Check dtype separately
+    assert bf_result.dtype == "Int64"
+
+    # Pandas may produce narrower numeric types
+    # Pandas has object index type
+    assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False)
+
+
+@pytest.mark.parametrize(
+    ("col", "op"),
+    [
+        pytest.param("bool_col", lambda x: x.min(), id="bool-min"),
+        pytest.param("bool_col", lambda x: x.max(), id="bool-max"),
+    ],
+)
+def test_dataframe_aggregate_bool(scalars_df_index, scalars_pandas_df_index, col, op):
+    bf_result = op(scalars_df_index[[col]]).to_pandas()
+    pd_result = op(scalars_pandas_df_index[[col]])
+
+    # Check dtype separately
+    assert bf_result.dtype == "boolean"
+
+    # Pandas may produce narrower numeric types
+    # Pandas has object index type
+    assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False)
+
+
 @pytest.mark.parametrize(
     ("ordered"),
     [
@@ -2990,34 +3078,38 @@ def test_loc_setitem_bool_series_scalar_error(scalars_dfs):
     ],
 )
 @pytest.mark.parametrize(
-    ("op"),
+    ("op", "bf_dtype"),
     [
-        (lambda x: x.sum(numeric_only=True)),
-        (lambda x: x.mean(numeric_only=True)),
-        (lambda x: x.min(numeric_only=True)),
-        (lambda x: x.max(numeric_only=True)),
-        (lambda x: x.std(numeric_only=True)),
-        (lambda x: x.var(numeric_only=True)),
-        (lambda x: x.count(numeric_only=False)),
-        (lambda x: x.nunique()),
+        (lambda x: x.sum(numeric_only=True), "Float64"),
+        (lambda x: x.mean(numeric_only=True), "Float64"),
+        (lambda x: x.min(numeric_only=True), "Float64"),
+        (lambda x: x.max(numeric_only=True), "Float64"),
+        (lambda x: x.std(numeric_only=True), "Float64"),
+        (lambda x: x.var(numeric_only=True), "Float64"),
+        (lambda x: x.count(numeric_only=False), "Int64"),
+        (lambda x: x.nunique(), "Int64"),
     ],
     ids=["sum", "mean", "min", "max", "std", "var", "count", "nunique"],
 )
-def test_dataframe_aggregates(scalars_df_index, scalars_pandas_df_index, op, ordered):
+def test_dataframe_aggregates(
+    scalars_df_index, scalars_pandas_df_index, op, bf_dtype, ordered
+):
     col_names = ["int64_too", "float64_col", "string_col", "int64_col", "bool_col"]
     bf_series = op(scalars_df_index[col_names])
-    pd_series = op(scalars_pandas_df_index[col_names])
     bf_result = bf_series.to_pandas(ordered=ordered)
+    pd_result = op(scalars_pandas_df_index[col_names])
+
+    # Check dtype separately
+    assert bf_result.dtype == bf_dtype
 
     # Pandas may produce narrower numeric types, but bigframes always produces Float64
     # Pandas has object index type
-    pd_series.index = pd_series.index.astype(pd.StringDtype(storage="pyarrow"))
     assert_series_equal(
-        pd_series,
+        pd_result,
         bf_result,
+        check_dtype=False,
         check_index_type=False,
         ignore_order=not ordered,
-        check_dtype=False,
     )