From 9faa4609603527d1f236eaa13ae21e72c434e0e1 Mon Sep 17 00:00:00 2001 From: Kei Date: Mon, 1 Apr 2024 19:04:48 +0800 Subject: [PATCH 01/33] Set preserve_dtype flag for bool type only when result is also bool --- pandas/core/groupby/ops.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 8585ae3828247..a5e9036eb7b75 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -914,16 +914,19 @@ def agg_series( np.ndarray or ExtensionArray """ - if not isinstance(obj._values, np.ndarray): + result = self._aggregate_series_pure_python(obj, func) + npvalues = lib.maybe_convert_objects(result, try_float=False) + + if not isinstance(obj._values, np.ndarray) and obj._values.dtype._is_boolean: + if npvalues.dtype == "bool": + preserve_dtype = True + elif not isinstance(obj._values, np.ndarray): # we can preserve a little bit more aggressively with EA dtype # because maybe_cast_pointwise_result will do a try/except # with _from_sequence. NB we are assuming here that _from_sequence # is sufficiently strict that it casts appropriately. preserve_dtype = True - result = self._aggregate_series_pure_python(obj, func) - - npvalues = lib.maybe_convert_objects(result, try_float=False) if preserve_dtype: out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) else: From 969d5b103efe8a68a510dcbac8ba84534181bc96 Mon Sep 17 00:00:00 2001 From: Kei Date: Tue, 2 Apr 2024 17:13:26 +0800 Subject: [PATCH 02/33] Update implementation to change type to pyarrow only --- pandas/core/groupby/ops.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index a5e9036eb7b75..809053308235d 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -18,6 +18,7 @@ ) import numpy as np +import pyarrow as pa from pandas._libs import ( NaT, @@ -45,12 +46,15 @@ ensure_uint64, is_1d_only_ea_dtype, ) +from pandas.core.dtypes.dtypes import ArrowDtype from pandas.core.dtypes.missing import ( isna, maybe_fill, ) from pandas.core.arrays import Categorical +from pandas.core.arrays.arrow.array import ArrowExtensionArray +from pandas.core.construction import array as pd_array from pandas.core.frame import DataFrame from pandas.core.groupby import grouper from pandas.core.indexes.api import ( @@ -917,17 +921,15 @@ def agg_series( result = self._aggregate_series_pure_python(obj, func) npvalues = lib.maybe_convert_objects(result, try_float=False) - if not isinstance(obj._values, np.ndarray) and obj._values.dtype._is_boolean: - if npvalues.dtype == "bool": - preserve_dtype = True - elif not isinstance(obj._values, np.ndarray): + if isinstance(obj._values, ArrowExtensionArray): + pyarrow_dtype = pa.from_numpy_dtype(npvalues.dtype) + pandas_pyarrow_dtype = ArrowDtype(pyarrow_dtype) + out = pd_array(npvalues, dtype=pandas_pyarrow_dtype) + elif not isinstance(obj._values, np.ndarray) or preserve_dtype: # we can preserve a little bit more aggressively with EA dtype # because maybe_cast_pointwise_result will do a try/except # with _from_sequence. NB we are assuming here that _from_sequence # is sufficiently strict that it casts appropriately. - preserve_dtype = True - - if preserve_dtype: out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) else: out = npvalues From 66114f397b892e2fa30cf4cda641f9c2104f1038 Mon Sep 17 00:00:00 2001 From: Kei Date: Tue, 2 Apr 2024 17:15:07 +0800 Subject: [PATCH 03/33] Change import order --- pandas/core/groupby/ops.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 809053308235d..177ff6cbc69bd 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -922,6 +922,7 @@ def agg_series( npvalues = lib.maybe_convert_objects(result, try_float=False) if isinstance(obj._values, ArrowExtensionArray): + # convert to pyarrow extension pyarrow_dtype = pa.from_numpy_dtype(npvalues.dtype) pandas_pyarrow_dtype = ArrowDtype(pyarrow_dtype) out = pd_array(npvalues, dtype=pandas_pyarrow_dtype) From b0290ed659060969325c6eb10e2c9cfa5011fba2 Mon Sep 17 00:00:00 2001 From: Kei Date: Wed, 3 Apr 2024 18:46:29 +0800 Subject: [PATCH 04/33] Convert numpy array to pandas representation of pyarrow array --- pandas/_libs/lib.pyx | 4 +++- pandas/core/groupby/ops.py | 27 ++++++++++++++++++--------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index a2205454a5a46..fdb9eb93181d1 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2546,7 +2546,7 @@ def maybe_convert_objects(ndarray[object] objects, if not convert_non_numeric: seen.object_ = True break - elif util.is_nan(val): + elif util.is_nan(val) or is_matching_na(val, C_NA): seen.nan_ = True mask[i] = True if util.is_complex_object(val): @@ -2555,6 +2555,8 @@ def maybe_convert_objects(ndarray[object] objects, seen.complex_ = True if not convert_numeric: break + elif is_matching_na(val, C_NA): + floats[i] = complexes[i] = fnan else: floats[i] = complexes[i] = val elif util.is_bool_object(val): diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 177ff6cbc69bd..4bd0df7718a62 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -918,20 +918,29 @@ def agg_series( np.ndarray or ExtensionArray """ - result = self._aggregate_series_pure_python(obj, func) - npvalues = lib.maybe_convert_objects(result, try_float=False) - - if isinstance(obj._values, ArrowExtensionArray): - # convert to pyarrow extension - pyarrow_dtype = pa.from_numpy_dtype(npvalues.dtype) - pandas_pyarrow_dtype = ArrowDtype(pyarrow_dtype) - out = pd_array(npvalues, dtype=pandas_pyarrow_dtype) - elif not isinstance(obj._values, np.ndarray) or preserve_dtype: + if not isinstance(obj._values, np.ndarray) and not isinstance( + obj._values, ArrowExtensionArray + ): # we can preserve a little bit more aggressively with EA dtype # because maybe_cast_pointwise_result will do a try/except # with _from_sequence. NB we are assuming here that _from_sequence # is sufficiently strict that it casts appropriately. + preserve_dtype = True + + result = self._aggregate_series_pure_python(obj, func) + + npvalues = lib.maybe_convert_objects(result, try_float=False) + if preserve_dtype: out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) + elif ( + isinstance(obj._values, ArrowExtensionArray) + and npvalues.dtype != np.dtype("object") + and npvalues.dtype != np.dtype("complex128") + ): + pyarrow_dtype = pa.from_numpy_dtype(npvalues.dtype) + pandas_pyarrow_dtype = ArrowDtype(pyarrow_dtype) + out = pd_array(npvalues, dtype=pandas_pyarrow_dtype) + else: out = npvalues return out From 20c8fa09f62b9e1d95a9ad26dfc043a51dd7f3ea Mon Sep 17 00:00:00 2001 From: Kei Date: Wed, 3 Apr 2024 18:47:04 +0800 Subject: [PATCH 05/33] Add tests --- .../tests/groupby/aggregate/test_aggregate.py | 316 ++++++++++++++++++ 1 file changed, 316 insertions(+) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 2b9df1b7079da..195d74c2b835b 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1663,3 +1663,319 @@ def func(x): msg = "length must not be 0" with pytest.raises(ValueError, match=msg): df.groupby("A", observed=False).agg(func) + + +def test_agg_simple_lambda_numpy_to_same_data_type(): + df = DataFrame( + {"A": [1, 3, 100, 3, 100, 100], "B": [False, False, False, False, False, True]} + ) + df["B"] = df["B"].astype("bool") + gb = df.groupby("A") + result = gb.agg(lambda x: x.max()) + + expected = DataFrame({"A": [1, 3, 100], "B": [False, False, True]}) + expected["B"] = expected["B"].astype("bool") + expected.set_index("A", inplace=True) + tm.assert_frame_equal(result, expected) + assert result["B"].dtype == expected["B"].dtype + + +def test_agg_simple_lambda_pyarrow_to_same_data_type(): + df = DataFrame( + {"A": [1, 3, 100, 3, 100, 100], "B": [False, False, False, False, False, True]} + ) + df["B"] = df["B"].astype("bool[pyarrow]") + gb = df.groupby("A") + result = gb.agg(lambda x: x.max()) + + expected = DataFrame({"A": [1, 3, 100], "B": [False, False, True]}) + expected["B"] = expected["B"].astype("bool[pyarrow]") + expected.set_index("A", inplace=True) + tm.assert_frame_equal(result, expected) + assert result["B"].dtype == expected["B"].dtype + + +def test_agg_simple_lambda_numpy_to_diff_data_type(): + df = DataFrame( + {"A": [1, 3, 100, 3, 100, 100], "B": [False, True, True, False, False, True]} + ) + df["B"] = df["B"].astype("bool") + gb = df.groupby("A") + result = gb.agg(lambda x: x.sum()) + + expected = DataFrame({"A": [1, 3, 100], "B": [0, 1, 2]}) + expected["B"] = expected["B"].astype("int64") + expected.set_index("A", inplace=True) + tm.assert_frame_equal(result, expected) + assert result["B"].dtype == expected["B"].dtype + + +def test_agg_simple_lambda_pyarrow_to_diff_data_type(): + df = DataFrame( + {"A": [1, 3, 100, 3, 100, 100], "B": [False, True, True, False, False, True]} + ) + df["B"] = df["B"].astype("bool[pyarrow]") + gb = df.groupby("A") + result = gb.agg(lambda x: x.sum()) + + expected = DataFrame({"A": [1, 3, 100], "B": [0, 1, 2]}) + expected["B"] = expected["B"].astype("int64[pyarrow]") + expected.set_index("A", inplace=True) + tm.assert_frame_equal(result, expected) + assert result["B"].dtype == expected["B"].dtype + + +def test_agg_lambda_numpy_to_diff_data_type(): + df = DataFrame( + { + "A": [ + "cat39403", + "cat21", + "cat21", + "cat39403", + "cat39403", + "cat21", + "cat21", + "cat39403", + "cat21", + "cat21", + "cat18", + ], + "B": [37, 4958, -4839, 85943, 5490, 1, 0, 945, -943049, -132, 3], + } + ) + df["B"] = df["B"].astype("int32") + gb = df.groupby("A") + result = gb.agg(lambda x: (x.sum() / x.count()) + x.max() - 3 + 5) + + expected = DataFrame( + { + "A": ["cat18", "cat21", "cat39403"], + "B": [8.0, -152216.83333333334, 109048.75], + } + ) + expected["B"] = expected["B"].astype("float64") + expected.set_index("A", inplace=True) + tm.assert_frame_equal(result, expected) + assert result["B"].dtype == expected["B"].dtype + + +def test_agg_lambda_pyarrow_to_diff_data_type(): + df = DataFrame( + { + "A": [ + "cat39403", + "cat21", + "cat21", + "cat39403", + "cat39403", + "cat21", + "cat21", + "cat39403", + "cat21", + "cat21", + "cat18", + ], + "B": [37, 4958, -4839, 85943, 5490, 1, 0, 945, -943049, -132, 3], + } + ) + df["B"] = df["B"].astype("int32[pyarrow]") + gb = df.groupby("A") + result = gb.agg(lambda x: (x.sum() / x.count()) + x.max() - 3 + 5) + + expected = DataFrame( + { + "A": ["cat18", "cat21", "cat39403"], + "B": [8.0, -152216.83333333334, 109048.75], + } + ) + expected["B"] = expected["B"].astype("double[pyarrow]") + expected.set_index("A", inplace=True) + tm.assert_frame_equal(result, expected) + assert result["B"].dtype == expected["B"].dtype + + +def test_agg_lambda_numpy_to_same_data_type(): + df = DataFrame( + { + "A": [ + "cat39403", + "cat21", + "cat21", + "cat39403", + "cat39403", + "cat21", + "cat21", + "cat39403", + "cat21", + "cat21", + "cat18", + ], + "B": [ + 37.0, + 4958.0, + -4839.0, + 85943.0, + 5490.0, + 1.0, + 0.0, + 945.0, + -943049.0, + -132.0, + 3.0, + ], + } + ) + df["B"] = df["B"].astype("float64") + gb = df.groupby("A") + result = gb.agg(lambda x: x.std() / x.var() * 10 / 3 - 32 + 3) + + expected = DataFrame( + {"A": ["cat18", "cat21", "cat39403"], "B": [np.nan, -28.999991, -28.999921]} + ) + expected["B"] = expected["B"].astype("float64") + expected.set_index("A", inplace=True) + tm.assert_frame_equal(result, expected) + assert result["B"].dtype == expected["B"].dtype + + +def test_agg_lambda_pyarrow_to_same_data_type(): + df = DataFrame( + { + "A": [ + "cat39403", + "cat21", + "cat21", + "cat39403", + "cat39403", + "cat21", + "cat21", + "cat39403", + "cat21", + "cat21", + "cat18", + ], + "B": [ + 37.0, + 4958.0, + -4839.0, + 85943.0, + 5490.0, + 1.0, + 0.0, + 945.0, + -943049.0, + -132.0, + 3.0, + ], + } + ) + df["B"] = df["B"].astype("double[pyarrow]") + gb = df.groupby("A") + result = gb.agg(lambda x: x.std() / x.var() * 10 / 3 - 32 + 3) + + expected = DataFrame( + {"A": ["cat18", "cat21", "cat39403"], "B": [np.nan, -28.999991, -28.999921]} + ) + expected["B"] = expected["B"].astype("double[pyarrow]") + expected.set_index("A", inplace=True) + tm.assert_frame_equal(result, expected) + assert result["B"].dtype == expected["B"].dtype + + +def test_agg_lambda_pyarrow_to_data_type_conversion(): + # test numpy datatype conversion back to pyarrow datatype + # complexes, floats, ints, uints, object + # float64 + df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) + df["B"] = df["B"].astype("float64[pyarrow]") + gb = df.groupby("A") + result = gb.agg(lambda x: x) + + expected = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) + expected["B"] = expected["B"].astype("float64[pyarrow]") + expected.set_index("A", inplace=True) + + tm.assert_frame_equal(result, expected) + assert result["B"].dtype == expected["B"].dtype + + # complex128 + df["B"] = df["B"].astype("int64[pyarrow]") + gb = df.groupby("A") + result = gb.agg(lambda x: complex(x.sum(), x.count())) + + expected = DataFrame( + { + "A": ["c1", "c2", "c3"], + "B": [complex(100, 1), complex(200, 1), complex(255, 1)], + } + ) + expected["B"] = expected["B"].astype("complex128") + expected.set_index("A", inplace=True) + + tm.assert_frame_equal(result, expected) + assert result["B"].dtype == expected["B"].dtype + + # int64 + df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) + df["B"] = df["B"].astype("int64[pyarrow]") + gb = df.groupby("A") + result = gb.agg(lambda x: x) + + expected = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) + expected["B"] = expected["B"].astype("int64[pyarrow]") + expected.set_index("A", inplace=True) + + tm.assert_frame_equal(result, expected) + assert result["B"].dtype == expected["B"].dtype + + # uint64 + df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) + df["B"] = df["B"].astype("uint64[pyarrow]") + gb = df.groupby("A") + result = gb.agg(lambda x: x) + + expected = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) + expected["B"] = expected["B"].astype("int64[pyarrow]") + expected.set_index("A", inplace=True) + + # uint64 casted + df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) + df["B"] = df["B"].astype("uint64[pyarrow]") + gb = df.groupby("A") + result = gb.agg(lambda x: np.uint64(x.sum())) + + expected = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) + expected["B"] = expected["B"].astype("uint64[pyarrow]") + expected.set_index("A", inplace=True) + + tm.assert_frame_equal(result, expected) + assert result["B"].dtype == expected["B"].dtype + + # bool + df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) + df["B"] = df["B"].astype("bool[pyarrow]") + gb = df.groupby("A") + result = gb.agg(lambda x: x) + + expected = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) + expected["B"] = expected["B"].astype("bool[pyarrow]") + expected.set_index("A", inplace=True) + + tm.assert_frame_equal(result, expected) + assert result["B"].dtype == expected["B"].dtype + + # object + df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) + df["B"] = df["B"].astype("int64[pyarrow]") + gb = df.groupby("A") + result = gb.agg(lambda x: {"number": 1}) + + expected = DataFrame( + {"A": ["c1", "c2", "c3"], "B": [{"number": 1}, {"number": 1}, {"number": 1}]} + ) + expected["B"] = expected["B"].astype("object") + expected.set_index("A", inplace=True) + + tm.assert_frame_equal(result, expected) + assert result["B"].dtype == expected["B"].dtype From 932d7376188bd7f77346d5f51da2ce58a4ada742 Mon Sep 17 00:00:00 2001 From: Kei Date: Fri, 5 Apr 2024 14:19:01 +0800 Subject: [PATCH 06/33] Change pyarrow to optional import in agg_series() method --- pandas/core/groupby/ops.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 4bd0df7718a62..b96ffd69ca1ac 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -18,7 +18,6 @@ ) import numpy as np -import pyarrow as pa from pandas._libs import ( NaT, @@ -937,6 +936,8 @@ def agg_series( and npvalues.dtype != np.dtype("object") and npvalues.dtype != np.dtype("complex128") ): + import pyarrow as pa + pyarrow_dtype = pa.from_numpy_dtype(npvalues.dtype) pandas_pyarrow_dtype = ArrowDtype(pyarrow_dtype) out = pd_array(npvalues, dtype=pandas_pyarrow_dtype) From 82ddeb52ba8b8b5498e97c1c5ffc93eac21a5b40 Mon Sep 17 00:00:00 2001 From: Kei Date: Fri, 5 Apr 2024 14:32:14 +0800 Subject: [PATCH 07/33] Seperate tests --- .../tests/groupby/aggregate/test_aggregate.py | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 195d74c2b835b..6c94c1a73cd38 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1883,10 +1883,9 @@ def test_agg_lambda_pyarrow_to_same_data_type(): assert result["B"].dtype == expected["B"].dtype -def test_agg_lambda_pyarrow_to_data_type_conversion(): - # test numpy datatype conversion back to pyarrow datatype +def test_agg_lambda_float64_pyarrow_dtype_conversion(): + # test numpy dtype conversion back to pyarrow dtype # complexes, floats, ints, uints, object - # float64 df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) df["B"] = df["B"].astype("float64[pyarrow]") gb = df.groupby("A") @@ -1899,7 +1898,9 @@ def test_agg_lambda_pyarrow_to_data_type_conversion(): tm.assert_frame_equal(result, expected) assert result["B"].dtype == expected["B"].dtype - # complex128 + +def test_agg_lambda_complex128_pyarrow_dtype_conversion(): + df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) df["B"] = df["B"].astype("int64[pyarrow]") gb = df.groupby("A") result = gb.agg(lambda x: complex(x.sum(), x.count())) @@ -1916,7 +1917,8 @@ def test_agg_lambda_pyarrow_to_data_type_conversion(): tm.assert_frame_equal(result, expected) assert result["B"].dtype == expected["B"].dtype - # int64 + +def test_agg_lambda_int64_pyarrow_dtype_conversion(): df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) df["B"] = df["B"].astype("int64[pyarrow]") gb = df.groupby("A") @@ -1929,7 +1931,8 @@ def test_agg_lambda_pyarrow_to_data_type_conversion(): tm.assert_frame_equal(result, expected) assert result["B"].dtype == expected["B"].dtype - # uint64 + +def test_agg_lambda_uint64_pyarrow_dtype_conversion(): df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) df["B"] = df["B"].astype("uint64[pyarrow]") gb = df.groupby("A") @@ -1939,7 +1942,11 @@ def test_agg_lambda_pyarrow_to_data_type_conversion(): expected["B"] = expected["B"].astype("int64[pyarrow]") expected.set_index("A", inplace=True) - # uint64 casted + tm.assert_frame_equal(result, expected) + assert result["B"].dtype == expected["B"].dtype + + +def test_agg_lambda_numpy_uint64_to_pyarrow_dtype_conversion(): df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) df["B"] = df["B"].astype("uint64[pyarrow]") gb = df.groupby("A") @@ -1952,7 +1959,8 @@ def test_agg_lambda_pyarrow_to_data_type_conversion(): tm.assert_frame_equal(result, expected) assert result["B"].dtype == expected["B"].dtype - # bool + +def test_agg_lambda_bool_pyarrow_dtype_conversion(): df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) df["B"] = df["B"].astype("bool[pyarrow]") gb = df.groupby("A") @@ -1965,7 +1973,8 @@ def test_agg_lambda_pyarrow_to_data_type_conversion(): tm.assert_frame_equal(result, expected) assert result["B"].dtype == expected["B"].dtype - # object + +def test_agg_lambda_object_pyarrow_dtype_conversion(): df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) df["B"] = df["B"].astype("int64[pyarrow]") gb = df.groupby("A") From a54bf588459f0d4d1e41a8077eef2a3ce47cda0c Mon Sep 17 00:00:00 2001 From: Kei Date: Mon, 8 Apr 2024 23:25:31 +0800 Subject: [PATCH 08/33] Revert to old implementation --- pandas/_libs/lib.pyx | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index fdb9eb93181d1..a2205454a5a46 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2546,7 +2546,7 @@ def maybe_convert_objects(ndarray[object] objects, if not convert_non_numeric: seen.object_ = True break - elif util.is_nan(val) or is_matching_na(val, C_NA): + elif util.is_nan(val): seen.nan_ = True mask[i] = True if util.is_complex_object(val): @@ -2555,8 +2555,6 @@ def maybe_convert_objects(ndarray[object] objects, seen.complex_ = True if not convert_numeric: break - elif is_matching_na(val, C_NA): - floats[i] = complexes[i] = fnan else: floats[i] = complexes[i] = val elif util.is_bool_object(val): From 64330f0c3f5ed9b89cc21531e400525ed0915c0b Mon Sep 17 00:00:00 2001 From: Kei Date: Mon, 8 Apr 2024 23:29:41 +0800 Subject: [PATCH 09/33] Update implementation to use pyarrow array method --- pandas/core/dtypes/cast.py | 36 ++++++++++++++++++++++++++++++++++++ pandas/core/groupby/ops.py | 18 ++++-------------- 2 files changed, 40 insertions(+), 14 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index a130983337f64..3b774231e111d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -478,6 +478,42 @@ def maybe_cast_pointwise_result( return result +def maybe_cast_to_pyarrow_dtype( + result: ArrayLike, converted_result: ArrayLike +) -> ArrayLike: + """ + Try casting result of a pointwise operation to its pyarrow dtype if + appropriate. + + Parameters + ---------- + result : array-like + Result to cast. + + Returns + ------- + result : array-like + result maybe casted to the dtype. + """ + try: + import pyarrow as pa + from pyarrow import ( + ArrowInvalid, + ArrowNotImplementedError, + ) + + from pandas.core.construction import array as pd_array + + result[isna(result)] = np.nan + pyarrow_result = pa.array(result) + pandas_pyarrow_dtype = ArrowDtype(pyarrow_result.type) + result = pd_array(result, dtype=pandas_pyarrow_dtype) + except (ArrowNotImplementedError, ArrowInvalid): + return converted_result + + return result + + def _maybe_cast_to_extension_array( cls: type[ExtensionArray], obj: ArrayLike, dtype: ExtensionDtype | None = None ) -> ArrayLike: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b96ffd69ca1ac..1212a601ac756 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -36,6 +36,7 @@ from pandas.core.dtypes.cast import ( maybe_cast_pointwise_result, + maybe_cast_to_pyarrow_dtype, maybe_downcast_to_dtype, ) from pandas.core.dtypes.common import ( @@ -45,7 +46,6 @@ ensure_uint64, is_1d_only_ea_dtype, ) -from pandas.core.dtypes.dtypes import ArrowDtype from pandas.core.dtypes.missing import ( isna, maybe_fill, @@ -53,7 +53,6 @@ from pandas.core.arrays import Categorical from pandas.core.arrays.arrow.array import ArrowExtensionArray -from pandas.core.construction import array as pd_array from pandas.core.frame import DataFrame from pandas.core.groupby import grouper from pandas.core.indexes.api import ( @@ -927,21 +926,12 @@ def agg_series( preserve_dtype = True result = self._aggregate_series_pure_python(obj, func) - npvalues = lib.maybe_convert_objects(result, try_float=False) + if preserve_dtype: out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) - elif ( - isinstance(obj._values, ArrowExtensionArray) - and npvalues.dtype != np.dtype("object") - and npvalues.dtype != np.dtype("complex128") - ): - import pyarrow as pa - - pyarrow_dtype = pa.from_numpy_dtype(npvalues.dtype) - pandas_pyarrow_dtype = ArrowDtype(pyarrow_dtype) - out = pd_array(npvalues, dtype=pandas_pyarrow_dtype) - + elif isinstance(obj._values, ArrowExtensionArray): + out = maybe_cast_to_pyarrow_dtype(result, npvalues) else: out = npvalues return out From 06477111fd78df967d156b22561bf5e8191bb9a1 Mon Sep 17 00:00:00 2001 From: Kei Date: Mon, 8 Apr 2024 23:31:11 +0800 Subject: [PATCH 10/33] Update test_aggregate tests --- pandas/tests/groupby/aggregate/test_aggregate.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 6c94c1a73cd38..4a4dcb5386fef 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1983,7 +1983,11 @@ def test_agg_lambda_object_pyarrow_dtype_conversion(): expected = DataFrame( {"A": ["c1", "c2", "c3"], "B": [{"number": 1}, {"number": 1}, {"number": 1}]} ) - expected["B"] = expected["B"].astype("object") + import pyarrow as pa + + pyarrow_type = pa.struct({"number": pa.int64()}) + pandas_pyarrow_dtype = pd.ArrowDtype(pyarrow_type) + expected["B"] = expected["B"].astype(pandas_pyarrow_dtype) expected.set_index("A", inplace=True) tm.assert_frame_equal(result, expected) From affde380011c7645d442b83b63e1abeb32bb7e67 Mon Sep 17 00:00:00 2001 From: Kei Date: Mon, 8 Apr 2024 23:56:28 +0800 Subject: [PATCH 11/33] Move pyarrow import to top of method --- pandas/tests/groupby/aggregate/test_aggregate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 4a4dcb5386fef..11790fd8a1fab 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1975,6 +1975,8 @@ def test_agg_lambda_bool_pyarrow_dtype_conversion(): def test_agg_lambda_object_pyarrow_dtype_conversion(): + import pyarrow as pa + df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) df["B"] = df["B"].astype("int64[pyarrow]") gb = df.groupby("A") @@ -1983,7 +1985,6 @@ def test_agg_lambda_object_pyarrow_dtype_conversion(): expected = DataFrame( {"A": ["c1", "c2", "c3"], "B": [{"number": 1}, {"number": 1}, {"number": 1}]} ) - import pyarrow as pa pyarrow_type = pa.struct({"number": pa.int64()}) pandas_pyarrow_dtype = pd.ArrowDtype(pyarrow_type) From 842f561d6f318a916f85379e7149ac96b6006dc3 Mon Sep 17 00:00:00 2001 From: Kei Date: Fri, 12 Apr 2024 13:36:37 +0800 Subject: [PATCH 12/33] Update according to pr comments --- pandas/core/dtypes/cast.py | 15 ++++++++++----- pandas/core/groupby/ops.py | 10 ++++++---- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 3b774231e111d..a27b197abf5d4 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -478,9 +478,7 @@ def maybe_cast_pointwise_result( return result -def maybe_cast_to_pyarrow_dtype( - result: ArrayLike, converted_result: ArrayLike -) -> ArrayLike: +def maybe_cast_to_pyarrow_dtype(result: ArrayLike) -> ArrayLike: """ Try casting result of a pointwise operation to its pyarrow dtype if appropriate. @@ -499,6 +497,7 @@ def maybe_cast_to_pyarrow_dtype( import pyarrow as pa from pyarrow import ( ArrowInvalid, + ArrowMemoryError, ArrowNotImplementedError, ) @@ -508,8 +507,14 @@ def maybe_cast_to_pyarrow_dtype( pyarrow_result = pa.array(result) pandas_pyarrow_dtype = ArrowDtype(pyarrow_result.type) result = pd_array(result, dtype=pandas_pyarrow_dtype) - except (ArrowNotImplementedError, ArrowInvalid): - return converted_result + except ( + ArrowNotImplementedError, + ArrowInvalid, + ArrowMemoryError, + TypeError, + ValueError, + ): + result = lib.maybe_convert_objects(result, try_float=False) return result diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 1212a601ac756..80f4a120ae981 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -916,6 +916,11 @@ def agg_series( np.ndarray or ExtensionArray """ + result = self._aggregate_series_pure_python(obj, func) + if isinstance(obj._values, ArrowExtensionArray): + out = maybe_cast_to_pyarrow_dtype(result) + return out + if not isinstance(obj._values, np.ndarray) and not isinstance( obj._values, ArrowExtensionArray ): @@ -925,15 +930,12 @@ def agg_series( # is sufficiently strict that it casts appropriately. preserve_dtype = True - result = self._aggregate_series_pure_python(obj, func) npvalues = lib.maybe_convert_objects(result, try_float=False) - if preserve_dtype: out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) - elif isinstance(obj._values, ArrowExtensionArray): - out = maybe_cast_to_pyarrow_dtype(result, npvalues) else: out = npvalues + return out @final From 6f35c0e49cec0ccbbc8d2b8e7d0009ad1bebcf19 Mon Sep 17 00:00:00 2001 From: Kei Date: Sat, 20 Apr 2024 22:48:11 +0800 Subject: [PATCH 13/33] Fallback convert to input dtype is output is all nan or empty array --- pandas/core/dtypes/cast.py | 10 +++++++--- pandas/core/groupby/ops.py | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index a27b197abf5d4..9b87d7be6420f 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -478,7 +478,7 @@ def maybe_cast_pointwise_result( return result -def maybe_cast_to_pyarrow_dtype(result: ArrayLike) -> ArrayLike: +def maybe_cast_to_pyarrow_dtype(result: ArrayLike, obj_dtype: Dtype) -> ArrayLike: """ Try casting result of a pointwise operation to its pyarrow dtype if appropriate. @@ -504,8 +504,12 @@ def maybe_cast_to_pyarrow_dtype(result: ArrayLike) -> ArrayLike: from pandas.core.construction import array as pd_array result[isna(result)] = np.nan - pyarrow_result = pa.array(result) - pandas_pyarrow_dtype = ArrowDtype(pyarrow_result.type) + if result.size == 0 or all(isna(result)): + pandas_pyarrow_dtype = obj_dtype + else: + pyarrow_result = pa.array(result) + pandas_pyarrow_dtype = ArrowDtype(pyarrow_result.type) + result = pd_array(result, dtype=pandas_pyarrow_dtype) except ( ArrowNotImplementedError, diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index fb089c86f5f1c..a20f0ae4b7dfc 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -918,7 +918,7 @@ def agg_series( result = self._aggregate_series_pure_python(obj, func) if isinstance(obj._values, ArrowExtensionArray): - out = maybe_cast_to_pyarrow_dtype(result) + out = maybe_cast_to_pyarrow_dtype(result, obj.dtype) return out if not isinstance(obj._values, np.ndarray) and not isinstance( From abd0adfeb6d06fc70b6eb75bc69488262eecc46c Mon Sep 17 00:00:00 2001 From: Kei Date: Sun, 21 Apr 2024 00:10:24 +0800 Subject: [PATCH 14/33] Strip na values when inferring pyarrow dtype --- pandas/core/dtypes/cast.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 9b87d7be6420f..eee25f7881848 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -503,11 +503,11 @@ def maybe_cast_to_pyarrow_dtype(result: ArrayLike, obj_dtype: Dtype) -> ArrayLik from pandas.core.construction import array as pd_array - result[isna(result)] = np.nan - if result.size == 0 or all(isna(result)): + stripped_result = result[~isna(result)] + if result.size == 0 or all(isna(stripped_result)): pandas_pyarrow_dtype = obj_dtype else: - pyarrow_result = pa.array(result) + pyarrow_result = pa.array(stripped_result) pandas_pyarrow_dtype = ArrowDtype(pyarrow_result.type) result = pd_array(result, dtype=pandas_pyarrow_dtype) From bebc442b10f2fafcc58ee343fb8d021b6ad3f33d Mon Sep 17 00:00:00 2001 From: Kei Date: Sun, 21 Apr 2024 00:19:07 +0800 Subject: [PATCH 15/33] Update tests to check expected inferred dtype instead of inputy dtype --- pandas/tests/extension/base/groupby.py | 10 ++++++++-- pandas/tests/extension/test_arrow.py | 19 +++++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index bab8566a06dc2..583b79c2d7afa 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -58,12 +58,18 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping): expected = pd.DataFrame({"B": uniques, "A": exp_vals}) tm.assert_frame_equal(result, expected) - def test_groupby_agg_extension(self, data_for_grouping): + def test_groupby_agg_extension( + self, data_for_grouping, expected_inferred_result_dtype + ): # GH#38980 groupby agg on extension type fails for non-numeric types df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) - expected = df.iloc[[0, 2, 4, 7]] + expected_df = pd.DataFrame( + {"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping} + ) + expected = expected_df.iloc[[0, 2, 4, 7]] expected = expected.set_index("A") + expected["B"] = expected["B"].astype(expected_inferred_result_dtype) result = df.groupby("A").agg({"B": "first"}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 9b2251d0b7d4a..50235bac6db45 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -225,6 +225,25 @@ def data_for_grouping(dtype): return pd.array([B, B, None, None, A, A, B, C], dtype=dtype) +@pytest.fixture +def expected_inferred_result_dtype(dtype): + """ + When the data pass through aggregate, + the inferred data type that it will become + + """ + + pa_dtype = dtype.pyarrow_dtype + if pa.types.is_date(pa_dtype): + return "date32[day][pyarrow]" + elif pa.types.is_time(pa_dtype): + return "time64[us][pyarrow]" + elif pa.types.is_decimal(pa_dtype): + return ArrowDtype(pa.decimal128(4, 3)) + else: + return dtype + + @pytest.fixture def data_for_sorting(data_for_grouping): """ From bb6343b5c1fb6e7ca9217f28f4f770eec0d4c983 Mon Sep 17 00:00:00 2001 From: Kei Date: Sun, 21 Apr 2024 16:14:46 +0800 Subject: [PATCH 16/33] Override test case for test_arrow.py --- pandas/tests/extension/base/groupby.py | 10 ++-------- pandas/tests/extension/test_arrow.py | 22 +++++++++++++++++++++- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 583b79c2d7afa..bab8566a06dc2 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -58,18 +58,12 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping): expected = pd.DataFrame({"B": uniques, "A": exp_vals}) tm.assert_frame_equal(result, expected) - def test_groupby_agg_extension( - self, data_for_grouping, expected_inferred_result_dtype - ): + def test_groupby_agg_extension(self, data_for_grouping): # GH#38980 groupby agg on extension type fails for non-numeric types df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) - expected_df = pd.DataFrame( - {"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping} - ) - expected = expected_df.iloc[[0, 2, 4, 7]] + expected = df.iloc[[0, 2, 4, 7]] expected = expected.set_index("A") - expected["B"] = expected["B"].astype(expected_inferred_result_dtype) result = df.groupby("A").agg({"B": "first"}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 50235bac6db45..1283301eeb5e2 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -225,7 +225,6 @@ def data_for_grouping(dtype): return pd.array([B, B, None, None, A, A, B, C], dtype=dtype) -@pytest.fixture def expected_inferred_result_dtype(dtype): """ When the data pass through aggregate, @@ -1144,6 +1143,27 @@ def test_comp_masked_numpy(self, masked_dtype, comparison_op): expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_())) tm.assert_series_equal(result, expected) + def test_groupby_agg_extension(self, data_for_grouping): + # GH#38980 groupby agg on extension type fails for non-numeric types + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) + + expected_df = pd.DataFrame( + {"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping} + ) + expected = expected_df.iloc[[0, 2, 4, 7]] + expected = expected.set_index("A") + expected_dtype = expected_inferred_result_dtype(expected["B"].dtype) + expected["B"] = expected["B"].astype(expected_dtype) + + result = df.groupby("A").agg({"B": "first"}) + tm.assert_frame_equal(result, expected) + + result = df.groupby("A").agg("first") + tm.assert_frame_equal(result, expected) + + result = df.groupby("A").first() + tm.assert_frame_equal(result, expected) + class TestLogicalOps: """Various Series and DataFrame logical ops methods.""" From 6dc40f5e5d6f457b93b84fac8306edb12746ac74 Mon Sep 17 00:00:00 2001 From: Kei Date: Sun, 21 Apr 2024 16:46:38 +0800 Subject: [PATCH 17/33] Empty commit to trigger build run From 4ef96f7301e0a445fb448cbfa495981109f7206c Mon Sep 17 00:00:00 2001 From: Kei Date: Wed, 24 Apr 2024 01:28:35 +0800 Subject: [PATCH 18/33] In agg series, convert to np values, then cast to pyarrow dtype, account for missing pyarrow dtypes --- pandas/core/dtypes/cast.py | 78 ++++++++++++++++++++++++-------------- pandas/core/groupby/ops.py | 6 +-- 2 files changed, 52 insertions(+), 32 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index eee25f7881848..d622e319b756c 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -478,10 +478,11 @@ def maybe_cast_pointwise_result( return result -def maybe_cast_to_pyarrow_dtype(result: ArrayLike, obj_dtype: Dtype) -> ArrayLike: +def maybe_cast_to_pyarrow_result(result: ArrayLike) -> ArrayLike: """ - Try casting result of a pointwise operation to its pyarrow dtype if - appropriate. + Try casting result of a pointwise operation to its pyarrow dtype + and arrow extension array if appropriate. If not possible, + returns np.ndarray. Parameters ---------- @@ -493,34 +494,20 @@ def maybe_cast_to_pyarrow_dtype(result: ArrayLike, obj_dtype: Dtype) -> ArrayLik result : array-like result maybe casted to the dtype. """ - try: - import pyarrow as pa - from pyarrow import ( - ArrowInvalid, - ArrowMemoryError, - ArrowNotImplementedError, - ) + from pandas.core.construction import array as pd_array - from pandas.core.construction import array as pd_array - - stripped_result = result[~isna(result)] - if result.size == 0 or all(isna(stripped_result)): - pandas_pyarrow_dtype = obj_dtype - else: - pyarrow_result = pa.array(stripped_result) - pandas_pyarrow_dtype = ArrowDtype(pyarrow_result.type) + # maybe_convert_objects is unable to detect NA as nan + # (detects it as object instead) + stripped_result = result[~isna(result)] + npvalues = lib.maybe_convert_objects(stripped_result, try_float=False) - result = pd_array(result, dtype=pandas_pyarrow_dtype) - except ( - ArrowNotImplementedError, - ArrowInvalid, - ArrowMemoryError, - TypeError, - ValueError, - ): - result = lib.maybe_convert_objects(result, try_float=False) + try: + dtype = convert_dtypes(npvalues, dtype_backend="pyarrow") + out = pd_array(result, dtype=dtype) + except (TypeError, ValueError, np.ComplexWarning): + out = npvalues - return result + return out def _maybe_cast_to_extension_array( @@ -1080,6 +1067,7 @@ def convert_dtypes( inferred_dtype = lib.infer_dtype(input_array) else: inferred_dtype = input_array.dtype + orig_inferred_dtype = inferred_dtype if is_string_dtype(inferred_dtype): if not convert_string or inferred_dtype == "bytes": @@ -1177,7 +1165,8 @@ def convert_dtypes( elif isinstance(inferred_dtype, StringDtype): base_dtype = np.dtype(str) else: - base_dtype = inferred_dtype + base_dtype = _infer_pyarrow_dtype(input_array, orig_inferred_dtype) + if ( base_dtype.kind == "O" # type: ignore[union-attr] and input_array.size > 0 @@ -1188,8 +1177,10 @@ def convert_dtypes( pa_type = pa.null() else: pa_type = to_pyarrow_type(base_dtype) + if pa_type is not None: inferred_dtype = ArrowDtype(pa_type) + elif dtype_backend == "numpy_nullable" and isinstance(inferred_dtype, ArrowDtype): # GH 53648 inferred_dtype = _arrow_dtype_mapping()[inferred_dtype.pyarrow_dtype] @@ -1199,6 +1190,35 @@ def convert_dtypes( return inferred_dtype # type: ignore[return-value] +def _infer_pyarrow_dtype( + input_array: ArrayLike, + inferred_dtype: str, +) -> DtypeObj: + if inferred_dtype not in ["time", "date", "decimal", "bytes"]: + return input_array.dtype + + # For a limited set of dtype + # Let pyarrow infer dtype from input_array + import pyarrow as pa + from pyarrow import ( + ArrowInvalid, + ArrowMemoryError, + ArrowNotImplementedError, + ) + + try: + pyarrow_array = pa.array(input_array) + return ArrowDtype(pyarrow_array.type) + except ( + TypeError, + ValueError, + ArrowInvalid, + ArrowMemoryError, + ArrowNotImplementedError, + ): + return input_array.dtype + + def maybe_infer_to_datetimelike( value: npt.NDArray[np.object_], ) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray | IntervalArray: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index a20f0ae4b7dfc..1de7b0093bfdf 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -36,7 +36,7 @@ from pandas.core.dtypes.cast import ( maybe_cast_pointwise_result, - maybe_cast_to_pyarrow_dtype, + maybe_cast_to_pyarrow_result, maybe_downcast_to_dtype, ) from pandas.core.dtypes.common import ( @@ -917,9 +917,9 @@ def agg_series( """ result = self._aggregate_series_pure_python(obj, func) + if isinstance(obj._values, ArrowExtensionArray): - out = maybe_cast_to_pyarrow_dtype(result, obj.dtype) - return out + return maybe_cast_to_pyarrow_result(result) if not isinstance(obj._values, np.ndarray) and not isinstance( obj._values, ArrowExtensionArray From c6a98c0e0f4a4cd130aca2cb90f105f8bbf4135e Mon Sep 17 00:00:00 2001 From: Kei Date: Wed, 24 Apr 2024 01:32:19 +0800 Subject: [PATCH 19/33] Update tests --- .../tests/groupby/aggregate/test_aggregate.py | 40 +++++++++---------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 11790fd8a1fab..d21931be6d760 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1886,46 +1886,48 @@ def test_agg_lambda_pyarrow_to_same_data_type(): def test_agg_lambda_float64_pyarrow_dtype_conversion(): # test numpy dtype conversion back to pyarrow dtype # complexes, floats, ints, uints, object - df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) + df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100.0, 200, 255.3873]}) df["B"] = df["B"].astype("float64[pyarrow]") gb = df.groupby("A") result = gb.agg(lambda x: x) - expected = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) - expected["B"] = expected["B"].astype("float64[pyarrow]") + expected = DataFrame({"A": ["c1", "c2", "c3"], "B": [100.0, 200, 255.3873]}) + expected["B"] = expected["B"].astype("double[pyarrow]") expected.set_index("A", inplace=True) tm.assert_frame_equal(result, expected) assert result["B"].dtype == expected["B"].dtype -def test_agg_lambda_complex128_pyarrow_dtype_conversion(): +def test_agg_lambda_int64_pyarrow_dtype_conversion(): + # test numpy dtype conversion back to pyarrow dtype + # complexes, floats, ints, uints, object df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) df["B"] = df["B"].astype("int64[pyarrow]") gb = df.groupby("A") - result = gb.agg(lambda x: complex(x.sum(), x.count())) + result = gb.agg(lambda x: x) - expected = DataFrame( - { - "A": ["c1", "c2", "c3"], - "B": [complex(100, 1), complex(200, 1), complex(255, 1)], - } - ) - expected["B"] = expected["B"].astype("complex128") + expected = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) + expected["B"] = expected["B"].astype("int64[pyarrow]") expected.set_index("A", inplace=True) tm.assert_frame_equal(result, expected) assert result["B"].dtype == expected["B"].dtype -def test_agg_lambda_int64_pyarrow_dtype_conversion(): +def test_agg_lambda_complex128_pyarrow_dtype_conversion(): df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) df["B"] = df["B"].astype("int64[pyarrow]") gb = df.groupby("A") - result = gb.agg(lambda x: x) + result = gb.agg(lambda x: complex(x.sum(), x.count())) - expected = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) - expected["B"] = expected["B"].astype("int64[pyarrow]") + expected = DataFrame( + { + "A": ["c1", "c2", "c3"], + "B": [complex(100, 1), complex(200, 1), complex(255, 1)], + } + ) + expected["B"] = expected["B"].astype("complex128") expected.set_index("A", inplace=True) tm.assert_frame_equal(result, expected) @@ -1975,8 +1977,6 @@ def test_agg_lambda_bool_pyarrow_dtype_conversion(): def test_agg_lambda_object_pyarrow_dtype_conversion(): - import pyarrow as pa - df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) df["B"] = df["B"].astype("int64[pyarrow]") gb = df.groupby("A") @@ -1986,9 +1986,7 @@ def test_agg_lambda_object_pyarrow_dtype_conversion(): {"A": ["c1", "c2", "c3"], "B": [{"number": 1}, {"number": 1}, {"number": 1}]} ) - pyarrow_type = pa.struct({"number": pa.int64()}) - pandas_pyarrow_dtype = pd.ArrowDtype(pyarrow_type) - expected["B"] = expected["B"].astype(pandas_pyarrow_dtype) + expected["B"] = expected["B"].astype("object") expected.set_index("A", inplace=True) tm.assert_frame_equal(result, expected) From 9181eaf76e846117db14a5f75a3499d0b87d9739 Mon Sep 17 00:00:00 2001 From: Kei Date: Thu, 25 Apr 2024 20:37:03 +0800 Subject: [PATCH 20/33] Update rst docs --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 8618d7d525771..a8653cc76fae1 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -430,6 +430,7 @@ Groupby/resample/rolling - Bug in :meth:`.DataFrameGroupBy.median` where nat values gave an incorrect result. (:issue:`57926`) - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`) +- Bug in :meth:`DataFrameGroupBy.agg` and :meth:`SeriesGroupBy.agg` that was returning numpy dtype values when input values are pyarrow dtype values, instead of returning pyarrow dtype values. (:issue:`53030`) - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) From 612d7d08e3144532c59131ed3bc74c8dbdfe6496 Mon Sep 17 00:00:00 2001 From: Kei Date: Thu, 25 Apr 2024 23:30:10 +0800 Subject: [PATCH 21/33] Update impl to fix tests --- pandas/core/dtypes/cast.py | 53 +++++++++++++++++++++----------------- pandas/core/groupby/ops.py | 2 +- 2 files changed, 31 insertions(+), 24 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index d622e319b756c..7b5293ebf766b 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -478,7 +478,7 @@ def maybe_cast_pointwise_result( return result -def maybe_cast_to_pyarrow_result(result: ArrayLike) -> ArrayLike: +def maybe_cast_to_pyarrow_result(result: ArrayLike, obj_dtype: DtypeObj) -> ArrayLike: """ Try casting result of a pointwise operation to its pyarrow dtype and arrow extension array if appropriate. If not possible, @@ -501,12 +501,14 @@ def maybe_cast_to_pyarrow_result(result: ArrayLike) -> ArrayLike: stripped_result = result[~isna(result)] npvalues = lib.maybe_convert_objects(stripped_result, try_float=False) + if stripped_result.size == 0: + return maybe_cast_pointwise_result(npvalues, obj_dtype, numeric_only=True) + try: dtype = convert_dtypes(npvalues, dtype_backend="pyarrow") out = pd_array(result, dtype=dtype) except (TypeError, ValueError, np.ComplexWarning): out = npvalues - return out @@ -1194,29 +1196,34 @@ def _infer_pyarrow_dtype( input_array: ArrayLike, inferred_dtype: str, ) -> DtypeObj: - if inferred_dtype not in ["time", "date", "decimal", "bytes"]: - return input_array.dtype - - # For a limited set of dtype - # Let pyarrow infer dtype from input_array import pyarrow as pa - from pyarrow import ( - ArrowInvalid, - ArrowMemoryError, - ArrowNotImplementedError, - ) - try: - pyarrow_array = pa.array(input_array) - return ArrowDtype(pyarrow_array.type) - except ( - TypeError, - ValueError, - ArrowInvalid, - ArrowMemoryError, - ArrowNotImplementedError, - ): - return input_array.dtype + if inferred_dtype == "date": + return ArrowDtype(pa.date32()) + elif inferred_dtype == "time": + return ArrowDtype(pa.time64("us")) + elif inferred_dtype == "bytes": + return ArrowDtype(pa.binary()) + elif inferred_dtype == "decimal": + from pyarrow import ( + ArrowInvalid, + ArrowMemoryError, + ArrowNotImplementedError, + ) + + try: + pyarrow_array = pa.array(input_array) + return ArrowDtype(pyarrow_array.type) + except ( + TypeError, + ValueError, + ArrowInvalid, + ArrowMemoryError, + ArrowNotImplementedError, + ): + return input_array.dtype + + return input_array.dtype def maybe_infer_to_datetimelike( diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 1de7b0093bfdf..97e431ee48f86 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -919,7 +919,7 @@ def agg_series( result = self._aggregate_series_pure_python(obj, func) if isinstance(obj._values, ArrowExtensionArray): - return maybe_cast_to_pyarrow_result(result) + return maybe_cast_to_pyarrow_result(result, obj.dtype) if not isinstance(obj._values, np.ndarray) and not isinstance( obj._values, ArrowExtensionArray From 3b6696b1e2ebf7ce57007f6ae687f4227838ca10 Mon Sep 17 00:00:00 2001 From: Kei Date: Thu, 25 Apr 2024 23:59:55 +0800 Subject: [PATCH 22/33] Declare variable in outer scope --- pandas/core/dtypes/cast.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 7b5293ebf766b..2986b7c171adc 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1061,6 +1061,7 @@ def convert_dtypes( np.dtype, or ExtensionDtype """ inferred_dtype: str | DtypeObj + orig_inferred_dtype = None if ( convert_string or convert_integer or convert_boolean or convert_floating From 680e2387c870137314ff5f37164921df5a2242e5 Mon Sep 17 00:00:00 2001 From: Kei Date: Mon, 29 Apr 2024 14:56:37 +0800 Subject: [PATCH 23/33] Update impl to use maybe_cast_pointwise_result instead of maybe_cast_to_pyarrow_array --- pandas/core/dtypes/cast.py | 75 +------------------------------------- pandas/core/groupby/ops.py | 21 ++++++----- 2 files changed, 13 insertions(+), 83 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 2986b7c171adc..a130983337f64 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -478,40 +478,6 @@ def maybe_cast_pointwise_result( return result -def maybe_cast_to_pyarrow_result(result: ArrayLike, obj_dtype: DtypeObj) -> ArrayLike: - """ - Try casting result of a pointwise operation to its pyarrow dtype - and arrow extension array if appropriate. If not possible, - returns np.ndarray. - - Parameters - ---------- - result : array-like - Result to cast. - - Returns - ------- - result : array-like - result maybe casted to the dtype. - """ - from pandas.core.construction import array as pd_array - - # maybe_convert_objects is unable to detect NA as nan - # (detects it as object instead) - stripped_result = result[~isna(result)] - npvalues = lib.maybe_convert_objects(stripped_result, try_float=False) - - if stripped_result.size == 0: - return maybe_cast_pointwise_result(npvalues, obj_dtype, numeric_only=True) - - try: - dtype = convert_dtypes(npvalues, dtype_backend="pyarrow") - out = pd_array(result, dtype=dtype) - except (TypeError, ValueError, np.ComplexWarning): - out = npvalues - return out - - def _maybe_cast_to_extension_array( cls: type[ExtensionArray], obj: ArrayLike, dtype: ExtensionDtype | None = None ) -> ArrayLike: @@ -1061,7 +1027,6 @@ def convert_dtypes( np.dtype, or ExtensionDtype """ inferred_dtype: str | DtypeObj - orig_inferred_dtype = None if ( convert_string or convert_integer or convert_boolean or convert_floating @@ -1070,7 +1035,6 @@ def convert_dtypes( inferred_dtype = lib.infer_dtype(input_array) else: inferred_dtype = input_array.dtype - orig_inferred_dtype = inferred_dtype if is_string_dtype(inferred_dtype): if not convert_string or inferred_dtype == "bytes": @@ -1168,8 +1132,7 @@ def convert_dtypes( elif isinstance(inferred_dtype, StringDtype): base_dtype = np.dtype(str) else: - base_dtype = _infer_pyarrow_dtype(input_array, orig_inferred_dtype) - + base_dtype = inferred_dtype if ( base_dtype.kind == "O" # type: ignore[union-attr] and input_array.size > 0 @@ -1180,10 +1143,8 @@ def convert_dtypes( pa_type = pa.null() else: pa_type = to_pyarrow_type(base_dtype) - if pa_type is not None: inferred_dtype = ArrowDtype(pa_type) - elif dtype_backend == "numpy_nullable" and isinstance(inferred_dtype, ArrowDtype): # GH 53648 inferred_dtype = _arrow_dtype_mapping()[inferred_dtype.pyarrow_dtype] @@ -1193,40 +1154,6 @@ def convert_dtypes( return inferred_dtype # type: ignore[return-value] -def _infer_pyarrow_dtype( - input_array: ArrayLike, - inferred_dtype: str, -) -> DtypeObj: - import pyarrow as pa - - if inferred_dtype == "date": - return ArrowDtype(pa.date32()) - elif inferred_dtype == "time": - return ArrowDtype(pa.time64("us")) - elif inferred_dtype == "bytes": - return ArrowDtype(pa.binary()) - elif inferred_dtype == "decimal": - from pyarrow import ( - ArrowInvalid, - ArrowMemoryError, - ArrowNotImplementedError, - ) - - try: - pyarrow_array = pa.array(input_array) - return ArrowDtype(pyarrow_array.type) - except ( - TypeError, - ValueError, - ArrowInvalid, - ArrowMemoryError, - ArrowNotImplementedError, - ): - return input_array.dtype - - return input_array.dtype - - def maybe_infer_to_datetimelike( value: npt.NDArray[np.object_], ) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray | IntervalArray: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 97e431ee48f86..f0041a039cc88 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -36,7 +36,6 @@ from pandas.core.dtypes.cast import ( maybe_cast_pointwise_result, - maybe_cast_to_pyarrow_result, maybe_downcast_to_dtype, ) from pandas.core.dtypes.common import ( @@ -46,6 +45,7 @@ ensure_uint64, is_1d_only_ea_dtype, ) +from pandas.core.dtypes.dtypes import ArrowDtype from pandas.core.dtypes.missing import ( isna, maybe_fill, @@ -917,21 +917,24 @@ def agg_series( """ result = self._aggregate_series_pure_python(obj, func) + npvalues = lib.maybe_convert_objects(result, try_float=False) if isinstance(obj._values, ArrowExtensionArray): - return maybe_cast_to_pyarrow_result(result, obj.dtype) + out = maybe_cast_pointwise_result( + npvalues, obj.dtype, numeric_only=True, same_dtype=False + ) + import pyarrow as pa + + if isinstance(out.dtype, ArrowDtype) and pa.types.is_struct( + out.dtype.pyarrow_dtype + ): + out = npvalues - if not isinstance(obj._values, np.ndarray) and not isinstance( - obj._values, ArrowExtensionArray - ): + elif not isinstance(obj._values, np.ndarray): # we can preserve a little bit more aggressively with EA dtype # because maybe_cast_pointwise_result will do a try/except # with _from_sequence. NB we are assuming here that _from_sequence # is sufficiently strict that it casts appropriately. - preserve_dtype = True - - npvalues = lib.maybe_convert_objects(result, try_float=False) - if preserve_dtype: out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) else: out = npvalues From 3a8597e033c3cb7f2c1ec127f0bb840613500c36 Mon Sep 17 00:00:00 2001 From: Kei Date: Mon, 29 Apr 2024 16:29:25 +0800 Subject: [PATCH 24/33] Fix tests with nested array --- pandas/tests/groupby/test_groupby.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 54d7895691f3f..4764bcb64fd0c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -28,6 +28,7 @@ ) import pandas._testing as tm from pandas.core.arrays import BooleanArray +from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics import pandas.core.common as com pytestmark = pytest.mark.filterwarnings("ignore:Mean of empty slice:RuntimeWarning") @@ -2476,9 +2477,14 @@ def test_by_column_values_with_same_starting_value(dtype): "Mood": [["happy", "sad"], "happy"], "Credit": [2500, 900], "Name": ["Thomas", "Thomas John"], - } + }, ).set_index("Name") + if dtype == "string[pyarrow_numpy]": + import pyarrow as pa + mood_values = ArrowStringArrayNumpySemantics(pa.array(["happy", "sad"])) + expected_result["Mood"] = [mood_values, "happy"] + expected_result["Mood"] = expected_result["Mood"].astype(dtype) tm.assert_frame_equal(result, expected_result) From 6496b15caf4a7fa3a526958f01125274ef9fb5e9 Mon Sep 17 00:00:00 2001 From: Kei Date: Thu, 2 May 2024 13:22:29 +0800 Subject: [PATCH 25/33] Update according to pr comments --- pandas/tests/extension/test_arrow.py | 29 +- .../tests/groupby/aggregate/test_aggregate.py | 324 +++--------------- 2 files changed, 52 insertions(+), 301 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 1283301eeb5e2..0f5c2d1ec6199 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -225,24 +225,6 @@ def data_for_grouping(dtype): return pd.array([B, B, None, None, A, A, B, C], dtype=dtype) -def expected_inferred_result_dtype(dtype): - """ - When the data pass through aggregate, - the inferred data type that it will become - - """ - - pa_dtype = dtype.pyarrow_dtype - if pa.types.is_date(pa_dtype): - return "date32[day][pyarrow]" - elif pa.types.is_time(pa_dtype): - return "time64[us][pyarrow]" - elif pa.types.is_decimal(pa_dtype): - return ArrowDtype(pa.decimal128(4, 3)) - else: - return dtype - - @pytest.fixture def data_for_sorting(data_for_grouping): """ @@ -1147,6 +1129,17 @@ def test_groupby_agg_extension(self, data_for_grouping): # GH#38980 groupby agg on extension type fails for non-numeric types df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) + def expected_inferred_result_dtype(dtype): + pa_dtype = dtype.pyarrow_dtype + if pa.types.is_date(pa_dtype): + return "date32[day][pyarrow]" + elif pa.types.is_time(pa_dtype): + return "time64[us][pyarrow]" + elif pa.types.is_decimal(pa_dtype): + return ArrowDtype(pa.decimal128(4, 3)) + else: + return dtype + expected_df = pd.DataFrame( {"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping} ) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index d21931be6d760..8a4356ee8534d 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1665,329 +1665,87 @@ def func(x): df.groupby("A", observed=False).agg(func) -def test_agg_simple_lambda_numpy_to_same_data_type(): - df = DataFrame( - {"A": [1, 3, 100, 3, 100, 100], "B": [False, False, False, False, False, True]} - ) - df["B"] = df["B"].astype("bool") - gb = df.groupby("A") - result = gb.agg(lambda x: x.max()) - - expected = DataFrame({"A": [1, 3, 100], "B": [False, False, True]}) - expected["B"] = expected["B"].astype("bool") - expected.set_index("A", inplace=True) - tm.assert_frame_equal(result, expected) - assert result["B"].dtype == expected["B"].dtype - - -def test_agg_simple_lambda_pyarrow_to_same_data_type(): - df = DataFrame( - {"A": [1, 3, 100, 3, 100, 100], "B": [False, False, False, False, False, True]} - ) - df["B"] = df["B"].astype("bool[pyarrow]") - gb = df.groupby("A") - result = gb.agg(lambda x: x.max()) - - expected = DataFrame({"A": [1, 3, 100], "B": [False, False, True]}) - expected["B"] = expected["B"].astype("bool[pyarrow]") - expected.set_index("A", inplace=True) - tm.assert_frame_equal(result, expected) - assert result["B"].dtype == expected["B"].dtype - - -def test_agg_simple_lambda_numpy_to_diff_data_type(): - df = DataFrame( - {"A": [1, 3, 100, 3, 100, 100], "B": [False, True, True, False, False, True]} - ) - df["B"] = df["B"].astype("bool") - gb = df.groupby("A") - result = gb.agg(lambda x: x.sum()) - - expected = DataFrame({"A": [1, 3, 100], "B": [0, 1, 2]}) - expected["B"] = expected["B"].astype("int64") - expected.set_index("A", inplace=True) - tm.assert_frame_equal(result, expected) - assert result["B"].dtype == expected["B"].dtype - - -def test_agg_simple_lambda_pyarrow_to_diff_data_type(): - df = DataFrame( - {"A": [1, 3, 100, 3, 100, 100], "B": [False, True, True, False, False, True]} - ) - df["B"] = df["B"].astype("bool[pyarrow]") - gb = df.groupby("A") - result = gb.agg(lambda x: x.sum()) - - expected = DataFrame({"A": [1, 3, 100], "B": [0, 1, 2]}) - expected["B"] = expected["B"].astype("int64[pyarrow]") - expected.set_index("A", inplace=True) - tm.assert_frame_equal(result, expected) - assert result["B"].dtype == expected["B"].dtype - - -def test_agg_lambda_numpy_to_diff_data_type(): +@pytest.mark.parametrize( + "input_dtype, output_dtype", + [ + ("float[pyarrow]", "double[pyarrow]"), + ("int64[pyarrow]", "int64[pyarrow]"), + ("uint64[pyarrow]", "int64[pyarrow]"), + ("bool[pyarrow]", "bool[pyarrow]"), + ], +) +def test_agg_lambda_pyarrow_dtype_conversion(input_dtype, output_dtype): + # GH#53030 + # test numpy dtype conversion back to pyarrow dtype + # complexes, floats, ints, uints, object df = DataFrame( { - "A": [ - "cat39403", - "cat21", - "cat21", - "cat39403", - "cat39403", - "cat21", - "cat21", - "cat39403", - "cat21", - "cat21", - "cat18", - ], - "B": [37, 4958, -4839, 85943, 5490, 1, 0, 945, -943049, -132, 3], + "A": ["c1", "c2", "c3", "c1", "c2", "c3"], + "B": pd.array([100, 200, 255, 0, 199, 40392], dtype=input_dtype), } ) - df["B"] = df["B"].astype("int32") gb = df.groupby("A") - result = gb.agg(lambda x: (x.sum() / x.count()) + x.max() - 3 + 5) + result = gb.agg(lambda x: x.min()) expected = DataFrame( - { - "A": ["cat18", "cat21", "cat39403"], - "B": [8.0, -152216.83333333334, 109048.75], - } + {"B": pd.array([0, 199, 255], dtype=output_dtype)}, + index=Index(["c1", "c2", "c3"], name="A"), ) - expected["B"] = expected["B"].astype("float64") - expected.set_index("A", inplace=True) tm.assert_frame_equal(result, expected) - assert result["B"].dtype == expected["B"].dtype -def test_agg_lambda_pyarrow_to_diff_data_type(): +def test_agg_lambda_complex128_dtype_conversion(): + # GH#53030 df = DataFrame( - { - "A": [ - "cat39403", - "cat21", - "cat21", - "cat39403", - "cat39403", - "cat21", - "cat21", - "cat39403", - "cat21", - "cat21", - "cat18", - ], - "B": [37, 4958, -4839, 85943, 5490, 1, 0, 945, -943049, -132, 3], - } + {"A": ["c1", "c2", "c3"], "B": pd.array([100, 200, 255], "int64[pyarrow]")} ) - df["B"] = df["B"].astype("int32[pyarrow]") gb = df.groupby("A") - result = gb.agg(lambda x: (x.sum() / x.count()) + x.max() - 3 + 5) + result = gb.agg(lambda x: complex(x.sum(), x.count())) expected = DataFrame( { - "A": ["cat18", "cat21", "cat39403"], - "B": [8.0, -152216.83333333334, 109048.75], - } + "B": pd.array( + [complex(100, 1), complex(200, 1), complex(255, 1)], dtype="complex128" + ), + }, + index=Index(["c1", "c2", "c3"], name="A"), ) - expected["B"] = expected["B"].astype("double[pyarrow]") - expected.set_index("A", inplace=True) tm.assert_frame_equal(result, expected) - assert result["B"].dtype == expected["B"].dtype -def test_agg_lambda_numpy_to_same_data_type(): +def test_agg_lambda_numpy_uint64_to_pyarrow_dtype_conversion(): + # GH#53030 df = DataFrame( { - "A": [ - "cat39403", - "cat21", - "cat21", - "cat39403", - "cat39403", - "cat21", - "cat21", - "cat39403", - "cat21", - "cat21", - "cat18", - ], - "B": [ - 37.0, - 4958.0, - -4839.0, - 85943.0, - 5490.0, - 1.0, - 0.0, - 945.0, - -943049.0, - -132.0, - 3.0, - ], + "A": ["c1", "c2", "c3"], + "B": pd.array([100, 200, 255], dtype="uint64[pyarrow]"), } ) - df["B"] = df["B"].astype("float64") gb = df.groupby("A") - result = gb.agg(lambda x: x.std() / x.var() * 10 / 3 - 32 + 3) + result = gb.agg(lambda x: np.uint64(x.sum())) expected = DataFrame( - {"A": ["cat18", "cat21", "cat39403"], "B": [np.nan, -28.999991, -28.999921]} - ) - expected["B"] = expected["B"].astype("float64") - expected.set_index("A", inplace=True) - tm.assert_frame_equal(result, expected) - assert result["B"].dtype == expected["B"].dtype - - -def test_agg_lambda_pyarrow_to_same_data_type(): - df = DataFrame( { - "A": [ - "cat39403", - "cat21", - "cat21", - "cat39403", - "cat39403", - "cat21", - "cat21", - "cat39403", - "cat21", - "cat21", - "cat18", - ], - "B": [ - 37.0, - 4958.0, - -4839.0, - 85943.0, - 5490.0, - 1.0, - 0.0, - 945.0, - -943049.0, - -132.0, - 3.0, - ], - } - ) - df["B"] = df["B"].astype("double[pyarrow]") - gb = df.groupby("A") - result = gb.agg(lambda x: x.std() / x.var() * 10 / 3 - 32 + 3) - - expected = DataFrame( - {"A": ["cat18", "cat21", "cat39403"], "B": [np.nan, -28.999991, -28.999921]} + "B": pd.array([100, 200, 255], dtype="uint64[pyarrow]"), + }, + index=Index(["c1", "c2", "c3"], name="A"), ) - expected["B"] = expected["B"].astype("double[pyarrow]") - expected.set_index("A", inplace=True) - tm.assert_frame_equal(result, expected) - assert result["B"].dtype == expected["B"].dtype - - -def test_agg_lambda_float64_pyarrow_dtype_conversion(): - # test numpy dtype conversion back to pyarrow dtype - # complexes, floats, ints, uints, object - df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100.0, 200, 255.3873]}) - df["B"] = df["B"].astype("float64[pyarrow]") - gb = df.groupby("A") - result = gb.agg(lambda x: x) - - expected = DataFrame({"A": ["c1", "c2", "c3"], "B": [100.0, 200, 255.3873]}) - expected["B"] = expected["B"].astype("double[pyarrow]") - expected.set_index("A", inplace=True) - - tm.assert_frame_equal(result, expected) - assert result["B"].dtype == expected["B"].dtype - - -def test_agg_lambda_int64_pyarrow_dtype_conversion(): - # test numpy dtype conversion back to pyarrow dtype - # complexes, floats, ints, uints, object - df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) - df["B"] = df["B"].astype("int64[pyarrow]") - gb = df.groupby("A") - result = gb.agg(lambda x: x) - - expected = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) - expected["B"] = expected["B"].astype("int64[pyarrow]") - expected.set_index("A", inplace=True) - tm.assert_frame_equal(result, expected) - assert result["B"].dtype == expected["B"].dtype -def test_agg_lambda_complex128_pyarrow_dtype_conversion(): - df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) - df["B"] = df["B"].astype("int64[pyarrow]") - gb = df.groupby("A") - result = gb.agg(lambda x: complex(x.sum(), x.count())) - - expected = DataFrame( +def test_agg_lambda_pyarrow_struct_to_object_dtype_conversion(): + # GH#53030 + df = DataFrame( { "A": ["c1", "c2", "c3"], - "B": [complex(100, 1), complex(200, 1), complex(255, 1)], + "B": pd.array([100, 200, 255], dtype="int64[pyarrow]"), } ) - expected["B"] = expected["B"].astype("complex128") - expected.set_index("A", inplace=True) - - tm.assert_frame_equal(result, expected) - assert result["B"].dtype == expected["B"].dtype - - -def test_agg_lambda_uint64_pyarrow_dtype_conversion(): - df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) - df["B"] = df["B"].astype("uint64[pyarrow]") - gb = df.groupby("A") - result = gb.agg(lambda x: x) - - expected = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) - expected["B"] = expected["B"].astype("int64[pyarrow]") - expected.set_index("A", inplace=True) - - tm.assert_frame_equal(result, expected) - assert result["B"].dtype == expected["B"].dtype - - -def test_agg_lambda_numpy_uint64_to_pyarrow_dtype_conversion(): - df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) - df["B"] = df["B"].astype("uint64[pyarrow]") - gb = df.groupby("A") - result = gb.agg(lambda x: np.uint64(x.sum())) - - expected = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) - expected["B"] = expected["B"].astype("uint64[pyarrow]") - expected.set_index("A", inplace=True) - - tm.assert_frame_equal(result, expected) - assert result["B"].dtype == expected["B"].dtype - - -def test_agg_lambda_bool_pyarrow_dtype_conversion(): - df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) - df["B"] = df["B"].astype("bool[pyarrow]") - gb = df.groupby("A") - result = gb.agg(lambda x: x) - - expected = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) - expected["B"] = expected["B"].astype("bool[pyarrow]") - expected.set_index("A", inplace=True) - - tm.assert_frame_equal(result, expected) - assert result["B"].dtype == expected["B"].dtype - - -def test_agg_lambda_object_pyarrow_dtype_conversion(): - df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]}) - df["B"] = df["B"].astype("int64[pyarrow]") gb = df.groupby("A") result = gb.agg(lambda x: {"number": 1}) expected = DataFrame( - {"A": ["c1", "c2", "c3"], "B": [{"number": 1}, {"number": 1}, {"number": 1}]} + {"B": pd.array([{"number": 1}, {"number": 1}, {"number": 1}], dtype="object")}, + index=Index(["c1", "c2", "c3"], name="A"), ) - - expected["B"] = expected["B"].astype("object") - expected.set_index("A", inplace=True) - tm.assert_frame_equal(result, expected) - assert result["B"].dtype == expected["B"].dtype From e1ccef68b6a0eda3c709deeae81805bc49b38e57 Mon Sep 17 00:00:00 2001 From: Kei Date: Tue, 7 May 2024 12:53:13 +0800 Subject: [PATCH 26/33] Preserve_dtype if argument is passed in, else don't preserve --- pandas/core/groupby/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index f0041a039cc88..7877071838c7a 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -921,7 +921,7 @@ def agg_series( if isinstance(obj._values, ArrowExtensionArray): out = maybe_cast_pointwise_result( - npvalues, obj.dtype, numeric_only=True, same_dtype=False + npvalues, obj.dtype, numeric_only=True, same_dtype=preserve_dtype ) import pyarrow as pa From a1d73f5348462a0749742a5d4ef4d836e4669ebe Mon Sep 17 00:00:00 2001 From: Kei Date: Tue, 7 May 2024 13:07:11 +0800 Subject: [PATCH 27/33] Update tests --- pandas/tests/extension/test_arrow.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index cf1e502a723ee..9ff10c17a7cc5 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1129,23 +1129,12 @@ def test_groupby_agg_extension(self, data_for_grouping): # GH#38980 groupby agg on extension type fails for non-numeric types df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) - def expected_inferred_result_dtype(dtype): - pa_dtype = dtype.pyarrow_dtype - if pa.types.is_date(pa_dtype): - return "date32[day][pyarrow]" - elif pa.types.is_time(pa_dtype): - return "time64[us][pyarrow]" - elif pa.types.is_decimal(pa_dtype): - return ArrowDtype(pa.decimal128(4, 3)) - else: - return dtype - expected_df = pd.DataFrame( {"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping} ) expected = expected_df.iloc[[0, 2, 4, 7]] expected = expected.set_index("A") - expected_dtype = expected_inferred_result_dtype(expected["B"].dtype) + expected_dtype = expected["B"].dtype expected["B"] = expected["B"].astype(expected_dtype) result = df.groupby("A").agg({"B": "first"}) From fa257b0568dc9deef766ff6ae3a53c155ff1d710 Mon Sep 17 00:00:00 2001 From: ellaella12 Date: Sun, 12 May 2024 15:39:24 +0800 Subject: [PATCH 28/33] Remove redundant tests --- pandas/tests/extension/test_arrow.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 9ff10c17a7cc5..7d31fe6085c3a 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1125,27 +1125,6 @@ def test_comp_masked_numpy(self, masked_dtype, comparison_op): expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_())) tm.assert_series_equal(result, expected) - def test_groupby_agg_extension(self, data_for_grouping): - # GH#38980 groupby agg on extension type fails for non-numeric types - df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) - - expected_df = pd.DataFrame( - {"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping} - ) - expected = expected_df.iloc[[0, 2, 4, 7]] - expected = expected.set_index("A") - expected_dtype = expected["B"].dtype - expected["B"] = expected["B"].astype(expected_dtype) - - result = df.groupby("A").agg({"B": "first"}) - tm.assert_frame_equal(result, expected) - - result = df.groupby("A").agg("first") - tm.assert_frame_equal(result, expected) - - result = df.groupby("A").first() - tm.assert_frame_equal(result, expected) - class TestLogicalOps: """Various Series and DataFrame logical ops methods.""" From 139319acc321c6dbfc7c2b12de1ba5a4630a7241 Mon Sep 17 00:00:00 2001 From: ellaella12 Date: Sun, 12 May 2024 16:41:15 +0800 Subject: [PATCH 29/33] retrigger pipeline From 283eda9bc0d154e1e73c4c208cf6ffea829326ad Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 22 Mar 2025 12:13:28 -0400 Subject: [PATCH 30/33] Rework --- pandas/core/groupby/ops.py | 20 ++++++++++--------- .../tests/groupby/aggregate/test_aggregate.py | 19 +++++++++++++----- pandas/tests/groupby/test_groupby.py | 17 +++++++--------- 3 files changed, 32 insertions(+), 24 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 0e90442f0c108..b1f94fa71eb25 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -44,7 +44,6 @@ ensure_uint64, is_1d_only_ea_dtype, ) -from pandas.core.dtypes.dtypes import ArrowDtype from pandas.core.dtypes.missing import ( isna, maybe_fill, @@ -956,19 +955,22 @@ def agg_series( ------- np.ndarray or ExtensionArray """ - result = self._aggregate_series_pure_python(obj, func) npvalues = lib.maybe_convert_objects(result, try_float=False) if isinstance(obj._values, ArrowExtensionArray): - out = maybe_cast_pointwise_result( - npvalues, obj.dtype, numeric_only=True, same_dtype=preserve_dtype - ) - import pyarrow as pa + from pandas.core.dtypes.common import is_string_dtype - if isinstance(out.dtype, ArrowDtype) and pa.types.is_struct( - out.dtype.pyarrow_dtype - ): + if not is_string_dtype(obj.dtype) or is_string_dtype(npvalues): + out = maybe_cast_pointwise_result( + npvalues, obj.dtype, numeric_only=True, same_dtype=preserve_dtype + ) + + # if isinstance(out.dtype, ArrowDtype) and pa.types.is_struct( + # out.dtype.pyarrow_dtype + # ): + # out = npvalues + else: out = npvalues elif not isinstance(obj._values, np.ndarray): diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 0a722013d2316..9e817b2b8fa1e 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -10,6 +10,7 @@ import pytest from pandas.errors import SpecificationError +import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_integer_dtype @@ -23,6 +24,7 @@ to_datetime, ) import pandas._testing as tm +from pandas.arrays import ArrowExtensionArray from pandas.core.groupby.grouper import Grouping @@ -1812,6 +1814,9 @@ def test_groupby_aggregation_func_list_multi_index_duplicate_columns(): @pytest.mark.parametrize( "input_dtype, output_dtype", [ + # With NumPy arrays, the results from the UDF would be e.g. np.float32 scalars + # which we can therefore preserve. However with PyArrow arrays, the results are + # Python scalars so we have no information about size or uint vs int. ("float[pyarrow]", "double[pyarrow]"), ("int64[pyarrow]", "int64[pyarrow]"), ("uint64[pyarrow]", "int64[pyarrow]"), @@ -1819,9 +1824,8 @@ def test_groupby_aggregation_func_list_multi_index_duplicate_columns(): ], ) def test_agg_lambda_pyarrow_dtype_conversion(input_dtype, output_dtype): - # GH#53030 - # test numpy dtype conversion back to pyarrow dtype - # complexes, floats, ints, uints, object + # GH#59601 + # Test PyArrow dtype conversion back to PyArrow dtype df = DataFrame( { "A": ["c1", "c2", "c3", "c1", "c2", "c3"], @@ -1839,7 +1843,7 @@ def test_agg_lambda_pyarrow_dtype_conversion(input_dtype, output_dtype): def test_agg_lambda_complex128_dtype_conversion(): - # GH#53030 + # GH#59601 df = DataFrame( {"A": ["c1", "c2", "c3"], "B": pd.array([100, 200, 255], "int64[pyarrow]")} ) @@ -1877,8 +1881,11 @@ def test_agg_lambda_numpy_uint64_to_pyarrow_dtype_conversion(): tm.assert_frame_equal(result, expected) +@td.skip_if_no("pyarrow") def test_agg_lambda_pyarrow_struct_to_object_dtype_conversion(): # GH#53030 + import pyarrow as pa + df = DataFrame( { "A": ["c1", "c2", "c3"], @@ -1888,8 +1895,10 @@ def test_agg_lambda_pyarrow_struct_to_object_dtype_conversion(): gb = df.groupby("A") result = gb.agg(lambda x: {"number": 1}) + arr = pa.array([{"number": 1}, {"number": 1}, {"number": 1}]) expected = DataFrame( - {"B": pd.array([{"number": 1}, {"number": 1}, {"number": 1}], dtype="object")}, + {"B": ArrowExtensionArray(arr)}, index=Index(["c1", "c2", "c3"], name="A"), ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 39d8015a9aafd..4955b1fe0da54 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -26,7 +26,6 @@ ) import pandas._testing as tm from pandas.core.arrays import BooleanArray -from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics import pandas.core.common as com pytestmark = pytest.mark.filterwarnings("ignore:Mean of empty slice:RuntimeWarning") @@ -2435,30 +2434,28 @@ def test_rolling_wrong_param_min_period(): def test_by_column_values_with_same_starting_value(any_string_dtype): # GH29635 + dtype = any_string_dtype df = DataFrame( { "Name": ["Thomas", "Thomas", "Thomas John"], "Credit": [1200, 1300, 900], - "Mood": Series(["sad", "happy", "happy"], dtype=any_string_dtype), + "Mood": Series(["sad", "happy", "happy"], dtype=dtype), } ) aggregate_details = {"Mood": Series.mode, "Credit": "sum"} result = df.groupby(["Name"]).agg(aggregate_details) - expected_result = DataFrame( + expected = DataFrame( { "Mood": [["happy", "sad"], "happy"], "Credit": [2500, 900], "Name": ["Thomas", "Thomas John"], }, ).set_index("Name") - if dtype == "string[pyarrow_numpy]": - import pyarrow as pa - - mood_values = ArrowStringArrayNumpySemantics(pa.array(["happy", "sad"])) - expected_result["Mood"] = [mood_values, "happy"] - expected_result["Mood"] = expected_result["Mood"].astype(dtype) - tm.assert_frame_equal(result, expected_result) + if getattr(dtype, "storage", None) == "pyarrow": + mood_values = pd.array(["happy", "sad"], dtype=dtype) + expected["Mood"] = [mood_values, "happy"] + tm.assert_frame_equal(result, expected) def test_groupby_none_in_first_mi_level(): From d6edeffbfbed6c01177f41fcaaf30b60f8dc7031 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 22 Mar 2025 12:14:42 -0400 Subject: [PATCH 31/33] Cleanup --- pandas/core/groupby/ops.py | 6 ------ pandas/tests/groupby/aggregate/test_aggregate.py | 4 ++-- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b1f94fa71eb25..ac3c4a6bf4f94 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -965,11 +965,6 @@ def agg_series( out = maybe_cast_pointwise_result( npvalues, obj.dtype, numeric_only=True, same_dtype=preserve_dtype ) - - # if isinstance(out.dtype, ArrowDtype) and pa.types.is_struct( - # out.dtype.pyarrow_dtype - # ): - # out = npvalues else: out = npvalues @@ -981,7 +976,6 @@ def agg_series( out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) else: out = npvalues - return out @final diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 9e817b2b8fa1e..b27785115b0a3 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1862,7 +1862,7 @@ def test_agg_lambda_complex128_dtype_conversion(): def test_agg_lambda_numpy_uint64_to_pyarrow_dtype_conversion(): - # GH#53030 + # GH#59601 df = DataFrame( { "A": ["c1", "c2", "c3"], @@ -1883,7 +1883,7 @@ def test_agg_lambda_numpy_uint64_to_pyarrow_dtype_conversion(): @td.skip_if_no("pyarrow") def test_agg_lambda_pyarrow_struct_to_object_dtype_conversion(): - # GH#53030 + # GH#59601 import pyarrow as pa df = DataFrame( From b2e34fb3c3440c33f377fd28991230f22988d235 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 22 Mar 2025 17:44:24 -0400 Subject: [PATCH 32/33] Fixup --- pandas/core/groupby/ops.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index ac3c4a6bf4f94..d680e26d7386e 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -961,7 +961,11 @@ def agg_series( if isinstance(obj._values, ArrowExtensionArray): from pandas.core.dtypes.common import is_string_dtype - if not is_string_dtype(obj.dtype) or is_string_dtype(npvalues): + # When obj.dtype is a string, any object can be cast. Only do so if the + # UDF returned strings or NA values. + if not is_string_dtype(obj.dtype) or is_string_dtype( + npvalues[~isna(npvalues)] + ): out = maybe_cast_pointwise_result( npvalues, obj.dtype, numeric_only=True, same_dtype=preserve_dtype ) From 9cbf3390867dd8396dda66b179b7d71f0e363f9f Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 22 Mar 2025 17:56:09 -0400 Subject: [PATCH 33/33] More skips --- pandas/tests/groupby/aggregate/test_aggregate.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index b27785115b0a3..ec1755cb98c25 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1811,6 +1811,7 @@ def test_groupby_aggregation_func_list_multi_index_duplicate_columns(): tm.assert_frame_equal(result, expected) +@td.skip_if_no("pyarrow") @pytest.mark.parametrize( "input_dtype, output_dtype", [ @@ -1842,6 +1843,7 @@ def test_agg_lambda_pyarrow_dtype_conversion(input_dtype, output_dtype): tm.assert_frame_equal(result, expected) +@td.skip_if_no("pyarrow") def test_agg_lambda_complex128_dtype_conversion(): # GH#59601 df = DataFrame( @@ -1861,6 +1863,7 @@ def test_agg_lambda_complex128_dtype_conversion(): tm.assert_frame_equal(result, expected) +@td.skip_if_no("pyarrow") def test_agg_lambda_numpy_uint64_to_pyarrow_dtype_conversion(): # GH#59601 df = DataFrame(