From a15bc15057fdc5d38a57003f8d5c46c64fa1bc54 Mon Sep 17 00:00:00 2001 From: YankoFelipe <28831006+YankoFelipe@users.noreply.github.com> Date: Tue, 11 Mar 2025 15:45:56 +0100 Subject: [PATCH 1/2] BUG: Passing original properties of and to subclasses constructors --- doc/source/development/extending.rst | 45 +++++++++++++++---- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_testing/__init__.py | 23 +++++----- pandas/core/frame.py | 6 ++- pandas/core/series.py | 13 ++++-- pandas/tests/frame/test_arithmetic.py | 21 ++++++--- pandas/tests/frame/test_subclass.py | 64 ++++++++++++++++++++++++--- 7 files changed, 137 insertions(+), 36 deletions(-) diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index e67829b8805eb..58d39bd7d4037 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -402,23 +402,50 @@ To let original data structures have additional properties, you should let ``pan 1. Define ``_internal_names`` and ``_internal_names_set`` for temporary properties which WILL NOT be passed to manipulation results. 2. Define ``_metadata`` for normal properties which will be passed to manipulation results. +If used, a ``Series`` subclass must also be defined with the same ``_metadata`` property and the first parameter of the constructors must be the data. +Avoid the following names for your normal properties: ``data``, ``index``, ``columns``, ``dtype``, ``copy``, ``name``, ``_name`` and ``fastpath``. Below is an example to define two original properties, "internal_cache" as a temporary property and "added_property" as a normal property .. code-block:: python - class SubclassedDataFrame2(pd.DataFrame): + class SubclassedDataFrame(pd.DataFrame): - # temporary properties - _internal_names = pd.DataFrame._internal_names + ["internal_cache"] - _internal_names_set = set(_internal_names) + # temporary properties + _internal_names = pd.DataFrame._internal_names + ["internal_cache"] + _internal_names_set = set(_internal_names) - # normal properties - _metadata = ["added_property"] + # normal properties + _metadata = ["added_property"] - @property - def _constructor(self): - return SubclassedDataFrame2 + def __init__(self, data=None, added_property=None, *args, **kwargs): + super().__init__(data, *args, **kwargs) + self.added_property = added_property + + @property + def _constructor(self): + return SubclassedDataFrame + + @property + def _constructor_sliced(self): + return SubclassedSeries + + class SubclassedSeries(pd.Series): + + # normal properties + _metadata = ["original_property"] + + def __init__(self, data=None, original_property=None, *args, **kwargs): + super().__init__(data, *args, **kwargs) + self.original_property = original_property + + @property + def _constructor(self): + return SubclassedSeries + + @property + def _constructor_expanddim(self): + return SubclassedDataFrame .. code-block:: python diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 25c26253e687b..93a5709bd8388 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -843,6 +843,7 @@ Other - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) - Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`) - Bug in ``Series.list`` methods not preserving the original name. (:issue:`60522`) +- Bug in original properties ``_metadata`` of :class:`Dataframe` and :class:`Series` subclasses'. For some operations (i.e. ``concat``) the new object wasn't receiving the original properties (:issue:`34177`) - Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`) - Bug in printing a :class:`Series` with a :class:`DataFrame` stored in :attr:`Series.attrs` raised a ``ValueError`` (:issue:`60568`) - Fixed regression in :meth:`DataFrame.from_records` not initializing subclasses properly (:issue:`57008`) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index ec9b5098c97c9..9201ce8b16533 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -325,32 +325,35 @@ def to_array(obj): class SubclassedSeries(Series): - _metadata = ["testattr", "name"] + _metadata = ["testattr"] + + def __init__(self, data=None, testattr=None, *args, **kwargs): + super().__init__(data, *args, **kwargs) + self.testattr = testattr @property def _constructor(self): - # For testing, those properties return a generic callable, and not - # the actual class. In this case that is equivalent, but it is to - # ensure we don't rely on the property returning a class - # See https://github.com/pandas-dev/pandas/pull/46018 and - # https://github.com/pandas-dev/pandas/issues/32638 and linked issues - return lambda *args, **kwargs: SubclassedSeries(*args, **kwargs) + return SubclassedSeries @property def _constructor_expanddim(self): - return lambda *args, **kwargs: SubclassedDataFrame(*args, **kwargs) + return SubclassedDataFrame class SubclassedDataFrame(DataFrame): _metadata = ["testattr"] + def __init__(self, data=None, testattr=None, *args, **kwargs): + super().__init__(data, *args, **kwargs) + self.testattr = testattr + @property def _constructor(self): - return lambda *args, **kwargs: SubclassedDataFrame(*args, **kwargs) + return SubclassedDataFrame @property def _constructor_sliced(self): - return lambda *args, **kwargs: SubclassedSeries(*args, **kwargs) + return SubclassedSeries def convert_rows_list_to_csv_str(rows_list: list[str]) -> str: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c862b7dbaf973..629b5c102c05b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -675,7 +675,8 @@ def _constructor_from_mgr(self, mgr, axes) -> DataFrame: # We assume that the subclass __init__ knows how to handle a # pd.DataFrame object. - return self._constructor(df) + metadata = {k: getattr(self, k) for k in self._metadata} + return self._constructor(df, **metadata) _constructor_sliced: Callable[..., Series] = Series @@ -690,7 +691,8 @@ def _constructor_sliced_from_mgr(self, mgr, axes) -> Series: # We assume that the subclass __init__ knows how to handle a # pd.Series object. - return self._constructor_sliced(ser) + metadata = {k: getattr(self, k) for k in self._metadata} + return self._constructor_sliced(ser, **metadata) # ---------------------------------------------------------------------- # Constructors diff --git a/pandas/core/series.py b/pandas/core/series.py index da46f8ede3409..d1b17e9901ff5 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -339,7 +339,7 @@ class Series(base.IndexOpsMixin, NDFrame): # type: ignore[misc] _HANDLED_TYPES = (Index, ExtensionArray, np.ndarray) _name: Hashable - _metadata: list[str] = ["_name"] + _metadata: list[str] = [] _internal_names_set = {"index", "name"} | NDFrame._internal_names_set _accessors = {"dt", "cat", "str", "sparse"} _hidden_attrs = ( @@ -372,6 +372,9 @@ def __init__( copy: bool | None = None, ) -> None: allow_mgr = False + if "_name" not in self._metadata: + # Subclass overrides _metadata, see @540db96b + self._metadata.append("_name") if ( isinstance(data, SingleBlockManager) and index is None @@ -610,7 +613,9 @@ def _constructor_from_mgr(self, mgr, axes): # We assume that the subclass __init__ knows how to handle a # pd.Series object. - return self._constructor(ser) + self._metadata.remove("_name") + metadata = {k: getattr(self, k) for k in self._metadata} + return self._constructor(ser, **metadata) @property def _constructor_expanddim(self) -> Callable[..., DataFrame]: @@ -634,7 +639,9 @@ def _constructor_expanddim_from_mgr(self, mgr, axes): # We assume that the subclass __init__ knows how to handle a # pd.DataFrame object. - return self._constructor_expanddim(df) + self._metadata.remove("_name") + metadata = {k: getattr(self, k) for k in self._metadata} + return self._constructor_expanddim(df, **metadata) # types @property diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index bc69ec388bf0c..d2b2e46c9573a 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -4,7 +4,6 @@ timezone, ) from enum import Enum -import functools import operator import re @@ -2139,6 +2138,12 @@ def test_frame_op_subclass_nonclass_constructor(): # GH#43201 subclass._constructor is a function, not the subclass itself class SubclassedSeries(Series): + _metadata = ["my_extra_data"] + + def __init__(self, data=None, my_extra_data=None, *args, **kwargs) -> None: + self.my_extra_data = my_extra_data + super().__init__(data, *args, **kwargs) + @property def _constructor(self): return SubclassedSeries @@ -2150,21 +2155,25 @@ def _constructor_expanddim(self): class SubclassedDataFrame(DataFrame): _metadata = ["my_extra_data"] - def __init__(self, my_extra_data, *args, **kwargs) -> None: + def __init__(self, data=None, my_extra_data=None, *args, **kwargs) -> None: self.my_extra_data = my_extra_data - super().__init__(*args, **kwargs) + super().__init__(data, *args, **kwargs) @property def _constructor(self): - return functools.partial(type(self), self.my_extra_data) + return SubclassedDataFrame @property def _constructor_sliced(self): return SubclassedSeries - sdf = SubclassedDataFrame("some_data", {"A": [1, 2, 3], "B": [4, 5, 6]}) + sdf = SubclassedDataFrame( + my_extra_data="some_data", data={"A": [1, 2, 3], "B": [4, 5, 6]} + ) result = sdf * 2 - expected = SubclassedDataFrame("some_data", {"A": [2, 4, 6], "B": [8, 10, 12]}) + expected = SubclassedDataFrame( + my_extra_data="some_data", data={"A": [2, 4, 6], "B": [8, 10, 12]} + ) tm.assert_frame_equal(result, expected) result = sdf + sdf diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index cbd563a03b908..4a76b892981c7 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -7,6 +7,7 @@ Index, MultiIndex, Series, + concat, ) import pandas._testing as tm @@ -742,16 +743,67 @@ def test_equals_subclass(self): assert df1.equals(df2) assert df2.equals(df1) + def test_original_property_is_preserved_when_subclassing(self): + original_property = "original_property" + + class SubclassedSeries(Series): + _metadata = [original_property] + + def __init__(self, data=None, original_property=None, *args, **kwargs): + super().__init__(data, *args, **kwargs) + self.original_property = original_property + + @property + def _constructor(self): + return SubclassedSeries + + @property + def _constructor_expanddim(self): + return SubclassedDataFrame + + class SubclassedDataFrame(DataFrame): + _metadata = ["original_property"] + + def __init__(self, data=None, original_property=None, *args, **kwargs): + super().__init__(data, *args, **kwargs) + self.original_property = original_property + + @property + def _constructor(self): + return SubclassedDataFrame + + @property + def _constructor_sliced(self): + return SubclassedSeries + + data = {"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 5]} + df = SubclassedDataFrame(data, original_property="original_property") + tm.assert_equal(df.original_property, original_property) + tm.assert_equal(df[df["value"] == 1].original_property, original_property) + tm.assert_equal(df.loc[df["key"] == "foo"].original_property, original_property) + tm.assert_equal(df["value"].original_property, original_property) + + tm.assert_equal(concat([df, df]).original_property, original_property) + + df1 = SubclassedDataFrame( + {"lkey": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 5]}, + original_property="original_property", + ) + df2 = SubclassedDataFrame( + {"rkey": ["foo", "bar", "baz", "foo"], "value": [5, 6, 7, 8]}, + original_property="original_property", + ) + merged_df = df1.merge(df2, left_on="lkey", right_on="rkey") + tm.assert_equal(merged_df.original_property, original_property) + plus = df1 + df2 + tm.assert_equal(plus.original_property, original_property) + class MySubclassWithMetadata(DataFrame): _metadata = ["my_metadata"] - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - - my_metadata = kwargs.pop("my_metadata", None) - if args and isinstance(args[0], MySubclassWithMetadata): - my_metadata = args[0].my_metadata # type: ignore[has-type] + def __init__(self, data=None, my_metadata=None, *args, **kwargs) -> None: + super().__init__(data, *args, **kwargs) self.my_metadata = my_metadata @property From b04035c17156f403125cbb742dcd4cf49a8399d2 Mon Sep 17 00:00:00 2001 From: YankoFelipe <28831006+YankoFelipe@users.noreply.github.com> Date: Wed, 12 Mar 2025 00:00:32 +0100 Subject: [PATCH 2/2] Fixing doc/source/development/extending.rst format --- doc/source/development/extending.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index 58d39bd7d4037..751d52f6cea15 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -402,10 +402,11 @@ To let original data structures have additional properties, you should let ``pan 1. Define ``_internal_names`` and ``_internal_names_set`` for temporary properties which WILL NOT be passed to manipulation results. 2. Define ``_metadata`` for normal properties which will be passed to manipulation results. -If used, a ``Series`` subclass must also be defined with the same ``_metadata`` property and the first parameter of the constructors must be the data. + +If ``_metadata`` is used, a ``Series`` subclass must also be defined with the same ``_metadata`` property and the first parameter of the constructors must be the data. Avoid the following names for your normal properties: ``data``, ``index``, ``columns``, ``dtype``, ``copy``, ``name``, ``_name`` and ``fastpath``. -Below is an example to define two original properties, "internal_cache" as a temporary property and "added_property" as a normal property +Below is an example to define two original properties, "internal_cache" as a temporary property and "added_property" as a normal property. .. code-block:: python