Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Passing original properties of DataFrame and Series subclasses to their constructors #61101

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 38 additions & 10 deletions doc/source/development/extending.rst
Original file line number Diff line number Diff line change
Expand Up @@ -403,22 +403,50 @@ To let original data structures have additional properties, you should let ``pan
1. Define ``_internal_names`` and ``_internal_names_set`` for temporary properties which WILL NOT be passed to manipulation results.
2. Define ``_metadata`` for normal properties which will be passed to manipulation results.

Below is an example to define two original properties, "internal_cache" as a temporary property and "added_property" as a normal property
If ``_metadata`` is used, a ``Series`` subclass must also be defined with the same ``_metadata`` property and the first parameter of the constructors must be the data.
Avoid the following names for your normal properties: ``data``, ``index``, ``columns``, ``dtype``, ``copy``, ``name``, ``_name`` and ``fastpath``.

Below is an example to define two original properties, "internal_cache" as a temporary property and "added_property" as a normal property.

.. code-block:: python

class SubclassedDataFrame2(pd.DataFrame):
class SubclassedDataFrame(pd.DataFrame):

# temporary properties
_internal_names = pd.DataFrame._internal_names + ["internal_cache"]
_internal_names_set = set(_internal_names)
# temporary properties
_internal_names = pd.DataFrame._internal_names + ["internal_cache"]
_internal_names_set = set(_internal_names)

# normal properties
_metadata = ["added_property"]
# normal properties
_metadata = ["added_property"]

@property
def _constructor(self):
return SubclassedDataFrame2
def __init__(self, data=None, added_property=None, *args, **kwargs):
super().__init__(data, *args, **kwargs)
self.added_property = added_property

@property
def _constructor(self):
return SubclassedDataFrame

@property
def _constructor_sliced(self):
return SubclassedSeries

class SubclassedSeries(pd.Series):

# normal properties
_metadata = ["original_property"]

def __init__(self, data=None, original_property=None, *args, **kwargs):
super().__init__(data, *args, **kwargs)
self.original_property = original_property

@property
def _constructor(self):
return SubclassedSeries

@property
def _constructor_expanddim(self):
return SubclassedDataFrame

.. code-block:: python

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -843,6 +843,7 @@ Other
- Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`)
- Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`)
- Bug in ``Series.list`` methods not preserving the original name. (:issue:`60522`)
- Bug in original properties ``_metadata`` of :class:`Dataframe` and :class:`Series` subclasses'. For some operations (i.e. ``concat``) the new object wasn't receiving the original properties (:issue:`34177`)
- Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`)
- Bug in printing a :class:`Series` with a :class:`DataFrame` stored in :attr:`Series.attrs` raised a ``ValueError`` (:issue:`60568`)
- Fixed regression in :meth:`DataFrame.from_records` not initializing subclasses properly (:issue:`57008`)
Expand Down
23 changes: 13 additions & 10 deletions pandas/_testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,32 +325,35 @@ def to_array(obj):


class SubclassedSeries(Series):
_metadata = ["testattr", "name"]
_metadata = ["testattr"]

def __init__(self, data=None, testattr=None, *args, **kwargs):
super().__init__(data, *args, **kwargs)
self.testattr = testattr

@property
def _constructor(self):
# For testing, those properties return a generic callable, and not
# the actual class. In this case that is equivalent, but it is to
# ensure we don't rely on the property returning a class
# See https://github.com/pandas-dev/pandas/pull/46018 and
# https://github.com/pandas-dev/pandas/issues/32638 and linked issues
return lambda *args, **kwargs: SubclassedSeries(*args, **kwargs)
return SubclassedSeries

@property
def _constructor_expanddim(self):
return lambda *args, **kwargs: SubclassedDataFrame(*args, **kwargs)
return SubclassedDataFrame


class SubclassedDataFrame(DataFrame):
_metadata = ["testattr"]

def __init__(self, data=None, testattr=None, *args, **kwargs):
super().__init__(data, *args, **kwargs)
self.testattr = testattr

@property
def _constructor(self):
return lambda *args, **kwargs: SubclassedDataFrame(*args, **kwargs)
return SubclassedDataFrame

@property
def _constructor_sliced(self):
return lambda *args, **kwargs: SubclassedSeries(*args, **kwargs)
return SubclassedSeries


def convert_rows_list_to_csv_str(rows_list: list[str]) -> str:
Expand Down
6 changes: 4 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -675,7 +675,8 @@ def _constructor_from_mgr(self, mgr, axes) -> DataFrame:

# We assume that the subclass __init__ knows how to handle a
# pd.DataFrame object.
return self._constructor(df)
metadata = {k: getattr(self, k) for k in self._metadata}
return self._constructor(df, **metadata)

_constructor_sliced: Callable[..., Series] = Series

Expand All @@ -690,7 +691,8 @@ def _constructor_sliced_from_mgr(self, mgr, axes) -> Series:

# We assume that the subclass __init__ knows how to handle a
# pd.Series object.
return self._constructor_sliced(ser)
metadata = {k: getattr(self, k) for k in self._metadata}
return self._constructor_sliced(ser, **metadata)

# ----------------------------------------------------------------------
# Constructors
Expand Down
13 changes: 10 additions & 3 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,7 @@ class Series(base.IndexOpsMixin, NDFrame): # type: ignore[misc]
_HANDLED_TYPES = (Index, ExtensionArray, np.ndarray)

_name: Hashable
_metadata: list[str] = ["_name"]
_metadata: list[str] = []
_internal_names_set = {"index", "name"} | NDFrame._internal_names_set
_accessors = {"dt", "cat", "str", "sparse"}
_hidden_attrs = (
Expand Down Expand Up @@ -372,6 +372,9 @@ def __init__(
copy: bool | None = None,
) -> None:
allow_mgr = False
if "_name" not in self._metadata:
# Subclass overrides _metadata, see @540db96b
self._metadata.append("_name")
if (
isinstance(data, SingleBlockManager)
and index is None
Expand Down Expand Up @@ -610,7 +613,9 @@ def _constructor_from_mgr(self, mgr, axes):

# We assume that the subclass __init__ knows how to handle a
# pd.Series object.
return self._constructor(ser)
self._metadata.remove("_name")
metadata = {k: getattr(self, k) for k in self._metadata}
return self._constructor(ser, **metadata)

@property
def _constructor_expanddim(self) -> Callable[..., DataFrame]:
Expand All @@ -634,7 +639,9 @@ def _constructor_expanddim_from_mgr(self, mgr, axes):

# We assume that the subclass __init__ knows how to handle a
# pd.DataFrame object.
return self._constructor_expanddim(df)
self._metadata.remove("_name")
metadata = {k: getattr(self, k) for k in self._metadata}
return self._constructor_expanddim(df, **metadata)

# types
@property
Expand Down
21 changes: 15 additions & 6 deletions pandas/tests/frame/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
timezone,
)
from enum import Enum
import functools
import operator
import re

Expand Down Expand Up @@ -2139,6 +2138,12 @@ def test_frame_op_subclass_nonclass_constructor():
# GH#43201 subclass._constructor is a function, not the subclass itself

class SubclassedSeries(Series):
_metadata = ["my_extra_data"]

def __init__(self, data=None, my_extra_data=None, *args, **kwargs) -> None:
self.my_extra_data = my_extra_data
super().__init__(data, *args, **kwargs)

@property
def _constructor(self):
return SubclassedSeries
Expand All @@ -2150,21 +2155,25 @@ def _constructor_expanddim(self):
class SubclassedDataFrame(DataFrame):
_metadata = ["my_extra_data"]

def __init__(self, my_extra_data, *args, **kwargs) -> None:
def __init__(self, data=None, my_extra_data=None, *args, **kwargs) -> None:
self.my_extra_data = my_extra_data
super().__init__(*args, **kwargs)
super().__init__(data, *args, **kwargs)

@property
def _constructor(self):
return functools.partial(type(self), self.my_extra_data)
return SubclassedDataFrame

@property
def _constructor_sliced(self):
return SubclassedSeries

sdf = SubclassedDataFrame("some_data", {"A": [1, 2, 3], "B": [4, 5, 6]})
sdf = SubclassedDataFrame(
my_extra_data="some_data", data={"A": [1, 2, 3], "B": [4, 5, 6]}
)
result = sdf * 2
expected = SubclassedDataFrame("some_data", {"A": [2, 4, 6], "B": [8, 10, 12]})
expected = SubclassedDataFrame(
my_extra_data="some_data", data={"A": [2, 4, 6], "B": [8, 10, 12]}
)
tm.assert_frame_equal(result, expected)

result = sdf + sdf
Expand Down
64 changes: 58 additions & 6 deletions pandas/tests/frame/test_subclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
Index,
MultiIndex,
Series,
concat,
)
import pandas._testing as tm

Expand Down Expand Up @@ -742,16 +743,67 @@ def test_equals_subclass(self):
assert df1.equals(df2)
assert df2.equals(df1)

def test_original_property_is_preserved_when_subclassing(self):
original_property = "original_property"

class SubclassedSeries(Series):
_metadata = [original_property]

def __init__(self, data=None, original_property=None, *args, **kwargs):
super().__init__(data, *args, **kwargs)
self.original_property = original_property

@property
def _constructor(self):
return SubclassedSeries

@property
def _constructor_expanddim(self):
return SubclassedDataFrame

class SubclassedDataFrame(DataFrame):
_metadata = ["original_property"]

def __init__(self, data=None, original_property=None, *args, **kwargs):
super().__init__(data, *args, **kwargs)
self.original_property = original_property

@property
def _constructor(self):
return SubclassedDataFrame

@property
def _constructor_sliced(self):
return SubclassedSeries

data = {"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 5]}
df = SubclassedDataFrame(data, original_property="original_property")
tm.assert_equal(df.original_property, original_property)
tm.assert_equal(df[df["value"] == 1].original_property, original_property)
tm.assert_equal(df.loc[df["key"] == "foo"].original_property, original_property)
tm.assert_equal(df["value"].original_property, original_property)

tm.assert_equal(concat([df, df]).original_property, original_property)

df1 = SubclassedDataFrame(
{"lkey": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 5]},
original_property="original_property",
)
df2 = SubclassedDataFrame(
{"rkey": ["foo", "bar", "baz", "foo"], "value": [5, 6, 7, 8]},
original_property="original_property",
)
merged_df = df1.merge(df2, left_on="lkey", right_on="rkey")
tm.assert_equal(merged_df.original_property, original_property)
plus = df1 + df2
tm.assert_equal(plus.original_property, original_property)


class MySubclassWithMetadata(DataFrame):
_metadata = ["my_metadata"]

def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)

my_metadata = kwargs.pop("my_metadata", None)
if args and isinstance(args[0], MySubclassWithMetadata):
my_metadata = args[0].my_metadata # type: ignore[has-type]
def __init__(self, data=None, my_metadata=None, *args, **kwargs) -> None:
super().__init__(data, *args, **kwargs)
self.my_metadata = my_metadata

@property
Expand Down
Loading