Skip to content

Commit f1bbb21

Browse files
david-cortesjreback
authored andcommitted
ENH: show percentiles in timestamp describe (#30164) (#30209)
1 parent 9a01577 commit f1bbb21

File tree

4 files changed

+67
-64
lines changed

4 files changed

+67
-64
lines changed

doc/source/whatsnew/v1.1.0.rst

+11
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,17 @@ Other enhancements
2222
-
2323
-
2424

25+
.. ---------------------------------------------------------------------------
26+
27+
.. _whatsnew_110.api.other:
28+
29+
Other API changes
30+
^^^^^^^^^^^^^^^^^
31+
32+
- :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last``
33+
will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`)
34+
-
35+
-
2536

2637
.. ---------------------------------------------------------------------------
2738

pandas/core/generic.py

+14-20
Original file line numberDiff line numberDiff line change
@@ -9567,26 +9567,8 @@ def describe_categorical_1d(data):
95679567
dtype = None
95689568
if result[1] > 0:
95699569
top, freq = objcounts.index[0], objcounts.iloc[0]
9570-
9571-
if is_datetime64_any_dtype(data):
9572-
tz = data.dt.tz
9573-
asint = data.dropna().values.view("i8")
9574-
top = Timestamp(top)
9575-
if top.tzinfo is not None and tz is not None:
9576-
# Don't tz_localize(None) if key is already tz-aware
9577-
top = top.tz_convert(tz)
9578-
else:
9579-
top = top.tz_localize(tz)
9580-
names += ["top", "freq", "first", "last"]
9581-
result += [
9582-
top,
9583-
freq,
9584-
Timestamp(asint.min(), tz=tz),
9585-
Timestamp(asint.max(), tz=tz),
9586-
]
9587-
else:
9588-
names += ["top", "freq"]
9589-
result += [top, freq]
9570+
names += ["top", "freq"]
9571+
result += [top, freq]
95909572

95919573
# If the DataFrame is empty, set 'top' and 'freq' to None
95929574
# to maintain output shape consistency
@@ -9597,11 +9579,23 @@ def describe_categorical_1d(data):
95979579

95989580
return pd.Series(result, index=names, name=data.name, dtype=dtype)
95999581

9582+
def describe_timestamp_1d(data):
9583+
# GH-30164
9584+
stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
9585+
d = (
9586+
[data.count(), data.mean(), data.min()]
9587+
+ data.quantile(percentiles).tolist()
9588+
+ [data.max()]
9589+
)
9590+
return pd.Series(d, index=stat_index, name=data.name)
9591+
96009592
def describe_1d(data):
96019593
if is_bool_dtype(data):
96029594
return describe_categorical_1d(data)
96039595
elif is_numeric_dtype(data):
96049596
return describe_numeric_1d(data)
9597+
elif is_datetime64_any_dtype(data):
9598+
return describe_timestamp_1d(data)
96059599
elif is_timedelta64_dtype(data):
96069600
return describe_numeric_1d(data)
96079601
else:

pandas/tests/frame/methods/test_describe.py

+6-39
Original file line numberDiff line numberDiff line change
@@ -253,52 +253,19 @@ def test_describe_tz_values(self, tz_naive_fixture):
253253

254254
expected = DataFrame(
255255
{
256-
"s1": [
257-
5,
258-
np.nan,
259-
np.nan,
260-
np.nan,
261-
np.nan,
262-
np.nan,
263-
2,
264-
1.581139,
265-
0,
266-
1,
267-
2,
268-
3,
269-
4,
270-
],
256+
"s1": [5, 2, 0, 1, 2, 3, 4, 1.581139],
271257
"s2": [
272258
5,
273-
5,
274-
s2.value_counts().index[0],
275-
1,
259+
Timestamp(2018, 1, 3).tz_localize(tz),
276260
start.tz_localize(tz),
261+
s2[1],
262+
s2[2],
263+
s2[3],
277264
end.tz_localize(tz),
278265
np.nan,
279-
np.nan,
280-
np.nan,
281-
np.nan,
282-
np.nan,
283-
np.nan,
284-
np.nan,
285266
],
286267
},
287-
index=[
288-
"count",
289-
"unique",
290-
"top",
291-
"freq",
292-
"first",
293-
"last",
294-
"mean",
295-
"std",
296-
"min",
297-
"25%",
298-
"50%",
299-
"75%",
300-
"max",
301-
],
268+
index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
302269
)
303270
result = df.describe(include="all")
304271
tm.assert_frame_equal(result, expected)

pandas/tests/series/methods/test_describe.py

+36-5
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import numpy as np
22

3-
from pandas import Series, Timestamp, date_range
3+
from pandas import Period, Series, Timedelta, Timestamp, date_range
44
import pandas._testing as tm
55

66

@@ -29,6 +29,36 @@ def test_describe(self):
2929
)
3030
tm.assert_series_equal(result, expected)
3131

32+
s = Series(
33+
[
34+
Timedelta("1 days"),
35+
Timedelta("2 days"),
36+
Timedelta("3 days"),
37+
Timedelta("4 days"),
38+
Timedelta("5 days"),
39+
],
40+
name="timedelta_data",
41+
)
42+
result = s.describe()
43+
expected = Series(
44+
[5, s[2], s.std(), s[0], s[1], s[2], s[3], s[4]],
45+
name="timedelta_data",
46+
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
47+
)
48+
tm.assert_series_equal(result, expected)
49+
50+
s = Series(
51+
[Period("2020-01", "M"), Period("2020-01", "M"), Period("2019-12", "M")],
52+
name="period_data",
53+
)
54+
result = s.describe()
55+
expected = Series(
56+
[3, 2, s[0], 2],
57+
name="period_data",
58+
index=["count", "unique", "top", "freq"],
59+
)
60+
tm.assert_series_equal(result, expected)
61+
3262
def test_describe_empty_object(self):
3363
# https://github.com/pandas-dev/pandas/issues/27183
3464
s = Series([None, None], dtype=object)
@@ -57,13 +87,14 @@ def test_describe_with_tz(self, tz_naive_fixture):
5787
expected = Series(
5888
[
5989
5,
60-
5,
61-
s.value_counts().index[0],
62-
1,
90+
Timestamp(2018, 1, 3).tz_localize(tz),
6391
start.tz_localize(tz),
92+
s[1],
93+
s[2],
94+
s[3],
6495
end.tz_localize(tz),
6596
],
6697
name=name,
67-
index=["count", "unique", "top", "freq", "first", "last"],
98+
index=["count", "mean", "min", "25%", "50%", "75%", "max"],
6899
)
69100
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)