ENH: show percentiles in timestamp describe (#30164) (#30209)

david-cortes · jreback · commit f1bbb2107db1 · 2020-01-20T18:37:55.000-05:00
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -22,6 +22,17 @@ Other enhancements
 -
 -
 
+.. ---------------------------------------------------------------------------
+
+.. _whatsnew_110.api.other:
+
+Other API changes
+^^^^^^^^^^^^^^^^^
+
+- :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last``
+  will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`)
+-
+-
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -9567,26 +9567,8 @@ def describe_categorical_1d(data):
             dtype = None
             if result[1] > 0:
                 top, freq = objcounts.index[0], objcounts.iloc[0]
-
-                if is_datetime64_any_dtype(data):
-                    tz = data.dt.tz
-                    asint = data.dropna().values.view("i8")
-                    top = Timestamp(top)
-                    if top.tzinfo is not None and tz is not None:
-                        # Don't tz_localize(None) if key is already tz-aware
-                        top = top.tz_convert(tz)
-                    else:
-                        top = top.tz_localize(tz)
-                    names += ["top", "freq", "first", "last"]
-                    result += [
-                        top,
-                        freq,
-                        Timestamp(asint.min(), tz=tz),
-                        Timestamp(asint.max(), tz=tz),
-                    ]
-                else:
-                    names += ["top", "freq"]
-                    result += [top, freq]
+                names += ["top", "freq"]
+                result += [top, freq]
 
             # If the DataFrame is empty, set 'top' and 'freq' to None
             # to maintain output shape consistency
@@ -9597,11 +9579,23 @@ def describe_categorical_1d(data):
 
             return pd.Series(result, index=names, name=data.name, dtype=dtype)
 
+        def describe_timestamp_1d(data):
+            # GH-30164
+            stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
+            d = (
+                [data.count(), data.mean(), data.min()]
+                + data.quantile(percentiles).tolist()
+                + [data.max()]
+            )
+            return pd.Series(d, index=stat_index, name=data.name)
+
         def describe_1d(data):
             if is_bool_dtype(data):
                 return describe_categorical_1d(data)
             elif is_numeric_dtype(data):
                 return describe_numeric_1d(data)
+            elif is_datetime64_any_dtype(data):
+                return describe_timestamp_1d(data)
             elif is_timedelta64_dtype(data):
                 return describe_numeric_1d(data)
             else:
diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py
@@ -253,52 +253,19 @@ def test_describe_tz_values(self, tz_naive_fixture):
 
         expected = DataFrame(
             {
-                "s1": [
-                    5,
-                    np.nan,
-                    np.nan,
-                    np.nan,
-                    np.nan,
-                    np.nan,
-                    2,
-                    1.581139,
-                    0,
-                    1,
-                    2,
-                    3,
-                    4,
-                ],
+                "s1": [5, 2, 0, 1, 2, 3, 4, 1.581139],
                 "s2": [
                     5,
-                    5,
-                    s2.value_counts().index[0],
-                    1,
+                    Timestamp(2018, 1, 3).tz_localize(tz),
                     start.tz_localize(tz),
+                    s2[1],
+                    s2[2],
+                    s2[3],
                     end.tz_localize(tz),
                     np.nan,
-                    np.nan,
-                    np.nan,
-                    np.nan,
-                    np.nan,
-                    np.nan,
-                    np.nan,
                 ],
             },
-            index=[
-                "count",
-                "unique",
-                "top",
-                "freq",
-                "first",
-                "last",
-                "mean",
-                "std",
-                "min",
-                "25%",
-                "50%",
-                "75%",
-                "max",
-            ],
+            index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
         )
         result = df.describe(include="all")
         tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/series/methods/test_describe.py b/pandas/tests/series/methods/test_describe.py
@@ -1,6 +1,6 @@
 import numpy as np
 
-from pandas import Series, Timestamp, date_range
+from pandas import Period, Series, Timedelta, Timestamp, date_range
 import pandas._testing as tm
 
 
@@ -29,6 +29,36 @@ def test_describe(self):
         )
         tm.assert_series_equal(result, expected)
 
+        s = Series(
+            [
+                Timedelta("1 days"),
+                Timedelta("2 days"),
+                Timedelta("3 days"),
+                Timedelta("4 days"),
+                Timedelta("5 days"),
+            ],
+            name="timedelta_data",
+        )
+        result = s.describe()
+        expected = Series(
+            [5, s[2], s.std(), s[0], s[1], s[2], s[3], s[4]],
+            name="timedelta_data",
+            index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
+        )
+        tm.assert_series_equal(result, expected)
+
+        s = Series(
+            [Period("2020-01", "M"), Period("2020-01", "M"), Period("2019-12", "M")],
+            name="period_data",
+        )
+        result = s.describe()
+        expected = Series(
+            [3, 2, s[0], 2],
+            name="period_data",
+            index=["count", "unique", "top", "freq"],
+        )
+        tm.assert_series_equal(result, expected)
+
     def test_describe_empty_object(self):
         # https://github.com/pandas-dev/pandas/issues/27183
         s = Series([None, None], dtype=object)
@@ -57,13 +87,14 @@ def test_describe_with_tz(self, tz_naive_fixture):
         expected = Series(
             [
                 5,
-                5,
-                s.value_counts().index[0],
-                1,
+                Timestamp(2018, 1, 3).tz_localize(tz),
                 start.tz_localize(tz),
+                s[1],
+                s[2],
+                s[3],
                 end.tz_localize(tz),
             ],
             name=name,
-            index=["count", "unique", "top", "freq", "first", "last"],
+            index=["count", "mean", "min", "25%", "50%", "75%", "max"],
         )
         tm.assert_series_equal(result, expected)