From 0fdbfd34add4fe9940cd1991122e851d62824724 Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Fri, 9 Nov 2018 09:26:57 -0600 Subject: [PATCH 01/39] wip --- doc/source/whatsnew/v0.24.0.txt | 1 + pandas/core/arrays/base.py | 40 ++++++++++++++++++- pandas/core/arrays/categorical.py | 2 + pandas/core/arrays/integer.py | 24 +---------- pandas/core/arrays/interval.py | 10 ----- pandas/core/arrays/period.py | 8 ---- pandas/io/formats/printing.py | 37 +++++++++++++---- pandas/tests/arrays/interval/test_interval.py | 32 +++++++++++---- pandas/tests/arrays/test_integer.py | 29 +++++++------- pandas/tests/arrays/test_period.py | 33 ++++++++++++++- pandas/tests/extension/base/__init__.py | 1 + pandas/tests/extension/base/interface.py | 25 ------------ pandas/tests/extension/base/printing.py | 38 ++++++++++++++++++ pandas/tests/extension/decimal/array.py | 3 -- .../tests/extension/decimal/test_decimal.py | 6 ++- pandas/tests/extension/json/array.py | 3 -- pandas/tests/extension/json/test_json.py | 7 +++- pandas/tests/extension/test_integer.py | 6 ++- pandas/tests/extension/test_interval.py | 6 ++- pandas/tests/extension/test_period.py | 6 ++- pandas/tests/extension/test_sparse.py | 6 +++ 21 files changed, 213 insertions(+), 110 deletions(-) create mode 100644 pandas/tests/extension/base/printing.py diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 5fefb9e3e405c..1378caeac5edb 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -857,6 +857,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your - :meth:`DataFrame.stack` no longer converts to object dtype for DataFrames where each column has the same extension dtype. The output Series will have the same dtype as the columns (:issue:`23077`). - :meth:`Series.unstack` and :meth:`DataFrame.unstack` no longer convert extension arrays to object-dtype ndarrays. Each column in the output ``DataFrame`` will now have the same dtype as the input (:issue:`23077`). - Bug when grouping :meth:`Dataframe.groupby()` and aggregating on ``ExtensionArray`` it was not returning the actual ``ExtensionArray`` dtype (:issue:`23227`). +- A default repr is now provided. .. _whatsnew_0240.api.incompatibilities: diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index f842d1237cb14..41ec0971bef7c 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -49,6 +49,13 @@ class ExtensionArray(object): * _formatting_values + A default repr displaying the type, (truncated) data, length, + and dtype is provided. It can be customized or replaced by + by overriding: + + * _formatter + * __repr__ + Some methods require casting the ExtensionArray to an ndarray of Python objects with ``self.astype(object)``, which may be expensive. When performance is a concern, we highly recommend overriding the following @@ -653,8 +660,35 @@ def copy(self, deep=False): raise AbstractMethodError(self) # ------------------------------------------------------------------------ - # Block-related methods + # Printing # ------------------------------------------------------------------------ + def __repr__(self): + from pandas.io.formats.printing import format_object_summary + + template = ( + '<{class_name}>\n' + '{data}\n' + 'Length: {length}, dtype: {dtype}' + ) + # the short repr has no trailing newline, while the truncated + # repr does. So we include a newline in our template, and strip + # any trailing newlines from format_object_summary + data = format_object_summary(self, self._formatter, name=False, + trailing_comma=False).rstrip('\n') + name = self.__class__.__name__ + return template.format(class_name=name, data=data, + length=len(self), + dtype=self.dtype) + + @property + def _formatter(self): + # type: () -> Callable[Any] + """Formatting function for scalar values. + + This is used in the default '__repr__'. The formatting function + receives instances of your scalar type. + """ + return str def _formatting_values(self): # type: () -> np.ndarray @@ -662,6 +696,10 @@ def _formatting_values(self): """An array of values to be printed in, e.g. the Series repr""" return np.array(self) + # ------------------------------------------------------------------------ + # Reshaping + # ------------------------------------------------------------------------ + @classmethod def _concat_same_type(cls, to_concat): # type: (Sequence[ExtensionArray]) -> ExtensionArray diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 4363f3ccb14e2..b74938a9c7b18 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1986,6 +1986,8 @@ def __unicode__(self): return result + __repr__ = __unicode__ + def _maybe_coerce_indexer(self, indexer): """ return an indexer coerced to the codes dtype """ if isinstance(indexer, np.ndarray) and indexer.dtype.kind == 'i': diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 9e045a7785660..27c845d666e60 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -6,7 +6,7 @@ from pandas._libs import lib from pandas.util._decorators import cache_readonly -from pandas.compat import u, range, string_types +from pandas.compat import range, string_types from pandas.compat import set_function_name from pandas.core import nanops @@ -24,9 +24,6 @@ from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.missing import isna, notna -from pandas.io.formats.printing import ( - format_object_summary, format_object_attrs, default_pprint) - class _IntegerDtype(ExtensionDtype): """ @@ -353,25 +350,6 @@ def __setitem__(self, key, value): def __len__(self): return len(self._data) - def __repr__(self): - """ - Return a string representation for this object. - - Invoked by unicode(df) in py2 only. Yields a Unicode String in both - py2/py3. - """ - klass = self.__class__.__name__ - data = format_object_summary(self, default_pprint, False) - attrs = format_object_attrs(self) - space = " " - - prepr = (u(",%s") % - space).join(u("%s=%s") % (k, v) for k, v in attrs) - - res = u("%s(%s%s)") % (klass, data, prepr) - - return res - @property def nbytes(self): return self._data.nbytes + self._mask.nbytes diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 94be29893d2b9..1b3687e555748 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -835,16 +835,6 @@ def _format_data(self): return summary - def __repr__(self): - tpl = textwrap.dedent("""\ - {cls}({data}, - {lead}closed='{closed}', - {lead}dtype='{dtype}')""") - return tpl.format(cls=self.__class__.__name__, - data=self._format_data(), - lead=' ' * len(self.__class__.__name__) + ' ', - closed=self.closed, dtype=self.dtype) - def _format_space(self): space = ' ' * (len(self.__class__.__name__) + 1) return "\n{space}".format(space=space) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 5a75f2706b218..f6996f8e62d4d 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -330,14 +330,6 @@ def start_time(self): def end_time(self): return self.to_timestamp(how='end') - def __repr__(self): - return '<{}>\n{}\nLength: {}, dtype: {}'.format( - self.__class__.__name__, - [str(s) for s in self], - len(self), - self.dtype - ) - def __setitem__( self, key, # type: Union[int, Sequence[int], Sequence[bool]] diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index f814bf965a1e9..3d56b4b222ba6 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -271,7 +271,9 @@ class TableSchemaFormatter(BaseFormatter): max_seq_items=max_seq_items) -def format_object_summary(obj, formatter, is_justify=True, name=None): +def format_object_summary(obj, formatter, is_justify=True, name=None, + trailing_comma=True, + truncated_trailing_newline=True): """ Return the formatted obj as a unicode string @@ -283,9 +285,14 @@ def format_object_summary(obj, formatter, is_justify=True, name=None): string formatter for an element is_justify : boolean should justify the display - name : name, optiona + name : name, optional defaults to the class name of the obj + Pass ``False`` to indicate that subsequent lines should + not be indented to align with the name. + trailing_comma : bool, default True + Whether to include a comma after the closing ']' + Returns ------- summary string @@ -300,8 +307,13 @@ def format_object_summary(obj, formatter, is_justify=True, name=None): if name is None: name = obj.__class__.__name__ - space1 = "\n%s" % (' ' * (len(name) + 1)) - space2 = "\n%s" % (' ' * (len(name) + 2)) + if name is False: + space1 = "\n" + space2 = "\n " # space for the opening '[' + else: + name_len = len(name) + space1 = "\n%s" % (' ' * (name_len + 1)) + space2 = "\n%s" % (' ' * (name_len + 2)) n = len(obj) sep = ',' @@ -328,15 +340,20 @@ def best_len(values): else: return 0 + if trailing_comma: + close = ', ' + else: + close = '' + if n == 0: - summary = '[], ' + summary = '[]{}'.format(close) elif n == 1: first = formatter(obj[0]) - summary = '[%s], ' % first + summary = '[{}]{}'.format(first, close) elif n == 2: first = formatter(obj[0]) last = formatter(obj[-1]) - summary = '[%s, %s], ' % (first, last) + summary = '[{}, {}]{}'.format(first, last, close) else: if n > max_seq_items: @@ -381,7 +398,11 @@ def best_len(values): summary, line = _extend_line(summary, line, tail[-1], display_width - 2, space2) summary += line - summary += '],' + + # right now close is either '' or ', ' + # Now we want to include the ']', but not the maybe space. + close = ']' + close.rstrip(' ') + summary += close if len(summary) > (display_width): summary += space1 diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 9a191dda3a73a..1138f64e2009a 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -1,10 +1,10 @@ # -*- coding: utf-8 -*- import numpy as np -import pytest -from pandas import Index, IntervalIndex, date_range, timedelta_range +from pandas import Index, date_range, option_context, timedelta_range from pandas.core.arrays import IntervalArray import pandas.util.testing as tm +import pytest @pytest.fixture(params=[ @@ -65,8 +65,26 @@ def test_set_na(self, left_right_dtypes): tm.assert_extension_array_equal(result, expected) -def test_repr_matches(): - idx = IntervalIndex.from_breaks([1, 2, 3]) - a = repr(idx) - b = repr(idx.values) - assert a.replace("Index", "Array") == b +def test_repr_small(): + arr = IntervalArray.from_breaks([1, 2, 3]) + result = repr(arr) + expected = ( + '<IntervalArray>\n' + '[(1, 2], (2, 3]]\n' + 'Length: 2, dtype: interval[int64]' + ) + assert result == expected + + +def test_repr_large(): + arr = IntervalArray.from_breaks([1, 2, 3, 4, 5, 6]) + with option_context('display.max_seq_items', 2): + result = repr(arr) + expected = ( + '<IntervalArray>\n' + '[(1, 2],\n' + ' ...\n' + ' (5, 6]] \n' + 'Length: 5, dtype: interval[int64]' + ) + assert result == expected diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 0fe07caed5b85..6b99ede3436ce 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- import numpy as np -import pytest from pandas.core.dtypes.generic import ABCIndexClass @@ -12,6 +11,7 @@ UInt32Dtype, UInt64Dtype) from pandas.tests.extension.base import BaseOpsUtil import pandas.util.testing as tm +import pytest def make_data(): @@ -57,24 +57,23 @@ def test_dtypes(dtype): assert dtype.name is not None -class TestInterface(object): +def test_repr_array(data): + result = repr(data) + assert '<IntegerArray>' in result - def test_repr_array(self, data): - result = repr(data) + # not long + assert '...' not in result + assert 'Length: ' in result + assert 'dtype: ' in result - # not long - assert '...' not in result - assert 'dtype=' in result - assert 'IntegerArray' in result - - def test_repr_array_long(self, data): - # some arrays may be able to assert a ... in the repr - with pd.option_context('display.max_seq_items', 1): - result = repr(data) +def test_repr_array_long(data): + # some arrays may be able to assert a ... in the repr + with pd.option_context('display.max_seq_items', 1): + result = repr(data) - assert '...' in result - assert 'length' in result + assert '...' in result + assert 'Length' in result class TestConstructors(object): diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index 0125729048cdd..553f187e2e8fe 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -1,5 +1,4 @@ import numpy as np -import pytest from pandas._libs.tslibs import iNaT from pandas._libs.tslibs.period import IncompatibleFrequency @@ -10,6 +9,7 @@ import pandas as pd from pandas.core.arrays import PeriodArray, period_array import pandas.util.testing as tm +import pytest # ---------------------------------------------------------------------------- # Constructors @@ -195,3 +195,34 @@ def tet_sub_period(): other = pd.Period("2000", freq="M") with tm.assert_raises_regex(IncompatibleFrequency, "freq"): arr - other + + +# ---------------------------------------------------------------------------- +# Printing + +def test_repr_small(): + arr = period_array(['2000', '2001'], freq='D') + result = str(arr) + expected = ( + '<PeriodArray>\n' + '[2000-01-01, 2001-01-01]\n' + 'Length: 2, dtype: period[D]' + ) + assert result == expected + + +def test_repr_large(): + arr = period_array(['2000', '2001'] * 500, freq='D') + result = str(arr) + expected = ( + '<PeriodArray>\n' + '[2000-01-01, 2001-01-01, 2000-01-01, 2001-01-01, 2000-01-01, ' + '2001-01-01,\n' # continuation + ' 2000-01-01, 2001-01-01, 2000-01-01, 2001-01-01,\n' + ' ...\n' + ' 2000-01-01, 2001-01-01, 2000-01-01, 2001-01-01, 2000-01-01, ' + '2001-01-01,\n' # continuation + ' 2000-01-01, 2001-01-01, 2000-01-01, 2001-01-01]\n' + 'Length: 1000, dtype: period[D]' + ) + assert result == expected diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index d11bb8b6beb77..57704b77bb233 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -48,6 +48,7 @@ class TestMyDtype(BaseDtypeTests): from .interface import BaseInterfaceTests # noqa from .methods import BaseMethodsTests # noqa from .ops import BaseArithmeticOpsTests, BaseComparisonOpsTests, BaseOpsUtil # noqa +from .printing import BasePrintingTests # noqa from .reduce import BaseNoReduceTests, BaseNumericReduceTests, BaseBooleanReduceTests # noqa from .missing import BaseMissingTests # noqa from .reshaping import BaseReshapingTests # noqa diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index 00a480d311b58..f8464dbac8053 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -1,7 +1,5 @@ import numpy as np -from pandas.compat import StringIO - from pandas.core.dtypes.common import is_extension_array_dtype from pandas.core.dtypes.dtypes import ExtensionDtype @@ -35,29 +33,6 @@ def test_array_interface(self, data): result = np.array(data) assert result[0] == data[0] - def test_repr(self, data): - ser = pd.Series(data) - assert data.dtype.name in repr(ser) - - df = pd.DataFrame({"A": data}) - repr(df) - - def test_repr_array(self, data): - # some arrays may be able to assert - # attributes in the repr - repr(data) - - def test_repr_array_long(self, data): - # some arrays may be able to assert a ... in the repr - with pd.option_context('display.max_seq_items', 1): - repr(data) - - def test_dtype_name_in_info(self, data): - buf = StringIO() - pd.DataFrame({"A": data}).info(buf=buf) - result = buf.getvalue() - assert data.dtype.name in result - def test_is_extension_array_dtype(self, data): assert is_extension_array_dtype(data) assert is_extension_array_dtype(data.dtype) diff --git a/pandas/tests/extension/base/printing.py b/pandas/tests/extension/base/printing.py new file mode 100644 index 0000000000000..3d224c2b06c0b --- /dev/null +++ b/pandas/tests/extension/base/printing.py @@ -0,0 +1,38 @@ +import io + +import pandas as pd +import pytest + +from .base import BaseExtensionTests + + +class BasePrintingTests(BaseExtensionTests): + """Tests checking the formatting of your EA when printed.""" + + @pytest.mark.parametrize("size", ["big", "small"]) + def test_array_repr(self, data, size): + if size == "small": + data = data[:5] + else: + data = type(data)._concat_same_type([data] * 5) + + result = repr(data) + assert data.__class__.__name__ in result + assert 'Length: {}'.format(len(data)) in result + assert str(data.dtype) in result + if size == 'big': + assert '...' in result + + def test_series_repr(self, data): + ser = pd.Series(data) + assert data.dtype.name in repr(ser) + + def test_dataframe_repr(self, data): + df = pd.DataFrame({"A": data}) + repr(df) + + def test_dtype_name_in_info(self, data): + buf = io.StringIO() + pd.DataFrame({"A": data}).info(buf=buf) + result = buf.getvalue() + assert data.dtype.name in result diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 3c8905c578c4f..79e81f1034c6d 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -114,9 +114,6 @@ def __setitem__(self, key, value): def __len__(self): return len(self._data) - def __repr__(self): - return 'DecimalArray({!r})'.format(self._data) - @property def nbytes(self): n = len(self) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index af5f6bf0a2f65..1a37b781dfcda 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -3,12 +3,12 @@ import operator import numpy as np -import pytest import pandas as pd from pandas import compat from pandas.tests.extension import base import pandas.util.testing as tm +import pytest from .array import DecimalArray, DecimalDtype, make_data, to_decimal @@ -200,6 +200,10 @@ class TestSetitem(BaseDecimal, base.BaseSetitemTests): pass +class TestPrinting(BaseDecimal, base.BasePrintingTests): + pass + + # TODO(extension) @pytest.mark.xfail(reason=( "raising AssertionError as this is not implemented, " diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 2c6e74fda8a0e..d58b7ddf29123 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -115,9 +115,6 @@ def __setitem__(self, key, value): def __len__(self): return len(self.data) - def __repr__(self): - return 'JSONArary({!r})'.format(self.data) - @property def nbytes(self): return sys.getsizeof(self.data) diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index b7c61496f0bf0..6934b9affb39a 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -1,13 +1,12 @@ import collections import operator -import pytest - from pandas.compat import PY2, PY36 import pandas as pd from pandas.tests.extension import base import pandas.util.testing as tm +import pytest from .array import JSONArray, JSONDtype, make_data @@ -283,3 +282,7 @@ def _check_divmod_op(self, s, op, other, exc=NotImplementedError): class TestComparisonOps(BaseJSON, base.BaseComparisonOpsTests): pass + + +class TestPrinting(BaseJSON, base.BasePrintingTests): + pass diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index efee647389884..4d5aa54b65320 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -14,7 +14,6 @@ """ import numpy as np -import pytest from pandas.core.dtypes.common import is_extension_array_dtype @@ -24,6 +23,7 @@ Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype, UInt16Dtype, UInt32Dtype, UInt64Dtype) from pandas.tests.extension import base +import pytest def make_data(): @@ -210,3 +210,7 @@ class TestNumericReduce(base.BaseNumericReduceTests): class TestBooleanReduce(base.BaseBooleanReduceTests): pass + + +class TestPrinting(base.BasePrintingTests): + pass diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 2c7bc79c324b4..8924c87200cae 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -14,7 +14,6 @@ """ import numpy as np -import pytest from pandas.core.dtypes.dtypes import IntervalDtype @@ -22,6 +21,7 @@ from pandas.core.arrays import IntervalArray from pandas.tests.extension import base import pandas.util.testing as tm +import pytest def make_data(): @@ -147,3 +147,7 @@ class TestReshaping(BaseInterval, base.BaseReshapingTests): class TestSetitem(BaseInterval, base.BaseSetitemTests): pass + + +class TestPrinting(BaseInterval, base.BasePrintingTests): + pass diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py index 83f30aed88e65..d6d6a6366f49f 100644 --- a/pandas/tests/extension/test_period.py +++ b/pandas/tests/extension/test_period.py @@ -1,5 +1,4 @@ import numpy as np -import pytest from pandas._libs.tslib import iNaT @@ -9,6 +8,7 @@ from pandas.core.arrays import PeriodArray from pandas.tests.extension import base import pandas.util.testing as tm +import pytest @pytest.fixture @@ -155,3 +155,7 @@ class TestSetitem(BasePeriodTests, base.BaseSetitemTests): class TestGroupby(BasePeriodTests, base.BaseGroupbyTests): pass + + +class TestPrinting(BasePeriodTests, base.BasePrintingTests): + pass diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 4f67a13215cfd..891e5f4dd9a95 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -316,3 +316,9 @@ def _compare_other(self, s, data, op_name, other): s = pd.Series(data) result = op(s, other) tm.assert_series_equal(result, expected) + + +class TestPrinting(BaseSparseTests, base.BasePrintingTests): + @pytest.mark.xfail(reason='Different repr', strict=True) + def test_array_repr(self, data, size): + super(TestPrinting, self).test_array_repr(data, size) From ace62aaedf9e877de8bd937eda37d04398a196d7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Fri, 9 Nov 2018 12:07:47 -0600 Subject: [PATCH 02/39] Deprecate formatting_values --- doc/source/whatsnew/v0.24.0.txt | 3 ++- pandas/core/arrays/base.py | 13 +++++++++---- pandas/core/arrays/categorical.py | 3 --- pandas/core/arrays/integer.py | 4 ---- pandas/core/arrays/interval.py | 3 --- pandas/core/arrays/period.py | 5 +++++ pandas/core/internals/blocks.py | 18 ++++++++++++++++-- pandas/io/formats/printing.py | 10 +++++----- pandas/tests/arrays/test_period.py | 6 +++--- pandas/tests/extension/decimal/test_decimal.py | 3 ++- 10 files changed, 42 insertions(+), 26 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 1378caeac5edb..ac985e11c1203 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -857,7 +857,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your - :meth:`DataFrame.stack` no longer converts to object dtype for DataFrames where each column has the same extension dtype. The output Series will have the same dtype as the columns (:issue:`23077`). - :meth:`Series.unstack` and :meth:`DataFrame.unstack` no longer convert extension arrays to object-dtype ndarrays. Each column in the output ``DataFrame`` will now have the same dtype as the input (:issue:`23077`). - Bug when grouping :meth:`Dataframe.groupby()` and aggregating on ``ExtensionArray`` it was not returning the actual ``ExtensionArray`` dtype (:issue:`23227`). -- A default repr is now provided. +- A default repr for ExtensionArrays is now provided (:issue:`23601`). .. _whatsnew_0240.api.incompatibilities: @@ -967,6 +967,7 @@ Deprecations - The class ``FrozenNDArray`` has been deprecated. When unpickling, ``FrozenNDArray`` will be unpickled to ``np.ndarray`` once this class is removed (:issue:`9031`) - Deprecated the `nthreads` keyword of :func:`pandas.read_feather` in favor of `use_threads` to reflect the changes in pyarrow 0.11.0. (:issue:`23053`) +- :meth:`ExtensionArray._formatting_values` is deprecated. Use `ExtensionArray._formatter` instead. (:issue:`23601`) .. _whatsnew_0240.deprecations.datetimelike_int_ops: diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 41ec0971bef7c..238dbdd5576b2 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -673,20 +673,25 @@ def __repr__(self): # the short repr has no trailing newline, while the truncated # repr does. So we include a newline in our template, and strip # any trailing newlines from format_object_summary - data = format_object_summary(self, self._formatter, name=False, + data = format_object_summary(self, self._formatter(), name=False, trailing_comma=False).rstrip('\n') name = self.__class__.__name__ return template.format(class_name=name, data=data, length=len(self), dtype=self.dtype) - @property - def _formatter(self): - # type: () -> Callable[Any] + def _formatter(self, boxed=False): + # type: (bool) -> Callable[Any] """Formatting function for scalar values. This is used in the default '__repr__'. The formatting function receives instances of your scalar type. + + Parameters + ---------- + boxed: bool, default False + Whether the formatter is to be used by pandas inside a Series + or DataFrame repr. """ return str diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b74938a9c7b18..f25009ba72cc9 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2344,9 +2344,6 @@ def _concat_same_type(self, to_concat): return _concat_categorical(to_concat) - def _formatting_values(self): - return self - def isin(self, values): """ Check whether `values` are contained in Categorical. diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 27c845d666e60..3224284788dc1 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -297,10 +297,6 @@ def __iter__(self): else: yield self._data[i] - def _formatting_values(self): - # type: () -> np.ndarray - return self._coerce_to_ndarray() - def take(self, indexer, allow_fill=False, fill_value=None): from pandas.api.extensions import take diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 1b3687e555748..51b406214ed85 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -689,9 +689,6 @@ def copy(self, deep=False): # TODO: Could skip verify_integrity here. return type(self).from_arrays(left, right, closed=closed) - def _formatting_values(self): - return np.asarray(self) - def isna(self): return isna(self.left) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index f6996f8e62d4d..b3cfe8860fccb 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -330,6 +330,11 @@ def start_time(self): def end_time(self): return self.to_timestamp(how='end') + def _formatter(self, boxed=False): + if boxed: + return str + return "'{}'".format + def __setitem__( self, key, # type: Union[int, Sequence[int], Sequence[bool]] diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 1f2a1ee52159e..600b2d18a7554 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -66,7 +66,7 @@ import pandas.core.missing as missing from pandas.core.base import PandasObject -from pandas.core.arrays import Categorical +from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex @@ -1951,7 +1951,21 @@ def _slice(self, slicer): return self.values[slicer] def formatting_values(self): - return self.values._formatting_values() + # Deprecating the ability to override _formatting_values. + # Do the warning here, it's only user in pandas, since we + # have to check if the subclass overrode it. + fv = getattr(type(self.values), '_formatting_values', None) + if fv is not ExtensionArray._formatting_values: + msg = ( + "'ExtensionArray._formatting_values' is deprecated. " + "Specify 'ExtensionArray._formatter' instead." + ) + warnings.warn(msg, FutureWarning) + return self.values._formatting_values() + + # the future implementation (and current, if not overrode) + formatter = self.values._formatter(boxed=True) + return np.array([formatter(x) for x in self.values], dtype=object) def concat_same_type(self, to_concat, placement=None): """ diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 3d56b4b222ba6..baeab662db9f1 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -341,19 +341,19 @@ def best_len(values): return 0 if trailing_comma: - close = ', ' + close = u', ' else: - close = '' + close = u'' if n == 0: - summary = '[]{}'.format(close) + summary = u'[]{}'.format(close) elif n == 1: first = formatter(obj[0]) - summary = '[{}]{}'.format(first, close) + summary = u'[{}]{}'.format(first, close) elif n == 2: first = formatter(obj[0]) last = formatter(obj[-1]) - summary = '[{}, {}]{}'.format(first, last, close) + summary = u'[{}, {}]{}'.format(first, last, close) else: if n > max_seq_items: diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index 553f187e2e8fe..cc9fef90d959f 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -204,9 +204,9 @@ def test_repr_small(): arr = period_array(['2000', '2001'], freq='D') result = str(arr) expected = ( - '<PeriodArray>\n' - '[2000-01-01, 2001-01-01]\n' - 'Length: 2, dtype: period[D]' + "<PeriodArray>\n" + "['2000-01-01', '2001-01-01']\n" + "Length: 2, dtype: period[D]" ) assert result == expected diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 1a37b781dfcda..2e7afc378fa44 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -188,7 +188,8 @@ def test_value_counts(self, all_data, dropna): class TestCasting(BaseDecimal, base.BaseCastingTests): - pass + pytestmark = pytest.mark.skipif(compat.PY2, + reason="Unhashble dtype in Py2.") class TestGroupby(BaseDecimal, base.BaseGroupbyTests): From 6e76b51dced8477d6e63c175854180838aa6283e Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Fri, 9 Nov 2018 12:16:03 -0600 Subject: [PATCH 03/39] test for warning --- pandas/core/internals/blocks.py | 2 +- pandas/tests/extension/decimal/test_decimal.py | 14 +++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 600b2d18a7554..b9c13196aea1d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1960,7 +1960,7 @@ def formatting_values(self): "'ExtensionArray._formatting_values' is deprecated. " "Specify 'ExtensionArray._formatter' instead." ) - warnings.warn(msg, FutureWarning) + warnings.warn(msg, FutureWarning, stacklevel=10) return self.values._formatting_values() # the future implementation (and current, if not overrode) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 2e7afc378fa44..5dd53439df47f 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -202,7 +202,8 @@ class TestSetitem(BaseDecimal, base.BaseSetitemTests): class TestPrinting(BaseDecimal, base.BasePrintingTests): - pass + pytestmark = pytest.mark.skipif(compat.PY2, + reason="Unhashble dtype in Py2.") # TODO(extension) @@ -384,3 +385,14 @@ def test_divmod_array(reverse, expected_div, expected_mod): tm.assert_extension_array_equal(div, expected_div) tm.assert_extension_array_equal(mod, expected_mod) + + +def test_formatting_values_deprecated(): + class DecimalArray2(DecimalArray): + def _formatting_values(self): + return np.array(self) + + ser = pd.Series(DecimalArray2([decimal.Decimal('1.0')])) + + with tm.assert_produces_warning(FutureWarning): + repr(ser) From fef04e64927e749c4788a3a2df71b666aecbca5a Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Fri, 9 Nov 2018 12:19:58 -0600 Subject: [PATCH 04/39] compat --- pandas/tests/extension/decimal/test_decimal.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 5dd53439df47f..3f7581484ee7e 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -393,6 +393,9 @@ def _formatting_values(self): return np.array(self) ser = pd.Series(DecimalArray2([decimal.Decimal('1.0')])) + # different levels for 2 vs. 3 + check_stacklevel = compat.PY3 - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=check_stacklevel): repr(ser) From 1885a97203ecba615657b197d9a2f178aa105065 Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Fri, 9 Nov 2018 13:22:29 -0600 Subject: [PATCH 05/39] na formatter --- pandas/core/arrays/integer.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 3224284788dc1..984ef4c8c673c 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -264,6 +264,13 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): def _from_factorized(cls, values, original): return integer_array(values, dtype=original.dtype) + def _formatter(self, boxed=False): + def fmt(x): + if isna(x): + return 'NaN' + return str(x) + return fmt + def __getitem__(self, item): if is_integer(item): if self._mask[item]: From ecfcd72043f1d5e2e742f652f991f42006a4cf19 Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Fri, 9 Nov 2018 13:33:25 -0600 Subject: [PATCH 06/39] clean --- pandas/io/formats/printing.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index baeab662db9f1..61c62d49cc6cc 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -272,8 +272,7 @@ class TableSchemaFormatter(BaseFormatter): def format_object_summary(obj, formatter, is_justify=True, name=None, - trailing_comma=True, - truncated_trailing_newline=True): + trailing_comma=True): """ Return the formatted obj as a unicode string From 37638cc9a820ce7981350e2ceefcea9317a9e2f8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Fri, 9 Nov 2018 14:29:25 -0600 Subject: [PATCH 07/39] wip --- pandas/core/arrays/base.py | 6 +-- pandas/core/internals/blocks.py | 4 +- pandas/io/formats/format.py | 60 +++++++++++++----------------- pandas/tests/arrays/test_period.py | 20 +++++----- 4 files changed, 40 insertions(+), 50 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 238dbdd5576b2..e15eb74119578 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -680,8 +680,8 @@ def __repr__(self): length=len(self), dtype=self.dtype) - def _formatter(self, boxed=False): - # type: (bool) -> Callable[Any] + def _formatter(self, formatter): + # type: (ExtensionArrayFormatter) -> Callable[Any] """Formatting function for scalar values. This is used in the default '__repr__'. The formatting function @@ -693,7 +693,7 @@ def _formatter(self, boxed=False): Whether the formatter is to be used by pandas inside a Series or DataFrame repr. """ - return str + return formatter.formatter or str def _formatting_values(self): # type: () -> np.ndarray diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b9c13196aea1d..267b769ce7a65 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1963,9 +1963,7 @@ def formatting_values(self): warnings.warn(msg, FutureWarning, stacklevel=10) return self.values._formatting_values() - # the future implementation (and current, if not overrode) - formatter = self.values._formatter(boxed=True) - return np.array([formatter(x) for x in self.values], dtype=object) + return self.values def concat_same_type(self, to_concat, placement=None): """ diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 6f64605bcf175..22d91a47c4082 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -16,11 +16,12 @@ from pandas.compat import StringIO, lzip, map, u, zip from pandas.core.dtypes.common import ( - is_categorical_dtype, is_datetime64_dtype, is_datetimetz, is_float, - is_float_dtype, is_integer, is_integer_dtype, is_interval_dtype, - is_list_like, is_numeric_dtype, is_period_arraylike, is_scalar, - is_timedelta64_dtype) -from pandas.core.dtypes.generic import ABCMultiIndex, ABCSparseArray + is_categorical_dtype, is_datetime64_dtype, is_datetimetz, + is_extension_array_dtype, is_float, is_float_dtype, is_integer, + is_integer_dtype, is_list_like, is_numeric_dtype, is_period_arraylike, + is_scalar, is_timedelta64_dtype) +from pandas.core.dtypes.generic import ( + ABCIndex, ABCMultiIndex, ABCSeries, ABCSparseArray) from pandas.core.dtypes.missing import isna, notna from pandas import compat @@ -849,22 +850,20 @@ def _get_column_name_list(self): def format_array(values, formatter, float_format=None, na_rep='NaN', digits=None, space=None, justify='right', decimal='.'): - if is_categorical_dtype(values): - fmt_klass = CategoricalArrayFormatter - elif is_interval_dtype(values): - fmt_klass = IntervalArrayFormatter + if is_period_arraylike(values): + fmt_klass = PeriodArrayFormatter + elif is_datetime64_dtype(values.dtype): + fmt_klass = Datetime64Formatter + elif is_timedelta64_dtype(values.dtype): + fmt_klass = Timedelta64Formatter + elif is_extension_array_dtype(values.dtype): + fmt_klass = ExtensionArrayFormatter elif is_float_dtype(values.dtype): fmt_klass = FloatArrayFormatter - elif is_period_arraylike(values): - fmt_klass = PeriodArrayFormatter elif is_integer_dtype(values.dtype): fmt_klass = IntArrayFormatter elif is_datetimetz(values): fmt_klass = Datetime64TZFormatter - elif is_datetime64_dtype(values.dtype): - fmt_klass = Datetime64Formatter - elif is_timedelta64_dtype(values.dtype): - fmt_klass = Timedelta64Formatter else: fmt_klass = GenericArrayFormatter @@ -1126,14 +1125,18 @@ def _format_strings(self): return fmt_values.tolist() -class IntervalArrayFormatter(GenericArrayFormatter): - - def __init__(self, values, *args, **kwargs): - GenericArrayFormatter.__init__(self, values, *args, **kwargs) - +class ExtensionArrayFormatter(GenericArrayFormatter): def _format_strings(self): - formatter = self.formatter or str - fmt_values = np.array([formatter(x) for x in self.values]) + values = self.values + if isinstance(values, (ABCIndex, ABCSeries)): + values = values._values + + formatter = self.values._formatter(self) + fmt_values = format_array(np.asarray(self.values), + formatter, + float_format=self.float_format, + na_rep=self.na_rep, digits=self.digits, + space=self.space, justify=self.justify) return fmt_values @@ -1152,19 +1155,6 @@ def _format_strings(self): return fmt_values -class CategoricalArrayFormatter(GenericArrayFormatter): - - def __init__(self, values, *args, **kwargs): - GenericArrayFormatter.__init__(self, values, *args, **kwargs) - - def _format_strings(self): - fmt_values = format_array(self.values.get_values(), self.formatter, - float_format=self.float_format, - na_rep=self.na_rep, digits=self.digits, - space=self.space, justify=self.justify) - return fmt_values - - def format_percentiles(percentiles): """ Outputs rounded and formatted percentiles. diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index cc9fef90d959f..245d932bb139e 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -215,14 +215,16 @@ def test_repr_large(): arr = period_array(['2000', '2001'] * 500, freq='D') result = str(arr) expected = ( - '<PeriodArray>\n' - '[2000-01-01, 2001-01-01, 2000-01-01, 2001-01-01, 2000-01-01, ' - '2001-01-01,\n' # continuation - ' 2000-01-01, 2001-01-01, 2000-01-01, 2001-01-01,\n' - ' ...\n' - ' 2000-01-01, 2001-01-01, 2000-01-01, 2001-01-01, 2000-01-01, ' - '2001-01-01,\n' # continuation - ' 2000-01-01, 2001-01-01, 2000-01-01, 2001-01-01]\n' - 'Length: 1000, dtype: period[D]' + "<PeriodArray>\n" + "['2000-01-01', '2001-01-01', '2000-01-01', '2001-01-01', " + "'2000-01-01',\n" + " '2001-01-01', '2000-01-01', '2001-01-01', '2000-01-01', " + "'2001-01-01',\n" + " ...\n" + " '2000-01-01', '2001-01-01', '2000-01-01', '2001-01-01', " + "'2000-01-01',\n" + " '2001-01-01', '2000-01-01', '2001-01-01', '2000-01-01', " + "'2001-01-01']\n" + "Length: 1000, dtype: period[D]" ) assert result == expected From 6e64b7bd87d8fb6e79151863e367e55466cffda4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Fri, 9 Nov 2018 14:47:45 -0600 Subject: [PATCH 08/39] more cleanup --- pandas/core/arrays/base.py | 4 ++-- pandas/core/arrays/categorical.py | 4 ++++ pandas/io/formats/format.py | 32 +++++++------------------------ 3 files changed, 13 insertions(+), 27 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index e15eb74119578..d4a8da03913c3 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -680,7 +680,7 @@ def __repr__(self): length=len(self), dtype=self.dtype) - def _formatter(self, formatter): + def _formatter(self, formatter=None): # type: (ExtensionArrayFormatter) -> Callable[Any] """Formatting function for scalar values. @@ -693,7 +693,7 @@ def _formatter(self, formatter): Whether the formatter is to be used by pandas inside a Series or DataFrame repr. """ - return formatter.formatter or str + return getattr(formatter, 'formatter', None) or str def _formatting_values(self): # type: () -> np.ndarray diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index f25009ba72cc9..82edddb4c2414 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -499,6 +499,10 @@ def _constructor(self): def _from_sequence(cls, scalars, dtype=None, copy=False): return Categorical(scalars, dtype=dtype) + def _formatter(self, formatter): + # backwards compat with old printing. + return None + def copy(self): """ Copy constructor. """ return self._constructor(values=self._codes.copy(), diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 22d91a47c4082..42e3a633b2f42 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -18,10 +18,10 @@ from pandas.core.dtypes.common import ( is_categorical_dtype, is_datetime64_dtype, is_datetimetz, is_extension_array_dtype, is_float, is_float_dtype, is_integer, - is_integer_dtype, is_list_like, is_numeric_dtype, is_period_arraylike, - is_scalar, is_timedelta64_dtype) + is_integer_dtype, is_list_like, is_numeric_dtype, is_scalar, + is_timedelta64_dtype) from pandas.core.dtypes.generic import ( - ABCIndex, ABCMultiIndex, ABCSeries, ABCSparseArray) + ABCIndexClass, ABCMultiIndex, ABCSeries, ABCSparseArray) from pandas.core.dtypes.missing import isna, notna from pandas import compat @@ -30,7 +30,6 @@ from pandas.core.config import get_option, set_option from pandas.core.index import Index, ensure_index from pandas.core.indexes.datetimes import DatetimeIndex -from pandas.core.indexes.period import PeriodIndex from pandas.io.common import _expand_user, _stringify_path from pandas.io.formats.printing import adjoin, justify, pprint_thing @@ -850,9 +849,7 @@ def _get_column_name_list(self): def format_array(values, formatter, float_format=None, na_rep='NaN', digits=None, space=None, justify='right', decimal='.'): - if is_period_arraylike(values): - fmt_klass = PeriodArrayFormatter - elif is_datetime64_dtype(values.dtype): + if is_datetime64_dtype(values.dtype): fmt_klass = Datetime64Formatter elif is_timedelta64_dtype(values.dtype): fmt_klass = Timedelta64Formatter @@ -1128,11 +1125,11 @@ def _format_strings(self): class ExtensionArrayFormatter(GenericArrayFormatter): def _format_strings(self): values = self.values - if isinstance(values, (ABCIndex, ABCSeries)): + if isinstance(values, (ABCIndexClass, ABCSeries)): values = values._values - formatter = self.values._formatter(self) - fmt_values = format_array(np.asarray(self.values), + formatter = values._formatter(self) + fmt_values = format_array(np.asarray(values), formatter, float_format=self.float_format, na_rep=self.na_rep, digits=self.digits, @@ -1140,21 +1137,6 @@ def _format_strings(self): return fmt_values -class PeriodArrayFormatter(IntArrayFormatter): - - def _format_strings(self): - from pandas.core.indexes.period import IncompatibleFrequency - try: - values = PeriodIndex(self.values).to_native_types() - except IncompatibleFrequency: - # periods may contains different freq - values = Index(self.values, dtype='object').to_native_types() - - formatter = self.formatter or (lambda x: '{x}'.format(x=x)) - fmt_values = [formatter(x) for x in values] - return fmt_values - - def format_percentiles(percentiles): """ Outputs rounded and formatted percentiles. From 193747e9c3b5d09315e8848495e9e6f91437c733 Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Fri, 9 Nov 2018 14:56:09 -0600 Subject: [PATCH 09/39] update docs, type --- pandas/core/arrays/base.py | 16 ++++++++++++---- pandas/core/arrays/integer.py | 14 ++++++++------ pandas/core/arrays/period.py | 4 ++-- pandas/tests/arrays/test_integer.py | 5 +++++ 4 files changed, 27 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index d4a8da03913c3..be0b272f2816f 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -681,7 +681,7 @@ def __repr__(self): dtype=self.dtype) def _formatter(self, formatter=None): - # type: (ExtensionArrayFormatter) -> Callable[Any] + # type: (Optional[ExtensionArrayFormatter]) -> Callable[[Any], str] """Formatting function for scalar values. This is used in the default '__repr__'. The formatting function @@ -689,9 +689,17 @@ def _formatter(self, formatter=None): Parameters ---------- - boxed: bool, default False - Whether the formatter is to be used by pandas inside a Series - or DataFrame repr. + formatter: GenericArrayFormatter, optional + The formatter this array is being rendered with. The formatter + may have a `.formatter` method already defined. By default, this + will be used if a `formatter` is passed, otherwise the formatter + is ``str``. + + Returns + ------- + Callable[[Any], str] + A callable that gets instances of the scalar type and + returns a string. """ return getattr(formatter, 'formatter', None) or str diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 984ef4c8c673c..98eb9df98328f 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -264,12 +264,14 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): def _from_factorized(cls, values, original): return integer_array(values, dtype=original.dtype) - def _formatter(self, boxed=False): - def fmt(x): - if isna(x): - return 'NaN' - return str(x) - return fmt + def _formatter(self, formatter=None): + if formatter is None: + def fmt(x): + if isna(x): + return 'NaN' + return str(x) + return fmt + return formatter.formatter def __getitem__(self, item): if is_integer(item): diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 4f2848b3f5591..36a00db7625ce 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -330,8 +330,8 @@ def start_time(self): def end_time(self): return self.to_timestamp(how='end') - def _formatter(self, boxed=False): - if boxed: + def _formatter(self, formatter=None): + if formatter: return str return "'{}'".format diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 6b99ede3436ce..94bf3dfd938a1 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -67,6 +67,11 @@ def test_repr_array(data): assert 'dtype: ' in result +def test_na_repr(data): + result = repr(integer_array([1, None])) + assert 'NaN' in result + + def test_repr_array_long(data): # some arrays may be able to assert a ... in the repr with pd.option_context('display.max_seq_items', 1): From 5a2e1e4bc5f03f84499f143861c547e9e248e692 Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Fri, 9 Nov 2018 15:02:39 -0600 Subject: [PATCH 10/39] format --- pandas/core/arrays/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index be0b272f2816f..7bdb54bec02fb 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -662,6 +662,7 @@ def copy(self, deep=False): # ------------------------------------------------------------------------ # Printing # ------------------------------------------------------------------------ + def __repr__(self): from pandas.io.formats.printing import format_object_summary From 1635b7319264829a2b1136220f4780e2965ad907 Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Fri, 9 Nov 2018 15:35:53 -0600 Subject: [PATCH 11/39] try this --- pandas/io/formats/format.py | 9 ++++++++- pandas/tests/frame/test_repr_info.py | 12 ++++++------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 42e3a633b2f42..a7567e52f3077 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1129,7 +1129,14 @@ def _format_strings(self): values = values._values formatter = values._formatter(self) - fmt_values = format_array(np.asarray(values), + + if is_categorical_dtype(values.dtype): + # Categorical is special for now, so that we can preserve tzinfo + array = values.get_values() + else: + array = np.asarray(values) + + fmt_values = format_array(array, formatter, float_format=self.float_format, na_rep=self.na_rep, digits=self.digits, diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 668613c494a47..96c523b10197f 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -514,12 +514,12 @@ def test_repr_categorical_dates_periods(self): tz='US/Eastern') p = period_range('2011-01', freq='M', periods=5) df = DataFrame({'dt': dt, 'p': p}) - exp = """ dt p -0 2011-01-01 09:00:00-05:00 2011-01 -1 2011-01-01 10:00:00-05:00 2011-02 -2 2011-01-01 11:00:00-05:00 2011-03 -3 2011-01-01 12:00:00-05:00 2011-04 -4 2011-01-01 13:00:00-05:00 2011-05""" + exp = """ dt p +0 2011-01-01 09:00:00-05:00 2011-01 +1 2011-01-01 10:00:00-05:00 2011-02 +2 2011-01-01 11:00:00-05:00 2011-03 +3 2011-01-01 12:00:00-05:00 2011-04 +4 2011-01-01 13:00:00-05:00 2011-05""" df = DataFrame({'dt': Categorical(dt), 'p': Categorical(p)}) assert repr(df) == exp From e2b1941f72ace652af8d125e6ef4aa85d18f6112 Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Fri, 9 Nov 2018 16:05:33 -0600 Subject: [PATCH 12/39] updates --- doc/source/whatsnew/v0.24.0.txt | 1 + pandas/core/arrays/period.py | 2 +- pandas/core/arrays/sparse.py | 10 ++++++++ pandas/tests/series/test_repr.py | 40 ++++++++++++++++---------------- 4 files changed, 32 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index ac985e11c1203..1b839866ba190 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -1120,6 +1120,7 @@ Datetimelike - Bug in rounding methods of :class:`DatetimeIndex` (:meth:`~DatetimeIndex.round`, :meth:`~DatetimeIndex.ceil`, :meth:`~DatetimeIndex.floor`) and :class:`Timestamp` (:meth:`~Timestamp.round`, :meth:`~Timestamp.ceil`, :meth:`~Timestamp.floor`) could give rise to loss of precision (:issue:`22591`) - Bug in :func:`to_datetime` with an :class:`Index` argument that would drop the ``name`` from the result (:issue:`21697`) - Bug in :class:`PeriodIndex` where adding or subtracting a :class:`timedelta` or :class:`Tick` object produced incorrect results (:issue:`22988`) +- Bug in the :class:`Series` repr with Period data missing a space before the data (:issue:`23601`) - Bug in :func:`date_range` when decrementing a start date to a past end date by a negative frequency (:issue:`23270`) - Bug in :meth:`Series.min` which would return ``NaN`` instead of ``NaT`` when called on a series of ``NaT`` (:issue:`23282`) - Bug in :func:`DataFrame.combine` with datetimelike values raising a TypeError (:issue:`23079`) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 36a00db7625ce..6d7ab6e6176d1 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -332,7 +332,7 @@ def end_time(self): def _formatter(self, formatter=None): if formatter: - return str + return formatter.formatter or str return "'{}'".format def __setitem__( diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index a63b3fb53625f..dd0d4ba587c6c 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -1661,6 +1661,16 @@ def __unicode__(self): fill=printing.pprint_thing(self.fill_value), index=printing.pprint_thing(self.sp_index)) + def _formatter(self, formatter=None): + if formatter is None: + def fmt(x): + if isna(x) and isinstance(x, float): + return 'NaN' + return str(x) + + return fmt + return formatter.formatter + SparseArray._add_arithmetic_ops() SparseArray._add_comparison_ops() diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index ef96274746655..c4a0496f7fb27 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -364,11 +364,11 @@ def test_categorical_series_repr_datetime_ordered(self): def test_categorical_series_repr_period(self): idx = period_range('2011-01-01 09:00', freq='H', periods=5) s = Series(Categorical(idx)) - exp = """0 2011-01-01 09:00 -1 2011-01-01 10:00 -2 2011-01-01 11:00 -3 2011-01-01 12:00 -4 2011-01-01 13:00 + exp = """0 2011-01-01 09:00 +1 2011-01-01 10:00 +2 2011-01-01 11:00 +3 2011-01-01 12:00 +4 2011-01-01 13:00 dtype: category Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]""" # noqa @@ -377,11 +377,11 @@ def test_categorical_series_repr_period(self): idx = period_range('2011-01', freq='M', periods=5) s = Series(Categorical(idx)) - exp = """0 2011-01 -1 2011-02 -2 2011-03 -3 2011-04 -4 2011-05 + exp = """0 2011-01 +1 2011-02 +2 2011-03 +3 2011-04 +4 2011-05 dtype: category Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" @@ -390,11 +390,11 @@ def test_categorical_series_repr_period(self): def test_categorical_series_repr_period_ordered(self): idx = period_range('2011-01-01 09:00', freq='H', periods=5) s = Series(Categorical(idx, ordered=True)) - exp = """0 2011-01-01 09:00 -1 2011-01-01 10:00 -2 2011-01-01 11:00 -3 2011-01-01 12:00 -4 2011-01-01 13:00 + exp = """0 2011-01-01 09:00 +1 2011-01-01 10:00 +2 2011-01-01 11:00 +3 2011-01-01 12:00 +4 2011-01-01 13:00 dtype: category Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < 2011-01-01 13:00]""" # noqa @@ -403,11 +403,11 @@ def test_categorical_series_repr_period_ordered(self): idx = period_range('2011-01', freq='M', periods=5) s = Series(Categorical(idx, ordered=True)) - exp = """0 2011-01 -1 2011-02 -2 2011-03 -3 2011-04 -4 2011-05 + exp = """0 2011-01 +1 2011-02 +2 2011-03 +3 2011-04 +4 2011-05 dtype: category Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" From 48e55ccaa06908a6dab1b27cfef63f00549771ff Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Fri, 9 Nov 2018 20:52:28 -0600 Subject: [PATCH 13/39] fixup interval --- pandas/core/arrays/interval.py | 10 ++++++++++ pandas/tests/extension/test_interval.py | 4 +++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 51b406214ed85..244952e8e884c 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -832,6 +832,16 @@ def _format_data(self): return summary + def __repr__(self): + tpl = textwrap.dedent("""\ + {cls}({data}, + {lead}closed='{closed}', + {lead}dtype='{dtype}')""") + return tpl.format(cls=self.__class__.__name__, + data=self._format_data(), + lead=' ' * len(self.__class__.__name__) + ' ', + closed=self.closed, dtype=self.dtype) + def _format_space(self): space = ' ' * (len(self.__class__.__name__) + 1) return "\n{space}".format(space=space) diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 8924c87200cae..281a61cb55463 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -150,4 +150,6 @@ class TestSetitem(BaseInterval, base.BaseSetitemTests): class TestPrinting(BaseInterval, base.BasePrintingTests): - pass + @pytest.mark.skip(reason="custom repr") + def test_array_repr(self, data, size): + pass From d8e7ba475b358877d3fcd5c93a7ba607de4cc2af Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Fri, 9 Nov 2018 21:08:41 -0600 Subject: [PATCH 14/39] py2 compat --- pandas/core/internals/blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 267b769ce7a65..2a8d49feb0bad 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1955,7 +1955,7 @@ def formatting_values(self): # Do the warning here, it's only user in pandas, since we # have to check if the subclass overrode it. fv = getattr(type(self.values), '_formatting_values', None) - if fv is not ExtensionArray._formatting_values: + if fv and fv != ExtensionArray._formatting_values: msg = ( "'ExtensionArray._formatting_values' is deprecated. " "Specify 'ExtensionArray._formatter' instead." From b312fe410bbcc67fd9db0ca3218a85df2af22f80 Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Sat, 10 Nov 2018 06:22:33 -0600 Subject: [PATCH 15/39] revert interval --- pandas/tests/arrays/interval/test_interval.py | 32 ++++--------------- 1 file changed, 7 insertions(+), 25 deletions(-) diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 1138f64e2009a..9a191dda3a73a 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -1,10 +1,10 @@ # -*- coding: utf-8 -*- import numpy as np +import pytest -from pandas import Index, date_range, option_context, timedelta_range +from pandas import Index, IntervalIndex, date_range, timedelta_range from pandas.core.arrays import IntervalArray import pandas.util.testing as tm -import pytest @pytest.fixture(params=[ @@ -65,26 +65,8 @@ def test_set_na(self, left_right_dtypes): tm.assert_extension_array_equal(result, expected) -def test_repr_small(): - arr = IntervalArray.from_breaks([1, 2, 3]) - result = repr(arr) - expected = ( - '<IntervalArray>\n' - '[(1, 2], (2, 3]]\n' - 'Length: 2, dtype: interval[int64]' - ) - assert result == expected - - -def test_repr_large(): - arr = IntervalArray.from_breaks([1, 2, 3, 4, 5, 6]) - with option_context('display.max_seq_items', 2): - result = repr(arr) - expected = ( - '<IntervalArray>\n' - '[(1, 2],\n' - ' ...\n' - ' (5, 6]] \n' - 'Length: 5, dtype: interval[int64]' - ) - assert result == expected +def test_repr_matches(): + idx = IntervalIndex.from_breaks([1, 2, 3]) + a = repr(idx) + b = repr(idx.values) + assert a.replace("Index", "Array") == b From 445736d939dc1ce8615767ccb79f84c8bfdad947 Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Sat, 10 Nov 2018 06:30:57 -0600 Subject: [PATCH 16/39] unicode, bytes --- pandas/core/arrays/base.py | 15 +++++++++++++++ pandas/tests/extension/base/printing.py | 12 ++++++++++++ 2 files changed, 27 insertions(+) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 7bdb54bec02fb..cfcf2e98c2bc6 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -9,6 +9,8 @@ import operator +from pandas import compat +from pandas.core.config import get_option from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass from pandas.errors import AbstractMethodError from pandas.compat.numpy import function as nv @@ -662,6 +664,19 @@ def copy(self, deep=False): # ------------------------------------------------------------------------ # Printing # ------------------------------------------------------------------------ + def __unicode__(self): + result = str(self) + if compat.PY2: + encoding = get_option("display.encoding") + result = result.decode(encoding) + return result + + def __bytes__(self): + result = str(self) + if compat.PY3: + encoding = get_option("display.encoding") + result = result.encode(encoding) + return result def __repr__(self): from pandas.io.formats.printing import format_object_summary diff --git a/pandas/tests/extension/base/printing.py b/pandas/tests/extension/base/printing.py index 3d224c2b06c0b..512ef23f4c247 100644 --- a/pandas/tests/extension/base/printing.py +++ b/pandas/tests/extension/base/printing.py @@ -1,6 +1,7 @@ import io import pandas as pd +from pandas import compat import pytest from .base import BaseExtensionTests @@ -23,6 +24,17 @@ def test_array_repr(self, data, size): if size == 'big': assert '...' in result + def test_array_repr_bytes(self, data): + result = bytes(data) + if compat.PY2: + assert isinstance(result, str) + else: + assert isinstance(result, bytes) + + def test_array_repr_unicode(self, data): + result = compat.u(data) + assert isinstance(result, compat.text_type) + def test_series_repr(self, data): ser = pd.Series(data) assert data.dtype.name in repr(ser) From 60e0d028b53d99a0ea52ed0943e70971081e342f Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Sat, 10 Nov 2018 06:36:39 -0600 Subject: [PATCH 17/39] isort --- pandas/tests/extension/base/printing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/printing.py b/pandas/tests/extension/base/printing.py index 512ef23f4c247..f13a3c244ccb8 100644 --- a/pandas/tests/extension/base/printing.py +++ b/pandas/tests/extension/base/printing.py @@ -1,8 +1,9 @@ import io +import pytest + import pandas as pd from pandas import compat -import pytest from .base import BaseExtensionTests From 5b07906cf28879ba174d3f356eab10df1fd8a858 Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Sat, 10 Nov 2018 06:52:16 -0600 Subject: [PATCH 18/39] py3 fixup --- pandas/tests/extension/base/printing.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/printing.py b/pandas/tests/extension/base/printing.py index f13a3c244ccb8..a04863b7a63b1 100644 --- a/pandas/tests/extension/base/printing.py +++ b/pandas/tests/extension/base/printing.py @@ -33,7 +33,10 @@ def test_array_repr_bytes(self, data): assert isinstance(result, bytes) def test_array_repr_unicode(self, data): - result = compat.u(data) + if compat.PY2: + result = compat.u(data) + else: + result = str(data) assert isinstance(result, compat.text_type) def test_series_repr(self, data): From ff0c9981986cad9877ddcdf593e0c424b0b3f25d Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Sat, 10 Nov 2018 14:41:43 -0600 Subject: [PATCH 19/39] fixup --- pandas/core/arrays/base.py | 32 +++++++++++++++---------------- pandas/core/arrays/categorical.py | 2 -- 2 files changed, 15 insertions(+), 19 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index cfcf2e98c2bc6..054ff1b360e8f 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -665,26 +665,12 @@ def copy(self, deep=False): # Printing # ------------------------------------------------------------------------ def __unicode__(self): - result = str(self) - if compat.PY2: - encoding = get_option("display.encoding") - result = result.decode(encoding) - return result - - def __bytes__(self): - result = str(self) - if compat.PY3: - encoding = get_option("display.encoding") - result = result.encode(encoding) - return result - - def __repr__(self): from pandas.io.formats.printing import format_object_summary template = ( - '<{class_name}>\n' - '{data}\n' - 'Length: {length}, dtype: {dtype}' + u'<{class_name}>\n' + u'{data}\n' + u'Length: {length}, dtype: {dtype}' ) # the short repr has no trailing newline, while the truncated # repr does. So we include a newline in our template, and strip @@ -696,6 +682,18 @@ def __repr__(self): length=len(self), dtype=self.dtype) + def __str__(self): + if compat.PY3: + return self.__unicode__() + return self.__bytes__() + + def __bytes__(self): + encoding = get_option("display.encoding") + return self.__unicode__().encode(encoding, 'replace') + + def __repr__(self): + return str(self) + def _formatter(self, formatter=None): # type: (Optional[ExtensionArrayFormatter]) -> Callable[[Any], str] """Formatting function for scalar values. diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 82edddb4c2414..3593f83770114 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1990,8 +1990,6 @@ def __unicode__(self): return result - __repr__ = __unicode__ - def _maybe_coerce_indexer(self, indexer): """ return an indexer coerced to the codes dtype """ if isinstance(indexer, np.ndarray) and indexer.dtype.kind == 'i': From 2fd3d5d0dba6369503ad0d71fda3155b1ffb8bcf Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Sat, 10 Nov 2018 14:45:22 -0600 Subject: [PATCH 20/39] unicode --- pandas/core/arrays/base.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 054ff1b360e8f..3e529cc0a4c11 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -9,8 +9,7 @@ import operator -from pandas import compat -from pandas.core.config import get_option +from pandas.core.base import StringMixin from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass from pandas.errors import AbstractMethodError from pandas.compat.numpy import function as nv @@ -21,7 +20,7 @@ _not_implemented_message = "{} does not implement {}." -class ExtensionArray(object): +class ExtensionArray(StringMixin): """Abstract base class for custom 1-D array types. pandas will recognize instances of this class as proper arrays @@ -682,18 +681,6 @@ def __unicode__(self): length=len(self), dtype=self.dtype) - def __str__(self): - if compat.PY3: - return self.__unicode__() - return self.__bytes__() - - def __bytes__(self): - encoding = get_option("display.encoding") - return self.__unicode__().encode(encoding, 'replace') - - def __repr__(self): - return str(self) - def _formatter(self, formatter=None): # type: (Optional[ExtensionArrayFormatter]) -> Callable[[Any], str] """Formatting function for scalar values. From 5d8d2fc9c7e7d0541d991e2f3fb107e57bd23a09 Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Sat, 10 Nov 2018 14:45:55 -0600 Subject: [PATCH 21/39] unicode --- pandas/core/arrays/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 3e529cc0a4c11..b6eba6a17359a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -55,7 +55,7 @@ class ExtensionArray(StringMixin): by overriding: * _formatter - * __repr__ + * __unicode__ Some methods require casting the ExtensionArray to an ndarray of Python objects with ``self.astype(object)``, which may be expensive. When From 4d343eafead092d5d8a6895f4df9c7206b5510fc Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Sat, 10 Nov 2018 16:30:55 -0600 Subject: [PATCH 22/39] unicode --- pandas/tests/extension/base/printing.py | 5 +---- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/tests/extension/base/printing.py b/pandas/tests/extension/base/printing.py index a04863b7a63b1..7a41e7689f4a6 100644 --- a/pandas/tests/extension/base/printing.py +++ b/pandas/tests/extension/base/printing.py @@ -33,10 +33,7 @@ def test_array_repr_bytes(self, data): assert isinstance(result, bytes) def test_array_repr_unicode(self, data): - if compat.PY2: - result = compat.u(data) - else: - result = str(data) + result = compat.text_type(data) assert isinstance(result, compat.text_type) def test_series_repr(self, data): diff --git a/setup.cfg b/setup.cfg index 4726a0ddb2fb2..2e07182196d5b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -90,7 +90,7 @@ known_post_core=pandas.tseries,pandas.io,pandas.plotting sections=FUTURE,STDLIB,THIRDPARTY,PRE_CORE,DTYPES,FIRSTPARTY,POST_CORE,LOCALFOLDER known_first_party=pandas -known_third_party=Cython,numpy,python-dateutil,pytz,pyarrow +known_third_party=Cython,numpy,python-dateutil,pytz,pyarrow,pytest multi_line_output=4 force_grid_wrap=0 combine_as_imports=True From 5b291d50e8722a9ef72ffa90f0476ff85a7edb66 Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Sat, 10 Nov 2018 16:33:20 -0600 Subject: [PATCH 23/39] lint --- pandas/tests/arrays/test_integer.py | 2 +- pandas/tests/arrays/test_period.py | 2 +- pandas/tests/extension/decimal/test_decimal.py | 2 +- pandas/tests/extension/json/test_json.py | 3 ++- pandas/tests/extension/test_integer.py | 2 +- pandas/tests/extension/test_interval.py | 2 +- pandas/tests/extension/test_period.py | 2 +- 7 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 206cfab3ab6a4..ebae1a2797bdc 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- import numpy as np +import pytest from pandas.core.dtypes.generic import ABCIndexClass @@ -11,7 +12,6 @@ UInt32Dtype, UInt64Dtype) from pandas.tests.extension.base import BaseOpsUtil import pandas.util.testing as tm -import pytest def make_data(): diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index 95c80ba61cc9a..f4fb8f2814a70 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from pandas._libs.tslibs import iNaT from pandas._libs.tslibs.period import IncompatibleFrequency @@ -9,7 +10,6 @@ import pandas as pd from pandas.core.arrays import PeriodArray, period_array import pandas.util.testing as tm -import pytest # ---------------------------------------------------------------------------- # Constructors diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index ed9f1acc10a6a..0ddb09c9b5617 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -3,12 +3,12 @@ import operator import numpy as np +import pytest import pandas as pd from pandas import compat from pandas.tests.extension import base import pandas.util.testing as tm -import pytest from .array import DecimalArray, DecimalDtype, make_data, to_decimal diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index a9244a1ab8436..29e4289226c68 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -1,12 +1,13 @@ import collections import operator +import pytest + from pandas.compat import PY2, PY36 import pandas as pd from pandas.tests.extension import base import pandas.util.testing as tm -import pytest from .array import JSONArray, JSONDtype, make_data diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 4d5aa54b65320..0abae56ef8723 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -14,6 +14,7 @@ """ import numpy as np +import pytest from pandas.core.dtypes.common import is_extension_array_dtype @@ -23,7 +24,6 @@ Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype, UInt16Dtype, UInt32Dtype, UInt64Dtype) from pandas.tests.extension import base -import pytest def make_data(): diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 9f49c2c920a95..644f3ef94f40b 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -14,13 +14,13 @@ """ import numpy as np +import pytest from pandas.core.dtypes.dtypes import IntervalDtype from pandas import Interval from pandas.core.arrays import IntervalArray from pandas.tests.extension import base -import pytest def make_data(): diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py index be74e7b330934..08e21fc30ad10 100644 --- a/pandas/tests/extension/test_period.py +++ b/pandas/tests/extension/test_period.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from pandas._libs.tslib import iNaT @@ -7,7 +8,6 @@ import pandas as pd from pandas.core.arrays import PeriodArray from pandas.tests.extension import base -import pytest @pytest.fixture From 1b93bf0cc06a5d5fc5b54e9f8f295391997f7bbf Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Sun, 11 Nov 2018 14:52:27 -0600 Subject: [PATCH 24/39] update repr tests --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/core/arrays/base.py | 2 +- pandas/tests/arrays/test_integer.py | 39 ++++++++++++++--------------- 3 files changed, 21 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 4d19a2eeb09f6..c6e23ff6e15a4 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -858,7 +858,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your - :meth:`DataFrame.stack` no longer converts to object dtype for DataFrames where each column has the same extension dtype. The output Series will have the same dtype as the columns (:issue:`23077`). - :meth:`Series.unstack` and :meth:`DataFrame.unstack` no longer convert extension arrays to object-dtype ndarrays. Each column in the output ``DataFrame`` will now have the same dtype as the input (:issue:`23077`). - Bug when grouping :meth:`Dataframe.groupby()` and aggregating on ``ExtensionArray`` it was not returning the actual ``ExtensionArray`` dtype (:issue:`23227`). -- A default repr for ExtensionArrays is now provided (:issue:`23601`). +- A default repr for :class:`ExtensionArray` is now provided (:issue:`23601`). .. _whatsnew_0240.api.incompatibilities: diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index b6eba6a17359a..bcfd7f2d1c281 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -675,7 +675,7 @@ def __unicode__(self): # repr does. So we include a newline in our template, and strip # any trailing newlines from format_object_summary data = format_object_summary(self, self._formatter(), name=False, - trailing_comma=False).rstrip('\n') + trailing_comma=False).rstrip() name = self.__class__.__name__ return template.format(class_name=name, data=data, length=len(self), diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index ebae1a2797bdc..c0b3ab3592188 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -57,28 +57,27 @@ def test_dtypes(dtype): assert dtype.name is not None -def test_repr_array(data): - result = repr(data) - assert '<IntegerArray>' in result - - # not long - assert '...' not in result - assert 'Length: ' in result - assert 'dtype: ' in result - - -def test_na_repr(data): - result = repr(integer_array([1, None])) - assert 'NaN' in result - +def test_repr_array(): + result = repr(integer_array([1, None, 3])) + expected = ( + '<IntegerArray>\n' + '[1, NaN, 3]\n' + 'Length: 3, dtype: Int64' + ) + assert result == expected -def test_repr_array_long(data): - # some arrays may be able to assert a ... in the repr - with pd.option_context('display.max_seq_items', 1): - result = repr(data) - assert '...' in result - assert 'Length' in result +def test_repr_array_long(): + data = integer_array([1, 2, None] * 1000) + expected = ( + "<IntegerArray>\n" + "[ 1, 2, NaN, 1, 2, NaN, 1, 2, NaN, 1,\n" + " ...\n" + " NaN, 1, 2, NaN, 1, 2, NaN, 1, 2, NaN]\n" + "Length: 3000, dtype: Int64" + ) + result = repr(data) + assert result == expected class TestConstructors(object): From 0f4083e2329ce89e34ec94b16bc11c0afd7f5811 Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Mon, 12 Nov 2018 06:59:20 -0600 Subject: [PATCH 25/39] remove periodarray --- pandas/core/arrays/period.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 1a6c10ee05261..09bfed93612aa 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -366,14 +366,6 @@ def to_timestamp(self, freq=None, how='start'): # -------------------------------------------------------------------- # Array-like / EA-Interface Methods - def __repr__(self): - return '<{}>\n{}\nLength: {}, dtype: {}'.format( - self.__class__.__name__, - [str(s) for s in self], - len(self), - self.dtype - ) - def _formatter(self, formatter=None): if formatter: return formatter.formatter or str From ebadf6f211aa0cfcfbe083cb3505a14c63718a1b Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Mon, 12 Nov 2018 08:11:12 -0600 Subject: [PATCH 26/39] FutureWarning -> DeprecationWarning --- pandas/core/arrays/base.py | 5 ----- pandas/core/internals/blocks.py | 2 +- pandas/tests/extension/decimal/test_decimal.py | 2 +- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index bcfd7f2d1c281..a5b351ee4fa3e 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -45,11 +45,6 @@ class ExtensionArray(StringMixin): * copy * _concat_same_type - An additional method is available to satisfy pandas' internal, - private block API. - - * _formatting_values - A default repr displaying the type, (truncated) data, length, and dtype is provided. It can be customized or replaced by by overriding: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 2a8d49feb0bad..835039e86d60f 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1960,7 +1960,7 @@ def formatting_values(self): "'ExtensionArray._formatting_values' is deprecated. " "Specify 'ExtensionArray._formatter' instead." ) - warnings.warn(msg, FutureWarning, stacklevel=10) + warnings.warn(msg, DeprecationWarning, stacklevel=10) return self.values._formatting_values() return self.values diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 0ddb09c9b5617..6281c5360cd03 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -396,6 +396,6 @@ def _formatting_values(self): # different levels for 2 vs. 3 check_stacklevel = compat.PY3 - with tm.assert_produces_warning(FutureWarning, + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=check_stacklevel): repr(ser) From e5f6976bfd3077299149febe80a3a76e4fee1090 Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Mon, 12 Nov 2018 08:19:01 -0600 Subject: [PATCH 27/39] wip --- pandas/core/arrays/base.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index a5b351ee4fa3e..26f676186b919 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -9,7 +9,6 @@ import operator -from pandas.core.base import StringMixin from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass from pandas.errors import AbstractMethodError from pandas.compat.numpy import function as nv @@ -20,7 +19,7 @@ _not_implemented_message = "{} does not implement {}." -class ExtensionArray(StringMixin): +class ExtensionArray(object): """Abstract base class for custom 1-D array types. pandas will recognize instances of this class as proper arrays @@ -50,7 +49,7 @@ class ExtensionArray(StringMixin): by overriding: * _formatter - * __unicode__ + * __repr__ Some methods require casting the ExtensionArray to an ndarray of Python objects with ``self.astype(object)``, which may be expensive. When @@ -658,7 +657,7 @@ def copy(self, deep=False): # ------------------------------------------------------------------------ # Printing # ------------------------------------------------------------------------ - def __unicode__(self): + def __repr__(self): from pandas.io.formats.printing import format_object_summary template = ( @@ -686,10 +685,14 @@ def _formatter(self, formatter=None): Parameters ---------- formatter: GenericArrayFormatter, optional - The formatter this array is being rendered with. The formatter - may have a `.formatter` method already defined. By default, this - will be used if a `formatter` is passed, otherwise the formatter - is ``str``. + The formatter this array is being rendered with. This will be + passed when the ExtensionArray is being rendered inside of a + Series, Index, or DataFrame. This will be ``None`` when called + from a top-level ``repr(array)``. + + By default, when ``formatter`` is passed, the return value + is ``formatter.formatter``. Otherwise, the default formatter + is ``repr``. Returns ------- @@ -697,7 +700,7 @@ def _formatter(self, formatter=None): A callable that gets instances of the scalar type and returns a string. """ - return getattr(formatter, 'formatter', None) or str + return getattr(formatter, 'formatter', None) or repr def _formatting_values(self): # type: () -> np.ndarray From 221cee9799be6f2608fcbf962b58d6b56aa0ff4f Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Mon, 12 Nov 2018 09:04:31 -0600 Subject: [PATCH 28/39] use repr --- pandas/core/arrays/base.py | 19 +++++++++++++++---- pandas/core/arrays/categorical.py | 5 ++++- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 26f676186b919..f2af678d74d0d 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -661,7 +661,7 @@ def __repr__(self): from pandas.io.formats.printing import format_object_summary template = ( - u'<{class_name}>\n' + u'{class_name}' u'{data}\n' u'Length: {length}, dtype: {dtype}' ) @@ -670,11 +670,17 @@ def __repr__(self): # any trailing newlines from format_object_summary data = format_object_summary(self, self._formatter(), name=False, trailing_comma=False).rstrip() - name = self.__class__.__name__ - return template.format(class_name=name, data=data, + class_name = u'<{}>\n'.format(self.__class__.__name__) + return template.format(class_name=class_name, data=data, length=len(self), dtype=self.dtype) + def __bytes__(self): + from pandas.core.config import get_option + + encoding = get_option("display.encoding") + return str(self).encode(encoding, 'replace') + def _formatter(self, formatter=None): # type: (Optional[ExtensionArrayFormatter]) -> Callable[[Any], str] """Formatting function for scalar values. @@ -705,7 +711,12 @@ def _formatter(self, formatter=None): def _formatting_values(self): # type: () -> np.ndarray # At the moment, this has to be an array since we use result.dtype - """An array of values to be printed in, e.g. the Series repr""" + """An array of values to be printed in, e.g. the Series repr + + .. deprecated:: 0.24.0 + + Use :meth:`ExtensionArray._formatter` instead. + """ return np.array(self) # ------------------------------------------------------------------------ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 3593f83770114..02ab3471c6be1 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -499,7 +499,7 @@ def _constructor(self): def _from_sequence(cls, scalars, dtype=None, copy=False): return Categorical(scalars, dtype=dtype) - def _formatter(self, formatter): + def _formatter(self, formatter=None): # backwards compat with old printing. return None @@ -1990,6 +1990,9 @@ def __unicode__(self): return result + def __repr__(self): + return super(ExtensionArray, self).__repr__() + def _maybe_coerce_indexer(self, indexer): """ return an indexer coerced to the codes dtype """ if isinstance(indexer, np.ndarray) and indexer.dtype.kind == 'i': From 439f2f896cd16ff7d80d708b51ffe265a4d3bbfc Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Mon, 12 Nov 2018 09:05:50 -0600 Subject: [PATCH 29/39] fixup! use repr --- pandas/core/arrays/categorical.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 02ab3471c6be1..70c1a8b878f03 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1993,6 +1993,9 @@ def __unicode__(self): def __repr__(self): return super(ExtensionArray, self).__repr__() + def __bytes__(self): + return super(ExtensionArray, self).__bytes__() + def _maybe_coerce_indexer(self, indexer): """ return an indexer coerced to the codes dtype """ if isinstance(indexer, np.ndarray) and indexer.dtype.kind == 'i': From 23645464899846d730ab5f13ecb5dbee5488b37c Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Mon, 12 Nov 2018 09:11:27 -0600 Subject: [PATCH 30/39] fixup! fixup! use repr --- pandas/core/arrays/base.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index f2af678d74d0d..8481502af4e2f 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -691,14 +691,19 @@ def _formatter(self, formatter=None): Parameters ---------- formatter: GenericArrayFormatter, optional - The formatter this array is being rendered with. This will be - passed when the ExtensionArray is being rendered inside of a - Series, Index, or DataFrame. This will be ``None`` when called - from a top-level ``repr(array)``. - - By default, when ``formatter`` is passed, the return value - is ``formatter.formatter``. Otherwise, the default formatter - is ``repr``. + The formatter this array is being rendered with. When the array + is being rendered inside an Index, Series, or DataFrame, a + formatter will be provided. So if you want your objects to + render differently inside a Series from on its own, checking + with ``formatter is None`` is an option. + + The default behavior depends on whether `formatter` is passed. + + * When `formatter` is None, :func:`repr` is returned. + * When `formatter` is passed, ``formatter.formatter`` is used, + which falls back to :func:`repr` if that isn't specified. + + In general, just returning :func:`repr` should be fine. Returns ------- From 62b1e2f0325ee6d83a96f4d4ca853ac7923da01f Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Mon, 12 Nov 2018 11:19:07 -0600 Subject: [PATCH 31/39] remove bytes --- pandas/core/arrays/base.py | 6 ------ pandas/core/arrays/categorical.py | 4 +--- pandas/tests/extension/base/printing.py | 7 ------- 3 files changed, 1 insertion(+), 16 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 8481502af4e2f..ee9e47baddcf3 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -675,12 +675,6 @@ def __repr__(self): length=len(self), dtype=self.dtype) - def __bytes__(self): - from pandas.core.config import get_option - - encoding = get_option("display.encoding") - return str(self).encode(encoding, 'replace') - def _formatter(self, formatter=None): # type: (Optional[ExtensionArrayFormatter]) -> Callable[[Any], str] """Formatting function for scalar values. diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 70c1a8b878f03..e3f46e8a9226b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1991,11 +1991,9 @@ def __unicode__(self): return result def __repr__(self): + # We want PandasObject.__repr__, which dispatches to __unicode__ return super(ExtensionArray, self).__repr__() - def __bytes__(self): - return super(ExtensionArray, self).__bytes__() - def _maybe_coerce_indexer(self, indexer): """ return an indexer coerced to the codes dtype """ if isinstance(indexer, np.ndarray) and indexer.dtype.kind == 'i': diff --git a/pandas/tests/extension/base/printing.py b/pandas/tests/extension/base/printing.py index 7a41e7689f4a6..b2ba1d95cf33e 100644 --- a/pandas/tests/extension/base/printing.py +++ b/pandas/tests/extension/base/printing.py @@ -25,13 +25,6 @@ def test_array_repr(self, data, size): if size == 'big': assert '...' in result - def test_array_repr_bytes(self, data): - result = bytes(data) - if compat.PY2: - assert isinstance(result, str) - else: - assert isinstance(result, bytes) - def test_array_repr_unicode(self, data): result = compat.text_type(data) assert isinstance(result, compat.text_type) From 27db397efca6840f285778b92aeefe5327a0b231 Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Thu, 15 Nov 2018 09:05:47 -0600 Subject: [PATCH 32/39] simplify formatter --- pandas/core/arrays/base.py | 32 ++++++++++++------------------- pandas/core/arrays/categorical.py | 2 +- pandas/core/arrays/integer.py | 14 ++++++-------- pandas/core/arrays/period.py | 6 +++--- pandas/core/arrays/sparse.py | 16 +++++++--------- 5 files changed, 29 insertions(+), 41 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index ee9e47baddcf3..edc3ccc934690 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -675,37 +675,29 @@ def __repr__(self): length=len(self), dtype=self.dtype) - def _formatter(self, formatter=None): - # type: (Optional[ExtensionArrayFormatter]) -> Callable[[Any], str] + def _formatter(self, boxed=False): + # type: (bool) -> Callable[[Any], str] """Formatting function for scalar values. - This is used in the default '__repr__'. The formatting function - receives instances of your scalar type. + This is used in the default '__repr__'. The returned formatting + function receives instances of your scalar type. Parameters ---------- - formatter: GenericArrayFormatter, optional - The formatter this array is being rendered with. When the array - is being rendered inside an Index, Series, or DataFrame, a - formatter will be provided. So if you want your objects to - render differently inside a Series from on its own, checking - with ``formatter is None`` is an option. - - The default behavior depends on whether `formatter` is passed. - - * When `formatter` is None, :func:`repr` is returned. - * When `formatter` is passed, ``formatter.formatter`` is used, - which falls back to :func:`repr` if that isn't specified. - - In general, just returning :func:`repr` should be fine. + boxed: bool, default False + An indicated for whether or not your array is being printed + within a Series, DataFrame, or Index (True), or just by + itself (False). This may be useful if you want scalar values + to appear differently within a Series versus on its own (e.g. + quoted or not). Returns ------- Callable[[Any], str] A callable that gets instances of the scalar type and - returns a string. + returns a string. By defult, :func:`repr` is used. """ - return getattr(formatter, 'formatter', None) or repr + return repr def _formatting_values(self): # type: () -> np.ndarray diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index cdd925a8568c7..09eb1b0231c31 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -499,7 +499,7 @@ def _constructor(self): def _from_sequence(cls, scalars, dtype=None, copy=False): return Categorical(scalars, dtype=dtype) - def _formatter(self, formatter=None): + def _formatter(self, boxed=False): # backwards compat with old printing. return None diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 98eb9df98328f..984ef4c8c673c 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -264,14 +264,12 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): def _from_factorized(cls, values, original): return integer_array(values, dtype=original.dtype) - def _formatter(self, formatter=None): - if formatter is None: - def fmt(x): - if isna(x): - return 'NaN' - return str(x) - return fmt - return formatter.formatter + def _formatter(self, boxed=False): + def fmt(x): + if isna(x): + return 'NaN' + return str(x) + return fmt def __getitem__(self, item): if is_integer(item): diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 64779a9644515..d1728e7b43cb7 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -350,9 +350,9 @@ def to_timestamp(self, freq=None, how='start'): # -------------------------------------------------------------------- # Array-like / EA-Interface Methods - def _formatter(self, formatter=None): - if formatter: - return formatter.formatter or str + def _formatter(self, boxed=False): + if boxed: + return str return "'{}'".format def __setitem__( diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index b4b33b71b2383..68ae39f54ea74 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -1738,15 +1738,13 @@ def __unicode__(self): fill=printing.pprint_thing(self.fill_value), index=printing.pprint_thing(self.sp_index)) - def _formatter(self, formatter=None): - if formatter is None: - def fmt(x): - if isna(x) and isinstance(x, float): - return 'NaN' - return str(x) - - return fmt - return formatter.formatter + def _formatter(self, boxed=False): + def fmt(x): + if isna(x) and isinstance(x, float): + return 'NaN' + return str(x) + + return fmt SparseArray._add_arithmetic_ops() From ef390fcc03e8b924d6da0b6e540bc124638047cf Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Mon, 19 Nov 2018 06:29:42 -0600 Subject: [PATCH 33/39] Updates: misc * whatsnew * docstrings --- doc/source/whatsnew/v0.24.0.rst | 2 +- pandas/core/arrays/base.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index b21c492cd554b..57754e9ebaf1b 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1197,7 +1197,7 @@ Datetimelike - Bug in rounding methods of :class:`DatetimeIndex` (:meth:`~DatetimeIndex.round`, :meth:`~DatetimeIndex.ceil`, :meth:`~DatetimeIndex.floor`) and :class:`Timestamp` (:meth:`~Timestamp.round`, :meth:`~Timestamp.ceil`, :meth:`~Timestamp.floor`) could give rise to loss of precision (:issue:`22591`) - Bug in :func:`to_datetime` with an :class:`Index` argument that would drop the ``name`` from the result (:issue:`21697`) - Bug in :class:`PeriodIndex` where adding or subtracting a :class:`timedelta` or :class:`Tick` object produced incorrect results (:issue:`22988`) -- Bug in the :class:`Series` repr with Period data missing a space before the data (:issue:`23601`) +- Bug in the :class:`Series` repr with period-dtype data missing a space before the data (:issue:`23601`) - Bug in :func:`date_range` when decrementing a start date to a past end date by a negative frequency (:issue:`23270`) - Bug in :meth:`Series.min` which would return ``NaN`` instead of ``NaT`` when called on a series of ``NaT`` (:issue:`23282`) - Bug in :func:`DataFrame.combine` with datetimelike values raising a TypeError (:issue:`23079`) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index edc3ccc934690..c22660a784dee 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -48,8 +48,8 @@ class ExtensionArray(object): and dtype is provided. It can be customized or replaced by by overriding: - * _formatter - * __repr__ + * __repr__ : A default repr for the ExtensionArray. + * _formatter : Print scalars inside a Series or DataFrame. Some methods require casting the ExtensionArray to an ndarray of Python objects with ``self.astype(object)``, which may be expensive. When @@ -695,7 +695,7 @@ def _formatter(self, boxed=False): ------- Callable[[Any], str] A callable that gets instances of the scalar type and - returns a string. By defult, :func:`repr` is used. + returns a string. By default, :func:`repr` is used. """ return repr From 2b5fe251b67deb45eaf5fd28d7b13beda11c267c Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Mon, 19 Nov 2018 06:38:03 -0600 Subject: [PATCH 34/39] BUG: Fixed SparseArray formatter We want to fall back to the implementation in formats. --- pandas/core/arrays/base.py | 2 +- pandas/core/arrays/sparse.py | 7 +------ pandas/io/formats/format.py | 5 ++++- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index c22660a784dee..0303dea84248b 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -676,7 +676,7 @@ def __repr__(self): dtype=self.dtype) def _formatter(self, boxed=False): - # type: (bool) -> Callable[[Any], str] + # type: (bool) -> Callable[[Any], Optional[str]] """Formatting function for scalar values. This is used in the default '__repr__'. The returned formatting diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 68ae39f54ea74..a951c8a5469f6 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -1739,12 +1739,7 @@ def __unicode__(self): index=printing.pprint_thing(self.sp_index)) def _formatter(self, boxed=False): - def fmt(x): - if isna(x) and isinstance(x, float): - return 'NaN' - return str(x) - - return fmt + return None SparseArray._add_arithmetic_ops() diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 406be5cc1b8bc..8ada7b92b2268 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1123,9 +1123,12 @@ class ExtensionArrayFormatter(GenericArrayFormatter): def _format_strings(self): values = self.values if isinstance(values, (ABCIndexClass, ABCSeries)): + boxed = True values = values._values + else: + boxed = False - formatter = values._formatter(self) + formatter = values._formatter(boxed=boxed) if is_categorical_dtype(values.dtype): # Categorical is special for now, so that we can preserve tzinfo From d9df6bf6c743f50ec68e4895893f82f34c4c7ed0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Mon, 19 Nov 2018 19:04:56 -0600 Subject: [PATCH 35/39] correct boxing --- pandas/io/formats/format.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 8ada7b92b2268..eda7167dfcbe1 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1123,12 +1123,9 @@ class ExtensionArrayFormatter(GenericArrayFormatter): def _format_strings(self): values = self.values if isinstance(values, (ABCIndexClass, ABCSeries)): - boxed = True values = values._values - else: - boxed = False - formatter = values._formatter(boxed=boxed) + formatter = values._formatter(boxed=True) if is_categorical_dtype(values.dtype): # Categorical is special for now, so that we can preserve tzinfo From 3825aeb7f61b4d350888882689bce521b4a603ce Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Sun, 2 Dec 2018 07:13:14 -0600 Subject: [PATCH 36/39] Use Array formatter in PeriodIndex --- pandas/core/indexes/period.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 56df454bddf1c..d5e5cba7a59dc 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -529,7 +529,7 @@ def __array_wrap__(self, result, context=None): @property def _formatter_func(self): - return lambda x: "'%s'" % x + return self.array._formatter(boxed=False) def asof_locs(self, where, mask): """ From 2a60c15e755bb64b582875c4548cd38b6a92cf70 Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Sun, 2 Dec 2018 07:15:55 -0600 Subject: [PATCH 37/39] Use repr / str --- pandas/core/arrays/base.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 3a8ed4eb97ccc..717178410c1ed 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -697,8 +697,12 @@ def _formatter(self, boxed=False): ------- Callable[[Any], str] A callable that gets instances of the scalar type and - returns a string. By default, :func:`repr` is used. + returns a string. By default, :func:`repr` is used + when ``boxed=False`` and :func:`str` is used when + ``boxed=True``. """ + if boxed: + return str return repr def _formatting_values(self): From a7ef10406be10e21605e32b05fe196711f46bed4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Mon, 3 Dec 2018 08:37:01 -0600 Subject: [PATCH 38/39] Update for review * docs * removed overloading of name=False * added indent_for_name --- doc/source/whatsnew/v0.24.0.rst | 2 +- pandas/core/arrays/base.py | 3 ++- pandas/core/arrays/categorical.py | 2 +- pandas/core/arrays/sparse.py | 2 ++ pandas/io/formats/printing.py | 15 ++++++++------- 5 files changed, 14 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index e35a2c1118204..02ffd07e81ff9 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1117,7 +1117,7 @@ Deprecations - The methods :meth:`Series.str.partition` and :meth:`Series.str.rpartition` have deprecated the ``pat`` keyword in favor of ``sep`` (:issue:`22676`) - Deprecated the `nthreads` keyword of :func:`pandas.read_feather` in favor of `use_threads` to reflect the changes in pyarrow 0.11.0. (:issue:`23053`) -- :meth:`ExtensionArray._formatting_values` is deprecated. Use `ExtensionArray._formatter` instead. (:issue:`23601`) +- :meth:`ExtensionArray._formatting_values` is deprecated. Use :attr:`ExtensionArray._formatter` instead. (:issue:`23601`) - :func:`pandas.read_excel` has deprecated accepting ``usecols`` as an integer. Please pass in a list of ints from 0 to ``usecols`` inclusive instead (:issue:`23527`) - Constructing a :class:`TimedeltaIndex` from data with ``datetime64``-dtyped data is deprecated, will raise ``TypeError`` in a future version (:issue:`23539`) - Constructing a :class:`DatetimeIndex` from data with ``timedelta64``-dtyped data is deprecated, will raise ``TypeError`` in a future version (:issue:`23675`) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 717178410c1ed..9c5696499817c 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -670,7 +670,8 @@ def __repr__(self): # the short repr has no trailing newline, while the truncated # repr does. So we include a newline in our template, and strip # any trailing newlines from format_object_summary - data = format_object_summary(self, self._formatter(), name=False, + data = format_object_summary(self, self._formatter(), + indent_for_name=False, trailing_comma=False).rstrip() class_name = u'<{}>\n'.format(self.__class__.__name__) return template.format(class_name=class_name, data=data, diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 94f18db642176..5ddfdd094e216 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -495,7 +495,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): return Categorical(scalars, dtype=dtype) def _formatter(self, boxed=False): - # backwards compat with old printing. + # Defer to CategoricalFormatter's formatter. return None def copy(self): diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 7ef47b73b67de..6285fd47dd41f 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -1736,6 +1736,8 @@ def __unicode__(self): index=printing.pprint_thing(self.sp_index)) def _formatter(self, boxed=False): + # Defer to the formatter from the GenericArrayFormatter calling us. + # This will infer the correct formatter from the dtype of the values. return None diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 842985aec6145..2e61c7a591280 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -272,6 +272,7 @@ class TableSchemaFormatter(BaseFormatter): def format_object_summary(obj, formatter, is_justify=True, name=None, + indent_for_name=True, trailing_comma=True): """ Return the formatted obj as a unicode string @@ -286,9 +287,9 @@ def format_object_summary(obj, formatter, is_justify=True, name=None, should justify the display name : name, optional defaults to the class name of the obj - - Pass ``False`` to indicate that subsequent lines should - not be indented to align with the name. + indent_for_name : bool, default True + Whether subsequent lines should be be indented to + align with the name. trailing_comma : bool, default True Whether to include a comma after the closing ']' @@ -306,13 +307,13 @@ def format_object_summary(obj, formatter, is_justify=True, name=None, if name is None: name = obj.__class__.__name__ - if name is False: - space1 = "\n" - space2 = "\n " # space for the opening '[' - else: + if indent_for_name: name_len = len(name) space1 = "\n%s" % (' ' * (name_len + 1)) space2 = "\n%s" % (' ' * (name_len + 2)) + else: + space1 = "\n" + space2 = "\n " # space for the opening '[' n = len(obj) sep = ',' From a3b1c92347b469740a7d7ad20f9cb97b357086dd Mon Sep 17 00:00:00 2001 From: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Mon, 3 Dec 2018 08:38:49 -0600 Subject: [PATCH 39/39] REF: removed trailing_comma argument --- pandas/core/arrays/base.py | 3 +-- pandas/io/formats/printing.py | 10 ++-------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 9c5696499817c..9a8b1de95218a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -671,8 +671,7 @@ def __repr__(self): # repr does. So we include a newline in our template, and strip # any trailing newlines from format_object_summary data = format_object_summary(self, self._formatter(), - indent_for_name=False, - trailing_comma=False).rstrip() + indent_for_name=False).rstrip(', \n') class_name = u'<{}>\n'.format(self.__class__.__name__) return template.format(class_name=class_name, data=data, length=len(self), diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 2e61c7a591280..6d45d1e5dfcee 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -272,8 +272,7 @@ class TableSchemaFormatter(BaseFormatter): def format_object_summary(obj, formatter, is_justify=True, name=None, - indent_for_name=True, - trailing_comma=True): + indent_for_name=True): """ Return the formatted obj as a unicode string @@ -290,8 +289,6 @@ def format_object_summary(obj, formatter, is_justify=True, name=None, indent_for_name : bool, default True Whether subsequent lines should be be indented to align with the name. - trailing_comma : bool, default True - Whether to include a comma after the closing ']' Returns ------- @@ -340,10 +337,7 @@ def best_len(values): else: return 0 - if trailing_comma: - close = u', ' - else: - close = u'' + close = u', ' if n == 0: summary = u'[]{}'.format(close)