Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: tolerance now takes list-like argument for reindex and get_indexer. #17367

Merged
merged 6 commits into from
Oct 14, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ Other Enhancements
- :meth:`DataFrame.assign` will preserve the original order of ``**kwargs`` for Python 3.6+ users instead of sorting the column names. (:issue:`14207`)
- Improved the import time of pandas by about 2.25x. (:issue:`16764`)
- :func:`read_json` and :func:`to_json` now accept a ``compression`` argument which allows them to transparently handle compressed files. (:issue:`17798`)
- :func:`Series.reindex`, :func:`DataFrame.reindex`, :func:`Index.get_indexer` now support list-like argument for ``tolerance``. (:issue:`17367`)

.. _whatsnew_0210.api_breaking:

Expand Down
17 changes: 16 additions & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2470,9 +2470,10 @@ def reindex_like(self, other, method=None, copy=True, limit=None,
Maximum number of consecutive labels to fill for inexact matches.
tolerance : optional
Maximum distance between labels of the other object and this
object for inexact matches.
object for inexact matches. Can be list-like.

.. versionadded:: 0.17.0
.. versionadded:: 0.21.0 (list-like tolerance)

Notes
-----
Expand Down Expand Up @@ -2860,7 +2861,14 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
matches. The values of the index at the matching locations most
satisfy the equation ``abs(index[indexer] - target) <= tolerance``.

Tolerance may be a scalar value, which applies the same tolerance
to all values, or list-like, which applies variable tolerance per
element. List-like includes list, tuple, array, Series, and must be
the same size as the index and its dtype must exactly match the
index's type.

.. versionadded:: 0.17.0
.. versionadded:: 0.21.0 (list-like tolerance)

Examples
--------
Expand Down Expand Up @@ -3120,7 +3128,14 @@ def _reindex_multi(self, axes, copy, fill_value):
matches. The values of the index at the matching locations most
satisfy the equation ``abs(index[indexer] - target) <= tolerance``.

Tolerance may be a scalar value, which applies the same tolerance
to all values, or list-like, which applies variable tolerance per
element. List-like includes list, tuple, array, Series, and must be
the same size as the index and its dtype must exactly match the
index's type.

.. versionadded:: 0.17.0
.. versionadded:: 0.21.0 (list-like tolerance)

Examples
--------
Expand Down
23 changes: 21 additions & 2 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2484,7 +2484,14 @@ def _get_unique_index(self, dropna=False):
the index at the matching location most satisfy the equation
``abs(index[loc] - key) <= tolerance``.

Tolerance may be a scalar
value, which applies the same tolerance to all values, or
list-like, which applies variable tolerance per element. List-like
includes list, tuple, array, Series, and must be the same size as
the index and its dtype must exactly match the index's type.

.. versionadded:: 0.17.0
.. versionadded:: 0.21.0 (list-like tolerance)

Returns
-------
Expand Down Expand Up @@ -2627,7 +2634,14 @@ def _get_level_values(self, level):
matches. The values of the index at the matching locations most
satisfy the equation ``abs(index[indexer] - target) <= tolerance``.

Tolerance may be a scalar value, which applies the same tolerance
to all values, or list-like, which applies variable tolerance per
element. List-like includes list, tuple, array, Series, and must be
the same size as the index and its dtype must exactly match the
index's type.

.. versionadded:: 0.17.0
.. versionadded:: 0.21.0 (list-like tolerance)

Examples
--------
Expand All @@ -2647,7 +2661,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
method = missing.clean_reindex_fill_method(method)
target = _ensure_index(target)
if tolerance is not None:
tolerance = self._convert_tolerance(tolerance)
tolerance = self._convert_tolerance(tolerance, target)

# Treat boolean labels passed to a numeric index as not found. Without
# this fix False and True would be treated as 0 and 1 respectively.
Expand Down Expand Up @@ -2683,10 +2697,15 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
'backfill or nearest reindexing')

indexer = self._engine.get_indexer(target._values)

return _ensure_platform_int(indexer)

def _convert_tolerance(self, tolerance):
def _convert_tolerance(self, tolerance, target):
# override this method on subclasses
tolerance = np.asarray(tolerance)
if target.size != tolerance.size and tolerance.size > 1:
raise ValueError('list-like tolerance size must match '
'target index size')
return tolerance

def _get_fill_indexer(self, target, method, limit=None, tolerance=None):
Expand Down
14 changes: 7 additions & 7 deletions pandas/core/indexes/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from pandas import compat
from pandas.compat.numpy import function as nv
from pandas.core.tools.timedeltas import to_timedelta

import numpy as np
from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -431,13 +432,12 @@ def asobject(self):
from pandas.core.index import Index
return Index(self._box_values(self.asi8), name=self.name, dtype=object)

def _convert_tolerance(self, tolerance):
try:
return Timedelta(tolerance).to_timedelta64()
except ValueError:
raise ValueError('tolerance argument for %s must be convertible '
'to Timedelta: %r'
% (type(self).__name__, tolerance))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use

tolerance = super(Datetimelike, self)._convert_tolerance

def _convert_tolerance(self, tolerance, target):
tolerance = np.asarray(to_timedelta(tolerance, box=False))
if target.size != tolerance.size and tolerance.size > 1:
raise ValueError('list-like tolerance size must match '
'target index size')
return tolerance

def _maybe_mask_results(self, result, fill_value=None, convert=None):
"""
Expand Down
9 changes: 7 additions & 2 deletions pandas/core/indexes/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1423,7 +1423,7 @@ def get_loc(self, key, method=None, tolerance=None):
if tolerance is not None:
# try converting tolerance now, so errors don't get swallowed by
# the try/except clauses below
tolerance = self._convert_tolerance(tolerance)
tolerance = self._convert_tolerance(tolerance, np.asarray(key))

if isinstance(key, datetime):
# needed to localize naive datetimes
Expand All @@ -1447,7 +1447,12 @@ def get_loc(self, key, method=None, tolerance=None):
try:
stamp = Timestamp(key, tz=self.tz)
return Index.get_loc(self, stamp, method, tolerance)
except (KeyError, ValueError):
except KeyError:
raise KeyError(key)
except ValueError as e:
# list-like tolerance size must match target index size
if 'list-like' in str(e):
raise e
raise KeyError(key)

def _maybe_cast_slice_bound(self, label, side, kind):
Expand Down
21 changes: 15 additions & 6 deletions pandas/core/indexes/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,21 @@ def _convert_for_op(self, value):

return value

def _convert_tolerance(self, tolerance):
try:
return float(tolerance)
except ValueError:
raise ValueError('tolerance argument for %s must be numeric: %r' %
(type(self).__name__, tolerance))
def _convert_tolerance(self, tolerance, target):
tolerance = np.asarray(tolerance)
if target.size != tolerance.size and tolerance.size > 1:
raise ValueError('list-like tolerance size must match '
'target index size')
if not np.issubdtype(tolerance.dtype, np.number):
if tolerance.ndim > 0:
raise ValueError(('tolerance argument for %s must contain '
'numeric elements if it is list type') %
(type(self).__name__,))
else:
raise ValueError(('tolerance argument for %s must be numeric '
'if it is a scalar: %r') %
(type(self).__name__, tolerance))
return tolerance

@classmethod
def _assert_safe_casting(cls, data, subarr):
Expand Down
24 changes: 17 additions & 7 deletions pandas/core/indexes/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,12 +641,17 @@ def to_timestamp(self, freq=None, how='start'):
return DatetimeIndex(new_data, freq='infer', name=self.name)

def _maybe_convert_timedelta(self, other):
if isinstance(other, (timedelta, np.timedelta64, offsets.Tick)):
if isinstance(
other, (timedelta, np.timedelta64, offsets.Tick, np.ndarray)):
offset = frequencies.to_offset(self.freq.rule_code)
if isinstance(offset, offsets.Tick):
nanos = tslib._delta_to_nanoseconds(other)
if isinstance(other, np.ndarray):
nanos = np.vectorize(tslib._delta_to_nanoseconds)(other)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the array version of this function is almost trivial
if u can add it alongside the other and call here
(u just need to type the input as ndarray i think)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually ignore the above this is ok here

else:
nanos = tslib._delta_to_nanoseconds(other)
offset_nanos = tslib._delta_to_nanoseconds(offset)
if nanos % offset_nanos == 0:
check = np.all(nanos % offset_nanos == 0)
if check:
return nanos // offset_nanos
elif isinstance(other, offsets.DateOffset):
freqstr = other.rule_code
Expand Down Expand Up @@ -782,7 +787,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
target = target.asi8

if tolerance is not None:
tolerance = self._convert_tolerance(tolerance)
tolerance = self._convert_tolerance(tolerance, target)
return Index.get_indexer(self._int64index, target, method,
limit, tolerance)

Expand Down Expand Up @@ -825,7 +830,8 @@ def get_loc(self, key, method=None, tolerance=None):
try:
ordinal = tslib.iNaT if key is tslib.NaT else key.ordinal
if tolerance is not None:
tolerance = self._convert_tolerance(tolerance)
tolerance = self._convert_tolerance(tolerance,
np.asarray(key))
return self._int64index.get_loc(ordinal, method, tolerance)

except KeyError:
Expand Down Expand Up @@ -908,8 +914,12 @@ def _get_string_slice(self, key):
return slice(self.searchsorted(t1.ordinal, side='left'),
self.searchsorted(t2.ordinal, side='right'))

def _convert_tolerance(self, tolerance):
tolerance = DatetimeIndexOpsMixin._convert_tolerance(self, tolerance)
def _convert_tolerance(self, tolerance, target):
tolerance = DatetimeIndexOpsMixin._convert_tolerance(self, tolerance,
target)
if target.size != tolerance.size and tolerance.size > 1:
raise ValueError('list-like tolerance size must match '
'target index size')
return self._maybe_convert_timedelta(tolerance)

def insert(self, loc, item):
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexes/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -699,7 +699,7 @@ def get_loc(self, key, method=None, tolerance=None):
if tolerance is not None:
# try converting tolerance now, so errors don't get swallowed by
# the try/except clauses below
tolerance = self._convert_tolerance(tolerance)
tolerance = self._convert_tolerance(tolerance, np.asarray(key))

if _is_convertible_to_td(key):
key = Timedelta(key)
Expand Down
3 changes: 3 additions & 0 deletions pandas/core/tools/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ def to_timedelta(arg, unit='ns', box=True, errors='raise'):
elif isinstance(arg, ABCIndexClass):
return _convert_listlike(arg, unit=unit, box=box,
errors=errors, name=arg.name)
elif is_list_like(arg) and getattr(arg, 'ndim', 1) == 0:
# extract array scalar and process below
arg = arg.item()
elif is_list_like(arg) and getattr(arg, 'ndim', 1) == 1:
return _convert_listlike(arg, unit=unit, box=box, errors=errors)
elif getattr(arg, 'ndim', 1) > 1:
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/frame/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1935,9 +1935,13 @@ def test_reindex_methods(self):

actual = df.reindex_like(df, method=method, tolerance=0)
assert_frame_equal(df, actual)
actual = df.reindex_like(df, method=method, tolerance=[0, 0, 0, 0])
assert_frame_equal(df, actual)

actual = df.reindex(target, method=method, tolerance=1)
assert_frame_equal(expected, actual)
actual = df.reindex(target, method=method, tolerance=[1, 1, 1, 1])
assert_frame_equal(expected, actual)

e2 = expected[::-1]
actual = df.reindex(target[::-1], method=method)
Expand All @@ -1958,6 +1962,11 @@ def test_reindex_methods(self):
actual = df.reindex(target, method='nearest', tolerance=0.2)
assert_frame_equal(expected, actual)

expected = pd.DataFrame({'x': [0, np.nan, 1, np.nan]}, index=target)
actual = df.reindex(target, method='nearest',
tolerance=[0.5, 0.01, 0.4, 0.1])
assert_frame_equal(expected, actual)

def test_reindex_frame_add_nat(self):
rng = date_range('1/1/2000 00:00:00', periods=10, freq='10s')
df = DataFrame({'A': np.random.randn(len(rng)), 'B': rng})
Expand Down
22 changes: 21 additions & 1 deletion pandas/tests/indexes/datetimes/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,17 @@ def test_get_loc(self):
tolerance=np.timedelta64(1, 'D')) == 1
assert idx.get_loc('2000-01-01T12', method='nearest',
tolerance=timedelta(1)) == 1
with tm.assert_raises_regex(ValueError, 'must be convertible'):
with tm.assert_raises_regex(ValueError,
'unit abbreviation w/o a number'):
idx.get_loc('2000-01-01T12', method='nearest', tolerance='foo')
with pytest.raises(KeyError):
idx.get_loc('2000-01-01T03', method='nearest', tolerance='2 hours')
with pytest.raises(
ValueError,
match='tolerance size must match target index size'):
idx.get_loc('2000-01-01', method='nearest',
tolerance=[pd.Timedelta('1day').to_timedelta64(),
pd.Timedelta('1day').to_timedelta64()])

assert idx.get_loc('2000', method='nearest') == slice(0, 3)
assert idx.get_loc('2000-01', method='nearest') == slice(0, 3)
Expand Down Expand Up @@ -93,6 +100,19 @@ def test_get_indexer(self):
idx.get_indexer(target, 'nearest',
tolerance=pd.Timedelta('1 hour')),
np.array([0, -1, 1], dtype=np.intp))
tol_raw = [pd.Timedelta('1 hour'),
pd.Timedelta('1 hour'),
pd.Timedelta('1 hour').to_timedelta64(), ]
tm.assert_numpy_array_equal(
idx.get_indexer(target, 'nearest',
tolerance=[np.timedelta64(x) for x in tol_raw]),
np.array([0, -1, 1], dtype=np.intp))
tol_bad = [pd.Timedelta('2 hour').to_timedelta64(),
pd.Timedelta('1 hour').to_timedelta64(),
'foo', ]
with pytest.raises(
ValueError, match='abbreviation w/o a number'):
idx.get_indexer(target, 'nearest', tolerance=tol_bad)
with pytest.raises(ValueError):
idx.get_indexer(idx[[0]], method='nearest', tolerance='foo')

Expand Down
24 changes: 23 additions & 1 deletion pandas/tests/indexes/period/test_period.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from pandas import (PeriodIndex, period_range, notna, DatetimeIndex, NaT,
Index, Period, Int64Index, Series, DataFrame, date_range,
offsets, compat)
from pandas.core.indexes.period import IncompatibleFrequency

from ..datetimelike import DatetimeLike

Expand Down Expand Up @@ -83,14 +84,21 @@ def test_get_loc(self):
tolerance=np.timedelta64(1, 'D')) == 1
assert idx.get_loc('2000-01-02T12', method='nearest',
tolerance=timedelta(1)) == 1
with tm.assert_raises_regex(ValueError, 'must be convertible'):
with tm.assert_raises_regex(ValueError,
'unit abbreviation w/o a number'):
idx.get_loc('2000-01-10', method='nearest', tolerance='foo')

msg = 'Input has different freq from PeriodIndex\\(freq=D\\)'
with tm.assert_raises_regex(ValueError, msg):
idx.get_loc('2000-01-10', method='nearest', tolerance='1 hour')
with pytest.raises(KeyError):
idx.get_loc('2000-01-10', method='nearest', tolerance='1 day')
with pytest.raises(
ValueError,
match='list-like tolerance size must match target index size'):
idx.get_loc('2000-01-10', method='nearest',
tolerance=[pd.Timedelta('1 day').to_timedelta64(),
pd.Timedelta('1 day').to_timedelta64()])

def test_where(self):
i = self.create_index()
Expand Down Expand Up @@ -158,6 +166,20 @@ def test_get_indexer(self):
tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest',
tolerance='1 day'),
np.array([0, 1, 1], dtype=np.intp))
tol_raw = [pd.Timedelta('1 hour'),
pd.Timedelta('1 hour'),
np.timedelta64(1, 'D'), ]
tm.assert_numpy_array_equal(
idx.get_indexer(target, 'nearest',
tolerance=[np.timedelta64(x) for x in tol_raw]),
np.array([0, -1, 1], dtype=np.intp))
tol_bad = [pd.Timedelta('2 hour').to_timedelta64(),
pd.Timedelta('1 hour').to_timedelta64(),
np.timedelta64(1, 'M'), ]
with pytest.raises(
IncompatibleFrequency,
match='Input has different freq from'):
idx.get_indexer(target, 'nearest', tolerance=tol_bad)

def test_repeat(self):
# GH10183
Expand Down
Loading