Skip to content

Commit dfe264a

Browse files
committed
Make common impl. with Index.searchsorted
1 parent 117da18 commit dfe264a

File tree

4 files changed

+68
-14
lines changed

4 files changed

+68
-14
lines changed

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -496,6 +496,7 @@ Performance Improvements
496496
both when indexing by label (using .loc) and position(.iloc).
497497
Likewise, slicing a ``CategoricalIndex`` itself (i.e. ``ci[100:200]``) shows similar speed improvements (:issue:`21659`)
498498
- Improved performance of :func:`Series.searchsorted` (:issue:`22034`)
499+
- Improved performance of :func:`Index.searchsorted` when dtype is uint64, float64 or object (:issue:`22034`)
499500
- Improved performance of :func:`Series.describe` in case of numeric dtypes (:issue:`21274`)
500501
- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`)
501502
- Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`,:issue:`21606`)

pandas/core/base.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
is_list_like,
1616
is_scalar,
1717
is_extension_type,
18-
is_extension_array_dtype)
18+
is_extension_array_dtype,
19+
ensure_platform_int)
1920

2021
from pandas.util._validators import validate_bool_kwarg
2122
from pandas.errors import AbstractMethodError
@@ -1230,8 +1231,8 @@ def factorize(self, sort=False, na_sentinel=-1):
12301231
@Appender(_shared_docs['searchsorted'])
12311232
@deprecate_kwarg(old_arg_name='key', new_arg_name='value')
12321233
def searchsorted(self, value, side='left', sorter=None):
1233-
# needs coercion on the key (DatetimeIndex does already)
1234-
return self.values.searchsorted(value, side=side, sorter=sorter)
1234+
return com.searchsorted(self._values, value,
1235+
side=side, sorter=sorter)
12351236

12361237
def drop_duplicates(self, keep='first', inplace=False):
12371238
inplace = validate_bool_kwarg(inplace, 'inplace')

pandas/core/common.py

+45-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
from pandas import compat
1616
from pandas.compat import iteritems, PY36, OrderedDict
1717
from pandas.core.dtypes.generic import ABCSeries, ABCIndex, ABCIndexClass
18-
from pandas.core.dtypes.common import is_integer
18+
from pandas.core.dtypes.common import (is_integer, is_integer_dtype,
19+
is_scalar, ensure_platform_int)
1920
from pandas.core.dtypes.inference import _iterable_not_string
2021
from pandas.core.dtypes.missing import isna, isnull, notnull # noqa
2122
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
@@ -430,3 +431,46 @@ def _pipe(obj, func, *args, **kwargs):
430431
return func(*args, **kwargs)
431432
else:
432433
return func(obj, *args, **kwargs)
434+
435+
436+
def searchsorted_integer(arr, value, side="left", sorter=None):
437+
dtype = arr.dtype
438+
if sorter is not None:
439+
sorter = ensure_platform_int(sorter)
440+
441+
# check integer bounds
442+
iinfo = np.iinfo(dtype)
443+
value_arr = np.array([value]) if is_scalar(value) else np.array(value)
444+
if (value_arr < iinfo.min).any() or (value_arr > iinfo.max).any():
445+
msg = "Value {} out of bound for dtype {}".format(value, dtype)
446+
raise ValueError(msg)
447+
448+
# convert dtype of value for better searchsorted speed
449+
if is_integer(value) or is_integer_dtype(value):
450+
value = np.asarray(value, dtype=dtype)
451+
elif hasattr(value, 'is_integer') and value.is_integer():
452+
# float 2.0 should be converted to int 2
453+
# but float 2.2 should *not* be converted to int 2
454+
value = np.asarray(value, dtype=dtype)
455+
456+
return arr.searchsorted(value, side=side, sorter=sorter)
457+
458+
459+
def searchsorted(arr, value, side="left", sorter=None):
460+
"""
461+
Do a arr.searchsorted(val) with adjustments for int dtypes.
462+
463+
:func:`numpy.searchsorted` is only fast if value is of same dtype
464+
as the searched array. Else numpy recasts arr to a higher dtype, which
465+
causes a slowdown. Below we ensure that value has the right dtype
466+
for giving fast results for arr.searchsorted.
467+
468+
See :meth:`Index.searchsorted` for details on parameters and return value.
469+
"""
470+
if sorter is not None:
471+
sorter = ensure_platform_int(sorter)
472+
473+
if is_integer_dtype(arr):
474+
return searchsorted_integer(arr, value, side=side, sorter=sorter)
475+
else:
476+
return arr.searchsorted(value, side=side, sorter=sorter)

pandas/core/series.py

+18-10
Original file line numberDiff line numberDiff line change
@@ -2080,16 +2080,24 @@ def __rmatmul__(self, other):
20802080
@Appender(base._shared_docs['searchsorted'])
20812081
@deprecate_kwarg(old_arg_name='v', new_arg_name='value')
20822082
def searchsorted(self, value, side='left', sorter=None):
2083-
if sorter is not None:
2084-
sorter = ensure_platform_int(sorter)
2085-
if not is_extension_type(self._values):
2086-
# numpy searchsorted is only fast if value is of same dtype as the
2087-
# searched array. Below we ensure that value has the right dtype,
2088-
# and is not 0-dimensional.
2089-
value = np.asarray(value, dtype=self._values.dtype)
2090-
value = value[..., np.newaxis] if value.ndim == 0 else value
2091-
2092-
return self._values.searchsorted(value, side=side, sorter=sorter)
2083+
simple_types = (is_integer_dtype, is_float_dtype, is_object_dtype,
2084+
is_categorical_dtype)
2085+
2086+
if any(is_dtype(self) for is_dtype in simple_types):
2087+
result = com.searchsorted(self._values, value,
2088+
side=side, sorter=sorter)
2089+
else:
2090+
# e.g. self is datetimelike and value is a pd.Timestamp
2091+
if sorter is not None:
2092+
sorter = ensure_platform_int(sorter)
2093+
value = Series(value)._values
2094+
result = self._values.searchsorted(value, side=side, sorter=sorter)
2095+
2096+
if is_scalar(result):
2097+
# ensure that a 1-dim array is returned
2098+
result = np.array([result])
2099+
2100+
return result
20932101

20942102
# -------------------------------------------------------------------
20952103
# Combination

0 commit comments

Comments
 (0)