Skip to content

Commit eb119d8

Browse files
fix conflict
2 parents a1a8891 + 85572de commit eb119d8

File tree

23 files changed

+601
-189
lines changed

23 files changed

+601
-189
lines changed

asv_bench/benchmarks/series_methods.py

+19
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,25 @@ def time_dropna(self, dtype):
124124
self.s.dropna()
125125

126126

127+
class SearchSorted(object):
128+
129+
goal_time = 0.2
130+
params = ['int8', 'int16', 'int32', 'int64',
131+
'uint8', 'uint16', 'uint32', 'uint64',
132+
'float16', 'float32', 'float64',
133+
'str']
134+
param_names = ['dtype']
135+
136+
def setup(self, dtype):
137+
N = 10**5
138+
data = np.array([1] * N + [2] * N + [3] * N).astype(dtype)
139+
self.s = Series(data)
140+
141+
def time_searchsorted(self, dtype):
142+
key = '2' if dtype == 'str' else 2
143+
self.s.searchsorted(key)
144+
145+
127146
class Map(object):
128147

129148
params = ['dict', 'Series']

doc/source/styled.xlsx

5.55 KB
Binary file not shown.

doc/source/user_guide/timeseries.rst

+10
Original file line numberDiff line numberDiff line change
@@ -633,6 +633,16 @@ We are stopping on the included end-point as it is part of the index:
633633
dft2 = dft2.swaplevel(0, 1).sort_index()
634634
dft2.loc[idx[:, '2013-01-05'], :]
635635
636+
.. versionadded:: 0.25.0
637+
638+
Slicing with string indexing also honors UTC offset.
639+
640+
.. ipython:: python
641+
642+
df = pd.DataFrame([0], index=pd.DatetimeIndex(['2019-01-01'], tz='US/Pacific'))
643+
df
644+
df['2019-01-01 12:00:00+04:00':'2019-01-01 13:00:00+04:00']
645+
636646
.. _timeseries.slice_vs_exact_match:
637647

638648
Slice vs. Exact Match

doc/source/whatsnew/v0.24.2.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ Bug Fixes
9696
**Other**
9797

9898
- Bug in :meth:`Series.is_unique` where single occurrences of ``NaN`` were not considered unique (:issue:`25180`)
99-
-
99+
- Bug in :func:`merge` when merging an empty ``DataFrame`` with an ``Int64`` column or a non-empty ``DataFrame`` with an ``Int64`` column that is all ``NaN`` (:issue:`25183`)
100100
-
101101

102102
.. _whatsnew_0.242.contributors:

doc/source/whatsnew/v0.25.0.rst

+37-3
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,46 @@ Other Enhancements
2323
- :meth:`Timestamp.replace` now supports the ``fold`` argument to disambiguate DST transition times (:issue:`25017`)
2424
- :meth:`DataFrame.at_time` and :meth:`Series.at_time` now support :meth:`datetime.time` objects with timezones (:issue:`24043`)
2525
- ``Series.str`` has gained :meth:`Series.str.casefold` method to removes all case distinctions present in a string (:issue:`25405`)
26+
- :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`)
27+
- :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`)
28+
-
2629

2730
.. _whatsnew_0250.api_breaking:
2831

2932
Backwards incompatible API changes
3033
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
3134

32-
- :meth:`Timestamp.strptime` will now raise a NotImplementedError (:issue:`25016`)
35+
.. _whatsnew_0250.api_breaking.utc_offset_indexing:
36+
37+
Indexing with date strings with UTC offsets
38+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
39+
40+
Indexing a :class:`DataFrame` or :class:`Series` with a :class:`DatetimeIndex` with a
41+
date string with a UTC offset would previously ignore the UTC offset. Now, the UTC offset
42+
is respected in indexing. (:issue:`24076`, :issue:`16785`)
43+
44+
*Previous Behavior*:
45+
46+
.. code-block:: ipython
47+
48+
In [1]: df = pd.DataFrame([0], index=pd.DatetimeIndex(['2019-01-01'], tz='US/Pacific'))
49+
50+
In [2]: df
51+
Out[2]:
52+
0
53+
2019-01-01 00:00:00-08:00 0
54+
55+
In [3]: df['2019-01-01 00:00:00+04:00':'2019-01-01 01:00:00+04:00']
56+
Out[3]:
57+
0
58+
2019-01-01 00:00:00-08:00 0
59+
60+
*New Behavior*:
61+
62+
.. ipython:: ipython
63+
64+
df = pd.DataFrame([0], index=pd.DatetimeIndex(['2019-01-01'], tz='US/Pacific'))
65+
df['2019-01-01 12:00:00+04:00':'2019-01-01 13:00:00+04:00']
3366

3467
.. _whatsnew_0250.api.other:
3568

@@ -38,7 +71,7 @@ Other API Changes
3871

3972
- :class:`DatetimeTZDtype` will now standardize pytz timezones to a common timezone instance (:issue:`24713`)
4073
- ``Timestamp`` and ``Timedelta`` scalars now implement the :meth:`to_numpy` method as aliases to :meth:`Timestamp.to_datetime64` and :meth:`Timedelta.to_timedelta64`, respectively. (:issue:`24653`)
41-
-
74+
- :meth:`Timestamp.strptime` will now rise a ``NotImplementedError`` (:issue:`25016`)
4275
-
4376

4477
.. _whatsnew_0250.deprecations:
@@ -64,7 +97,8 @@ Performance Improvements
6497

6598
- Significant speedup in `SparseArray` initialization that benefits most operations, fixing performance regression introduced in v0.20.0 (:issue:`24985`)
6699
- `DataFrame.to_stata()` is now faster when outputting data with any string or non-native endian columns (:issue:`25045`)
67-
-
100+
- Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is
101+
int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`)
68102

69103

70104
.. _whatsnew_0250.bug_fixes:

pandas/compat/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ def lfilter(*args, **kwargs):
137137
reload = reload
138138
Hashable = collections.abc.Hashable
139139
Iterable = collections.abc.Iterable
140+
Iterator = collections.abc.Iterator
140141
Mapping = collections.abc.Mapping
141142
MutableMapping = collections.abc.MutableMapping
142143
Sequence = collections.abc.Sequence
@@ -199,6 +200,7 @@ def get_range_parameters(data):
199200

200201
Hashable = collections.Hashable
201202
Iterable = collections.Iterable
203+
Iterator = collections.Iterator
202204
Mapping = collections.Mapping
203205
MutableMapping = collections.MutableMapping
204206
Sequence = collections.Sequence

pandas/core/algorithms.py

+84-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
ensure_float64, ensure_int64, ensure_object, ensure_platform_int,
2020
ensure_uint64, is_array_like, is_bool_dtype, is_categorical_dtype,
2121
is_complex_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype,
22-
is_datetimelike, is_extension_array_dtype, is_float_dtype,
22+
is_datetimelike, is_extension_array_dtype, is_float_dtype, is_integer,
2323
is_integer_dtype, is_interval_dtype, is_list_like, is_numeric_dtype,
2424
is_object_dtype, is_period_dtype, is_scalar, is_signed_integer_dtype,
2525
is_sparse, is_timedelta64_dtype, is_unsigned_integer_dtype,
@@ -1724,6 +1724,89 @@ def func(arr, indexer, out, fill_value=np.nan):
17241724
return out
17251725

17261726

1727+
# ------------ #
1728+
# searchsorted #
1729+
# ------------ #
1730+
1731+
def searchsorted(arr, value, side="left", sorter=None):
1732+
"""
1733+
Find indices where elements should be inserted to maintain order.
1734+
1735+
.. versionadded:: 0.25.0
1736+
1737+
Find the indices into a sorted array `arr` (a) such that, if the
1738+
corresponding elements in `value` were inserted before the indices,
1739+
the order of `arr` would be preserved.
1740+
1741+
Assuming that `arr` is sorted:
1742+
1743+
====== ================================
1744+
`side` returned index `i` satisfies
1745+
====== ================================
1746+
left ``arr[i-1] < value <= self[i]``
1747+
right ``arr[i-1] <= value < self[i]``
1748+
====== ================================
1749+
1750+
Parameters
1751+
----------
1752+
arr: array-like
1753+
Input array. If `sorter` is None, then it must be sorted in
1754+
ascending order, otherwise `sorter` must be an array of indices
1755+
that sort it.
1756+
value : array_like
1757+
Values to insert into `arr`.
1758+
side : {'left', 'right'}, optional
1759+
If 'left', the index of the first suitable location found is given.
1760+
If 'right', return the last such index. If there is no suitable
1761+
index, return either 0 or N (where N is the length of `self`).
1762+
sorter : 1-D array_like, optional
1763+
Optional array of integer indices that sort array a into ascending
1764+
order. They are typically the result of argsort.
1765+
1766+
Returns
1767+
-------
1768+
array of ints
1769+
Array of insertion points with the same shape as `value`.
1770+
1771+
See Also
1772+
--------
1773+
numpy.searchsorted : Similar method from NumPy.
1774+
"""
1775+
if sorter is not None:
1776+
sorter = ensure_platform_int(sorter)
1777+
1778+
if isinstance(arr, np.ndarray) and is_integer_dtype(arr) and (
1779+
is_integer(value) or is_integer_dtype(value)):
1780+
from .arrays.array_ import array
1781+
# if `arr` and `value` have different dtypes, `arr` would be
1782+
# recast by numpy, causing a slow search.
1783+
# Before searching below, we therefore try to give `value` the
1784+
# same dtype as `arr`, while guarding against integer overflows.
1785+
iinfo = np.iinfo(arr.dtype.type)
1786+
value_arr = np.array([value]) if is_scalar(value) else np.array(value)
1787+
if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all():
1788+
# value within bounds, so no overflow, so can convert value dtype
1789+
# to dtype of arr
1790+
dtype = arr.dtype
1791+
else:
1792+
dtype = value_arr.dtype
1793+
1794+
if is_scalar(value):
1795+
value = dtype.type(value)
1796+
else:
1797+
value = array(value, dtype=dtype)
1798+
elif not (is_object_dtype(arr) or is_numeric_dtype(arr) or
1799+
is_categorical_dtype(arr)):
1800+
from pandas.core.series import Series
1801+
# E.g. if `arr` is an array with dtype='datetime64[ns]'
1802+
# and `value` is a pd.Timestamp, we may need to convert value
1803+
value_ser = Series(value)._values
1804+
value = value_ser[0] if is_scalar(value) else value_ser
1805+
1806+
result = arr.searchsorted(value, side=side, sorter=sorter)
1807+
return result
1808+
1809+
17271810
# ---- #
17281811
# diff #
17291812
# ---- #

pandas/core/arrays/base.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -555,17 +555,17 @@ def searchsorted(self, value, side="left", sorter=None):
555555
.. versionadded:: 0.24.0
556556
557557
Find the indices into a sorted array `self` (a) such that, if the
558-
corresponding elements in `v` were inserted before the indices, the
559-
order of `self` would be preserved.
558+
corresponding elements in `value` were inserted before the indices,
559+
the order of `self` would be preserved.
560560
561-
Assuming that `a` is sorted:
561+
Assuming that `self` is sorted:
562562
563-
====== ============================
563+
====== ================================
564564
`side` returned index `i` satisfies
565-
====== ============================
566-
left ``self[i-1] < v <= self[i]``
567-
right ``self[i-1] <= v < self[i]``
568-
====== ============================
565+
====== ================================
566+
left ``self[i-1] < value <= self[i]``
567+
right ``self[i-1] <= value < self[i]``
568+
====== ================================
569569
570570
Parameters
571571
----------
@@ -581,7 +581,7 @@ def searchsorted(self, value, side="left", sorter=None):
581581
582582
Returns
583583
-------
584-
indices : array of ints
584+
array of ints
585585
Array of insertion points with the same shape as `value`.
586586
587587
See Also

pandas/core/arrays/numpy_.py

+7
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from pandas._libs import lib
66
from pandas.compat.numpy import function as nv
7+
from pandas.util._decorators import Appender
78
from pandas.util._validators import validate_fillna_kwargs
89

910
from pandas.core.dtypes.dtypes import ExtensionDtype
@@ -12,6 +13,7 @@
1213

1314
from pandas import compat
1415
from pandas.core import nanops
16+
from pandas.core.algorithms import searchsorted
1517
from pandas.core.missing import backfill_1d, pad_1d
1618

1719
from .base import ExtensionArray, ExtensionOpsMixin
@@ -423,6 +425,11 @@ def to_numpy(self, dtype=None, copy=False):
423425

424426
return result
425427

428+
@Appender(ExtensionArray.searchsorted.__doc__)
429+
def searchsorted(self, value, side='left', sorter=None):
430+
return searchsorted(self.to_numpy(), value,
431+
side=side, sorter=sorter)
432+
426433
# ------------------------------------------------------------------------
427434
# Ops
428435

pandas/core/base.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -1522,11 +1522,11 @@ def factorize(self, sort=False, na_sentinel=-1):
15221522
array([3])
15231523
""")
15241524

1525-
@Substitution(klass='IndexOpsMixin')
1525+
@Substitution(klass='Index')
15261526
@Appender(_shared_docs['searchsorted'])
15271527
def searchsorted(self, value, side='left', sorter=None):
1528-
# needs coercion on the key (DatetimeIndex does already)
1529-
return self._values.searchsorted(value, side=side, sorter=sorter)
1528+
return algorithms.searchsorted(self._values, value,
1529+
side=side, sorter=sorter)
15301530

15311531
def drop_duplicates(self, keep='first', inplace=False):
15321532
inplace = validate_bool_kwarg(inplace, 'inplace')

pandas/core/frame.py

+41-2
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333

3434
from pandas import compat
3535
from pandas.compat import (range, map, zip, lmap, lzip, StringIO, u,
36-
PY36, raise_with_traceback,
36+
PY36, raise_with_traceback, Iterator,
3737
string_and_binary_types)
3838
from pandas.compat.numpy import function as nv
3939
from pandas.core.dtypes.cast import (
@@ -4025,7 +4025,8 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
40254025
This parameter can be either a single column key, a single array of
40264026
the same length as the calling DataFrame, or a list containing an
40274027
arbitrary combination of column keys and arrays. Here, "array"
4028-
encompasses :class:`Series`, :class:`Index` and ``np.ndarray``.
4028+
encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and
4029+
instances of :class:`abc.Iterator`.
40294030
drop : bool, default True
40304031
Delete columns to be used as the new index.
40314032
append : bool, default False
@@ -4104,6 +4105,32 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
41044105
if not isinstance(keys, list):
41054106
keys = [keys]
41064107

4108+
err_msg = ('The parameter "keys" may be a column key, one-dimensional '
4109+
'array, or a list containing only valid column keys and '
4110+
'one-dimensional arrays.')
4111+
4112+
missing = []
4113+
for col in keys:
4114+
if isinstance(col, (ABCIndexClass, ABCSeries, np.ndarray,
4115+
list, Iterator)):
4116+
# arrays are fine as long as they are one-dimensional
4117+
# iterators get converted to list below
4118+
if getattr(col, 'ndim', 1) != 1:
4119+
raise ValueError(err_msg)
4120+
else:
4121+
# everything else gets tried as a key; see GH 24969
4122+
try:
4123+
found = col in self.columns
4124+
except TypeError:
4125+
raise TypeError(err_msg + ' Received column of '
4126+
'type {}'.format(type(col)))
4127+
else:
4128+
if not found:
4129+
missing.append(col)
4130+
4131+
if missing:
4132+
raise KeyError('None of {} are in the columns'.format(missing))
4133+
41074134
if inplace:
41084135
frame = self
41094136
else:
@@ -4132,13 +4159,25 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
41324159
elif isinstance(col, (list, np.ndarray)):
41334160
arrays.append(col)
41344161
names.append(None)
4162+
elif isinstance(col, Iterator):
4163+
arrays.append(list(col))
4164+
names.append(None)
41354165
# from here, col can only be a column label
41364166
else:
41374167
arrays.append(frame[col]._values)
41384168
names.append(col)
41394169
if drop:
41404170
to_remove.append(col)
41414171

4172+
if len(arrays[-1]) != len(self):
4173+
# check newest element against length of calling frame, since
4174+
# ensure_index_from_sequences would not raise for append=False.
4175+
raise ValueError('Length mismatch: Expected {len_self} rows, '
4176+
'received array of length {len_col}'.format(
4177+
len_self=len(self),
4178+
len_col=len(arrays[-1])
4179+
))
4180+
41424181
index = ensure_index_from_sequences(arrays, names)
41434182

41444183
if verify_integrity and not index.is_unique:

0 commit comments

Comments
 (0)