Skip to content

Commit 2166ac1

Browse files
sinhrksjreback
authored andcommitted
PERF: Improve duplicated perf
closes pandas-dev#10235 Author: sinhrks <sinhrks@gmail.com> Closes pandas-dev#13751 from sinhrks/perf_duplicated and squashes the following commits: 12fb5ac [sinhrks] PERF: Improve duplicated perf
1 parent 5f524d6 commit 2166ac1

File tree

9 files changed

+314
-88
lines changed

9 files changed

+314
-88
lines changed

asv_bench/benchmarks/algorithms.py

+14
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ class algorithm(object):
77

88
def setup(self):
99
N = 100000
10+
11+
self.int_unique = pd.Int64Index(np.arange(N * 5))
12+
# cache is_unique
13+
self.int_unique.is_unique
14+
1015
self.int = pd.Int64Index(np.arange(N).repeat(5))
1116
self.float = pd.Float64Index(np.random.randn(N).repeat(5))
1217

@@ -15,3 +20,12 @@ def time_int_factorize(self):
1520

1621
def time_float_factorize(self):
1722
self.int.factorize()
23+
24+
def time_int_unique_duplicated(self):
25+
self.int_unique.duplicated()
26+
27+
def time_int_duplicated(self):
28+
self.int.duplicated()
29+
30+
def time_float_duplicated(self):
31+
self.float.duplicated()

doc/source/whatsnew/v0.19.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -656,6 +656,7 @@ Performance Improvements
656656

657657
- Improved performance of float64 hash table operations, fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`)
658658
- Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`)
659+
- Improved performance of ``Index`` and ``Series`` ``.duplicated`` (:issue:`10235`)
659660
- Improved performance of ``Index.difference`` (:issue:`12044`)
660661
- Improved performance of datetime string parsing in ``DatetimeIndex`` (:issue:`13692`)
661662
- Improved performance of hashing ``Period`` (:issue:`12817`)

pandas/core/algorithms.py

+51-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88

99
from pandas import compat, lib, tslib, _np_version_under1p8
1010
from pandas.types.cast import _maybe_promote
11-
from pandas.types.generic import ABCPeriodIndex, ABCDatetimeIndex
11+
from pandas.types.generic import (ABCSeries, ABCIndex, ABCPeriodIndex,
12+
ABCDatetimeIndex)
1213
from pandas.types.common import (is_integer_dtype,
1314
is_int64_dtype,
1415
is_categorical_dtype,
@@ -448,6 +449,55 @@ def _value_counts_arraylike(values, dropna=True):
448449
return keys, counts
449450

450451

452+
def duplicated(values, keep='first'):
453+
"""
454+
Return boolean ndarray denoting duplicate values
455+
456+
.. versionadded:: 0.19.0
457+
458+
Parameters
459+
----------
460+
keep : {'first', 'last', False}, default 'first'
461+
- ``first`` : Mark duplicates as ``True`` except for the first
462+
occurrence.
463+
- ``last`` : Mark duplicates as ``True`` except for the last
464+
occurrence.
465+
- False : Mark all duplicates as ``True``.
466+
467+
Returns
468+
-------
469+
duplicated : ndarray
470+
"""
471+
472+
dtype = values.dtype
473+
474+
# no need to revert to original type
475+
if is_datetime_or_timedelta_dtype(dtype) or is_datetimetz(dtype):
476+
if isinstance(values, (ABCSeries, ABCIndex)):
477+
values = values.values.view(np.int64)
478+
else:
479+
values = values.view(np.int64)
480+
elif is_period_arraylike(values):
481+
from pandas.tseries.period import PeriodIndex
482+
values = PeriodIndex(values).asi8
483+
elif is_categorical_dtype(dtype):
484+
values = values.values.codes
485+
elif isinstance(values, (ABCSeries, ABCIndex)):
486+
values = values.values
487+
488+
if is_integer_dtype(dtype):
489+
values = _ensure_int64(values)
490+
duplicated = htable.duplicated_int64(values, keep=keep)
491+
elif is_float_dtype(dtype):
492+
values = _ensure_float64(values)
493+
duplicated = htable.duplicated_float64(values, keep=keep)
494+
else:
495+
values = _ensure_object(values)
496+
duplicated = htable.duplicated_object(values, keep=keep)
497+
498+
return duplicated
499+
500+
451501
def mode(values):
452502
"""Returns the mode or mode(s) of the passed Series or ndarray (sorted)"""
453503
# must sort because hash order isn't necessarily defined.

pandas/core/base.py

+13-7
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
from pandas.types.missing import isnull
99
from pandas.types.generic import ABCDataFrame, ABCSeries, ABCIndexClass
10-
from pandas.types.common import (_ensure_object, is_object_dtype,
10+
from pandas.types.common import (is_object_dtype,
1111
is_list_like, is_scalar)
1212

1313
from pandas.core import common as com
@@ -1014,6 +1014,7 @@ def is_monotonic(self):
10141014
"""
10151015
from pandas import Index
10161016
return Index(self).is_monotonic
1017+
10171018
is_monotonic_increasing = is_monotonic
10181019

10191020
@property
@@ -1171,6 +1172,10 @@ def searchsorted(self, key, side='left', sorter=None):
11711172
False: 'first'})
11721173
@Appender(_shared_docs['drop_duplicates'] % _indexops_doc_kwargs)
11731174
def drop_duplicates(self, keep='first', inplace=False):
1175+
if isinstance(self, ABCIndexClass):
1176+
if self.is_unique:
1177+
return self._shallow_copy()
1178+
11741179
duplicated = self.duplicated(keep=keep)
11751180
result = self[np.logical_not(duplicated)]
11761181
if inplace:
@@ -1200,13 +1205,14 @@ def drop_duplicates(self, keep='first', inplace=False):
12001205
False: 'first'})
12011206
@Appender(_shared_docs['duplicated'] % _indexops_doc_kwargs)
12021207
def duplicated(self, keep='first'):
1203-
keys = com._values_from_object(_ensure_object(self.values))
1204-
duplicated = lib.duplicated(keys, keep=keep)
1205-
try:
1206-
return self._constructor(duplicated,
1208+
from pandas.core.algorithms import duplicated
1209+
if isinstance(self, ABCIndexClass):
1210+
if self.is_unique:
1211+
return np.zeros(len(self), dtype=np.bool)
1212+
return duplicated(self, keep=keep)
1213+
else:
1214+
return self._constructor(duplicated(self, keep=keep),
12071215
index=self.index).__finalize__(self)
1208-
except AttributeError:
1209-
return np.array(duplicated, dtype=bool)
12101216

12111217
# ----------------------------------------------------------------------
12121218
# abstracts

pandas/hashtable.pyx

+84
Original file line numberDiff line numberDiff line change
@@ -1073,6 +1073,90 @@ def mode_int64(int64_t[:] values):
10731073

10741074
return modes[:j+1]
10751075

1076+
1077+
def duplicated_object(ndarray[object] values, object keep='first'):
1078+
cdef:
1079+
Py_ssize_t i, n
1080+
dict seen = dict()
1081+
object row
1082+
1083+
n = len(values)
1084+
cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8)
1085+
1086+
if keep == 'last':
1087+
for i from n > i >= 0:
1088+
row = values[i]
1089+
if row in seen:
1090+
result[i] = 1
1091+
else:
1092+
seen[row] = i
1093+
result[i] = 0
1094+
elif keep == 'first':
1095+
for i from 0 <= i < n:
1096+
row = values[i]
1097+
if row in seen:
1098+
result[i] = 1
1099+
else:
1100+
seen[row] = i
1101+
result[i] = 0
1102+
elif keep is False:
1103+
for i from 0 <= i < n:
1104+
row = values[i]
1105+
if row in seen:
1106+
result[i] = 1
1107+
result[seen[row]] = 1
1108+
else:
1109+
seen[row] = i
1110+
result[i] = 0
1111+
else:
1112+
raise ValueError('keep must be either "first", "last" or False')
1113+
1114+
return result.view(np.bool_)
1115+
1116+
1117+
@cython.wraparound(False)
1118+
@cython.boundscheck(False)
1119+
def duplicated_float64(ndarray[float64_t, ndim=1] values,
1120+
object keep='first'):
1121+
cdef:
1122+
int ret = 0, k
1123+
float64_t value
1124+
Py_ssize_t i, n = len(values)
1125+
kh_float64_t * table = kh_init_float64()
1126+
ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
1127+
1128+
kh_resize_float64(table, min(n, _SIZE_HINT_LIMIT))
1129+
1130+
if keep not in ('last', 'first', False):
1131+
raise ValueError('keep must be either "first", "last" or False')
1132+
1133+
if keep == 'last':
1134+
with nogil:
1135+
for i from n > i >=0:
1136+
kh_put_float64(table, values[i], &ret)
1137+
out[i] = ret == 0
1138+
elif keep == 'first':
1139+
with nogil:
1140+
for i from 0 <= i < n:
1141+
kh_put_float64(table, values[i], &ret)
1142+
out[i] = ret == 0
1143+
else:
1144+
with nogil:
1145+
for i from 0 <= i < n:
1146+
value = values[i]
1147+
k = kh_get_float64(table, value)
1148+
if k != table.n_buckets:
1149+
out[table.vals[k]] = 1
1150+
out[i] = 1
1151+
else:
1152+
k = kh_put_float64(table, value, &ret)
1153+
table.keys[k] = value
1154+
table.vals[k] = i
1155+
out[i] = 0
1156+
kh_destroy_float64(table)
1157+
return out
1158+
1159+
10761160
@cython.wraparound(False)
10771161
@cython.boundscheck(False)
10781162
def duplicated_int64(ndarray[int64_t, ndim=1] values,

pandas/lib.pyx

-40
Original file line numberDiff line numberDiff line change
@@ -1394,46 +1394,6 @@ def fast_zip_fillna(list ndarrays, fill_value=pandas_null):
13941394
return result
13951395

13961396

1397-
def duplicated(ndarray[object] values, object keep='first'):
1398-
cdef:
1399-
Py_ssize_t i, n
1400-
dict seen = dict()
1401-
object row
1402-
1403-
n = len(values)
1404-
cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8)
1405-
1406-
if keep == 'last':
1407-
for i from n > i >= 0:
1408-
row = values[i]
1409-
if row in seen:
1410-
result[i] = 1
1411-
else:
1412-
seen[row] = i
1413-
result[i] = 0
1414-
elif keep == 'first':
1415-
for i from 0 <= i < n:
1416-
row = values[i]
1417-
if row in seen:
1418-
result[i] = 1
1419-
else:
1420-
seen[row] = i
1421-
result[i] = 0
1422-
elif keep is False:
1423-
for i from 0 <= i < n:
1424-
row = values[i]
1425-
if row in seen:
1426-
result[i] = 1
1427-
result[seen[row]] = 1
1428-
else:
1429-
seen[row] = i
1430-
result[i] = 0
1431-
else:
1432-
raise ValueError('keep must be either "first", "last" or False')
1433-
1434-
return result.view(np.bool_)
1435-
1436-
14371397
def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups):
14381398
cdef:
14391399
Py_ssize_t i, group_size, n, start

pandas/tests/indexes/test_multi.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1860,7 +1860,7 @@ def check(nlevels, with_nulls):
18601860

18611861
for keep in ['first', 'last', False]:
18621862
left = mi.duplicated(keep=keep)
1863-
right = pd.lib.duplicated(mi.values, keep=keep)
1863+
right = pd.hashtable.duplicated_object(mi.values, keep=keep)
18641864
tm.assert_numpy_array_equal(left, right)
18651865

18661866
# GH5873

0 commit comments

Comments
 (0)