Skip to content

Commit 6e47287

Browse files
committed
ENH: duplicated and drop_duplicates now accept take=all kw
1 parent b144cc1 commit 6e47287

14 files changed

+404
-66
lines changed

doc/source/indexing.rst

+6-4
Original file line numberDiff line numberDiff line change
@@ -1209,17 +1209,19 @@ takes as an argument the columns to use to identify duplicated rows.
12091209
- ``drop_duplicates`` removes duplicate rows.
12101210

12111211
By default, the first observed row of a duplicate set is considered unique, but
1212-
each method has a ``take_last`` parameter that indicates the last observed row
1213-
should be taken instead.
1212+
each method has a ``take`` parameter to specify targets.
12141213

12151214
.. ipython:: python
12161215
12171216
df2 = DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'],
12181217
'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'],
12191218
'c' : np.random.randn(7)})
12201219
df2.duplicated(['a','b'])
1220+
df2.duplicated(['a','b'], take='last')
1221+
df2.duplicated(['a','b'], take='all')
12211222
df2.drop_duplicates(['a','b'])
1222-
df2.drop_duplicates(['a','b'], take_last=True)
1223+
df2.drop_duplicates(['a','b'], take='last')
1224+
df2.drop_duplicates(['a','b'], take='all')
12231225
12241226
An alternative way to drop duplicates on the index is ``.groupby(level=0)`` combined with ``first()`` or ``last()``.
12251227

@@ -1230,7 +1232,7 @@ An alternative way to drop duplicates on the index is ``.groupby(level=0)`` comb
12301232
df3.groupby(level=0).first()
12311233
12321234
# a bit more verbose
1233-
df3.reset_index().drop_duplicates(subset='b', take_last=False).set_index('b')
1235+
df3.reset_index().drop_duplicates(subset='b', take='first').set_index('b')
12341236
12351237
.. _indexing.dictionarylike:
12361238

doc/source/whatsnew/v0.17.0.txt

+12
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,16 @@ New features
2626
Other enhancements
2727
^^^^^^^^^^^^^^^^^^
2828

29+
- ``drop_duplicates`` and ``duplicated`` now accept ``take`` keyword to target first, last, and all duplicates. (:issue:`6511`, :issue:`8505`)
30+
31+
.. ipython :: python
32+
33+
s = pd.Series(['A', 'B', 'C', 'A', 'B', 'D'])
34+
s.drop_duplicates()
35+
s.drop_duplicates(take='last')
36+
s.drop_duplicates(take='all')
37+
38+
2939
.. _whatsnew_0170.api:
3040

3141
Backwards incompatible API changes
@@ -43,6 +53,8 @@ Other API Changes
4353
Deprecations
4454
^^^^^^^^^^^^
4555

56+
- ``drop_duplicates`` and ``duplicated``'s ``take_last`` keyword was removed in favor of ``take``. (:issue:`6511`, :issue:`8505`)
57+
4658
.. _whatsnew_0170.prior_deprecations:
4759

4860
Removal of prior version deprecations/changes

pandas/core/base.py

+18-9
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from pandas.core import common as com
77
import pandas.core.nanops as nanops
88
import pandas.lib as lib
9-
from pandas.util.decorators import Appender, cache_readonly
9+
from pandas.util.decorators import Appender, cache_readonly, deprecate_kwarg
1010
from pandas.core.strings import StringMethods
1111
from pandas.core.common import AbstractMethodError
1212

@@ -543,18 +543,23 @@ def _dir_deletions(self):
543543
544544
Parameters
545545
----------
546-
take_last : boolean, default False
547-
Take the last observed index in a group. Default first
546+
547+
take : {'first', 'last', 'all'}, default 'first'
548+
- ``first`` : Take the first observed index in a group.
549+
- ``last`` : Take the last observed index in a group.
550+
- ``all`` : Remove all duplicates in a group.
551+
take_last : deprecated
548552
%(inplace)s
549553
550554
Returns
551555
-------
552556
deduplicated : %(klass)s
553557
""")
554558

559+
@deprecate_kwarg('take_last', 'take', mapping={True: 'last', False: 'first'})
555560
@Appender(_shared_docs['drop_duplicates'] % _indexops_doc_kwargs)
556-
def drop_duplicates(self, take_last=False, inplace=False):
557-
duplicated = self.duplicated(take_last=take_last)
561+
def drop_duplicates(self, take='first', inplace=False):
562+
duplicated = self.duplicated(take=take)
558563
result = self[np.logical_not(duplicated)]
559564
if inplace:
560565
return self._update_inplace(result)
@@ -566,18 +571,22 @@ def drop_duplicates(self, take_last=False, inplace=False):
566571
567572
Parameters
568573
----------
569-
take_last : boolean, default False
570-
Take the last observed index in a group. Default first
574+
take : {'first', 'last', 'all'}, default 'first'
575+
- ``first`` : Take the first observed index in a group.
576+
- ``last`` : Take the last observed index in a group.
577+
- ``all`` : Remove all duplicates in a group.
578+
take_last : deprecated
571579
572580
Returns
573581
-------
574582
duplicated : %(duplicated)s
575583
""")
576584

585+
@deprecate_kwarg('take_last', 'take', mapping={True: 'last', False: 'first'})
577586
@Appender(_shared_docs['duplicated'] % _indexops_doc_kwargs)
578-
def duplicated(self, take_last=False):
587+
def duplicated(self, take='first'):
579588
keys = com._ensure_object(self.values)
580-
duplicated = lib.duplicated(keys, take_last=take_last)
589+
duplicated = lib.duplicated(keys, take=take)
581590
try:
582591
return self._constructor(duplicated,
583592
index=self.index).__finalize__(self)

pandas/core/frame.py

+16-9
Original file line numberDiff line numberDiff line change
@@ -2801,8 +2801,9 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None,
28012801
else:
28022802
return result
28032803

2804+
@deprecate_kwarg('take_last', 'take', mapping={True: 'last', False: 'first'})
28042805
@deprecate_kwarg(old_arg_name='cols', new_arg_name='subset')
2805-
def drop_duplicates(self, subset=None, take_last=False, inplace=False):
2806+
def drop_duplicates(self, subset=None, take='first', inplace=False):
28062807
"""
28072808
Return DataFrame with duplicate rows removed, optionally only
28082809
considering certain columns
@@ -2812,8 +2813,11 @@ def drop_duplicates(self, subset=None, take_last=False, inplace=False):
28122813
subset : column label or sequence of labels, optional
28132814
Only consider certain columns for identifying duplicates, by
28142815
default use all of the columns
2815-
take_last : boolean, default False
2816-
Take the last observed row in a row. Defaults to the first row
2816+
take : {'first', 'last', 'all'}, default 'first'
2817+
- ``first`` : Take the first observed row in a row.
2818+
- ``last`` : Take the last observed row in a row.
2819+
- ``all`` : Remove all duplicates in a row.
2820+
take_last : deprecated
28172821
inplace : boolean, default False
28182822
Whether to drop duplicates in place or to return a copy
28192823
cols : kwargs only argument of subset [deprecated]
@@ -2822,7 +2826,7 @@ def drop_duplicates(self, subset=None, take_last=False, inplace=False):
28222826
-------
28232827
deduplicated : DataFrame
28242828
"""
2825-
duplicated = self.duplicated(subset, take_last=take_last)
2829+
duplicated = self.duplicated(subset, take=take)
28262830

28272831
if inplace:
28282832
inds, = (-duplicated).nonzero()
@@ -2831,8 +2835,9 @@ def drop_duplicates(self, subset=None, take_last=False, inplace=False):
28312835
else:
28322836
return self[-duplicated]
28332837

2838+
@deprecate_kwarg('take_last', 'take', mapping={True: 'last', False: 'first'})
28342839
@deprecate_kwarg(old_arg_name='cols', new_arg_name='subset')
2835-
def duplicated(self, subset=None, take_last=False):
2840+
def duplicated(self, subset=None, take='first'):
28362841
"""
28372842
Return boolean Series denoting duplicate rows, optionally only
28382843
considering certain columns
@@ -2842,9 +2847,11 @@ def duplicated(self, subset=None, take_last=False):
28422847
subset : column label or sequence of labels, optional
28432848
Only consider certain columns for identifying duplicates, by
28442849
default use all of the columns
2845-
take_last : boolean, default False
2846-
For a set of distinct duplicate rows, flag all but the last row as
2847-
duplicated. Default is for all but the first row to be flagged
2850+
take : {'first', 'last', 'all'}, default 'first'
2851+
- ``first`` : Take the first observed row in a row.
2852+
- ``last`` : Take the last observed row in a row.
2853+
- ``all`` : Remove all duplicates in a row.
2854+
take_last : deprecated
28482855
cols : kwargs only argument of subset [deprecated]
28492856
28502857
Returns
@@ -2870,7 +2877,7 @@ def f(vals):
28702877
labels, shape = map(list, zip( * map(f, vals)))
28712878

28722879
ids = get_group_index(labels, shape, sort=False, xnull=False)
2873-
return Series(duplicated_int64(ids, take_last), index=self.index)
2880+
return Series(duplicated_int64(ids, take), index=self.index)
28742881

28752882
#----------------------------------------------------------------------
28762883
# Sorting

pandas/core/index.py

+13-9
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from pandas.lib import Timestamp, Timedelta, is_datetime_array
1717
from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs, PandasDelegate
1818
from pandas.util.decorators import (Appender, Substitution, cache_readonly,
19-
deprecate)
19+
deprecate, deprecate_kwarg)
2020
import pandas.core.common as com
2121
from pandas.core.common import (isnull, array_equivalent, is_dtype_equal, is_object_dtype,
2222
_values_from_object, is_float, is_integer, is_iterator, is_categorical_dtype,
@@ -2576,14 +2576,16 @@ def drop(self, labels, errors='raise'):
25762576
indexer = indexer[~mask]
25772577
return self.delete(indexer)
25782578

2579+
@deprecate_kwarg('take_last', 'take', mapping={True: 'last', False: 'first'})
25792580
@Appender(_shared_docs['drop_duplicates'] % _index_doc_kwargs)
2580-
def drop_duplicates(self, take_last=False):
2581-
result = super(Index, self).drop_duplicates(take_last=take_last)
2581+
def drop_duplicates(self, take='first'):
2582+
result = super(Index, self).drop_duplicates(take=take)
25822583
return self._constructor(result)
25832584

2585+
@deprecate_kwarg('take_last', 'take', mapping={True: 'last', False: 'first'})
25842586
@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
2585-
def duplicated(self, take_last=False):
2586-
return super(Index, self).duplicated(take_last=take_last)
2587+
def duplicated(self, take='first'):
2588+
return super(Index, self).duplicated(take=take)
25872589

25882590

25892591
def _evaluate_with_timedelta_like(self, other, op, opstr):
@@ -3002,10 +3004,11 @@ def _engine(self):
30023004
def is_unique(self):
30033005
return not self.duplicated().any()
30043006

3007+
@deprecate_kwarg('take_last', 'take', mapping={True: 'last', False: 'first'})
30053008
@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
3006-
def duplicated(self, take_last=False):
3009+
def duplicated(self, take='first'):
30073010
from pandas.hashtable import duplicated_int64
3008-
return duplicated_int64(self.codes.astype('i8'), take_last)
3011+
return duplicated_int64(self.codes.astype('i8'), take)
30093012

30103013
def get_loc(self, key, method=None):
30113014
"""
@@ -4152,15 +4155,16 @@ def _has_complex_internals(self):
41524155
def is_unique(self):
41534156
return not self.duplicated().any()
41544157

4158+
@deprecate_kwarg('take_last', 'take', mapping={True: 'last', False: 'first'})
41554159
@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
4156-
def duplicated(self, take_last=False):
4160+
def duplicated(self, take='first'):
41574161
from pandas.core.groupby import get_group_index
41584162
from pandas.hashtable import duplicated_int64
41594163

41604164
shape = map(len, self.levels)
41614165
ids = get_group_index(self.labels, shape, sort=False, xnull=False)
41624166

4163-
return duplicated_int64(ids, take_last)
4167+
return duplicated_int64(ids, take)
41644168

41654169
def get_value(self, series, key):
41664170
# somewhat broken encapsulation

pandas/core/series.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
import pandas.core.datetools as datetools
4545
import pandas.core.format as fmt
4646
import pandas.core.nanops as nanops
47-
from pandas.util.decorators import Appender, cache_readonly
47+
from pandas.util.decorators import Appender, cache_readonly, deprecate_kwarg
4848

4949
import pandas.lib as lib
5050
import pandas.tslib as tslib
@@ -1137,14 +1137,15 @@ def mode(self):
11371137
from pandas.core.algorithms import mode
11381138
return mode(self)
11391139

1140+
@deprecate_kwarg('take_last', 'take', mapping={True: 'last', False: 'first'})
11401141
@Appender(base._shared_docs['drop_duplicates'] % _shared_doc_kwargs)
1141-
def drop_duplicates(self, take_last=False, inplace=False):
1142-
return super(Series, self).drop_duplicates(take_last=take_last,
1143-
inplace=inplace)
1142+
def drop_duplicates(self, take='first', inplace=False):
1143+
return super(Series, self).drop_duplicates(take=take, inplace=inplace)
11441144

1145+
@deprecate_kwarg('take_last', 'take', mapping={True: 'last', False: 'first'})
11451146
@Appender(base._shared_docs['duplicated'] % _shared_doc_kwargs)
1146-
def duplicated(self, take_last=False):
1147-
return super(Series, self).duplicated(take_last=take_last)
1147+
def duplicated(self, take='first'):
1148+
return super(Series, self).duplicated(take=take)
11481149

11491150
def idxmin(self, axis=None, out=None, skipna=True):
11501151
"""

pandas/hashtable.pyx

+17-3
Original file line numberDiff line numberDiff line change
@@ -1051,23 +1051,37 @@ def mode_int64(ndarray[int64_t] values):
10511051

10521052
@cython.wraparound(False)
10531053
@cython.boundscheck(False)
1054-
def duplicated_int64(ndarray[int64_t, ndim=1] values, int take_last):
1054+
def duplicated_int64(ndarray[int64_t, ndim=1] values, object take='first'):
10551055
cdef:
10561056
int ret = 0
10571057
Py_ssize_t i, n = len(values)
10581058
kh_int64_t * table = kh_init_int64()
10591059
ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
10601060

1061+
# for all
1062+
dict seen = dict()
1063+
10611064
kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT))
10621065

1063-
if take_last:
1066+
if take == 'last':
10641067
for i from n > i >=0:
10651068
kh_put_int64(table, values[i], &ret)
10661069
out[i] = ret == 0
1067-
else:
1070+
elif take == 'first':
10681071
for i from 0 <= i < n:
10691072
kh_put_int64(table, values[i], &ret)
10701073
out[i] = ret == 0
1074+
elif take == 'all':
1075+
for i from 0 <= i < n:
1076+
row = values[i]
1077+
if row in seen:
1078+
out[i] = 1
1079+
out[seen[row]] = 1
1080+
else:
1081+
seen[row] = i
1082+
out[i] = 0
1083+
else:
1084+
raise ValueError('take must be either "first", "last" or "all"')
10711085

10721086
kh_destroy_int64(table)
10731087
return out

pandas/lib.pyx

+19-7
Original file line numberDiff line numberDiff line change
@@ -1292,35 +1292,47 @@ def fast_zip_fillna(list ndarrays, fill_value=pandas_null):
12921292

12931293
return result
12941294

1295-
def duplicated(ndarray[object] values, take_last=False):
1295+
1296+
def duplicated(ndarray[object] values, object take='first'):
12961297
cdef:
12971298
Py_ssize_t i, n
1298-
set seen = set()
1299+
dict seen = dict()
12991300
object row
13001301

13011302
n = len(values)
13021303
cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8)
13031304

1304-
if take_last:
1305+
if take == 'last':
13051306
for i from n > i >= 0:
13061307
row = values[i]
1307-
13081308
if row in seen:
13091309
result[i] = 1
13101310
else:
1311-
seen.add(row)
1311+
seen[row] = i
13121312
result[i] = 0
1313-
else:
1313+
elif take == 'first':
13141314
for i from 0 <= i < n:
13151315
row = values[i]
13161316
if row in seen:
13171317
result[i] = 1
13181318
else:
1319-
seen.add(row)
1319+
seen[row] = i
13201320
result[i] = 0
1321+
elif take == 'all':
1322+
for i from 0 <= i < n:
1323+
row = values[i]
1324+
if row in seen:
1325+
result[i] = 1
1326+
result[seen[row]] = 1
1327+
else:
1328+
seen[row] = i
1329+
result[i] = 0
1330+
else:
1331+
raise ValueError('take must be either "first", "last" or "all"')
13211332

13221333
return result.view(np.bool_)
13231334

1335+
13241336
def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups):
13251337
cdef:
13261338
Py_ssize_t i, group_size, n, start

0 commit comments

Comments
 (0)