PERF: Improve duplicated perf

sinhrks · jreback · commit 2166ac1394da · 2016-07-25T08:01:54.000-04:00
closes pandas-dev#10235 Author: sinhrks <sinhrks@gmail.com> Closes pandas-dev#13751 from sinhrks/perf_duplicated and squashes the following commits: 12fb5ac [sinhrks] PERF: Improve duplicated perf
diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
@@ -7,6 +7,11 @@ class algorithm(object):
 
     def setup(self):
         N = 100000
+
+        self.int_unique = pd.Int64Index(np.arange(N * 5))
+        # cache is_unique
+        self.int_unique.is_unique
+
         self.int = pd.Int64Index(np.arange(N).repeat(5))
         self.float = pd.Float64Index(np.random.randn(N).repeat(5))
 
@@ -15,3 +20,12 @@ def time_int_factorize(self):
 
     def time_float_factorize(self):
         self.int.factorize()
+
+    def time_int_unique_duplicated(self):
+        self.int_unique.duplicated()
+
+    def time_int_duplicated(self):
+        self.int.duplicated()
+
+    def time_float_duplicated(self):
+        self.float.duplicated()
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -656,6 +656,7 @@ Performance Improvements
 
 - Improved performance of float64 hash table operations, fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`)
 - Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`)
+- Improved performance of ``Index`` and ``Series`` ``.duplicated`` (:issue:`10235`)
 - Improved performance of ``Index.difference`` (:issue:`12044`)
 - Improved performance of datetime string parsing in ``DatetimeIndex`` (:issue:`13692`)
 - Improved performance of hashing ``Period`` (:issue:`12817`)
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -8,7 +8,8 @@
 
 from pandas import compat, lib, tslib, _np_version_under1p8
 from pandas.types.cast import _maybe_promote
-from pandas.types.generic import ABCPeriodIndex, ABCDatetimeIndex
+from pandas.types.generic import (ABCSeries, ABCIndex, ABCPeriodIndex,
+                                  ABCDatetimeIndex)
 from pandas.types.common import (is_integer_dtype,
                                  is_int64_dtype,
                                  is_categorical_dtype,
@@ -448,6 +449,55 @@ def _value_counts_arraylike(values, dropna=True):
     return keys, counts
 
 
+def duplicated(values, keep='first'):
+    """
+    Return boolean ndarray denoting duplicate values
+
+    .. versionadded:: 0.19.0
+
+    Parameters
+    ----------
+    keep : {'first', 'last', False}, default 'first'
+        - ``first`` : Mark duplicates as ``True`` except for the first
+          occurrence.
+        - ``last`` : Mark duplicates as ``True`` except for the last
+          occurrence.
+        - False : Mark all duplicates as ``True``.
+
+    Returns
+    -------
+    duplicated : ndarray
+    """
+
+    dtype = values.dtype
+
+    # no need to revert to original type
+    if is_datetime_or_timedelta_dtype(dtype) or is_datetimetz(dtype):
+        if isinstance(values, (ABCSeries, ABCIndex)):
+            values = values.values.view(np.int64)
+        else:
+            values = values.view(np.int64)
+    elif is_period_arraylike(values):
+        from pandas.tseries.period import PeriodIndex
+        values = PeriodIndex(values).asi8
+    elif is_categorical_dtype(dtype):
+        values = values.values.codes
+    elif isinstance(values, (ABCSeries, ABCIndex)):
+        values = values.values
+
+    if is_integer_dtype(dtype):
+        values = _ensure_int64(values)
+        duplicated = htable.duplicated_int64(values, keep=keep)
+    elif is_float_dtype(dtype):
+        values = _ensure_float64(values)
+        duplicated = htable.duplicated_float64(values, keep=keep)
+    else:
+        values = _ensure_object(values)
+        duplicated = htable.duplicated_object(values, keep=keep)
+
+    return duplicated
+
+
 def mode(values):
     """Returns the mode or mode(s) of the passed Series or ndarray (sorted)"""
     # must sort because hash order isn't necessarily defined.
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -7,7 +7,7 @@
 
 from pandas.types.missing import isnull
 from pandas.types.generic import ABCDataFrame, ABCSeries, ABCIndexClass
-from pandas.types.common import (_ensure_object, is_object_dtype,
+from pandas.types.common import (is_object_dtype,
                                  is_list_like, is_scalar)
 
 from pandas.core import common as com
@@ -1014,6 +1014,7 @@ def is_monotonic(self):
         """
         from pandas import Index
         return Index(self).is_monotonic
+
     is_monotonic_increasing = is_monotonic
 
     @property
@@ -1171,6 +1172,10 @@ def searchsorted(self, key, side='left', sorter=None):
                                                    False: 'first'})
     @Appender(_shared_docs['drop_duplicates'] % _indexops_doc_kwargs)
     def drop_duplicates(self, keep='first', inplace=False):
+        if isinstance(self, ABCIndexClass):
+            if self.is_unique:
+                return self._shallow_copy()
+
         duplicated = self.duplicated(keep=keep)
         result = self[np.logical_not(duplicated)]
         if inplace:
@@ -1200,13 +1205,14 @@ def drop_duplicates(self, keep='first', inplace=False):
                                                    False: 'first'})
     @Appender(_shared_docs['duplicated'] % _indexops_doc_kwargs)
     def duplicated(self, keep='first'):
-        keys = com._values_from_object(_ensure_object(self.values))
-        duplicated = lib.duplicated(keys, keep=keep)
-        try:
-            return self._constructor(duplicated,
+        from pandas.core.algorithms import duplicated
+        if isinstance(self, ABCIndexClass):
+            if self.is_unique:
+                return np.zeros(len(self), dtype=np.bool)
+            return duplicated(self, keep=keep)
+        else:
+            return self._constructor(duplicated(self, keep=keep),
                                      index=self.index).__finalize__(self)
-        except AttributeError:
-            return np.array(duplicated, dtype=bool)
 
     # ----------------------------------------------------------------------
     # abstracts
diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx
@@ -1073,6 +1073,90 @@ def mode_int64(int64_t[:] values):
 
     return modes[:j+1]
 
+
+def duplicated_object(ndarray[object] values, object keep='first'):
+    cdef:
+        Py_ssize_t i, n
+        dict seen = dict()
+        object row
+
+    n = len(values)
+    cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8)
+
+    if keep == 'last':
+        for i from n > i >= 0:
+            row = values[i]
+            if row in seen:
+                result[i] = 1
+            else:
+                seen[row] = i
+                result[i] = 0
+    elif keep == 'first':
+        for i from 0 <= i < n:
+            row = values[i]
+            if row in seen:
+                result[i] = 1
+            else:
+                seen[row] = i
+                result[i] = 0
+    elif keep is False:
+        for i from 0 <= i < n:
+            row = values[i]
+            if row in seen:
+                result[i] = 1
+                result[seen[row]] = 1
+            else:
+                seen[row] = i
+                result[i] = 0
+    else:
+        raise ValueError('keep must be either "first", "last" or False')
+
+    return result.view(np.bool_)
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def duplicated_float64(ndarray[float64_t, ndim=1] values,
+                       object keep='first'):
+    cdef:
+        int ret = 0, k
+        float64_t value
+        Py_ssize_t i, n = len(values)
+        kh_float64_t * table = kh_init_float64()
+        ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
+
+    kh_resize_float64(table, min(n, _SIZE_HINT_LIMIT))
+
+    if keep not in ('last', 'first', False):
+        raise ValueError('keep must be either "first", "last" or False')
+
+    if keep == 'last':
+        with nogil:
+            for i from n > i >=0:
+                kh_put_float64(table, values[i], &ret)
+                out[i] = ret == 0
+    elif keep == 'first':
+        with nogil:
+            for i from 0 <= i < n:
+                kh_put_float64(table, values[i], &ret)
+                out[i] = ret == 0
+    else:
+        with nogil:
+            for i from 0 <= i < n:
+                value = values[i]
+                k = kh_get_float64(table, value)
+                if k != table.n_buckets:
+                    out[table.vals[k]] = 1
+                    out[i] = 1
+                else:
+                    k = kh_put_float64(table, value, &ret)
+                    table.keys[k] = value
+                    table.vals[k] = i
+                    out[i] = 0
+    kh_destroy_float64(table)
+    return out
+
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def duplicated_int64(ndarray[int64_t, ndim=1] values,
diff --git a/pandas/lib.pyx b/pandas/lib.pyx
@@ -1394,46 +1394,6 @@ def fast_zip_fillna(list ndarrays, fill_value=pandas_null):
     return result
 
 
-def duplicated(ndarray[object] values, object keep='first'):
-    cdef:
-        Py_ssize_t i, n
-        dict seen = dict()
-        object row
-
-    n = len(values)
-    cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8)
-
-    if keep == 'last':
-        for i from n > i >= 0:
-            row = values[i]
-            if row in seen:
-                result[i] = 1
-            else:
-                seen[row] = i
-                result[i] = 0
-    elif keep == 'first':
-        for i from 0 <= i < n:
-            row = values[i]
-            if row in seen:
-                result[i] = 1
-            else:
-                seen[row] = i
-                result[i] = 0
-    elif keep is False:
-        for i from 0 <= i < n:
-            row = values[i]
-            if row in seen:
-                result[i] = 1
-                result[seen[row]] = 1
-            else:
-                seen[row] = i
-                result[i] = 0
-    else:
-        raise ValueError('keep must be either "first", "last" or False')
-
-    return result.view(np.bool_)
-
-
 def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups):
     cdef:
         Py_ssize_t i, group_size, n, start
diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py
@@ -1860,7 +1860,7 @@ def check(nlevels, with_nulls):
 
         for keep in ['first', 'last', False]:
             left = mi.duplicated(keep=keep)
-            right = pd.lib.duplicated(mi.values, keep=keep)
+            right = pd.hashtable.duplicated_object(mi.values, keep=keep)
             tm.assert_numpy_array_equal(left, right)
 
         # GH5873
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py