pandas-dev · WillAyd · Jan 9, 2019 · Nov 26, 2018 · Nov 27, 2018 · Nov 27, 2018
diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
@@ -16,63 +16,75 @@
 
 class Factorize(object):
 
-    params = [True, False]
-    param_names = ['sort']
+    params = [[True, False], ['int', 'uint', 'float', 'string']]
+    param_names = ['sort', 'dtype']
 
-    def setup(self, sort):
+    def setup(self, sort, dtype):
         N = 10**5
-        self.int_idx = pd.Int64Index(np.arange(N).repeat(5))
-        self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5))
-        self.string_idx = tm.makeStringIndex(N)
+        data = {'int': pd.Int64Index(np.arange(N).repeat(5)),
+                'uint': pd.UInt64Index(np.arange(N).repeat(5)),
+                'float': pd.Float64Index(np.random.randn(N).repeat(5)),
+                'string': tm.makeStringIndex(N).repeat(5)}
+        self.idx = data[dtype]
 
-    def time_factorize_int(self, sort):
-        self.int_idx.factorize(sort=sort)
+    def time_factorize(self, sort, dtype):
+        self.idx.factorize(sort=sort)
 
-    def time_factorize_float(self, sort):
-        self.float_idx.factorize(sort=sort)
 
-    def time_factorize_string(self, sort):
-        self.string_idx.factorize(sort=sort)
+class FactorizeUnique(object):
 
+    params = [[True, False], ['int', 'uint', 'float', 'string']]
+    param_names = ['sort', 'dtype']
 
-class Duplicated(object):
-
-    params = ['first', 'last', False]
-    param_names = ['keep']
-
-    def setup(self, keep):
+    def setup(self, sort, dtype):
         N = 10**5
-        self.int_idx = pd.Int64Index(np.arange(N).repeat(5))
-        self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5))
-        self.string_idx = tm.makeStringIndex(N)
-
-    def time_duplicated_int(self, keep):
-        self.int_idx.duplicated(keep=keep)
+        data = {'int': pd.Int64Index(np.arange(N)),
+                'uint': pd.UInt64Index(np.arange(N)),
+                'float': pd.Float64Index(np.arange(N)),
+                'string': tm.makeStringIndex(N)}
+        self.idx = data[dtype]
+        assert self.idx.is_unique
 
-    def time_duplicated_float(self, keep):
-        self.float_idx.duplicated(keep=keep)
+    def time_factorize(self, sort, dtype):
+        self.idx.factorize(sort=sort)
 
-    def time_duplicated_string(self, keep):
-        self.string_idx.duplicated(keep=keep)
 
+class Duplicated(object):
 
-class DuplicatedUniqueIndex(object):
+    params = [['first', 'last', False], ['int', 'uint', 'float', 'string']]
+    param_names = ['keep', 'dtype']
 
-    def setup(self):
+    def setup(self, keep, dtype):
         N = 10**5
-        self.idx_int_dup = pd.Int64Index(np.arange(N * 5))
+        data = {'int': pd.Int64Index(np.arange(N).repeat(5)),
+                'uint': pd.UInt64Index(np.arange(N).repeat(5)),
+                'float': pd.Float64Index(np.random.randn(N).repeat(5)),
+                'string': tm.makeStringIndex(N).repeat(5)}
+        self.idx = data[dtype]
         # cache is_unique
-        self.idx_int_dup.is_unique
+        self.idx.is_unique
+
+    def time_duplicated(self, keep, dtype):
+        self.idx.duplicated(keep=keep)
+
 
-    def time_duplicated_unique_int(self):
-        self.idx_int_dup.duplicated()
+class DuplicatedUniqueIndex(object):
 
+    params = ['int', 'uint', 'float', 'string']
+    param_names = ['dtype']
 
-class Match(object):
+    def setup(self, dtype):
+        N = 10**5
+        data = {'int': pd.Int64Index(np.arange(N)),
+                'uint': pd.UInt64Index(np.arange(N)),
+                'float': pd.Float64Index(np.random.randn(N)),
+                'string': tm.makeStringIndex(N)}
+        self.idx = data[dtype]
+        # cache is_unique
+        self.idx.is_unique
 
-    def setup(self):
-        self.uniques = tm.makeStringIndex(1000).values
-        self.all = self.uniques.repeat(10)
+    def time_duplicated_unique(self, dtype):
+        self.idx.duplicated()
 
 
 class Hashing(object):
@@ -113,4 +125,21 @@ def time_series_dates(self, df):
         hashing.hash_pandas_object(df['dates'])
 
 
+class Quantile(object):
+    params = [[0, 0.5, 1],
+              ['linear', 'nearest', 'lower', 'higher', 'midpoint'],
+              ['float', 'int', 'uint']]
+    param_names = ['quantile', 'interpolation', 'dtype']
+
+    def setup(self, quantile, interpolation, dtype):
+        N = 10**5
+        data = {'int': np.arange(N),
+                'uint': np.arange(N).astype(np.uint64),
+                'float': np.random.randn(N)}
+        self.idx = pd.Series(data[dtype].repeat(5))
+
+    def time_quantile(self, quantile, interpolation, dtype):
+        self.idx.quantile(quantile, interpolation=interpolation)
+
+
 from .pandas_vb_common import setup  # noqa: F401
diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
@@ -84,7 +84,8 @@ class ValueCounts(object):
 
     def setup(self, dropna):
         n = 5 * 10**5
-        arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)]
+        arr = ['s{:04d}'.format(i) for i in np.random.randint(0, n // 10,
+                                                              size=n)]
         self.ts = pd.Series(arr).astype('category')
 
     def time_value_counts(self, dropna):
@@ -104,13 +105,26 @@ class SetCategories(object):
 
     def setup(self):
         n = 5 * 10**5
-        arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)]
+        arr = ['s{:04d}'.format(i) for i in np.random.randint(0, n // 10,
+                                                              size=n)]
         self.ts = pd.Series(arr).astype('category')
 
     def time_set_categories(self):
         self.ts.cat.set_categories(self.ts.cat.categories[::2])
 
 
+class RemoveCategories(object):
+
+    def setup(self):
+        n = 5 * 10**5
+        arr = ['s{:04d}'.format(i) for i in np.random.randint(0, n // 10,
+                                                              size=n)]
+        self.ts = pd.Series(arr).astype('category')
+
+    def time_remove_categories(self):
+        self.ts.cat.remove_categories(self.ts.cat.categories[::2])
+
+
 class Rank(object):
 
     def setup(self):
@@ -159,7 +173,7 @@ def setup(self, dtype):
         sample_size = 100
         arr = [i for i in np.random.randint(0, n // 10, size=n)]
         if dtype == 'object':
-            arr = ['s%04d' % i for i in arr]
+            arr = ['s{:04d}'.format(i) for i in arr]
         self.sample = np.random.choice(arr, sample_size)
         self.series = pd.Series(arr).astype('category')
 
@@ -236,4 +250,40 @@ def time_getitem_bool_array(self, index):
         self.data[self.data == self.cat_scalar]
 
 
+class Indexing(object):
+
+    def setup(self):
+        N = 10**5
+        self.index = pd.CategoricalIndex(range(N), range(N))
+        self.series = pd.Series(range(N), index=self.index).sort_index()
+        self.category = self.index[500]
+
+    def time_get_loc(self):
+        self.index.get_loc(self.category)
+
+    def time_shape(self):
+        self.index.shape
+
+    def time_shallow_copy(self):
+        self.index._shallow_copy()
+
+    def time_align(self):
+        pd.DataFrame({'a': self.series, 'b': self.series[:500]})
+
+    def time_intersection(self):
+        self.index[:750].intersection(self.index[250:])
+
+    def time_unique(self):
+        self.index.unique()
+
+    def time_reindex(self):
+        self.index.reindex(self.index[:500])
+
+    def time_reindex_missing(self):
+        self.index.reindex(['a', 'b', 'c', 'd'])
+
+    def time_sort_values(self):
+        self.index.sort_values(ascending=False)
+
+
 from .pandas_vb_common import setup  # noqa: F401
diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py
@@ -48,6 +48,7 @@ def setup(self, axis):
                        index=date_range('20130101', periods=N, freq='s'))
         self.empty_left = [DataFrame(), df]
         self.empty_right = [df, DataFrame()]
+        self.mixed_ndims = [df, df.head(N // 2)]
 
     def time_concat_series(self, axis):
         concat(self.series, axis=axis, sort=False)
@@ -61,6 +62,9 @@ def time_concat_empty_right(self, axis):
     def time_concat_empty_left(self, axis):
         concat(self.empty_left, axis=axis)
 
+    def time_concat_mixed_ndims(self, axis):
+        concat(self.mixed_ndims, axis=axis)
+
 
 class ConcatPanels(object):
 
@@ -274,8 +278,10 @@ def time_merge_ordered(self):
 
 
 class MergeAsof(object):
+    params = [['backward', 'forward', 'nearest']]
+    param_names = ['direction']
 
-    def setup(self):
+    def setup(self, direction):
         one_count = 200000
         two_count = 1000000
 
@@ -307,20 +313,23 @@ def setup(self):
         self.df1e = df1[['time', 'key', 'key2', 'value1']]
         self.df2e = df2[['time', 'key', 'key2', 'value2']]
 
-    def time_on_int(self):
-        merge_asof(self.df1a, self.df2a, on='time')
+    def time_on_int(self, direction):
+        merge_asof(self.df1a, self.df2a, on='time', direction=direction)
 
-    def time_on_int32(self):
-        merge_asof(self.df1d, self.df2d, on='time32')
+    def time_on_int32(self, direction):
+        merge_asof(self.df1d, self.df2d, on='time32', direction=direction)
 
-    def time_by_object(self):
-        merge_asof(self.df1b, self.df2b, on='time', by='key')
+    def time_by_object(self, direction):
+        merge_asof(self.df1b, self.df2b, on='time', by='key',
+                   direction=direction)
 
-    def time_by_int(self):
-        merge_asof(self.df1c, self.df2c, on='time', by='key2')
+    def time_by_int(self, direction):
+        merge_asof(self.df1c, self.df2c, on='time', by='key2',
+                   direction=direction)
 
-    def time_multiby(self):
-        merge_asof(self.df1e, self.df2e, on='time', by=['key', 'key2'])
+    def time_multiby(self, direction):
+        merge_asof(self.df1e, self.df2e, on='time', by=['key', 'key2'],
+                   direction=direction)
 
 
 class Align(object):

diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py
@@ -16,14 +16,17 @@
                   np.float64, np.int16, np.int8, np.uint16, np.uint8]
 datetime_dtypes = [np.datetime64, np.timedelta64]
 string_dtypes = [np.object]
-extension_dtypes = [pd.Int8Dtype, pd.Int16Dtype,
-                    pd.Int32Dtype, pd.Int64Dtype,
-                    pd.UInt8Dtype, pd.UInt16Dtype,
-                    pd.UInt32Dtype, pd.UInt64Dtype,
-                    pd.CategoricalDtype,
-                    pd.IntervalDtype,
-                    pd.DatetimeTZDtype('ns', 'UTC'),
-                    pd.PeriodDtype('D')]
+try:
+    extension_dtypes = [pd.Int8Dtype, pd.Int16Dtype,
+                        pd.Int32Dtype, pd.Int64Dtype,
+                        pd.UInt8Dtype, pd.UInt16Dtype,
+                        pd.UInt32Dtype, pd.UInt64Dtype,
+                        pd.CategoricalDtype,
+                        pd.IntervalDtype,
+                        pd.DatetimeTZDtype('ns', 'UTC'),
+                        pd.PeriodDtype('D')]
+except AttributeError:
+    extension_dtypes = []
 
 
 def setup(*args, **kwargs):

diff --git a/asv_bench/benchmarks/plotting.py b/asv_bench/benchmarks/plotting.py
@@ -74,6 +74,9 @@ def time_plot_regular_compat(self):
     def time_plot_irregular(self):
         self.df2.plot()
 
+    def time_plot_table(self):
+        self.df.plot(table=True)
+
 
 class Misc(object):
 

diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pandas.util.testing as tm
-from pandas import (DataFrame, Series, MultiIndex, Index,
-                    date_range)
+from pandas import (DataFrame, Series, MultiIndex, Index, date_range,
+                    period_range)
 from .pandas_vb_common import lib
 
 
@@ -35,15 +35,15 @@ def time_reindex_multiindex(self):
 
 class ReindexMethod(object):
 
-    params = ['pad', 'backfill']
-    param_names = ['method']
+    params = [['pad', 'backfill'], [date_range, period_range]]
+    param_names = ['method', 'constructor']
 
-    def setup(self, method):
+    def setup(self, method, constructor):
         N = 100000
-        self.idx = date_range('1/1/2000', periods=N, freq='1min')
+        self.idx = constructor('1/1/2000', periods=N, freq='1min')
         self.ts = Series(np.random.randn(N), index=self.idx)[::2]
 
-    def time_reindex_method(self, method):
+    def time_reindex_method(self, method, constructor):
         self.ts.reindex(self.idx, method=method)
 
 

diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py
@@ -131,6 +131,38 @@ def setup(self):
     def time_pivot_table(self):
         self.df.pivot_table(index='key1', columns=['key2', 'key3'])
 
+    def time_pivot_table_agg(self):
+        self.df.pivot_table(index='key1', columns=['key2', 'key3'],
+                            aggfunc=['sum', 'mean'])
+
+    def time_pivot_table_margins(self):
+        self.df.pivot_table(index='key1', columns=['key2', 'key3'],
+                            margins=True)
+
+
+class Crosstab(object):
+
+    def setup(self):
+        N = 100000
+        fac1 = np.array(['A', 'B', 'C'], dtype='O')
+        fac2 = np.array(['one', 'two'], dtype='O')
+        self.ind1 = np.random.randint(0, 3, size=N)
+        self.ind2 = np.random.randint(0, 2, size=N)
+        self.vec1 = fac1.take(self.ind1)
+        self.vec2 = fac2.take(self.ind2)
+
+    def time_crosstab(self):
+        pd.crosstab(self.vec1, self.vec2)
+
+    def time_crosstab_values(self):
+        pd.crosstab(self.vec1, self.vec2, values=self.ind1, aggfunc='sum')
+
+    def time_crosstab_normalize(self):
+        pd.crosstab(self.vec1, self.vec2, normalize=True)
+
+    def time_crosstab_normalize_margins(self):
+        pd.crosstab(self.vec1, self.vec2, normalize=True, margins=True)
+
 
 class GetDummies(object):
     def setup(self):