Skip to content

Commit 46a31c9

Browse files
qwhelanWillAyd
authored andcommitted
PERF: expand asv benchmark coverage (#24214)
1 parent ea8c9bf commit 46a31c9

10 files changed

+219
-72
lines changed

asv_bench/benchmarks/algorithms.py

+67-38
Original file line numberDiff line numberDiff line change
@@ -16,63 +16,75 @@
1616

1717
class Factorize(object):
1818

19-
params = [True, False]
20-
param_names = ['sort']
19+
params = [[True, False], ['int', 'uint', 'float', 'string']]
20+
param_names = ['sort', 'dtype']
2121

22-
def setup(self, sort):
22+
def setup(self, sort, dtype):
2323
N = 10**5
24-
self.int_idx = pd.Int64Index(np.arange(N).repeat(5))
25-
self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5))
26-
self.string_idx = tm.makeStringIndex(N)
24+
data = {'int': pd.Int64Index(np.arange(N).repeat(5)),
25+
'uint': pd.UInt64Index(np.arange(N).repeat(5)),
26+
'float': pd.Float64Index(np.random.randn(N).repeat(5)),
27+
'string': tm.makeStringIndex(N).repeat(5)}
28+
self.idx = data[dtype]
2729

28-
def time_factorize_int(self, sort):
29-
self.int_idx.factorize(sort=sort)
30+
def time_factorize(self, sort, dtype):
31+
self.idx.factorize(sort=sort)
3032

31-
def time_factorize_float(self, sort):
32-
self.float_idx.factorize(sort=sort)
3333

34-
def time_factorize_string(self, sort):
35-
self.string_idx.factorize(sort=sort)
34+
class FactorizeUnique(object):
3635

36+
params = [[True, False], ['int', 'uint', 'float', 'string']]
37+
param_names = ['sort', 'dtype']
3738

38-
class Duplicated(object):
39-
40-
params = ['first', 'last', False]
41-
param_names = ['keep']
42-
43-
def setup(self, keep):
39+
def setup(self, sort, dtype):
4440
N = 10**5
45-
self.int_idx = pd.Int64Index(np.arange(N).repeat(5))
46-
self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5))
47-
self.string_idx = tm.makeStringIndex(N)
48-
49-
def time_duplicated_int(self, keep):
50-
self.int_idx.duplicated(keep=keep)
41+
data = {'int': pd.Int64Index(np.arange(N)),
42+
'uint': pd.UInt64Index(np.arange(N)),
43+
'float': pd.Float64Index(np.arange(N)),
44+
'string': tm.makeStringIndex(N)}
45+
self.idx = data[dtype]
46+
assert self.idx.is_unique
5147

52-
def time_duplicated_float(self, keep):
53-
self.float_idx.duplicated(keep=keep)
48+
def time_factorize(self, sort, dtype):
49+
self.idx.factorize(sort=sort)
5450

55-
def time_duplicated_string(self, keep):
56-
self.string_idx.duplicated(keep=keep)
5751

52+
class Duplicated(object):
5853

59-
class DuplicatedUniqueIndex(object):
54+
params = [['first', 'last', False], ['int', 'uint', 'float', 'string']]
55+
param_names = ['keep', 'dtype']
6056

61-
def setup(self):
57+
def setup(self, keep, dtype):
6258
N = 10**5
63-
self.idx_int_dup = pd.Int64Index(np.arange(N * 5))
59+
data = {'int': pd.Int64Index(np.arange(N).repeat(5)),
60+
'uint': pd.UInt64Index(np.arange(N).repeat(5)),
61+
'float': pd.Float64Index(np.random.randn(N).repeat(5)),
62+
'string': tm.makeStringIndex(N).repeat(5)}
63+
self.idx = data[dtype]
6464
# cache is_unique
65-
self.idx_int_dup.is_unique
65+
self.idx.is_unique
66+
67+
def time_duplicated(self, keep, dtype):
68+
self.idx.duplicated(keep=keep)
69+
6670

67-
def time_duplicated_unique_int(self):
68-
self.idx_int_dup.duplicated()
71+
class DuplicatedUniqueIndex(object):
6972

73+
params = ['int', 'uint', 'float', 'string']
74+
param_names = ['dtype']
7075

71-
class Match(object):
76+
def setup(self, dtype):
77+
N = 10**5
78+
data = {'int': pd.Int64Index(np.arange(N)),
79+
'uint': pd.UInt64Index(np.arange(N)),
80+
'float': pd.Float64Index(np.random.randn(N)),
81+
'string': tm.makeStringIndex(N)}
82+
self.idx = data[dtype]
83+
# cache is_unique
84+
self.idx.is_unique
7285

73-
def setup(self):
74-
self.uniques = tm.makeStringIndex(1000).values
75-
self.all = self.uniques.repeat(10)
86+
def time_duplicated_unique(self, dtype):
87+
self.idx.duplicated()
7688

7789

7890
class Hashing(object):
@@ -113,4 +125,21 @@ def time_series_dates(self, df):
113125
hashing.hash_pandas_object(df['dates'])
114126

115127

128+
class Quantile(object):
129+
params = [[0, 0.5, 1],
130+
['linear', 'nearest', 'lower', 'higher', 'midpoint'],
131+
['float', 'int', 'uint']]
132+
param_names = ['quantile', 'interpolation', 'dtype']
133+
134+
def setup(self, quantile, interpolation, dtype):
135+
N = 10**5
136+
data = {'int': np.arange(N),
137+
'uint': np.arange(N).astype(np.uint64),
138+
'float': np.random.randn(N)}
139+
self.idx = pd.Series(data[dtype].repeat(5))
140+
141+
def time_quantile(self, quantile, interpolation, dtype):
142+
self.idx.quantile(quantile, interpolation=interpolation)
143+
144+
116145
from .pandas_vb_common import setup # noqa: F401

asv_bench/benchmarks/categoricals.py

+53-3
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,8 @@ class ValueCounts(object):
8484

8585
def setup(self, dropna):
8686
n = 5 * 10**5
87-
arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)]
87+
arr = ['s{:04d}'.format(i) for i in np.random.randint(0, n // 10,
88+
size=n)]
8889
self.ts = pd.Series(arr).astype('category')
8990

9091
def time_value_counts(self, dropna):
@@ -104,13 +105,26 @@ class SetCategories(object):
104105

105106
def setup(self):
106107
n = 5 * 10**5
107-
arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)]
108+
arr = ['s{:04d}'.format(i) for i in np.random.randint(0, n // 10,
109+
size=n)]
108110
self.ts = pd.Series(arr).astype('category')
109111

110112
def time_set_categories(self):
111113
self.ts.cat.set_categories(self.ts.cat.categories[::2])
112114

113115

116+
class RemoveCategories(object):
117+
118+
def setup(self):
119+
n = 5 * 10**5
120+
arr = ['s{:04d}'.format(i) for i in np.random.randint(0, n // 10,
121+
size=n)]
122+
self.ts = pd.Series(arr).astype('category')
123+
124+
def time_remove_categories(self):
125+
self.ts.cat.remove_categories(self.ts.cat.categories[::2])
126+
127+
114128
class Rank(object):
115129

116130
def setup(self):
@@ -159,7 +173,7 @@ def setup(self, dtype):
159173
sample_size = 100
160174
arr = [i for i in np.random.randint(0, n // 10, size=n)]
161175
if dtype == 'object':
162-
arr = ['s%04d' % i for i in arr]
176+
arr = ['s{:04d}'.format(i) for i in arr]
163177
self.sample = np.random.choice(arr, sample_size)
164178
self.series = pd.Series(arr).astype('category')
165179

@@ -236,4 +250,40 @@ def time_getitem_bool_array(self, index):
236250
self.data[self.data == self.cat_scalar]
237251

238252

253+
class Indexing(object):
254+
255+
def setup(self):
256+
N = 10**5
257+
self.index = pd.CategoricalIndex(range(N), range(N))
258+
self.series = pd.Series(range(N), index=self.index).sort_index()
259+
self.category = self.index[500]
260+
261+
def time_get_loc(self):
262+
self.index.get_loc(self.category)
263+
264+
def time_shape(self):
265+
self.index.shape
266+
267+
def time_shallow_copy(self):
268+
self.index._shallow_copy()
269+
270+
def time_align(self):
271+
pd.DataFrame({'a': self.series, 'b': self.series[:500]})
272+
273+
def time_intersection(self):
274+
self.index[:750].intersection(self.index[250:])
275+
276+
def time_unique(self):
277+
self.index.unique()
278+
279+
def time_reindex(self):
280+
self.index.reindex(self.index[:500])
281+
282+
def time_reindex_missing(self):
283+
self.index.reindex(['a', 'b', 'c', 'd'])
284+
285+
def time_sort_values(self):
286+
self.index.sort_values(ascending=False)
287+
288+
239289
from .pandas_vb_common import setup # noqa: F401

asv_bench/benchmarks/join_merge.py

+20-11
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ def setup(self, axis):
4848
index=date_range('20130101', periods=N, freq='s'))
4949
self.empty_left = [DataFrame(), df]
5050
self.empty_right = [df, DataFrame()]
51+
self.mixed_ndims = [df, df.head(N // 2)]
5152

5253
def time_concat_series(self, axis):
5354
concat(self.series, axis=axis, sort=False)
@@ -61,6 +62,9 @@ def time_concat_empty_right(self, axis):
6162
def time_concat_empty_left(self, axis):
6263
concat(self.empty_left, axis=axis)
6364

65+
def time_concat_mixed_ndims(self, axis):
66+
concat(self.mixed_ndims, axis=axis)
67+
6468

6569
class ConcatPanels(object):
6670

@@ -274,8 +278,10 @@ def time_merge_ordered(self):
274278

275279

276280
class MergeAsof(object):
281+
params = [['backward', 'forward', 'nearest']]
282+
param_names = ['direction']
277283

278-
def setup(self):
284+
def setup(self, direction):
279285
one_count = 200000
280286
two_count = 1000000
281287

@@ -307,20 +313,23 @@ def setup(self):
307313
self.df1e = df1[['time', 'key', 'key2', 'value1']]
308314
self.df2e = df2[['time', 'key', 'key2', 'value2']]
309315

310-
def time_on_int(self):
311-
merge_asof(self.df1a, self.df2a, on='time')
316+
def time_on_int(self, direction):
317+
merge_asof(self.df1a, self.df2a, on='time', direction=direction)
312318

313-
def time_on_int32(self):
314-
merge_asof(self.df1d, self.df2d, on='time32')
319+
def time_on_int32(self, direction):
320+
merge_asof(self.df1d, self.df2d, on='time32', direction=direction)
315321

316-
def time_by_object(self):
317-
merge_asof(self.df1b, self.df2b, on='time', by='key')
322+
def time_by_object(self, direction):
323+
merge_asof(self.df1b, self.df2b, on='time', by='key',
324+
direction=direction)
318325

319-
def time_by_int(self):
320-
merge_asof(self.df1c, self.df2c, on='time', by='key2')
326+
def time_by_int(self, direction):
327+
merge_asof(self.df1c, self.df2c, on='time', by='key2',
328+
direction=direction)
321329

322-
def time_multiby(self):
323-
merge_asof(self.df1e, self.df2e, on='time', by=['key', 'key2'])
330+
def time_multiby(self, direction):
331+
merge_asof(self.df1e, self.df2e, on='time', by=['key', 'key2'],
332+
direction=direction)
324333

325334

326335
class Align(object):

asv_bench/benchmarks/pandas_vb_common.py

+11-8
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,17 @@
1616
np.float64, np.int16, np.int8, np.uint16, np.uint8]
1717
datetime_dtypes = [np.datetime64, np.timedelta64]
1818
string_dtypes = [np.object]
19-
extension_dtypes = [pd.Int8Dtype, pd.Int16Dtype,
20-
pd.Int32Dtype, pd.Int64Dtype,
21-
pd.UInt8Dtype, pd.UInt16Dtype,
22-
pd.UInt32Dtype, pd.UInt64Dtype,
23-
pd.CategoricalDtype,
24-
pd.IntervalDtype,
25-
pd.DatetimeTZDtype('ns', 'UTC'),
26-
pd.PeriodDtype('D')]
19+
try:
20+
extension_dtypes = [pd.Int8Dtype, pd.Int16Dtype,
21+
pd.Int32Dtype, pd.Int64Dtype,
22+
pd.UInt8Dtype, pd.UInt16Dtype,
23+
pd.UInt32Dtype, pd.UInt64Dtype,
24+
pd.CategoricalDtype,
25+
pd.IntervalDtype,
26+
pd.DatetimeTZDtype('ns', 'UTC'),
27+
pd.PeriodDtype('D')]
28+
except AttributeError:
29+
extension_dtypes = []
2730

2831

2932
def setup(*args, **kwargs):

asv_bench/benchmarks/plotting.py

+3
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@ def time_plot_regular_compat(self):
7474
def time_plot_irregular(self):
7575
self.df2.plot()
7676

77+
def time_plot_table(self):
78+
self.df.plot(table=True)
79+
7780

7881
class Misc(object):
7982

asv_bench/benchmarks/reindex.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import numpy as np
22
import pandas.util.testing as tm
3-
from pandas import (DataFrame, Series, MultiIndex, Index,
4-
date_range)
3+
from pandas import (DataFrame, Series, MultiIndex, Index, date_range,
4+
period_range)
55
from .pandas_vb_common import lib
66

77

@@ -35,15 +35,15 @@ def time_reindex_multiindex(self):
3535

3636
class ReindexMethod(object):
3737

38-
params = ['pad', 'backfill']
39-
param_names = ['method']
38+
params = [['pad', 'backfill'], [date_range, period_range]]
39+
param_names = ['method', 'constructor']
4040

41-
def setup(self, method):
41+
def setup(self, method, constructor):
4242
N = 100000
43-
self.idx = date_range('1/1/2000', periods=N, freq='1min')
43+
self.idx = constructor('1/1/2000', periods=N, freq='1min')
4444
self.ts = Series(np.random.randn(N), index=self.idx)[::2]
4545

46-
def time_reindex_method(self, method):
46+
def time_reindex_method(self, method, constructor):
4747
self.ts.reindex(self.idx, method=method)
4848

4949

asv_bench/benchmarks/reshape.py

+32
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,38 @@ def setup(self):
131131
def time_pivot_table(self):
132132
self.df.pivot_table(index='key1', columns=['key2', 'key3'])
133133

134+
def time_pivot_table_agg(self):
135+
self.df.pivot_table(index='key1', columns=['key2', 'key3'],
136+
aggfunc=['sum', 'mean'])
137+
138+
def time_pivot_table_margins(self):
139+
self.df.pivot_table(index='key1', columns=['key2', 'key3'],
140+
margins=True)
141+
142+
143+
class Crosstab(object):
144+
145+
def setup(self):
146+
N = 100000
147+
fac1 = np.array(['A', 'B', 'C'], dtype='O')
148+
fac2 = np.array(['one', 'two'], dtype='O')
149+
self.ind1 = np.random.randint(0, 3, size=N)
150+
self.ind2 = np.random.randint(0, 2, size=N)
151+
self.vec1 = fac1.take(self.ind1)
152+
self.vec2 = fac2.take(self.ind2)
153+
154+
def time_crosstab(self):
155+
pd.crosstab(self.vec1, self.vec2)
156+
157+
def time_crosstab_values(self):
158+
pd.crosstab(self.vec1, self.vec2, values=self.ind1, aggfunc='sum')
159+
160+
def time_crosstab_normalize(self):
161+
pd.crosstab(self.vec1, self.vec2, normalize=True)
162+
163+
def time_crosstab_normalize_margins(self):
164+
pd.crosstab(self.vec1, self.vec2, normalize=True, margins=True)
165+
134166

135167
class GetDummies(object):
136168
def setup(self):

0 commit comments

Comments
 (0)