Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PERF: expand asv benchmark coverage #24214

Merged
merged 15 commits into from
Jan 9, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 67 additions & 38 deletions asv_bench/benchmarks/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,63 +16,75 @@

class Factorize(object):

params = [True, False]
param_names = ['sort']
params = [[True, False], ['int', 'uint', 'float', 'string']]
param_names = ['sort', 'dtype']

def setup(self, sort):
def setup(self, sort, dtype):
N = 10**5
self.int_idx = pd.Int64Index(np.arange(N).repeat(5))
self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5))
self.string_idx = tm.makeStringIndex(N)
data = {'int': pd.Int64Index(np.arange(N).repeat(5)),
'uint': pd.UInt64Index(np.arange(N).repeat(5)),
'float': pd.Float64Index(np.random.randn(N).repeat(5)),
'string': tm.makeStringIndex(N).repeat(5)}
self.idx = data[dtype]

def time_factorize_int(self, sort):
self.int_idx.factorize(sort=sort)
def time_factorize(self, sort, dtype):
self.idx.factorize(sort=sort)

def time_factorize_float(self, sort):
self.float_idx.factorize(sort=sort)

def time_factorize_string(self, sort):
self.string_idx.factorize(sort=sort)
class FactorizeUnique(object):

params = [[True, False], ['int', 'uint', 'float', 'string']]
param_names = ['sort', 'dtype']

class Duplicated(object):

params = ['first', 'last', False]
param_names = ['keep']

def setup(self, keep):
def setup(self, sort, dtype):
N = 10**5
self.int_idx = pd.Int64Index(np.arange(N).repeat(5))
self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5))
self.string_idx = tm.makeStringIndex(N)

def time_duplicated_int(self, keep):
self.int_idx.duplicated(keep=keep)
data = {'int': pd.Int64Index(np.arange(N)),
'uint': pd.UInt64Index(np.arange(N)),
'float': pd.Float64Index(np.arange(N)),
'string': tm.makeStringIndex(N)}
self.idx = data[dtype]
assert self.idx.is_unique

def time_duplicated_float(self, keep):
self.float_idx.duplicated(keep=keep)
def time_factorize(self, sort, dtype):
self.idx.factorize(sort=sort)

def time_duplicated_string(self, keep):
self.string_idx.duplicated(keep=keep)

class Duplicated(object):

class DuplicatedUniqueIndex(object):
params = [['first', 'last', False], ['int', 'uint', 'float', 'string']]
param_names = ['keep', 'dtype']

def setup(self):
def setup(self, keep, dtype):
N = 10**5
self.idx_int_dup = pd.Int64Index(np.arange(N * 5))
data = {'int': pd.Int64Index(np.arange(N).repeat(5)),
'uint': pd.UInt64Index(np.arange(N).repeat(5)),
'float': pd.Float64Index(np.random.randn(N).repeat(5)),
'string': tm.makeStringIndex(N).repeat(5)}
self.idx = data[dtype]
# cache is_unique
self.idx_int_dup.is_unique
self.idx.is_unique

def time_duplicated(self, keep, dtype):
self.idx.duplicated(keep=keep)


def time_duplicated_unique_int(self):
self.idx_int_dup.duplicated()
class DuplicatedUniqueIndex(object):

params = ['int', 'uint', 'float', 'string']
param_names = ['dtype']

class Match(object):
def setup(self, dtype):
N = 10**5
data = {'int': pd.Int64Index(np.arange(N)),
'uint': pd.UInt64Index(np.arange(N)),
'float': pd.Float64Index(np.random.randn(N)),
'string': tm.makeStringIndex(N)}
self.idx = data[dtype]
# cache is_unique
self.idx.is_unique

def setup(self):
self.uniques = tm.makeStringIndex(1000).values
self.all = self.uniques.repeat(10)
def time_duplicated_unique(self, dtype):
self.idx.duplicated()


class Hashing(object):
Expand Down Expand Up @@ -113,4 +125,21 @@ def time_series_dates(self, df):
hashing.hash_pandas_object(df['dates'])


class Quantile(object):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can remove the Match one; was remove in 0.23.0 i think.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

params = [[0, 0.5, 1],
['linear', 'nearest', 'lower', 'higher', 'midpoint'],
['float', 'int', 'uint']]
param_names = ['quantile', 'interpolation', 'dtype']

def setup(self, quantile, interpolation, dtype):
N = 10**5
data = {'int': np.arange(N),
'uint': np.arange(N).astype(np.uint64),
'float': np.random.randn(N)}
self.idx = pd.Series(data[dtype].repeat(5))

def time_quantile(self, quantile, interpolation, dtype):
self.idx.quantile(quantile, interpolation=interpolation)


from .pandas_vb_common import setup # noqa: F401
56 changes: 53 additions & 3 deletions asv_bench/benchmarks/categoricals.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ class ValueCounts(object):

def setup(self, dropna):
n = 5 * 10**5
arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)]
arr = ['s{:04d}'.format(i) for i in np.random.randint(0, n // 10,
size=n)]
self.ts = pd.Series(arr).astype('category')

def time_value_counts(self, dropna):
Expand All @@ -104,13 +105,26 @@ class SetCategories(object):

def setup(self):
n = 5 * 10**5
arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)]
arr = ['s{:04d}'.format(i) for i in np.random.randint(0, n // 10,
size=n)]
self.ts = pd.Series(arr).astype('category')

def time_set_categories(self):
self.ts.cat.set_categories(self.ts.cat.categories[::2])


class RemoveCategories(object):

def setup(self):
n = 5 * 10**5
arr = ['s{:04d}'.format(i) for i in np.random.randint(0, n // 10,
size=n)]
self.ts = pd.Series(arr).astype('category')

def time_remove_categories(self):
self.ts.cat.remove_categories(self.ts.cat.categories[::2])


class Rank(object):

def setup(self):
Expand Down Expand Up @@ -159,7 +173,7 @@ def setup(self, dtype):
sample_size = 100
arr = [i for i in np.random.randint(0, n // 10, size=n)]
if dtype == 'object':
arr = ['s%04d' % i for i in arr]
arr = ['s{:04d}'.format(i) for i in arr]
self.sample = np.random.choice(arr, sample_size)
self.series = pd.Series(arr).astype('category')

Expand Down Expand Up @@ -236,4 +250,40 @@ def time_getitem_bool_array(self, index):
self.data[self.data == self.cat_scalar]


class Indexing(object):

def setup(self):
N = 10**5
self.index = pd.CategoricalIndex(range(N), range(N))
self.series = pd.Series(range(N), index=self.index).sort_index()
self.category = self.index[500]

def time_get_loc(self):
self.index.get_loc(self.category)

def time_shape(self):
self.index.shape

def time_shallow_copy(self):
self.index._shallow_copy()

def time_align(self):
pd.DataFrame({'a': self.series, 'b': self.series[:500]})

def time_intersection(self):
self.index[:750].intersection(self.index[250:])

def time_unique(self):
self.index.unique()

def time_reindex(self):
self.index.reindex(self.index[:500])

def time_reindex_missing(self):
self.index.reindex(['a', 'b', 'c', 'd'])

def time_sort_values(self):
self.index.sort_values(ascending=False)


from .pandas_vb_common import setup # noqa: F401
31 changes: 20 additions & 11 deletions asv_bench/benchmarks/join_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def setup(self, axis):
index=date_range('20130101', periods=N, freq='s'))
self.empty_left = [DataFrame(), df]
self.empty_right = [df, DataFrame()]
self.mixed_ndims = [df, df.head(N // 2)]

def time_concat_series(self, axis):
concat(self.series, axis=axis, sort=False)
Expand All @@ -61,6 +62,9 @@ def time_concat_empty_right(self, axis):
def time_concat_empty_left(self, axis):
concat(self.empty_left, axis=axis)

def time_concat_mixed_ndims(self, axis):
concat(self.mixed_ndims, axis=axis)


class ConcatPanels(object):

Expand Down Expand Up @@ -274,8 +278,10 @@ def time_merge_ordered(self):


class MergeAsof(object):
params = [['backward', 'forward', 'nearest']]
param_names = ['direction']

def setup(self):
def setup(self, direction):
one_count = 200000
two_count = 1000000

Expand Down Expand Up @@ -307,20 +313,23 @@ def setup(self):
self.df1e = df1[['time', 'key', 'key2', 'value1']]
self.df2e = df2[['time', 'key', 'key2', 'value2']]

def time_on_int(self):
merge_asof(self.df1a, self.df2a, on='time')
def time_on_int(self, direction):
merge_asof(self.df1a, self.df2a, on='time', direction=direction)

def time_on_int32(self):
merge_asof(self.df1d, self.df2d, on='time32')
def time_on_int32(self, direction):
merge_asof(self.df1d, self.df2d, on='time32', direction=direction)

def time_by_object(self):
merge_asof(self.df1b, self.df2b, on='time', by='key')
def time_by_object(self, direction):
merge_asof(self.df1b, self.df2b, on='time', by='key',
direction=direction)

def time_by_int(self):
merge_asof(self.df1c, self.df2c, on='time', by='key2')
def time_by_int(self, direction):
merge_asof(self.df1c, self.df2c, on='time', by='key2',
direction=direction)

def time_multiby(self):
merge_asof(self.df1e, self.df2e, on='time', by=['key', 'key2'])
def time_multiby(self, direction):
merge_asof(self.df1e, self.df2e, on='time', by=['key', 'key2'],
direction=direction)


class Align(object):
Expand Down
19 changes: 11 additions & 8 deletions asv_bench/benchmarks/pandas_vb_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,17 @@
np.float64, np.int16, np.int8, np.uint16, np.uint8]
datetime_dtypes = [np.datetime64, np.timedelta64]
string_dtypes = [np.object]
extension_dtypes = [pd.Int8Dtype, pd.Int16Dtype,
pd.Int32Dtype, pd.Int64Dtype,
pd.UInt8Dtype, pd.UInt16Dtype,
pd.UInt32Dtype, pd.UInt64Dtype,
pd.CategoricalDtype,
pd.IntervalDtype,
pd.DatetimeTZDtype('ns', 'UTC'),
pd.PeriodDtype('D')]
try:
extension_dtypes = [pd.Int8Dtype, pd.Int16Dtype,
pd.Int32Dtype, pd.Int64Dtype,
pd.UInt8Dtype, pd.UInt16Dtype,
pd.UInt32Dtype, pd.UInt64Dtype,
pd.CategoricalDtype,
pd.IntervalDtype,
pd.DatetimeTZDtype('ns', 'UTC'),
pd.PeriodDtype('D')]
except AttributeError:
extension_dtypes = []


def setup(*args, **kwargs):
Expand Down
3 changes: 3 additions & 0 deletions asv_bench/benchmarks/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ def time_plot_regular_compat(self):
def time_plot_irregular(self):
self.df2.plot()

def time_plot_table(self):
self.df.plot(table=True)


class Misc(object):

Expand Down
14 changes: 7 additions & 7 deletions asv_bench/benchmarks/reindex.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
import pandas.util.testing as tm
from pandas import (DataFrame, Series, MultiIndex, Index,
date_range)
from pandas import (DataFrame, Series, MultiIndex, Index, date_range,
period_range)
from .pandas_vb_common import lib


Expand Down Expand Up @@ -35,15 +35,15 @@ def time_reindex_multiindex(self):

class ReindexMethod(object):

params = ['pad', 'backfill']
param_names = ['method']
params = [['pad', 'backfill'], [date_range, period_range]]
param_names = ['method', 'constructor']

def setup(self, method):
def setup(self, method, constructor):
N = 100000
self.idx = date_range('1/1/2000', periods=N, freq='1min')
self.idx = constructor('1/1/2000', periods=N, freq='1min')
self.ts = Series(np.random.randn(N), index=self.idx)[::2]

def time_reindex_method(self, method):
def time_reindex_method(self, method, constructor):
self.ts.reindex(self.idx, method=method)


Expand Down
32 changes: 32 additions & 0 deletions asv_bench/benchmarks/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,38 @@ def setup(self):
def time_pivot_table(self):
self.df.pivot_table(index='key1', columns=['key2', 'key3'])

def time_pivot_table_agg(self):
self.df.pivot_table(index='key1', columns=['key2', 'key3'],
aggfunc=['sum', 'mean'])

def time_pivot_table_margins(self):
self.df.pivot_table(index='key1', columns=['key2', 'key3'],
margins=True)


class Crosstab(object):

def setup(self):
N = 100000
fac1 = np.array(['A', 'B', 'C'], dtype='O')
fac2 = np.array(['one', 'two'], dtype='O')
self.ind1 = np.random.randint(0, 3, size=N)
self.ind2 = np.random.randint(0, 2, size=N)
self.vec1 = fac1.take(self.ind1)
self.vec2 = fac2.take(self.ind2)

def time_crosstab(self):
pd.crosstab(self.vec1, self.vec2)

def time_crosstab_values(self):
pd.crosstab(self.vec1, self.vec2, values=self.ind1, aggfunc='sum')

def time_crosstab_normalize(self):
pd.crosstab(self.vec1, self.vec2, normalize=True)

def time_crosstab_normalize_margins(self):
pd.crosstab(self.vec1, self.vec2, normalize=True, margins=True)


class GetDummies(object):
def setup(self):
Expand Down
Loading