Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CLN: ASV inference benchmark #18759

Merged
merged 3 commits into from
Dec 18, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 72 additions & 76 deletions asv_bench/benchmarks/inference.py
Original file line number Diff line number Diff line change
@@ -1,77 +1,80 @@
from .pandas_vb_common import *
import pandas as pd
import numpy as np
import pandas.util.testing as tm
from pandas import DataFrame, Series, to_numeric

from .pandas_vb_common import numeric_dtypes, lib, setup # noqa

class DtypeInfer(object):
goal_time = 0.2

class NumericInferOps(object):
# from GH 7332
goal_time = 0.2
params = numeric_dtypes
param_names = ['dtype']

def setup(self, dtype):
N = 5 * 10**5
self.df = DataFrame({'A': np.arange(N).astype(dtype),
'B': np.arange(N).astype(dtype)})

def time_add(self, dtype):
self.df['A'] + self.df['B']

def time_subtract(self, dtype):
self.df['A'] - self.df['B']

def setup(self):
self.N = 500000
self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'),
B=np.arange(self.N, dtype='int64')))
self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'),
B=np.arange(self.N, dtype='int32')))
self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'),
B=np.arange(self.N, dtype='uint32')))
self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'),
B=np.arange(self.N, dtype='float64')))
self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'),
B=np.arange(self.N, dtype='float32')))
self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'),
B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms')))
self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']),
B=self.df_datetime64['B']))
def time_multiply(self, dtype):
self.df['A'] * self.df['B']

def time_int64(self):
(self.df_int64['A'] + self.df_int64['B'])
def time_divide(self, dtype):
self.df['A'] / self.df['B']

def time_int32(self):
(self.df_int32['A'] + self.df_int32['B'])
def time_modulo(self, dtype):
self.df['A'] % self.df['B']

def time_uint32(self):
(self.df_uint32['A'] + self.df_uint32['B'])

def time_float64(self):
(self.df_float64['A'] + self.df_float64['B'])
class DateInferOps(object):
# from GH 7332
goal_time = 0.2

def setup_cache(self):
N = 5 * 10**5
df = DataFrame({'datetime64': np.arange(N).astype('datetime64[ms]')})
df['timedelta'] = df['datetime64'] - df['datetime64']
return df

def time_float32(self):
(self.df_float32['A'] + self.df_float32['B'])
def time_subtract_datetimes(self, df):
df['datetime64'] - df['datetime64']

def time_datetime64(self):
(self.df_datetime64['A'] - self.df_datetime64['B'])
def time_timedelta_plus_datetime(self, df):
df['timedelta'] + df['datetime64']

def time_timedelta64_1(self):
(self.df_timedelta64['A'] + self.df_timedelta64['B'])
def time_add_timedeltas(self, df):
df['timedelta'] + df['timedelta']

def time_timedelta64_2(self):
(self.df_timedelta64['A'] + self.df_timedelta64['A'])

class ToNumeric(object):

class to_numeric(object):
goal_time = 0.2
params = ['ignore', 'coerce']
param_names = ['errors']

def setup(self):
self.n = 10000
self.float = Series(np.random.randn(self.n * 100))
def setup(self, errors):
N = 10000
self.float = Series(np.random.randn(N))
self.numstr = self.float.astype('str')
self.str = Series(tm.makeStringIndex(self.n))
self.str = Series(tm.makeStringIndex(N))

def time_from_float(self):
pd.to_numeric(self.float)
def time_from_float(self, errors):
to_numeric(self.float, errors=errors)

def time_from_numeric_str(self):
pd.to_numeric(self.numstr)
def time_from_numeric_str(self, errors):
to_numeric(self.numstr, errors=errors)

def time_from_str_ignore(self):
pd.to_numeric(self.str, errors='ignore')
def time_from_str(self, errors):
to_numeric(self.str, errors=errors)

def time_from_str_coerce(self):
pd.to_numeric(self.str, errors='coerce')


class to_numeric_downcast(object):
class ToNumericDowncast(object):

param_names = ['dtype', 'downcast']
params = [['string-float', 'string-int', 'string-nint', 'datetime64',
Expand All @@ -81,37 +84,30 @@ class to_numeric_downcast(object):
N = 500000
N2 = int(N / 2)

data_dict = {
'string-int': (['1'] * N2) + ([2] * N2),
'string-nint': (['-1'] * N2) + ([2] * N2),
'datetime64': np.repeat(np.array(['1970-01-01', '1970-01-02'],
dtype='datetime64[D]'), N),
'string-float': (['1.1'] * N2) + ([2] * N2),
'int-list': ([1] * N2) + ([2] * N2),
'int32': np.repeat(np.int32(1), N)
}
data_dict = {'string-int': ['1'] * N2 + [2] * N2,
'string-nint': ['-1'] * N2 + [2] * N2,
'datetime64': np.repeat(np.array(['1970-01-01', '1970-01-02'],
dtype='datetime64[D]'), N),
'string-float': ['1.1'] * N2 + [2] * N2,
'int-list': [1] * N2 + [2] * N2,
'int32': np.repeat(np.int32(1), N)}

def setup(self, dtype, downcast):
self.data = self.data_dict[dtype]

def time_downcast(self, dtype, downcast):
pd.to_numeric(self.data, downcast=downcast)
to_numeric(self.data, downcast=downcast)


class MaybeConvertNumeric(object):

def setup(self):
n = 1000000
arr = np.repeat([2**63], n)
arr = arr + np.arange(n).astype('uint64')
arr = np.array([arr[i] if i%2 == 0 else
str(arr[i]) for i in range(n)],
dtype=object)

arr[-1] = -1
self.data = arr
self.na_values = set()

def time_convert(self):
lib.maybe_convert_numeric(self.data, self.na_values,
coerce_numeric=False)
def setup_cache(self):
N = 10**6
arr = np.repeat([2**63], N) + np.arange(N).astype('uint64')
data = arr.astype(object)
data[1::2] = arr[1::2].astype(str)
data[-1] = -1
return data

def time_convert(self, data):
lib.maybe_convert_numeric(data, set(), coerce_numeric=False)
6 changes: 5 additions & 1 deletion asv_bench/benchmarks/pandas_vb_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@
except ImportError:
pass

numeric_dtypes = [np.int64, np.int32, np.uint32, np.uint64, np.float32,
np.float64, np.int16, np.int8, np.uint16, np.uint8]
datetime_dtypes = [np.datetime64, np.timedelta64]

# This function just needs to be imported into each benchmark file in order to
# sets up the random seed before each function.
# http://asv.readthedocs.io/en/latest/writing_benchmarks.html
Expand All @@ -39,7 +43,7 @@ def remove(self, f):
def teardown(self, *args, **kwargs):
self.remove(self.fname)

# try em until it works!
# Compatability import for lib
for imp in ['pandas._libs.lib', 'pandas.lib', 'pandas_tseries']:
try:
lib = import_module(imp)
Expand Down