CLN: clean benchmarks to get them running (#16025)

jorisvandenbossche · web-flow · commit c6060a80933f · 2017-04-17T13:14:38.000+02:00
* fix lib and algos import

* fix take_1d import

* string uppercase -&gt; ascii_uppercase (py3 compat)

* sas test file path

* fix datetools usage

* fix hashing benchmarks

* dict values py3 compat

* avoid overflow by using higher freq

* xrange -&gt; range

* fix xport path

* revised hdfstore_bench to use new query syntax
rename table variables

* change default python version to 3.6
diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
@@ -26,7 +26,7 @@
     // The Pythons you'd like to test against.  If not provided, defaults
     // to the current version of Python used to run `asv`.
     // "pythons": ["2.7", "3.4"],
-    "pythons": ["2.7"],
+    "pythons": ["3.6"],
 
     // The matrix of dependencies to test.  Each key is the name of a
     // package (in PyPI) and the values are version numbers.  An empty
diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
@@ -2,6 +2,11 @@
 import pandas as pd
 from pandas.util import testing as tm
 
+try:
+    from pandas.tools.hashing import hash_pandas_object
+except ImportError:
+    pass
+
 
 class Algorithms(object):
     goal_time = 0.2
@@ -103,13 +108,13 @@ def setup(self):
         self.df.iloc[10:20] = np.nan
 
     def time_frame(self):
-        self.df.hash()
+        hash_pandas_object(self.df)
 
     def time_series_int(self):
-        self.df.E.hash()
+        hash_pandas_object(self.df.E)
 
     def time_series_string(self):
-        self.df.B.hash()
+        hash_pandas_object(self.df.B)
 
     def time_series_categorical(self):
-        self.df.C.hash()
+        hash_pandas_object(self.df.C)
diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py
@@ -20,12 +20,12 @@ def setup(self):
             self.data = self.frame.to_dict()
         except:
             self.data = self.frame.toDict()
-        self.some_dict = self.data.values()[0]
+        self.some_dict = list(self.data.values())[0]
         self.dict_list = [dict(zip(self.columns, row)) for row in self.frame.values]
 
         self.data2 = dict(
             ((i, dict(((j, float(j)) for j in range(100)))) for i in
-             xrange(2000)))
+             range(2000)))
 
     def time_frame_ctor_list_of_dict(self):
         DataFrame(self.dict_list)
diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
@@ -56,7 +56,7 @@ def time_reindex_both_axes_ix(self):
         self.df.ix[(self.idx, self.idx)]
 
     def time_reindex_upcast(self):
-        self.df2.reindex(permutation(range(1200)))
+        self.df2.reindex(np.random.permutation(range(1200)))
 
 
 #----------------------------------------------------------------------
@@ -583,7 +583,7 @@ class frame_assign_timeseries_index(object):
     goal_time = 0.2
 
     def setup(self):
-        self.idx = date_range('1/1/2000', periods=100000, freq='D')
+        self.idx = date_range('1/1/2000', periods=100000, freq='H')
         self.df = DataFrame(randn(100000, 1), columns=['A'], index=self.idx)
 
     def time_frame_assign_timeseries_index(self):
diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py
@@ -1,11 +1,17 @@
 from .pandas_vb_common import *
-from pandas.core import common as com
+
+from pandas.core.algorithms import take_1d
 
 try:
     from cStringIO import StringIO
 except ImportError:
     from io import StringIO
 
+try:
+    from pandas._libs import algos
+except ImportError:
+    from pandas import algos
+
 try:
     from pandas.util.testing import test_parallel
 
@@ -167,11 +173,11 @@ def time_nogil_take1d_float64(self):
 
     @test_parallel(num_threads=2)
     def take_1d_pg2_int64(self):
-        com.take_1d(self.df.int64.values, self.indexer)
+        take_1d(self.df.int64.values, self.indexer)
 
     @test_parallel(num_threads=2)
     def take_1d_pg2_float64(self):
-        com.take_1d(self.df.float64.values, self.indexer)
+        take_1d(self.df.float64.values, self.indexer)
 
 
 class nogil_take1d_int64(object):
@@ -193,11 +199,11 @@ def time_nogil_take1d_int64(self):
 
     @test_parallel(num_threads=2)
     def take_1d_pg2_int64(self):
-        com.take_1d(self.df.int64.values, self.indexer)
+        take_1d(self.df.int64.values, self.indexer)
 
     @test_parallel(num_threads=2)
     def take_1d_pg2_float64(self):
-        com.take_1d(self.df.float64.values, self.indexer)
+        take_1d(self.df.float64.values, self.indexer)
 
 
 class nogil_kth_smallest(object):
@@ -226,7 +232,7 @@ class nogil_datetime_fields(object):
 
     def setup(self):
         self.N = 100000000
-        self.dti = pd.date_range('1900-01-01', periods=self.N, freq='D')
+        self.dti = pd.date_range('1900-01-01', periods=self.N, freq='T')
         self.period = self.dti.to_period('D')
         if (not have_real_test_parallel):
             raise NotImplementedError
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -331,7 +331,7 @@ def setup(self):
 
     def get_test_data(self, ngroups=100, n=100000):
         self.unique_groups = range(self.ngroups)
-        self.arr = np.asarray(np.tile(self.unique_groups, (n / self.ngroups)), dtype=object)
+        self.arr = np.asarray(np.tile(self.unique_groups, int(n / self.ngroups)), dtype=object)
         if (len(self.arr) < n):
             self.arr = np.asarray((list(self.arr) + self.unique_groups[:(n - len(self.arr))]), dtype=object)
         random.shuffle(self.arr)
diff --git a/asv_bench/benchmarks/hdfstore_bench.py b/asv_bench/benchmarks/hdfstore_bench.py
@@ -31,16 +31,12 @@ def setup(self):
         self.remove(self.f)
 
         self.store = HDFStore(self.f)
-        self.store.put('df1', self.df)
-        self.store.put('df_mixed', self.df_mixed)
-
-        self.store.append('df5', self.df_mixed)
-        self.store.append('df7', self.df)
-
-        self.store.append('df9', self.df_wide)
-
-        self.store.append('df11', self.df_wide2)
-        self.store.append('df12', self.df2)
+        self.store.put('fixed', self.df)
+        self.store.put('fixed_mixed', self.df_mixed)
+        self.store.append('table', self.df2)
+        self.store.append('table_mixed', self.df_mixed)
+        self.store.append('table_wide', self.df_wide)
+        self.store.append('table_wide2', self.df_wide2)
 
     def teardown(self):
         self.store.close()
@@ -52,45 +48,47 @@ def remove(self, f):
             pass
 
     def time_read_store(self):
-        self.store.get('df1')
+        self.store.get('fixed')
 
     def time_read_store_mixed(self):
-        self.store.get('df_mixed')
+        self.store.get('fixed_mixed')
 
     def time_write_store(self):
-        self.store.put('df2', self.df)
+        self.store.put('fixed_write', self.df)
 
     def time_write_store_mixed(self):
-        self.store.put('df_mixed2', self.df_mixed)
+        self.store.put('fixed_mixed_write', self.df_mixed)
 
     def time_read_store_table_mixed(self):
-        self.store.select('df5')
+        self.store.select('table_mixed')
 
     def time_write_store_table_mixed(self):
-        self.store.append('df6', self.df_mixed)
+        self.store.append('table_mixed_write', self.df_mixed)
 
     def time_read_store_table(self):
-        self.store.select('df7')
+        self.store.select('table')
 
     def time_write_store_table(self):
-        self.store.append('df8', self.df)
+        self.store.append('table_write', self.df)
 
     def time_read_store_table_wide(self):
-        self.store.select('df9')
+        self.store.select('table_wide')
 
     def time_write_store_table_wide(self):
-        self.store.append('df10', self.df_wide)
+        self.store.append('table_wide_write', self.df_wide)
 
     def time_write_store_table_dc(self):
-        self.store.append('df15', self.df, data_columns=True)
+        self.store.append('table_dc_write', self.df_dc, data_columns=True)
 
     def time_query_store_table_wide(self):
-        self.store.select('df11', [('index', '>', self.df_wide2.index[10000]),
-                                   ('index', '<', self.df_wide2.index[15000])])
+        start = self.df_wide2.index[10000]
+        stop = self.df_wide2.index[15000]
+        self.store.select('table_wide', where="index > start and index < stop")
 
     def time_query_store_table(self):
-        self.store.select('df12', [('index', '>', self.df2.index[10000]),
-                                   ('index', '<', self.df2.index[15000])])
+        start = self.df2.index[10000]
+        stop = self.df2.index[15000]
+        self.store.select('table', where="index > start and index < stop")
 
 
 class HDF5Panel(object):
diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py
@@ -113,5 +113,5 @@ def setup(self):
         self.na_values = set()
 
     def time_convert(self):
-        pd.lib.maybe_convert_numeric(self.data, self.na_values,
-                                     coerce_numeric=False)
+        lib.maybe_convert_numeric(self.data, self.na_values,
+                                  coerce_numeric=False)
diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py
@@ -314,12 +314,12 @@ def setup(self):
 
         self.df1 = pd.DataFrame(
             {'time': np.random.randint(0, one_count / 20, one_count),
-             'key': np.random.choice(list(string.uppercase), one_count),
+             'key': np.random.choice(list(string.ascii_uppercase), one_count),
              'key2': np.random.randint(0, 25, one_count),
              'value1': np.random.randn(one_count)})
         self.df2 = pd.DataFrame(
             {'time': np.random.randint(0, two_count / 20, two_count),
-             'key': np.random.choice(list(string.uppercase), two_count),
+             'key': np.random.choice(list(string.ascii_uppercase), two_count),
              'key2': np.random.randint(0, 25, two_count),
              'value2': np.random.randn(two_count)})
 
diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py
@@ -153,18 +153,20 @@ def time_packers_read_stata_with_validation(self):
 class packers_read_sas(_Packers):
 
     def setup(self):
-        self.f = os.path.join(os.path.dirname(__file__), '..', '..',
-                              'pandas', 'io', 'tests', 'sas', 'data',
-                              'test1.sas7bdat')
-        self.f2 = os.path.join(os.path.dirname(__file__), '..', '..',
-                               'pandas', 'io', 'tests', 'sas', 'data',
-                               'paxraw_d_short.xpt')
+
+        testdir = os.path.join(os.path.dirname(__file__), '..', '..',
+                               'pandas', 'tests', 'io', 'sas')
+        if not os.path.exists(testdir):
+            testdir = os.path.join(os.path.dirname(__file__), '..', '..',
+                                   'pandas', 'io', 'tests', 'sas')
+        self.f = os.path.join(testdir, 'data', 'test1.sas7bdat')
+        self.f2 = os.path.join(testdir, 'data', 'paxraw_d_short.xpt')
 
     def time_read_sas7bdat(self):
         pd.read_sas(self.f, format='sas7bdat')
 
     def time_read_xport(self):
-        pd.read_sas(self.f, format='xport')
+        pd.read_sas(self.f2, format='xport')
 
 
 class CSV(_Packers):
diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py
@@ -1,9 +1,7 @@
 from pandas import *
 import pandas as pd
-from datetime import timedelta
 from numpy.random import randn
 from numpy.random import randint
-from numpy.random import permutation
 import pandas.util.testing as tm
 import random
 import numpy as np
@@ -18,7 +16,7 @@
 np.random.seed(1234)
 
 # try em until it works!
-for imp in ['pandas_tseries', 'pandas.lib', 'pandas._libs.lib']:
+for imp in ['pandas._libs.lib', 'pandas.lib', 'pandas_tseries']:
     try:
         lib = import_module(imp)
         break
diff --git a/asv_bench/benchmarks/panel_ctor.py b/asv_bench/benchmarks/panel_ctor.py
@@ -1,4 +1,5 @@
 from .pandas_vb_common import *
+from datetime import timedelta
 
 
 class Constructors1(object):
@@ -24,7 +25,7 @@ class Constructors2(object):
     def setup(self):
         self.data_frames = {}
         for x in range(100):
-            self.dr = np.asarray(DatetimeIndex(start=datetime(1990, 1, 1), end=datetime(2012, 1, 1), freq=datetools.Day(1)))
+            self.dr = np.asarray(DatetimeIndex(start=datetime(1990, 1, 1), end=datetime(2012, 1, 1), freq='D'))
             self.df = DataFrame({'a': ([0] * len(self.dr)), 'b': ([1] * len(self.dr)), 'c': ([2] * len(self.dr)), }, index=self.dr)
             self.data_frames[x] = self.df
 
@@ -36,7 +37,7 @@ class Constructors3(object):
     goal_time = 0.2
 
     def setup(self):
-        self.dr = np.asarray(DatetimeIndex(start=datetime(1990, 1, 1), end=datetime(2012, 1, 1), freq=datetools.Day(1)))
+        self.dr = np.asarray(DatetimeIndex(start=datetime(1990, 1, 1), end=datetime(2012, 1, 1), freq='D'))
         self.data_frames = {}
         for x in range(100):
             self.df = DataFrame({'a': ([0] * len(self.dr)), 'b': ([1] * len(self.dr)), 'c': ([2] * len(self.dr)), }, index=self.dr)
diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py
@@ -1,6 +1,4 @@
 from .pandas_vb_common import *
-from pandas.compat import range
-from datetime import timedelta
 
 
 class replace_fillna(object):
diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py
@@ -4,7 +4,6 @@
     from pandas.tseries.converter import DatetimeConverter
 from .pandas_vb_common import *
 import pandas as pd
-from datetime import timedelta
 import datetime as dt
 try:
     import pandas.tseries.holiday
@@ -57,7 +56,7 @@ def setup(self):
         self.a = self.rng7[:50000].append(self.rng7[50002:])
 
     def time_add_timedelta(self):
-        (self.rng + timedelta(minutes=2))
+        (self.rng + dt.timedelta(minutes=2))
 
     def time_add_offset_delta(self):
         (self.rng + self.delta_offset)