Skip to content

Commit 77f35df

Browse files
committed
PERF: speed up CategoricalIndex.get_loc
1 parent 8963218 commit 77f35df

File tree

6 files changed

+57
-26
lines changed

6 files changed

+57
-26
lines changed

asv_bench/benchmarks/indexing_engines.py

+12-4
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,22 @@
11
import numpy as np
22

3-
from pandas._libs.index import (Int64Engine, UInt64Engine, Float64Engine,
4-
ObjectEngine)
3+
from pandas._libs.index import (
4+
Int64Engine, Int32Engine, Int16Engine, Int8Engine,
5+
UInt64Engine, UInt32Engine, UInt16Engine, UInt8Engine,
6+
Float64Engine, Float32Engine,
7+
ObjectEngine
8+
)
59

610

711
class NumericEngineIndexing(object):
812

913
goal_time = 0.2
10-
params = [[Int64Engine, UInt64Engine, Float64Engine],
11-
[np.int64, np.uint64, np.float64],
14+
params = [[Int64Engine, Int32Engine, Int16Engine, Int8Engine,
15+
UInt64Engine, UInt32Engine, UInt16Engine, UInt8Engine,
16+
Float64Engine, Float32Engine],
17+
[np.int64, np.int32, np.int16, np.int8,
18+
np.uint64, np.uint32, np.uint16, np.uint8,
19+
np.float64, np.float32],
1220
['monotonic_incr', 'monotonic_decr', 'non_monotonic'],
1321
]
1422
param_names = ['engine', 'dtype', 'index_type']

doc/source/whatsnew/v0.24.0.txt

+4-3
Original file line numberDiff line numberDiff line change
@@ -759,9 +759,10 @@ Removal of prior version deprecations/changes
759759
Performance Improvements
760760
~~~~~~~~~~~~~~~~~~~~~~~~
761761

762-
- Very large improvement in performance of slicing when the index is a :class:`CategoricalIndex`,
763-
both when indexing by label (using .loc) and position(.iloc).
764-
Likewise, slicing a ``CategoricalIndex`` itself (i.e. ``ci[100:200]``) shows similar speed improvements (:issue:`21659`)
762+
- Slicing Series and Dataframes with an monotonically increasing :class:`CategoricalIndex`
763+
is now very fast and has speed comparable to slicing with an ``Int64Index``.
764+
The speed increase is both when indexing by label (using .loc) and position(.iloc) (:issue:`20395`)
765+
- Slicing a :class:`CategoricalIndex` itself (i.e. ``ci[1000:2000]``) shows similar speed improvements as above (:issue:`21659`)
765766
- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`)
766767
- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`)
767768
- Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`)

pandas/_libs/algos.pyx

+7-2
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ from libc.math cimport fabs, sqrt
1010
import numpy as np
1111
cimport numpy as cnp
1212
from numpy cimport (ndarray,
13-
NPY_INT64, NPY_UINT64, NPY_INT32, NPY_INT16, NPY_INT8,
13+
NPY_INT64, NPY_INT32, NPY_INT16, NPY_INT8,
14+
NPY_UINT64, NPY_UINT32, NPY_UINT16, NPY_UINT8,
1415
NPY_FLOAT32, NPY_FLOAT64,
1516
NPY_OBJECT,
1617
int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
@@ -359,8 +360,10 @@ ctypedef fused algos_t:
359360
float64_t
360361
float32_t
361362
object
362-
int32_t
363363
int64_t
364+
int32_t
365+
int16_t
366+
int8_t
364367
uint64_t
365368
uint8_t
366369

@@ -866,6 +869,8 @@ is_monotonic_float32 = is_monotonic["float32_t"]
866869
is_monotonic_object = is_monotonic["object"]
867870
is_monotonic_int64 = is_monotonic["int64_t"]
868871
is_monotonic_int32 = is_monotonic["int32_t"]
872+
is_monotonic_int16 = is_monotonic["int16_t"]
873+
is_monotonic_int8 = is_monotonic["int8_t"]
869874
is_monotonic_uint64 = is_monotonic["uint64_t"]
870875
is_monotonic_bool = is_monotonic["uint8_t"]
871876

pandas/_libs/index.pyx

+4-2
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@ import cython
55

66
import numpy as np
77
cimport numpy as cnp
8-
from numpy cimport (ndarray, float64_t, int32_t,
9-
int64_t, uint8_t, uint64_t, intp_t,
8+
from numpy cimport (ndarray, intp_t,
9+
float64_t, float32_t,
10+
int64_t, int32_t, int16_t, int8_t,
11+
uint64_t, uint32_t, uint16_t, uint8_t,
1012
# Note: NPY_DATETIME, NPY_TIMEDELTA are only available
1113
# for cimport in cython>=0.27.3
1214
NPY_DATETIME, NPY_TIMEDELTA)

pandas/_libs/index_class_helper.pxi.in

+17-13
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,22 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
1010

1111
{{py:
1212

13-
# name, dtype, ctype
14-
dtypes = [('Float64', 'float64', 'float64_t'),
15-
('UInt64', 'uint64', 'uint64_t'),
16-
('Int64', 'int64', 'int64_t'),
17-
('Object', 'object', 'object')]
13+
# name, dtype, ctype, hashtable_name, hashtable_dtype
14+
dtypes = [('Float64', 'float64', 'float64_t', 'Float64', 'float64'),
15+
('Float32', 'float32', 'float32_t', 'Float64', 'float64'),
16+
('Int64', 'int64', 'int64_t', 'Int64', 'int64'),
17+
('Int32', 'int32', 'int32_t', 'Int64', 'int64'),
18+
('Int16', 'int16', 'int16_t', 'Int64', 'int64'),
19+
('Int8', 'int8', 'int8_t', 'Int64', 'int64'),
20+
('UInt64', 'uint64', 'uint64_t', 'UInt64', 'uint64'),
21+
('UInt32', 'uint32', 'uint32_t', 'UInt64', 'uint64'),
22+
('UInt16', 'uint16', 'uint16_t', 'UInt64', 'uint64'),
23+
('UInt8', 'uint8', 'uint8_t', 'UInt64', 'uint64'),
24+
('Object', 'object', 'object', 'PyObject', 'object'),
25+
]
1826
}}
1927

20-
{{for name, dtype, ctype in dtypes}}
28+
{{for name, dtype, ctype, hashtable_name, hashtable_dtype in dtypes}}
2129

2230

2331
cdef class {{name}}Engine(IndexEngine):
@@ -34,13 +42,9 @@ cdef class {{name}}Engine(IndexEngine):
3442
other, limit=limit)
3543

3644
cdef _make_hash_table(self, n):
37-
{{if name == 'Object'}}
38-
return _hash.PyObjectHashTable(n)
39-
{{else}}
40-
return _hash.{{name}}HashTable(n)
41-
{{endif}}
45+
return _hash.{{hashtable_name}}HashTable(n)
4246

43-
{{if name != 'Float64' and name != 'Object'}}
47+
{{if name not in {'Float64', 'Float32', 'Object'} }}
4448
cdef _check_type(self, object val):
4549
hash(val)
4650
if util.is_bool_object(val):
@@ -60,7 +64,7 @@ cdef class {{name}}Engine(IndexEngine):
6064
ndarray[{{ctype}}] values
6165
int count = 0
6266

63-
{{if name != 'Float64'}}
67+
{{if name not in {'Float64', 'Float32'} }}
6468
if not util.is_integer_object(val):
6569
raise KeyError(val)
6670
{{endif}}

pandas/core/indexes/category.py

+13-2
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,17 @@ class CategoricalIndex(Index, accessor.PandasDelegate):
8484
"""
8585

8686
_typ = 'categoricalindex'
87-
_engine_type = libindex.Int64Engine
87+
88+
@property
89+
def _engine_type(self):
90+
# self.codes can have dtype int8, int16, int32 or int64, so we need
91+
# to return the corresponding engine type (libindex.Int8Engine, etc.).
92+
return {np.int8: libindex.Int8Engine,
93+
np.int16: libindex.Int16Engine,
94+
np.int32: libindex.Int32Engine,
95+
np.int64: libindex.Int64Engine,
96+
}[self.codes.dtype.type]
97+
8898
_attributes = ['name']
8999

90100
def __new__(cls, data=None, categories=None, ordered=None, dtype=None,
@@ -382,7 +392,7 @@ def argsort(self, *args, **kwargs):
382392
def _engine(self):
383393

384394
# we are going to look things up with the codes themselves
385-
return self._engine_type(lambda: self.codes.astype('i8'), len(self))
395+
return self._engine_type(lambda: self.codes, len(self))
386396

387397
# introspection
388398
@cache_readonly
@@ -450,6 +460,7 @@ def get_loc(self, key, method=None):
450460
array([False, True, False, True], dtype=bool)
451461
"""
452462
code = self.categories.get_loc(key)
463+
code = self.codes.dtype.type(code)
453464
try:
454465
return self._engine.get_loc(code)
455466
except KeyError:

0 commit comments

Comments
 (0)