Skip to content

Commit b578c0b

Browse files
committed
use python's built-in C optimized functions as much as possible
1 parent 9561285 commit b578c0b

File tree

2 files changed

+258
-119
lines changed

2 files changed

+258
-119
lines changed

python/interpret-core/interpret/utils/_clean_x.py

+79-50
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22
# Distributed under the MIT software license
33

44
import logging
5+
from warnings import warn
56
from collections import Counter
6-
from itertools import count
7+
from itertools import count, repeat, compress
8+
import operator
79

810
import numpy as np
911
from numpy import ma
@@ -410,10 +412,10 @@ def _densify_object_ndarray(X_col):
410412

411413
X_col = X_col.copy()
412414
places = np.fromiter(
413-
(
414-
val_type is float or issubclass(val_type, np.floating)
415-
for val_type in map(type, X_col)
416-
),
415+
map(isinstance, X_col, repeat(float)), np.bool_, count=len(X_col)
416+
)
417+
places |= np.fromiter(
418+
map(issubclass, map(type, X_col), repeat(np.floating)),
417419
np.bool_,
418420
count=len(X_col),
419421
)
@@ -508,7 +510,7 @@ def _process_column_initial(X_col, nonmissings, processing, min_unique_continuou
508510

509511
categories = dict(zip(categories, count(1)))
510512
mapping = np.fromiter(
511-
(categories[val] for val in uniques), np.int64, count=len(uniques)
513+
map(categories.__getitem__, uniques), np.int64, count=len(uniques)
512514
)
513515
encoded = mapping[indexes]
514516

@@ -541,7 +543,7 @@ def _encode_categorical_existing(X_col, nonmissings, categories):
541543
uniques = uniques.astype(np.str_, copy=False)
542544

543545
mapping = np.fromiter(
544-
(categories.get(val, -1) for val in uniques), np.int64, count=len(uniques)
546+
map(categories.get, uniques, repeat(-1)), np.int64, count=len(uniques)
545547
)
546548
encoded = mapping[indexes]
547549

@@ -642,7 +644,7 @@ def _encode_pandas_categorical_existing(X_col, pd_categories, categories):
642644
# if we have just 1 sample, we can avoid making the mapping below
643645

644646
mapping = np.fromiter(
645-
(categories.get(val, -1) for val in pd_categories),
647+
map(categories.get, pd_categories, repeat(-1)),
646648
np.int64,
647649
count=len(pd_categories),
648650
)
@@ -1048,10 +1050,19 @@ def unify_columns(
10481050
X,
10491051
requests,
10501052
feature_names_in,
1051-
feature_types=None,
1052-
min_unique_continuous=0,
1053-
go_fast=False,
1053+
feature_types,
1054+
min_unique_continuous,
1055+
go_fast,
10541056
):
1057+
# preclean_X is always called on X prior to calling this function
1058+
1059+
# unify_feature_names is always called on feature_names_in prior to calling this function
1060+
1061+
# feature_names_in is guranteed not to contain duplicate names because unify_feature_names checks this.
1062+
1063+
# feature_types can ONLY be None when called from unify_data OR when called from EBMPreprocessor.fit(...)
1064+
# on all subsequent calls we pass a cleaned up feature_types from the results of the first call to EBMPreprocessor.fit(...)
1065+
10551066
# If the requests paramter contains a categories dictionary, then that same categories object is guaranteed to
10561067
# be yielded back to the caller. This guarantee can be used to rapidly identify which request is being
10571068
# yielded by using the id(categories) along with the feature_idx
@@ -1075,17 +1086,17 @@ def unify_columns(
10751086
# feature_types to not be None. During predict time feature_types_in cannot be None, but we need
10761087
# to check for legality on the dimensions of X
10771088
keep_cols = np.fromiter(
1078-
(val != "ignore" for val in feature_types),
1089+
map(operator.ne, repeat("ignore"), feature_types),
10791090
np.bool_,
10801091
count=len(feature_types),
10811092
)
10821093
if n_cols != keep_cols.sum():
10831094
# called under: predict
1084-
msg = f"The model has {len(feature_types)} features, but X has {n_cols} columns"
1095+
msg = f"The model has {len(keep_cols)} features, but X has {n_cols} columns"
10851096
_log.error(msg)
10861097
raise ValueError(msg)
1087-
col_map = np.empty(len(feature_types), np.int64)
1088-
np.place(col_map, keep_cols, np.arange(len(feature_types), dtype=np.int64))
1098+
col_map = np.empty(len(keep_cols), np.int64)
1099+
np.place(col_map, keep_cols, np.arange(len(keep_cols), dtype=np.int64))
10891100

10901101
# TODO: I'm not sure that simply checking X.flags.c_contiguous handles all the situations that we'd want
10911102
# to know about some data. If we recieved a transposed array that was C ordered how would that look?
@@ -1126,44 +1137,62 @@ def unify_columns(
11261137
# Pandas also allows duplicate labels by default:
11271138
# https://pandas.pydata.org/docs/user_guide/duplicates.html#duplicates-disallow
11281139
# we can tollerate duplicate labels here, provided none of them are being used by our model
1129-
for name, n_count in Counter(map(str, names_original)).items():
1130-
if n_count != 1:
1131-
del names_dict[name]
1140+
counts = Counter(map(str, names_original))
1141+
sum(
1142+
map(
1143+
operator.truth,
1144+
map(
1145+
operator.delitem,
1146+
repeat(names_dict),
1147+
compress(
1148+
counts.keys(), map(operator.ne, repeat(1), counts.values())
1149+
),
1150+
),
1151+
)
1152+
)
11321153

11331154
if feature_types is None:
1134-
for feature_name_in in feature_names_in:
1135-
if feature_name_in not in names_dict:
1136-
names_dict = None
1137-
break
1138-
else:
1139-
for feature_name_in, feature_type in zip(feature_names_in, feature_types):
1140-
if feature_type != "ignore" and feature_name_in not in names_dict:
1141-
names_dict = None
1142-
break
1155+
if not all(map(operator.contains, repeat(names_dict), feature_names_in)):
1156+
if n_cols != len(feature_names_in):
1157+
msg = f"The model has {len(feature_names_in)} feature names, but X has {n_cols} columns."
1158+
_log.error(msg)
1159+
raise ValueError(msg)
11431160

1144-
if names_dict is None:
1145-
if n_cols == len(feature_names_in):
11461161
names_dict = dict(zip(feature_names_in, count()))
1147-
else:
1148-
# during fit time unify_feature_names would only allow us to get here if this was legal, which requires
1149-
# feature_types to not be None. During predict time feature_types_in cannot be None, but we need
1150-
# to check for legality on the dimensions of X
1151-
names_dict = dict(
1152-
zip(
1153-
(
1154-
feature_name_in
1155-
for feature_name_in, feature_type in zip(
1156-
feature_names_in, feature_types
1157-
)
1158-
if feature_type != "ignore"
1159-
),
1160-
count(),
1162+
warn(
1163+
"Pandas dataframe X does not contain all feature names. Falling back to positional columns."
1164+
)
1165+
else:
1166+
if not all(
1167+
map(
1168+
operator.contains,
1169+
repeat(names_dict),
1170+
compress(
1171+
feature_names_in,
1172+
map(operator.ne, repeat("ignore"), feature_types),
1173+
),
1174+
)
1175+
):
1176+
if n_cols == len(feature_names_in):
1177+
names_dict = dict(zip(feature_names_in, count()))
1178+
else:
1179+
names_dict = dict(
1180+
zip(
1181+
compress(
1182+
feature_names_in,
1183+
map(operator.ne, repeat("ignore"), feature_types),
1184+
),
1185+
count(),
1186+
)
11611187
)
1188+
if n_cols != len(names_dict):
1189+
msg = f"The model has {len(feature_types)} features, but X has {n_cols} columns"
1190+
_log.error(msg)
1191+
raise ValueError(msg)
1192+
1193+
warn(
1194+
"Pandas dataframe X does not contain all feature names. Falling back to positional columns."
11621195
)
1163-
if n_cols != len(names_dict):
1164-
msg = f"The model has {len(feature_types)} features, but X has {n_cols} columns"
1165-
_log.error(msg)
1166-
raise ValueError(msg)
11671196

11681197
# Pandas also sometimes uses a dense 2D ndarray instead of per column 1D ndarrays, which would benefit from
11691198
# transposing, but accessing the BlockManager is currently unsupported behavior. They are also planning to eliminate
@@ -1194,7 +1223,7 @@ def unify_columns(
11941223
# feature_types to not be None. During predict time feature_types_in cannot be None, but we need
11951224
# to check for legality on the dimensions of X
11961225
keep_cols = np.fromiter(
1197-
(val != "ignore" for val in feature_types),
1226+
map(operator.ne, repeat("ignore"), feature_types),
11981227
np.bool_,
11991228
count=len(feature_types),
12001229
)
@@ -1222,7 +1251,7 @@ def unify_columns(
12221251
# feature_types to not be None. During predict time feature_types_in cannot be None, but we need
12231252
# to check for legality on the dimensions of X
12241253
keep_cols = np.fromiter(
1225-
(val != "ignore" for val in feature_types),
1254+
map(operator.ne, repeat("ignore"), feature_types),
12261255
np.bool_,
12271256
count=len(feature_types),
12281257
)
@@ -1263,7 +1292,7 @@ def unify_columns(
12631292
def _determine_min_cols(feature_names=None, feature_types=None):
12641293
if feature_types is None:
12651294
return None if feature_names is None else len(feature_names)
1266-
n_ignored = sum(1 for feature_type in feature_types if feature_type == "ignore")
1295+
n_ignored = sum(map(operator.eq, repeat("ignore"), feature_types))
12671296
if (
12681297
feature_names is None
12691298
or len(feature_names) == len(feature_types)

0 commit comments

Comments
 (0)