2
2
# Distributed under the MIT software license
3
3
4
4
import logging
5
+ from warnings import warn
5
6
from collections import Counter
6
- from itertools import count
7
+ from itertools import count , repeat , compress
8
+ import operator
7
9
8
10
import numpy as np
9
11
from numpy import ma
@@ -410,10 +412,10 @@ def _densify_object_ndarray(X_col):
410
412
411
413
X_col = X_col .copy ()
412
414
places = np .fromiter (
413
- (
414
- val_type is float or issubclass ( val_type , np . floating )
415
- for val_type in map ( type , X_col )
416
- ),
415
+ map ( isinstance , X_col , repeat ( float )), np . bool_ , count = len ( X_col )
416
+ )
417
+ places |= np . fromiter (
418
+ map ( issubclass , map ( type , X_col ), repeat ( np . floating ) ),
417
419
np .bool_ ,
418
420
count = len (X_col ),
419
421
)
@@ -508,7 +510,7 @@ def _process_column_initial(X_col, nonmissings, processing, min_unique_continuou
508
510
509
511
categories = dict (zip (categories , count (1 )))
510
512
mapping = np .fromiter (
511
- (categories [ val ] for val in uniques ), np .int64 , count = len (uniques )
513
+ map (categories . __getitem__ , uniques ), np .int64 , count = len (uniques )
512
514
)
513
515
encoded = mapping [indexes ]
514
516
@@ -541,7 +543,7 @@ def _encode_categorical_existing(X_col, nonmissings, categories):
541
543
uniques = uniques .astype (np .str_ , copy = False )
542
544
543
545
mapping = np .fromiter (
544
- (categories .get ( val , - 1 ) for val in uniques ), np .int64 , count = len (uniques )
546
+ map (categories .get , uniques , repeat ( - 1 )), np .int64 , count = len (uniques )
545
547
)
546
548
encoded = mapping [indexes ]
547
549
@@ -642,7 +644,7 @@ def _encode_pandas_categorical_existing(X_col, pd_categories, categories):
642
644
# if we have just 1 sample, we can avoid making the mapping below
643
645
644
646
mapping = np .fromiter (
645
- (categories .get ( val , - 1 ) for val in pd_categories ),
647
+ map (categories .get , pd_categories , repeat ( - 1 )),
646
648
np .int64 ,
647
649
count = len (pd_categories ),
648
650
)
@@ -1048,10 +1050,19 @@ def unify_columns(
1048
1050
X ,
1049
1051
requests ,
1050
1052
feature_names_in ,
1051
- feature_types = None ,
1052
- min_unique_continuous = 0 ,
1053
- go_fast = False ,
1053
+ feature_types ,
1054
+ min_unique_continuous ,
1055
+ go_fast ,
1054
1056
):
1057
+ # preclean_X is always called on X prior to calling this function
1058
+
1059
+ # unify_feature_names is always called on feature_names_in prior to calling this function
1060
+
1061
+ # feature_names_in is guranteed not to contain duplicate names because unify_feature_names checks this.
1062
+
1063
+ # feature_types can ONLY be None when called from unify_data OR when called from EBMPreprocessor.fit(...)
1064
+ # on all subsequent calls we pass a cleaned up feature_types from the results of the first call to EBMPreprocessor.fit(...)
1065
+
1055
1066
# If the requests paramter contains a categories dictionary, then that same categories object is guaranteed to
1056
1067
# be yielded back to the caller. This guarantee can be used to rapidly identify which request is being
1057
1068
# yielded by using the id(categories) along with the feature_idx
@@ -1075,17 +1086,17 @@ def unify_columns(
1075
1086
# feature_types to not be None. During predict time feature_types_in cannot be None, but we need
1076
1087
# to check for legality on the dimensions of X
1077
1088
keep_cols = np .fromiter (
1078
- ( val != "ignore" for val in feature_types ),
1089
+ map ( operator . ne , repeat ( "ignore" ), feature_types ),
1079
1090
np .bool_ ,
1080
1091
count = len (feature_types ),
1081
1092
)
1082
1093
if n_cols != keep_cols .sum ():
1083
1094
# called under: predict
1084
- msg = f"The model has { len (feature_types )} features, but X has { n_cols } columns"
1095
+ msg = f"The model has { len (keep_cols )} features, but X has { n_cols } columns"
1085
1096
_log .error (msg )
1086
1097
raise ValueError (msg )
1087
- col_map = np .empty (len (feature_types ), np .int64 )
1088
- np .place (col_map , keep_cols , np .arange (len (feature_types ), dtype = np .int64 ))
1098
+ col_map = np .empty (len (keep_cols ), np .int64 )
1099
+ np .place (col_map , keep_cols , np .arange (len (keep_cols ), dtype = np .int64 ))
1089
1100
1090
1101
# TODO: I'm not sure that simply checking X.flags.c_contiguous handles all the situations that we'd want
1091
1102
# to know about some data. If we recieved a transposed array that was C ordered how would that look?
@@ -1126,44 +1137,62 @@ def unify_columns(
1126
1137
# Pandas also allows duplicate labels by default:
1127
1138
# https://pandas.pydata.org/docs/user_guide/duplicates.html#duplicates-disallow
1128
1139
# we can tollerate duplicate labels here, provided none of them are being used by our model
1129
- for name , n_count in Counter (map (str , names_original )).items ():
1130
- if n_count != 1 :
1131
- del names_dict [name ]
1140
+ counts = Counter (map (str , names_original ))
1141
+ sum (
1142
+ map (
1143
+ operator .truth ,
1144
+ map (
1145
+ operator .delitem ,
1146
+ repeat (names_dict ),
1147
+ compress (
1148
+ counts .keys (), map (operator .ne , repeat (1 ), counts .values ())
1149
+ ),
1150
+ ),
1151
+ )
1152
+ )
1132
1153
1133
1154
if feature_types is None :
1134
- for feature_name_in in feature_names_in :
1135
- if feature_name_in not in names_dict :
1136
- names_dict = None
1137
- break
1138
- else :
1139
- for feature_name_in , feature_type in zip (feature_names_in , feature_types ):
1140
- if feature_type != "ignore" and feature_name_in not in names_dict :
1141
- names_dict = None
1142
- break
1155
+ if not all (map (operator .contains , repeat (names_dict ), feature_names_in )):
1156
+ if n_cols != len (feature_names_in ):
1157
+ msg = f"The model has { len (feature_names_in )} feature names, but X has { n_cols } columns."
1158
+ _log .error (msg )
1159
+ raise ValueError (msg )
1143
1160
1144
- if names_dict is None :
1145
- if n_cols == len (feature_names_in ):
1146
1161
names_dict = dict (zip (feature_names_in , count ()))
1147
- else :
1148
- # during fit time unify_feature_names would only allow us to get here if this was legal, which requires
1149
- # feature_types to not be None. During predict time feature_types_in cannot be None, but we need
1150
- # to check for legality on the dimensions of X
1151
- names_dict = dict (
1152
- zip (
1153
- (
1154
- feature_name_in
1155
- for feature_name_in , feature_type in zip (
1156
- feature_names_in , feature_types
1157
- )
1158
- if feature_type != "ignore"
1159
- ),
1160
- count (),
1162
+ warn (
1163
+ "Pandas dataframe X does not contain all feature names. Falling back to positional columns."
1164
+ )
1165
+ else :
1166
+ if not all (
1167
+ map (
1168
+ operator .contains ,
1169
+ repeat (names_dict ),
1170
+ compress (
1171
+ feature_names_in ,
1172
+ map (operator .ne , repeat ("ignore" ), feature_types ),
1173
+ ),
1174
+ )
1175
+ ):
1176
+ if n_cols == len (feature_names_in ):
1177
+ names_dict = dict (zip (feature_names_in , count ()))
1178
+ else :
1179
+ names_dict = dict (
1180
+ zip (
1181
+ compress (
1182
+ feature_names_in ,
1183
+ map (operator .ne , repeat ("ignore" ), feature_types ),
1184
+ ),
1185
+ count (),
1186
+ )
1161
1187
)
1188
+ if n_cols != len (names_dict ):
1189
+ msg = f"The model has { len (feature_types )} features, but X has { n_cols } columns"
1190
+ _log .error (msg )
1191
+ raise ValueError (msg )
1192
+
1193
+ warn (
1194
+ "Pandas dataframe X does not contain all feature names. Falling back to positional columns."
1162
1195
)
1163
- if n_cols != len (names_dict ):
1164
- msg = f"The model has { len (feature_types )} features, but X has { n_cols } columns"
1165
- _log .error (msg )
1166
- raise ValueError (msg )
1167
1196
1168
1197
# Pandas also sometimes uses a dense 2D ndarray instead of per column 1D ndarrays, which would benefit from
1169
1198
# transposing, but accessing the BlockManager is currently unsupported behavior. They are also planning to eliminate
@@ -1194,7 +1223,7 @@ def unify_columns(
1194
1223
# feature_types to not be None. During predict time feature_types_in cannot be None, but we need
1195
1224
# to check for legality on the dimensions of X
1196
1225
keep_cols = np .fromiter (
1197
- ( val != "ignore" for val in feature_types ),
1226
+ map ( operator . ne , repeat ( "ignore" ), feature_types ),
1198
1227
np .bool_ ,
1199
1228
count = len (feature_types ),
1200
1229
)
@@ -1222,7 +1251,7 @@ def unify_columns(
1222
1251
# feature_types to not be None. During predict time feature_types_in cannot be None, but we need
1223
1252
# to check for legality on the dimensions of X
1224
1253
keep_cols = np .fromiter (
1225
- ( val != "ignore" for val in feature_types ),
1254
+ map ( operator . ne , repeat ( "ignore" ), feature_types ),
1226
1255
np .bool_ ,
1227
1256
count = len (feature_types ),
1228
1257
)
@@ -1263,7 +1292,7 @@ def unify_columns(
1263
1292
def _determine_min_cols (feature_names = None , feature_types = None ):
1264
1293
if feature_types is None :
1265
1294
return None if feature_names is None else len (feature_names )
1266
- n_ignored = sum (1 for feature_type in feature_types if feature_type == "ignore" )
1295
+ n_ignored = sum (map ( operator . eq , repeat ( "ignore" ), feature_types ) )
1267
1296
if (
1268
1297
feature_names is None
1269
1298
or len (feature_names ) == len (feature_types )
0 commit comments