@@ -1102,7 +1102,7 @@ def unify_columns(
1102
1102
# X = np.asfortranarray(X)
1103
1103
1104
1104
n_cols = X .shape [1 ]
1105
- if n_cols == len (feature_names_in ):
1105
+ if len (feature_names_in ) == n_cols :
1106
1106
if feature_types is None :
1107
1107
for feature_idx , categories in requests :
1108
1108
yield _process_numpy_column (
@@ -1125,7 +1125,7 @@ def unify_columns(
1125
1125
np .bool_ ,
1126
1126
count = len (feature_types ),
1127
1127
)
1128
- if n_cols != keep_cols .sum ():
1128
+ if keep_cols .sum () != n_cols :
1129
1129
# called under: predict
1130
1130
msg = f"The model has { len (keep_cols )} features, but X has { n_cols } columns"
1131
1131
_log .error (msg )
@@ -1154,11 +1154,14 @@ def unify_columns(
1154
1154
mapping = dict (zip (map (str , cols ), cols ))
1155
1155
n_cols = len (cols )
1156
1156
if len (mapping ) != n_cols :
1157
- # this can happen if for instance one column is "0" and annother is int(0)
1158
- # Pandas also allows duplicate labels by default:
1159
- # https://pandas.pydata.org/docs/user_guide/duplicates.html#duplicates-disallow
1160
- # we can tollerate duplicate labels here, provided none of them are being used by our model
1157
+ warn (
1158
+ "Columns with duplicate names detected. This can happen for example if there are columns '0' and 0."
1159
+ )
1160
+
1161
+ # We can handle duplicate names if they are not being used by the model.
1161
1162
counts = Counter (map (str , cols ))
1163
+
1164
+ # sum is used to iterate outside the interpreter. The result is not used.
1162
1165
sum (
1163
1166
map (
1164
1167
operator .truth ,
@@ -1175,6 +1178,10 @@ def unify_columns(
1175
1178
if feature_types is None :
1176
1179
if all (map (operator .contains , repeat (mapping ), feature_names_in )):
1177
1180
# we can index by name, which is a lot faster in pandas
1181
+
1182
+ if len (feature_names_in ) != n_cols :
1183
+ warn ("Extra columns present in X that are not used by the model." )
1184
+
1178
1185
for feature_idx , categories in requests :
1179
1186
yield _process_pandas_column (
1180
1187
X [mapping [feature_names_in [feature_idx ]]],
@@ -1183,7 +1190,7 @@ def unify_columns(
1183
1190
min_unique_continuous ,
1184
1191
)
1185
1192
else :
1186
- if n_cols != len (feature_names_in ):
1193
+ if len (feature_names_in ) != n_cols :
1187
1194
msg = f"The model has { len (feature_names_in )} feature names, but X has { n_cols } columns."
1188
1195
_log .error (msg )
1189
1196
raise ValueError (msg )
@@ -1209,6 +1216,10 @@ def unify_columns(
1209
1216
)
1210
1217
):
1211
1218
# we can index by name, which is a lot faster in pandas
1219
+
1220
+ if len (feature_names_in ) < n_cols :
1221
+ warn ("Extra columns present in X that are not used by the model." )
1222
+
1212
1223
for feature_idx , categories in requests :
1213
1224
yield _process_pandas_column (
1214
1225
X [mapping [feature_names_in [feature_idx ]]],
@@ -1218,7 +1229,7 @@ def unify_columns(
1218
1229
)
1219
1230
else :
1220
1231
X = X .iloc
1221
- if n_cols == len (feature_names_in ):
1232
+ if len (feature_names_in ) == n_cols :
1222
1233
warn (
1223
1234
"Pandas dataframe X does not contain all feature names. Falling back to positional columns."
1224
1235
)
@@ -1235,9 +1246,9 @@ def unify_columns(
1235
1246
np .bool_ ,
1236
1247
count = len (feature_types ),
1237
1248
)
1238
- if n_cols != keep_cols .sum ():
1249
+ if keep_cols .sum () != n_cols :
1239
1250
# called under: predict
1240
- msg = f"The model has { len (keep_cols )} features, but X has { n_cols } columns"
1251
+ msg = f"The model has { len (keep_cols )} features, but X has { n_cols } columns. "
1241
1252
_log .error (msg )
1242
1253
raise ValueError (msg )
1243
1254
col_map = np .empty (len (keep_cols ), np .int64 )
@@ -1266,7 +1277,7 @@ def unify_columns(
1266
1277
1267
1278
n_cols = X .shape [1 ]
1268
1279
1269
- if n_cols == len (feature_names_in ):
1280
+ if len (feature_names_in ) == n_cols :
1270
1281
if feature_types is None :
1271
1282
for feature_idx , categories in requests :
1272
1283
yield _process_sparse_column (
@@ -1289,8 +1300,8 @@ def unify_columns(
1289
1300
np .bool_ ,
1290
1301
count = len (feature_types ),
1291
1302
)
1292
- if n_cols != keep_cols .sum ():
1293
- msg = f"The model has { len (feature_types )} features, but X has { n_cols } columns"
1303
+ if keep_cols .sum () != n_cols :
1304
+ msg = f"The model has { len (feature_types )} features, but X has { n_cols } columns. "
1294
1305
_log .error (msg )
1295
1306
raise ValueError (msg )
1296
1307
col_map = np .empty (len (feature_types ), np .int64 )
@@ -1315,7 +1326,7 @@ def unify_columns(
1315
1326
elif safe_isinstance (X , "scipy.sparse.spmatrix" ):
1316
1327
n_cols = X .shape [1 ]
1317
1328
1318
- if n_cols == len (feature_names_in ):
1329
+ if len (feature_names_in ) == n_cols :
1319
1330
if feature_types is None :
1320
1331
for feature_idx , categories in requests :
1321
1332
yield _process_sparse_column (
@@ -1338,8 +1349,8 @@ def unify_columns(
1338
1349
np .bool_ ,
1339
1350
count = len (feature_types ),
1340
1351
)
1341
- if n_cols != keep_cols .sum ():
1342
- msg = f"The model has { len (feature_types )} features, but X has { n_cols } columns"
1352
+ if keep_cols .sum () != n_cols :
1353
+ msg = f"The model has { len (feature_types )} features, but X has { n_cols } columns. "
1343
1354
_log .error (msg )
1344
1355
raise ValueError (msg )
1345
1356
col_map = np .empty (len (feature_types ), np .int64 )
0 commit comments