-
Notifications
You must be signed in to change notification settings - Fork 67
/
Copy pathtest_voting_learners_api_3.py
252 lines (226 loc) · 10.6 KB
/
test_voting_learners_api_3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
# License: BSD 3 clause
"""
Prediction tests for voting learners.
:author: Nitin Madnani (nmadnani@ets.org)
:organization: ETS
"""
import unittest
from itertools import product
from pathlib import Path
import numpy as np
from numpy.testing import assert_array_almost_equal, assert_array_equal
from sklearn.ensemble import VotingClassifier, VotingRegressor
from skll.learner.voting import VotingLearner
from skll.utils.testing import make_california_housing_data, make_digits_data, other_dir, output_dir
# define some constants needed for testing
TRAIN_FS_DIGITS, TEST_FS_DIGITS = make_digits_data(use_digit_names=True)
FS_DIGITS, _ = make_digits_data(test_size=0, use_digit_names=True)
TRAIN_FS_HOUSING, TEST_FS_HOUSING = make_california_housing_data(num_examples=2000)
FS_HOUSING, _ = make_california_housing_data(num_examples=2000, test_size=0)
FS_HOUSING.ids = np.arange(2000)
CUSTOM_LEARNER_PATH = other_dir / "custom_logistic_wrapper.py"
class TestVotingLearnersAPIThree(unittest.TestCase):
"""Test class for third set of voting learner API tests."""
@classmethod
def setUpClass(cls):
for dir_path in [other_dir, output_dir]:
dir_path.mkdir(exist_ok=True)
@classmethod
def tearDown(cls):
for output_file_path in output_dir.glob("test_predict_voting*"):
output_file_path.unlink()
def check_predict(
self,
learner_type,
with_grid_search,
with_soft_voting,
with_class_labels,
with_file_output,
with_individual_predictions,
):
# to test the predict() method, we instantiate the SKLL voting learner,
# train it on either the digits (classification) or housing (regression)
# data set, and generate predictions on the corresponding test set; then
# we do the same in scikit-learn space and compare the objective and
# compare the SKLL and scikit-learn predictions
# set the prediction prefix in case we need to write out the predictions
prediction_prefix = (
output_dir / f"test_predict_voting_"
f"{learner_type}_"
f"{with_grid_search}_"
f"{with_class_labels}"
if with_file_output
else None
)
prediction_prefix = str(prediction_prefix) if prediction_prefix else None
# set various parameters based on whether we are using
# a classifier or a regressor
if learner_type == "classifier":
learner_names = ["LogisticRegression", "SVC", "MultinomialNB"]
voting_type = "soft" if with_soft_voting else "hard"
train_fs, test_fs = TRAIN_FS_DIGITS, TEST_FS_DIGITS
objective = "accuracy"
else:
learner_names = ["LinearRegression", "SVR", "Ridge"]
voting_type = "hard"
train_fs, test_fs = TRAIN_FS_HOUSING, TEST_FS_HOUSING
objective = "pearson"
# instantiate and train the SKLL voting learner on the digits dataset
skll_vl = VotingLearner(
learner_names, feature_scaling="none", min_feature_count=0, voting=voting_type
)
skll_vl.train(
train_fs, grid_objective=objective, grid_search=with_grid_search, grid_search_folds=3
)
# get the overall and individual predictions from SKLL
(skll_predictions, skll_individual_dict) = skll_vl.predict(
test_fs,
class_labels=with_class_labels,
prediction_prefix=prediction_prefix,
individual_predictions=with_individual_predictions,
)
# get the underlying scikit-learn estimators from SKLL
named_estimators = skll_vl.model.named_estimators_
clf1 = named_estimators[learner_names[0]]["estimator"]
clf2 = named_estimators[learner_names[1]]["estimator"]
clf3 = named_estimators[learner_names[2]]["estimator"]
# instantiate and train the scikit-learn voting classifer
sklearn_model_type = VotingClassifier if learner_type == "classifier" else VotingRegressor
sklearn_model_kwargs = {
"estimators": [
(learner_names[0], clf1),
(learner_names[1], clf2),
(learner_names[2], clf3),
]
}
if learner_type == "classifier":
sklearn_model_kwargs["voting"] = voting_type
sklearn_vl = sklearn_model_type(**sklearn_model_kwargs)
sklearn_vl.fit(train_fs.features, train_fs.labels)
# get the overall predictions from scikit-learn
sklearn_predictions = sklearn_vl.predict(test_fs.features)
# if we are doing classification and not asked to output class
# labels get either the scikit-learn probabilities or the class
# indices depending on the voting type (soft vs. hard)
if learner_type == "classifier" and not with_class_labels:
if voting_type == "soft":
sklearn_predictions = sklearn_vl.predict_proba(test_fs.features)
else:
sklearn_predictions = np.array(
[skll_vl.label_dict[class_] for class_ in sklearn_predictions]
)
# get the individual scikit-learn predictions, if necessary
sklearn_individual_dict = {}
if with_individual_predictions:
for name, estimator in sklearn_vl.named_estimators_.items():
estimator_predictions = estimator.predict(test_fs.features)
# scikit-learn individual predictions are indices not class labels
# so we need to convert them to labels if required
if with_class_labels:
estimator_predictions = [
sklearn_vl.classes_[index] for index in estimator_predictions
]
# if no class labels, then get the probabilities with soft
# voting; note that since the individual predictions from
# scikit-learn are already indices, we do not need to do
# anything for the hard voting case
else:
if voting_type == "soft":
estimator_predictions = estimator.predict_proba(test_fs.features)
sklearn_individual_dict[name] = estimator_predictions
# now we start the actual tests
# if individual predictions were not asked for, then SKLL
# should have returned None for those
if not with_individual_predictions:
self.assertTrue(skll_individual_dict is None)
# if we are doing soft voting over classifiers and not returning
# the class labels, then we need to compare SKLL and scikit-learn
# probabilities; for the digits dataset, the numbers only match
# exactly for 2 decimal places because of the way that SVC computes
# probabilities; we also check that the index of the highest probability
# is the same for both
if learner_type == "classifier":
if voting_type == "soft" and not with_class_labels:
assert_array_almost_equal(skll_predictions, sklearn_predictions, decimal=2)
skll_max_prob_indices = np.argmax(skll_predictions, axis=1)
sklearn_max_prob_indices = np.argmax(sklearn_predictions, axis=1)
assert_array_equal(skll_max_prob_indices, sklearn_max_prob_indices)
# check individual probabilities but only for non-SVC estimators
if with_individual_predictions:
assert_array_almost_equal(
skll_individual_dict["LogisticRegression"],
sklearn_individual_dict["LogisticRegression"],
decimal=2,
)
assert_array_almost_equal(
skll_individual_dict["MultinomialNB"],
sklearn_individual_dict["MultinomialNB"],
decimal=2,
)
# in all other cases, we expect the actual class labels or class indices
# to be identical between SKLL and scikit-learn
else:
assert_array_equal(skll_predictions, sklearn_predictions)
# for regression, we expect the overall predictions to match exactly
# but individual predictions only up to 2 decimal places
else:
assert_array_equal(skll_predictions, sklearn_predictions)
if with_individual_predictions:
assert_array_almost_equal(
skll_individual_dict[learner_names[0]],
sklearn_individual_dict[learner_names[0]],
decimal=2,
)
assert_array_almost_equal(
skll_individual_dict[learner_names[1]],
sklearn_individual_dict[learner_names[1]],
decimal=2,
)
assert_array_almost_equal(
skll_individual_dict[learner_names[2]],
sklearn_individual_dict[learner_names[2]],
decimal=2,
)
# if we were asked to write output to disk, then check that
# the files actually exist
if with_file_output:
self.assertTrue(Path(f"{prediction_prefix}_predictions.tsv").exists())
if with_individual_predictions:
self.assertTrue(
Path(f"{prediction_prefix}_{learner_names[0]}_predictions.tsv").exists()
)
self.assertTrue(
Path(f"{prediction_prefix}_{learner_names[1]}_predictions.tsv").exists()
)
self.assertTrue(
Path(f"{prediction_prefix}_{learner_names[2]}_predictions.tsv").exists()
)
def test_predict(self):
for (
learner_type,
with_grid_search,
with_soft_voting,
with_class_labels,
with_file_output,
with_individual_predictions,
) in product(
["classifier", "regressor"],
[False, True],
[False, True],
[False, True],
[False, True],
[False, True],
):
# regressors do not support soft voting or class labels
if learner_type == "regressor" and (with_soft_voting or with_class_labels):
continue
else:
yield (
self.check_predict,
learner_type,
with_grid_search,
with_soft_voting,
with_class_labels,
with_file_output,
with_individual_predictions,
)