-
Notifications
You must be signed in to change notification settings - Fork 67
/
Copy pathtest_voting_learners_api_1.py
387 lines (339 loc) · 15.4 KB
/
test_voting_learners_api_1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
# License: BSD 3 clause
"""
Initialization, training, saving, and loading tests for voting learners.
:author: Nitin Madnani (nmadnani@ets.org)
:organization: ETS
"""
import unittest
from itertools import product
from pathlib import Path
import numpy as np
from numpy.testing import assert_raises_regex
from sklearn.ensemble import RandomForestRegressor, VotingClassifier, VotingRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC, SVR
from skll.learner.voting import VotingLearner
from skll.utils.testing import make_california_housing_data, make_digits_data, other_dir, output_dir
# define some constants needed for testing
TRAIN_FS_DIGITS, TEST_FS_DIGITS = make_digits_data(use_digit_names=True)
FS_DIGITS, _ = make_digits_data(test_size=0, use_digit_names=True)
TRAIN_FS_HOUSING, TEST_FS_HOUSING = make_california_housing_data(num_examples=2000)
FS_HOUSING, _ = make_california_housing_data(num_examples=2000, test_size=0)
FS_HOUSING.ids = np.arange(2000)
CUSTOM_LEARNER_PATH = other_dir / "custom_logistic_wrapper.py"
class TestVotingLearnersAPIOne(unittest.TestCase):
"""Test class for first set of voting learner API tests."""
@classmethod
def setUpClass(cls):
for dir_path in [other_dir, output_dir]:
dir_path.mkdir(exist_ok=True)
@classmethod
def tearDownClass(cls):
"""Clean up after tests."""
for model_path in [
Path("test_current_directory.model"),
output_dir / "test_other_directory.model",
]:
if model_path.exists():
model_path.unlink()
def check_initialize(
self,
learner_type,
voting_type,
feature_scaling,
pos_label,
min_feature_count,
model_kwargs_list,
sampler_list,
):
"""Run checks for voting learner initialization."""
# instantiate the keyword arguments for the initialization
kwargs = {}
if voting_type:
kwargs["voting"] = voting_type
if feature_scaling:
kwargs["feature_scaling"] = feature_scaling
if pos_label:
kwargs["pos_label"] = pos_label
if min_feature_count:
kwargs["min_feature_count"] = min_feature_count
if sampler_list is not None:
sampler_list = ["RBFSampler", "Nystroem", "AdditiveChi2Sampler"]
kwargs["sampler_list"] = sampler_list
# if the voting learner is a classifier
if learner_type == "classifier":
# we are using 2 learners
learner_names = ["LogisticRegression", "SVC", "MultinomialNB"]
# add the model parameters for each of the learners
if model_kwargs_list is not None:
given_model_kwargs_list = [
{"C": 0.01},
{"C": 10.0, "kernel": "poly"},
{"alpha": 0.75},
]
kwargs["model_kwargs_list"] = given_model_kwargs_list
else:
# we are using 2 learners
learner_names = ["LinearRegression", "SVR", "RandomForestRegressor"]
# add the model parameters for each of the learners
if model_kwargs_list is not None:
given_model_kwargs_list = [
{},
{"C": 0.01, "kernel": "linear"},
{"n_estimators": 1000},
]
kwargs["model_kwargs_list"] = given_model_kwargs_list
# initialize the voting classifier
vl = VotingLearner(learner_names, **kwargs)
# check that we have the right number and type of learners
self.assertEqual(len(vl.learners), len(learner_names))
self.assertEqual(vl.learners[0].model_type.__name__, learner_names[0])
self.assertEqual(vl.learners[1].model_type.__name__, learner_names[1])
self.assertEqual(vl.learners[2].model_type.__name__, learner_names[2])
# check that the probability attribute is properly set
if learner_type == "classifier":
self.assertEqual(vl.learners[0].probability, voting_type == "soft")
self.assertEqual(vl.learners[1].probability, voting_type == "soft")
self.assertEqual(vl.learners[2].probability, voting_type == "soft")
# check that we have the right attribute values
self.assertEqual(vl.learner_type, learner_type)
self.assertEqual(vl.label_dict, {})
# check that voting type is properly set
if learner_type == "classifier":
expected_voting_type = "hard" if voting_type is None else voting_type
else:
expected_voting_type = None
self.assertEqual(vl.voting, expected_voting_type)
# check that feature scaling is properly set
expected_feature_scaling = "none" if feature_scaling is None else feature_scaling
self.assertEqual(vl.learners[0]._feature_scaling, expected_feature_scaling)
self.assertEqual(vl.learners[1]._feature_scaling, expected_feature_scaling)
self.assertEqual(vl.learners[2]._feature_scaling, expected_feature_scaling)
# check that any given model kwargs are reflected
if model_kwargs_list:
self.assertEqual(vl.model_kwargs_list, given_model_kwargs_list)
if learner_type == "classifier":
self.assertEqual(vl.learners[0].model_kwargs["C"], given_model_kwargs_list[0]["C"])
self.assertEqual(vl.learners[1].model_kwargs["C"], given_model_kwargs_list[1]["C"])
self.assertEqual(
vl.learners[1].model_kwargs["kernel"], given_model_kwargs_list[1]["kernel"]
)
self.assertEqual(
vl.learners[2].model_kwargs["alpha"], given_model_kwargs_list[2]["alpha"]
)
else:
self.assertEqual(vl.learners[1].model_kwargs["C"], given_model_kwargs_list[1]["C"])
self.assertEqual(
vl.learners[1].model_kwargs["kernel"], given_model_kwargs_list[1]["kernel"]
)
self.assertEqual(
vl.learners[2].model_kwargs["n_estimators"],
given_model_kwargs_list[2]["n_estimators"],
)
else:
self.assertEqual(vl.model_kwargs_list, [])
# check that any given samplers are actually used
if sampler_list:
self.assertEqual(vl.sampler_list, sampler_list)
self.assertEqual(vl.learners[0].sampler.__class__.__name__, "RBFSampler")
self.assertEqual(vl.learners[1].sampler.__class__.__name__, "Nystroem")
self.assertEqual(vl.learners[2].sampler.__class__.__name__, "AdditiveChi2Sampler")
else:
self.assertEqual(vl.sampler_list, [])
# check that sampler kwargs is reflected
self.assertEqual(vl.sampler_kwargs_list, [])
def test_initialize(self):
for (
learner_type,
voting_type,
feature_scaling,
pos_label,
min_feature_count,
model_kwargs_list,
sampler_list,
) in product(
["classifier", "regressor"],
[None, "hard", "soft"],
[None, "none", "both", "with_mean", "with_std"],
[None, "a"],
[None, 5],
[None, True],
[None, True],
):
yield (
self.check_initialize,
learner_type,
voting_type,
feature_scaling,
pos_label,
min_feature_count,
model_kwargs_list,
sampler_list,
)
def test_initialize_bad_model_kwargs_list(self):
assert_raises_regex(
ValueError,
r"should be a list",
VotingLearner,
["SVC", "LogisticRegression", "MultinomialNB"],
model_kwargs_list={"C": 0.01},
)
def test_initialize_bad_sampler_list(self):
assert_raises_regex(
ValueError,
r"should be a list",
VotingLearner,
["SVC", "LogisticRegression", "MultinomialNB"],
sampler_list="Nystroem",
)
def test_initialize_bad_sampler_kwargs_list(self):
assert_raises_regex(
ValueError,
r"should be a list",
VotingLearner,
["SVC", "LogisticRegression", "MultinomialNB"],
sampler_kwargs_list=0.01,
)
def test_initialize_incorrect_model_kwargs_list(self):
assert_raises_regex(
ValueError,
r"must have 3 entries",
VotingLearner,
["SVC", "LogisticRegression", "MultinomialNB"],
model_kwargs_list=[{"C": 0.01}, {"C": 0.1}],
)
def test_initialize_incorrect_sampler_list(self):
assert_raises_regex(
ValueError,
r"must have 3 entries",
VotingLearner,
["SVC", "LogisticRegression", "MultinomialNB"],
sampler_list=["RBFSampler"],
)
def test_initialize_incorrect_sampler_kwargs_list(self):
assert_raises_regex(
ValueError,
r"must have 3 entries",
VotingLearner,
["SVC", "LogisticRegression", "MultinomialNB"],
sampler_kwargs_list=[{"gamma": 1.0}],
)
def test_intialize_bad_learner_types(self):
assert_raises_regex(
ValueError,
r"cannot mix classifiers and regressors",
VotingLearner,
["SVC", "LinearRegression", "MultinomialNB"],
)
def check_train(self, learner_type, with_grid_search):
"""Run checks when training voting learners."""
# if the voting learner is a classifier
if learner_type == "classifier":
# use 3 classifiers, the digits training set, and accuracy
# as the grid search objective
learner_names = ["LogisticRegression", "SVC", "MultinomialNB"]
estimator_classes = [LogisticRegression, SVC, MultinomialNB]
featureset = TRAIN_FS_DIGITS
objective = "accuracy"
else:
# otherwise use 3 regressors, the housing training set
# and pearson as the grid search objective
learner_names = ["LinearRegression", "SVR", "RandomForestRegressor"]
estimator_classes = [LinearRegression, SVR, RandomForestRegressor]
featureset = TRAIN_FS_HOUSING
objective = "pearson"
# instantiate and train a voting learner
vl = VotingLearner(learner_names)
vl.train(
featureset, grid_objective=objective, grid_search=with_grid_search, grid_search_folds=3
)
# check that the training worked
self.assertIsNotNone(vl.model)
model_type = VotingClassifier if learner_type == "classifier" else VotingRegressor
assert isinstance(vl.model, model_type)
# check the underlying learners
self.assertEqual(len(vl.learners), len(learner_names))
assert isinstance(vl.learners[0].model, estimator_classes[0])
assert isinstance(vl.learners[1].model, estimator_classes[1])
assert isinstance(vl.learners[2].model, estimator_classes[2])
self.assertEqual(len(vl.model.named_estimators_), 3)
pipeline1 = vl.model.named_estimators_[learner_names[0]]
pipeline2 = vl.model.named_estimators_[learner_names[1]]
pipeline3 = vl.model.named_estimators_[learner_names[2]]
assert isinstance(pipeline1, Pipeline)
assert isinstance(pipeline2, Pipeline)
assert isinstance(pipeline3, Pipeline)
assert isinstance(pipeline1["estimator"], estimator_classes[0])
assert isinstance(pipeline2["estimator"], estimator_classes[1])
assert isinstance(pipeline3["estimator"], estimator_classes[2])
def test_train(self):
for learner_type, with_grid_search in product(["classifier", "regressor"], [False, True]):
yield self.check_train, learner_type, with_grid_search
def test_train_with_custom_path(self):
"""Test voting classifier with custom learner path."""
# instantiate and train a voting classifier on the digits training set
learner_names = ["CustomLogisticRegressionWrapper", "SVC"]
vl = VotingLearner(learner_names, custom_learner_path=str(CUSTOM_LEARNER_PATH))
vl.train(TRAIN_FS_DIGITS, grid_objective="accuracy", grid_search=False)
# check that we have a trained model
self.assertIsNotNone(vl.model)
assert isinstance(vl.model, VotingClassifier)
# check the underlying learners
self.assertEqual(len(vl.learners), 2)
self.assertEqual(vl.learners[0].model.__class__.__name__, "CustomLogisticRegressionWrapper")
assert isinstance(vl.learners[1].model, SVC)
self.assertEqual(len(vl.model.named_estimators_), 2)
pipeline1 = vl.model.named_estimators_["CustomLogisticRegressionWrapper"]
pipeline2 = vl.model.named_estimators_["SVC"]
assert isinstance(pipeline1, Pipeline)
assert isinstance(pipeline2, Pipeline)
self.assertEqual(
pipeline1["estimator"].__class__.__name__, "CustomLogisticRegressionWrapper"
)
assert isinstance(pipeline2["estimator"], SVC)
def test_train_bad_param_grid_list(self):
vl = VotingLearner(["SVC", "LogisticRegression", "MultinomialNB"])
assert_raises_regex(
ValueError,
r"should be a list",
vl.train,
TRAIN_FS_DIGITS[:100],
grid_objective="accuracy",
param_grid_list={"C": [0.01, 0.1, 1.0, 10.0]},
)
def check_save_and_load(self, learner_type, use_current_directory):
"""Check that saving models works as expected."""
# if the voting learner is a classifier
if learner_type == "classifier":
# use 3 classifiers, the digits training set, and accuracy
# as the grid search objective
learner_names = ["LogisticRegression", "SVC", "MultinomialNB"]
featureset = TRAIN_FS_DIGITS[:100]
else:
# otherwise use 3 regressors, the housing training set
# and pearson as the grid search objective
learner_names = ["LinearRegression", "SVR", "RandomForestRegressor"]
featureset = TRAIN_FS_HOUSING[:100]
# instantiate and train a voting learner without grid search
vl = VotingLearner(learner_names)
vl.train(featureset, grid_search=False)
# save this trained model into the current directory
if use_current_directory:
model_name = Path("test_current_directory.model")
else:
model_name = output_dir / "test_other_directory.model"
vl.save(model_name)
# make sure that the model saved and that it's the same model
self.assertTrue(model_name.exists())
vl2 = VotingLearner.from_file(model_name)
self.assertEqual(vl._learner_names, vl2._learner_names)
self.assertEqual(vl.model_type, vl2.model_type)
self.assertEqual(vl.voting, vl2.voting)
self.assertEqual(vl.learner_type, vl2.learner_type)
def test_save_and_load(self):
for learner_type, use_current_directory in product(
["classifier", "regressor"], [False, True]
):
yield self.check_save_and_load, learner_type, use_current_directory