-
Notifications
You must be signed in to change notification settings - Fork 67
/
Copy pathtest_cv.py
459 lines (389 loc) · 18.4 KB
/
test_cv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
# License: BSD 3 clause
"""
Run cross-validation tests.
:author: Michael Heilman (mheilman@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)
:author: Dan Blanchard (dblanchard@ets.org)
:author: Aoife Cahill (acahill@ets.org)
"""
import csv
import itertools
import json
import re
import unittest
import numpy as np
from numpy.testing import assert_almost_equal
from sklearn.datasets import make_classification
from sklearn.feature_extraction import FeatureHasher
from skll.config import load_cv_folds
from skll.data import FeatureSet
from skll.experiments import load_featureset, run_configuration
from skll.learner import Learner
from skll.utils.testing import (
compute_expected_folds_for_cv_testing,
config_dir,
create_jsonlines_feature_files,
fill_in_config_paths_for_single_file,
other_dir,
output_dir,
remove_jsonlines_feature_files,
train_dir,
unlink,
)
class TestCrossValidation(unittest.TestCase):
"""Test class for cross-validation tests."""
@classmethod
def setUpClass(cls):
"""Create necessary directories for testing."""
for dir_path in [train_dir, output_dir]:
dir_path.mkdir(exist_ok=True)
# create jsonlines feature files
create_jsonlines_feature_files(train_dir)
@classmethod
def tearDownClass(cls):
"""Clean up after tests."""
fold_file_path = other_dir / "custom_folds.csv"
unlink(fold_file_path)
config_dir_obj = config_dir
cfg_files = [
config_dir_obj / "test_save_cv_folds.cfg",
config_dir_obj / "test_save_cv_models.cfg",
config_dir_obj / "test_custom_cv_seed_classifier.cfg",
config_dir_obj / "test_custom_cv_seed_regressor.cfg",
config_dir_obj / "test_folds_file.cfg",
config_dir_obj / "test_folds_file_grid.cfg",
]
for cfg_file in cfg_files:
unlink(cfg_file)
for output_file in itertools.chain(
output_dir.glob("test_save_cv_folds*"),
output_dir.glob("test_int_labels_cv_*"),
output_dir.glob("test_save_cv_models*"),
output_dir.glob("test_custom_cv_seed*"),
output_dir.glob("test_folds_file*"),
):
unlink(output_file)
remove_jsonlines_feature_files(train_dir)
def make_cv_folds_data(self, num_examples_per_fold=100, num_folds=3, use_feature_hashing=False):
"""Create data for pre-specified CV folds tests with or without feature hashing."""
num_total_examples = num_examples_per_fold * num_folds
# create the numeric features and the binary labels
X, _ = make_classification(
n_samples=num_total_examples,
n_features=3,
n_informative=3,
n_redundant=0,
n_classes=2,
random_state=1234567890,
)
y = np.array([0, 1] * int(num_total_examples / 2))
# the folds mapping: the first num_examples_per_fold examples
# are in fold 1 the second num_examples_per_fold are in
# fold 2 and so on
foldgen = ([str(i)] * num_examples_per_fold for i in range(num_folds))
folds = list(itertools.chain(*foldgen))
# now create the list of feature dictionaries
# and add the binary features that depend on
# the class and fold number
feature_names = [f"f{i}" for i in range(1, 4)]
features = []
for row, classid, foldnum in zip(X, y, folds):
string_feature_name = f"is_{classid}_{foldnum}"
string_feature_value = 1
feat_dict = dict(zip(feature_names, row))
feat_dict.update({string_feature_name: string_feature_value})
features.append(feat_dict)
# create the example IDs
ids = [
f"EXAMPLE_{num_examples_per_fold * k + i}"
for k in range(num_folds)
for i in range(num_examples_per_fold)
]
# create the cross-validation feature set with or without feature hashing
vectorizer = FeatureHasher(n_features=4) if use_feature_hashing else None
cv_fs = FeatureSet("cv_folds", ids, features=features, labels=y, vectorizer=vectorizer)
# make the custom cv folds dictionary
custom_cv_folds = dict(zip(ids, folds))
return cv_fs, custom_cv_folds
def test_specified_cv_folds(self):
"""Check cross-validation results with specified folds, feature hashing, and RBFSampler."""
# This runs four tests.
# The first does not use feature hashing with 9 features (3 numeric, 6
# binary) has pre-specified folds and has less than 60% accuracy for each
# of the 3 folds.
# The second uses feature hashing with 4 features, uses 10 folds (not pre-
# specified) and has more than 70% accuracy accuracy for each of the 10
# folds.
# The third is the same as the first but uses an RBFSampler.
# The fourth is the same as the second but uses an RBFSampler.
for test_value, assert_func, expected_folds, use_hashing, use_sampler in [
(0.58, self.assertLess, 3, False, False),
(0.1, self.assertGreater, 10, True, False),
(0.57, self.assertLess, 3, False, True),
(0.69, self.assertGreater, 10, True, True),
]:
sampler = "RBFSampler" if use_sampler else None
learner = Learner("LogisticRegression", sampler=sampler)
cv_fs, custom_cv_folds = self.make_cv_folds_data(use_feature_hashing=use_hashing)
folds = custom_cv_folds if not use_hashing else 10
(grid_scores, _, _, _, _) = learner.cross_validate(
cv_fs,
cv_folds=folds,
grid_search=True,
grid_objective="f1_score_micro",
save_cv_folds=False,
)
fold_test_scores = [t[-2] for t in grid_scores]
overall_score = np.mean(fold_test_scores)
assert_func(overall_score, test_value)
self.assertEqual(len(fold_test_scores), expected_folds)
for fold_score in fold_test_scores:
assert_func(fold_score, test_value)
def test_load_cv_folds(self):
"""Test to check that cross-validation folds are correctly loaded from a CSV file."""
# create custom CV folds
custom_cv_folds = self.make_cv_folds_data()[1]
# write the generated CV folds to a CSV file
fold_file_path = other_dir / "custom_folds.csv"
with open(fold_file_path, "w", newline="") as foldf:
w = csv.writer(foldf)
w.writerow(["id", "fold"])
for example_id, fold_label in custom_cv_folds.items():
w.writerow([example_id, fold_label])
# now read the CSV file using _load_cv_folds
custom_cv_folds_loaded = load_cv_folds(fold_file_path)
self.assertEqual(custom_cv_folds_loaded, custom_cv_folds)
def test_load_cv_folds_non_float_ids(self):
"""Test to check that CV folds with non-float IDs raise error when converted to floats."""
# create custom CV folds
custom_cv_folds = self.make_cv_folds_data()[1]
# write the generated CV folds to a CSV file
fold_file_path = other_dir / "custom_folds.csv"
with open(fold_file_path, "w", newline="") as foldf:
w = csv.writer(foldf)
w.writerow(["id", "fold"])
for example_id, fold_label in custom_cv_folds.items():
w.writerow([example_id, fold_label])
# now read the CSV file using _load_cv_folds, which should raise ValueError
with self.assertRaises(ValueError):
load_cv_folds(fold_file_path, ids_to_floats=True)
def test_retrieve_cv_folds(self):
"""Test to make sure that the fold ids get returned correctly after cross-validation."""
# Setup
learner = Learner("LogisticRegression")
num_folds = 5
cv_fs, custom_cv_folds = self.make_cv_folds_data(
num_examples_per_fold=2, num_folds=num_folds
)
# Test 1: learner.cross_validate() makes the folds itself.
expected_fold_ids = {
"EXAMPLE_0": "0",
"EXAMPLE_1": "4",
"EXAMPLE_2": "3",
"EXAMPLE_3": "1",
"EXAMPLE_4": "2",
"EXAMPLE_5": "2",
"EXAMPLE_6": "1",
"EXAMPLE_7": "0",
"EXAMPLE_8": "4",
"EXAMPLE_9": "3",
}
_, _, _, skll_fold_ids, _ = learner.cross_validate(
cv_fs,
stratified=True,
cv_folds=num_folds,
grid_search=True,
grid_objective="f1_score_micro",
shuffle=False,
)
self.assertEqual(skll_fold_ids, expected_fold_ids)
# Test 2: if we pass in custom fold ids, those are also preserved.
_, _, _, skll_fold_ids, _ = learner.cross_validate(
cv_fs,
stratified=True,
cv_folds=custom_cv_folds,
grid_search=True,
grid_objective="f1_score_micro",
shuffle=False,
)
self.assertEqual(skll_fold_ids, custom_cv_folds)
# Test 3: when learner.cross_validate() makes the folds but stratified=False
# and grid_search=False, so that KFold is used.
expected_fold_ids = {
"EXAMPLE_0": "0",
"EXAMPLE_1": "0",
"EXAMPLE_2": "1",
"EXAMPLE_3": "1",
"EXAMPLE_4": "2",
"EXAMPLE_5": "2",
"EXAMPLE_6": "3",
"EXAMPLE_7": "3",
"EXAMPLE_8": "4",
"EXAMPLE_9": "4",
}
_, _, _, skll_fold_ids, _ = learner.cross_validate(
cv_fs, stratified=False, cv_folds=num_folds, grid_search=False, shuffle=False
)
self.assertEqual(skll_fold_ids, custom_cv_folds)
def test_folds_file_logging_num_folds(self):
"""Test when using `folds_file`, log shows number of folds and appropriate warning."""
# Run experiment
suffix = ".jsonlines"
train_path = train_dir / f"f0{suffix}"
template_path = config_dir / "test_folds_file.template.cfg"
config_path = fill_in_config_paths_for_single_file(template_path, train_path, None)
run_configuration(config_path, quiet=True, local=True)
# Check experiment log output
with open(output_dir / "test_folds_file_logging.log") as f:
cv_file_pattern = re.compile(
r'Specifying "folds_file" overrides both explicit and default ' r'"num_cv_folds".'
)
matches = re.findall(cv_file_pattern, f.read())
self.assertEqual(len(matches), 1)
# Check job log output
with open(output_dir / "test_folds_file_logging_LogisticRegression.log") as f:
cv_folds_pattern = re.compile(
r"(Task: cross_validate\n)(.+)(Cross-validating \([0-9]+ folds, seed=[0-9]+\))"
)
matches = re.findall(cv_folds_pattern, f.read())
self.assertEqual(len(matches), 1)
def test_folds_file_with_fewer_ids_than_featureset(self):
"""Test when using `folds_file`, log shows warning for extra IDs in featureset."""
# Run experiment with a special featureset that has extra IDs
suffix = ".jsonlines"
train_path = train_dir / f"f5{suffix}"
template_path = config_dir / "test_folds_file.template.cfg"
config_path = fill_in_config_paths_for_single_file(template_path, train_path, None)
run_configuration(config_path, quiet=True, local=True)
# Check job log output
with open(output_dir / "test_folds_file_logging_LogisticRegression.log") as f:
cv_file_pattern = re.compile(
r"Feature set contains IDs that are not in folds dictionary. "
r"Skipping those IDs."
)
matches = re.findall(cv_file_pattern, f.read())
self.assertEqual(len(matches), 1)
def test_folds_file_logging_grid_search(self):
"""
Test logging with `folds_file`.
When `folds_file` is used but `use_folds_file` for grid search
is specified that we get an appropriate message in the log.
"""
# Run experiment
suffix = ".jsonlines"
train_path = train_dir / f"f0{suffix}"
template_path = config_dir / "test_folds_file_grid.template.cfg"
config_path = fill_in_config_paths_for_single_file(template_path, train_path, None)
run_configuration(config_path, quiet=True, local=True)
# Check experiment log output
with open(output_dir / "test_folds_file_logging.log") as f:
cv_file_pattern = re.compile(
r'Specifying "folds_file" overrides both explicit and default '
r'"num_cv_folds".\n(.+)The specified "folds_file" will not be '
r"used for inner grid search."
)
matches = re.findall(cv_file_pattern, f.read())
self.assertEqual(len(matches), 1)
def test_cross_validate_task(self):
"""Test that 10-fold cross_validate experiments work and fold ids get saved."""
# Run experiment
suffix = ".jsonlines"
train_path = train_dir / f"f0{suffix}"
template_path = config_dir / "test_save_cv_folds.template.cfg"
config_path = fill_in_config_paths_for_single_file(template_path, train_path, None)
run_configuration(config_path, quiet=True, local=True)
# Check final average results
with open(output_dir / "test_save_cv_folds_LogisticRegression.results.json") as f:
result_dict = json.load(f)[10]
assert_almost_equal(result_dict["accuracy"], 0.517)
# Check that the fold ids were saved correctly
# First compute the expected fold IDs
examples = load_featureset(train_path, "", suffix, quiet=True)
expected_skll_ids = compute_expected_folds_for_cv_testing(examples, num_folds=10)
# read in the computed fold IDs
skll_fold_ids = {}
with open(output_dir / "test_save_cv_folds_skll_fold_ids.csv") as f:
reader = csv.DictReader(f)
for row in reader:
skll_fold_ids[row["id"]] = row["cv_test_fold"]
# convert the dictionary to strings (sorted by key) for quick comparison
skll_fold_ids_str = "".join(f"{key}{val}" for key, val in sorted(skll_fold_ids.items()))
expected_skll_ids_str = "".join(
f"{key}{val}" for key, val in sorted(expected_skll_ids.items())
)
self.assertEqual(skll_fold_ids_str, expected_skll_ids_str)
def test_cross_validate_task_save_cv_models(self):
"""Test 10-fold cross_validate experiments and model saving."""
suffix = ".jsonlines"
train_path = train_dir / f"f0{suffix}"
template_path = config_dir / "test_save_cv_models.template.cfg"
config_path = fill_in_config_paths_for_single_file(template_path, train_path, None)
run_configuration(config_path, quiet=True, local=True)
cv_model_prefix = "test_save_cv_models_LogisticRegression_fold"
for i in range(1, 11):
model_path = output_dir / f"{cv_model_prefix}{i}.model"
assert model_path.exists()
def check_cross_validate_task_with_custom_seed(self, learner_type, use_config):
"""
Test cross-validation with custom specified seed.
Test that cross-validation for either a classifier or a regressor
with an custom seed and with grid search enabled generates the
same folds as expected. We run the test using a configuration file
as well as the API.
"""
# load in the featureset
suffix = ".jsonlines"
train_path = train_dir / f"f0{suffix}"
if learner_type == "classifier":
examples = load_featureset(train_path, "", suffix, quiet=True)
else:
examples = load_featureset(
train_path, "", suffix, class_map={"dog": 0, "cat": 1}, quiet=True
)
# use a configuration file or the API, as specified
if use_config:
template_name = (
"test_custom_cv_seed_classifier.template.cfg"
if learner_type == "classifier"
else "test_custom_cv_seed_regressor.template.cfg"
)
template_path = config_dir / template_name
config_path = fill_in_config_paths_for_single_file(template_path, train_path, None)
run_configuration(config_path, quiet=True, local=True)
# read in the folds file produced by SKLL
computed_fold_ids = {}
cv_folds_file_name = (
"test_custom_cv_seed_clf_skll_fold_ids.csv"
if learner_type == "classifier"
else "test_custom_cv_seed_reg_skll_fold_ids.csv"
)
with open(output_dir / cv_folds_file_name) as f:
reader = csv.DictReader(f)
for row in reader:
computed_fold_ids[row["id"]] = row["cv_test_fold"]
else:
learner_name = "LogisticRegression" if learner_type == "classifier" else "Ridge"
learner = Learner(learner_name)
objective = "accuracy" if learner_type == "classifier" else "pearson"
(_, _, _, computed_fold_ids, _) = learner.cross_validate(
examples, cv_folds=10, cv_seed=54321, grid_search=True, grid_objective=objective
)
# compute the fold IDs we expect from SKLL using the custom seed directly;
# obviously, we only use stratified fold splitting with classifiers
expected_fold_ids = compute_expected_folds_for_cv_testing(
examples, stratified=learner_type == "classifier", seed=54321
)
# convert the dictionary to strings (sorted by key) for quick comparison
computed_fold_ids_str = "".join(
f"{key}{val}" for key, val in sorted(computed_fold_ids.items())
)
expected_fold_ids_str = "".join(
f"{key}{val}" for key, val in sorted(expected_fold_ids.items())
)
# the two sets of folds must be equal
self.assertEqual(computed_fold_ids_str, expected_fold_ids_str)
def test_cross_validate_task_with_custom_seed(self):
yield self.check_cross_validate_task_with_custom_seed, "classifier", False
yield self.check_cross_validate_task_with_custom_seed, "classifier", True
yield self.check_cross_validate_task_with_custom_seed, "regressor", False
yield self.check_cross_validate_task_with_custom_seed, "regressor", True