Skip to content

Commit f779b3d

Browse files
committed
add sklearn 0.18 layer to scripts
1 parent e06b7e5 commit f779b3d

File tree

11 files changed

+179
-58
lines changed

11 files changed

+179
-58
lines changed
Binary file not shown.
Binary file not shown.
8 KB
Binary file not shown.
+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from sklearn.feature_extraction.text import HashingVectorizer
2+
import re
3+
import os
4+
import pickle
5+
6+
cur_dir = os.path.dirname(__file__)
7+
stop = pickle.load(open(
8+
os.path.join(cur_dir,
9+
'pkl_objects',
10+
'stopwords.pkl'), 'rb'))
11+
12+
def tokenizer(text):
13+
text = re.sub('<[^>]*>', '', text)
14+
emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
15+
text.lower())
16+
text = re.sub('[\W]+', ' ', text.lower()) \
17+
+ ' '.join(emoticons).replace('-', '')
18+
tokenized = [w for w in text.split() if w not in stop]
19+
return tokenized
20+
21+
vect = HashingVectorizer(decode_error='ignore',
22+
n_features=2**21,
23+
preprocessor=None,
24+
tokenizer=tokenizer)

code/optional-py-scripts/ch03.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212

1313
import numpy as np
1414
from sklearn import datasets
15-
from sklearn.cross_validation import train_test_split
1615
from sklearn.preprocessing import StandardScaler
1716
from sklearn.metrics import accuracy_score
1817
from sklearn.linear_model import LogisticRegression
@@ -25,6 +24,14 @@
2524
from matplotlib.colors import ListedColormap
2625
import matplotlib.pyplot as plt
2726

27+
# for sklearn 0.18's alternative syntax
28+
from distutils.version import LooseVersion as Version
29+
from sklearn import __version__ as sklearn_version
30+
if Version(sklearn_version) < '0.18':
31+
from sklearn.grid_search import train_test_split
32+
else:
33+
from sklearn.model_selection import train_test_split
34+
2835
#############################################################################
2936
print(50 * '=')
3037
print('Section: First steps with scikit-learn')
@@ -191,7 +198,8 @@ def cost_0(z):
191198
# plt.savefig('./figures/logistic_regression.png', dpi=300)
192199
plt.show()
193200

194-
print('Predicted probabilities', lr.predict_proba(X_test_std[0, :]))
201+
print('Predicted probabilities', lr.predict_proba(X_test_std[0, :]
202+
.reshape(1, -1)))
195203

196204
#############################################################################
197205
print(50 * '=')

code/optional-py-scripts/ch04.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
from sklearn.preprocessing import OneHotEncoder
1919
from sklearn.preprocessing import MinMaxScaler
2020
from sklearn.preprocessing import StandardScaler
21-
from sklearn.cross_validation import train_test_split
2221
from sklearn.linear_model import LogisticRegression
2322
from sklearn.neighbors import KNeighborsClassifier
2423
from sklearn.ensemble import RandomForestClassifier
@@ -27,6 +26,13 @@
2726
from itertools import combinations
2827
import matplotlib.pyplot as plt
2928

29+
# for sklearn 0.18's alternative syntax
30+
from distutils.version import LooseVersion as Version
31+
from sklearn import __version__ as sklearn_version
32+
if Version(sklearn_version) < '0.18':
33+
from sklearn.grid_search import train_test_split
34+
else:
35+
from sklearn.model_selection import train_test_split
3036

3137
#############################################################################
3238
print(50 * '=')
@@ -382,5 +388,11 @@ def _calc_score(self, X_train, y_train, X_test, y_test, indices):
382388
# plt.savefig('./random_forest.png', dpi=300)
383389
plt.show()
384390

385-
X_selected = forest.transform(X_train, threshold=0.15)
391+
if Version(sklearn_version) < '0.18':
392+
X_selected = forest.transform(X_train, threshold=0.15)
393+
else:
394+
from sklearn.feature_selection import SelectFromModel
395+
sfm = SelectFromModel(forest, threshold=0.15, prefit=True)
396+
X_selected = sfm.transform(X_train)
397+
386398
X_selected.shape

code/optional-py-scripts/ch05.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212

1313
import pandas as pd
1414
import numpy as np
15-
from sklearn.cross_validation import train_test_split
1615
from sklearn.preprocessing import StandardScaler
1716
from sklearn.decomposition import PCA
1817
import matplotlib.pyplot as plt
@@ -27,6 +26,17 @@
2726
from scipy.linalg import eigh
2827
from matplotlib.ticker import FormatStrFormatter
2928

29+
# for sklearn 0.18's alternative syntax
30+
from distutils.version import LooseVersion as Version
31+
from sklearn import __version__ as sklearn_version
32+
if Version(sklearn_version) < '0.18':
33+
from sklearn.grid_search import train_test_split
34+
from sklearn.lda import LDA
35+
else:
36+
from sklearn.model_selection import train_test_split
37+
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
38+
39+
3040
#############################################################################
3141
print(50 * '=')
3242
print('Section: Unsupervised dimensionality reduction'

code/optional-py-scripts/ch06.py

+55-28
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,10 @@
1515
import pandas as pd
1616
import matplotlib.pyplot as plt
1717
from sklearn.preprocessing import LabelEncoder
18-
from sklearn.cross_validation import train_test_split
1918
from sklearn.preprocessing import StandardScaler
2019
from sklearn.decomposition import PCA
2120
from sklearn.linear_model import LogisticRegression
2221
from sklearn.pipeline import Pipeline
23-
from sklearn.cross_validation import StratifiedKFold
24-
from sklearn.cross_validation import cross_val_score
25-
from sklearn.learning_curve import learning_curve
26-
from sklearn.learning_curve import validation_curve
27-
from sklearn.grid_search import GridSearchCV
2822
from sklearn.tree import DecisionTreeClassifier
2923
from sklearn.svm import SVC
3024
from sklearn.metrics import confusion_matrix
@@ -38,6 +32,24 @@
3832
from sklearn.metrics import accuracy_score
3933
from scipy import interp
4034

35+
# for sklearn 0.18's alternative syntax
36+
from distutils.version import LooseVersion as Version
37+
from sklearn import __version__ as sklearn_version
38+
if Version(sklearn_version) < '0.18':
39+
from sklearn.grid_search import train_test_split
40+
from sklearn.cross_validation import StratifiedKFold
41+
from sklearn.cross_validation import cross_val_score
42+
from sklearn.learning_curve import learning_curve
43+
from sklearn.learning_curve import validation_curve
44+
from sklearn.grid_search import GridSearchCV
45+
else:
46+
from sklearn.model_selection import train_test_split
47+
from sklearn.model_selection import StratifiedKFold
48+
from sklearn.model_selection import cross_val_score
49+
from sklearn.model_selection import learning_curve
50+
from sklearn.model_selection import validation_curve
51+
from sklearn.model_selection import GridSearchCV
52+
4153
#############################################################################
4254
print(50 * '=')
4355
print('Section: Loading the Breast Cancer Wisconsin dataset')
@@ -83,31 +95,39 @@
8395
print('Section: K-fold cross-validation')
8496
print(50 * '-')
8597

86-
kfold = StratifiedKFold(y=y_train,
87-
n_folds=10,
88-
random_state=1)
98+
if Version(sklearn_version) < '0.18':
99+
kfold = StratifiedKFold(y=y_train,
100+
n_folds=10,
101+
random_state=1)
102+
else:
103+
kfold = StratifiedKFold(n_splits=10,
104+
random_state=1).split(X_train, y_train)
89105

90106
scores = []
91107
for k, (train, test) in enumerate(kfold):
92108
pipe_lr.fit(X_train[train], y_train[train])
93109
score = pipe_lr.score(X_train[test], y_train[test])
94110
scores.append(score)
95-
print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1,
111+
print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k + 1,
96112
np.bincount(y_train[train]), score))
97113

98114
print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
99115

100116
print('Using StratifiedKFold')
101-
kfold = StratifiedKFold(y=y_train,
102-
n_folds=10,
103-
random_state=1)
117+
if Version(sklearn_version) < '0.18':
118+
kfold = StratifiedKFold(y=y_train,
119+
n_folds=10,
120+
random_state=1)
121+
else:
122+
kfold = StratifiedKFold(n_splits=10,
123+
random_state=1).split(X_train, y_train)
104124

105125
scores = []
106126
for k, (train, test) in enumerate(kfold):
107127
pipe_lr.fit(X_train[train], y_train[train])
108128
score = pipe_lr.score(X_train[test], y_train[test])
109129
scores.append(score)
110-
print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1,
130+
print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k + 1,
111131
np.bincount(y_train[train]), score))
112132

113133
print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
@@ -134,12 +154,12 @@
134154
('clf', LogisticRegression(penalty='l2', random_state=0))])
135155

136156
train_sizes, train_scores, test_scores =\
137-
learning_curve(estimator=pipe_lr,
138-
X=X_train,
139-
y=y_train,
140-
train_sizes=np.linspace(0.1, 1.0, 10),
141-
cv=10,
142-
n_jobs=1)
157+
learning_curve(estimator=pipe_lr,
158+
X=X_train,
159+
y=y_train,
160+
train_sizes=np.linspace(0.1, 1.0, 10),
161+
cv=10,
162+
n_jobs=1)
143163

144164
train_mean = np.mean(train_scores, axis=1)
145165
train_std = np.std(train_scores, axis=1)
@@ -182,12 +202,12 @@
182202

183203
param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
184204
train_scores, test_scores = validation_curve(
185-
estimator=pipe_lr,
186-
X=X_train,
187-
y=y_train,
188-
param_name='clf__C',
189-
param_range=param_range,
190-
cv=10)
205+
estimator=pipe_lr,
206+
X=X_train,
207+
y=y_train,
208+
param_name='clf__C',
209+
param_range=param_range,
210+
cv=10)
191211

192212
train_mean = np.mean(train_scores, axis=1)
193213
train_std = np.std(train_scores, axis=1)
@@ -345,7 +365,14 @@
345365

346366
X_train2 = X_train[:, [4, 14]]
347367

348-
cv = StratifiedKFold(y_train, n_folds=3, random_state=1)
368+
if Version(sklearn_version) < '0.18':
369+
cv = StratifiedKFold(y_train,
370+
n_folds=3,
371+
random_state=1)
372+
373+
else:
374+
cv = list(StratifiedKFold(n_splits=3,
375+
random_state=1).split(X_train, y_train))
349376

350377
fig = plt.figure(figsize=(7, 5))
351378

@@ -367,7 +394,7 @@
367394
tpr,
368395
lw=1,
369396
label='ROC fold %d (area = %0.2f)'
370-
% (i+1, roc_auc))
397+
% (i + 1, roc_auc))
371398

372399
plt.plot([0, 1],
373400
[0, 1],

code/optional-py-scripts/ch07.py

+36-18
Original file line numberDiff line numberDiff line change
@@ -23,22 +23,30 @@
2323
from sklearn.base import clone
2424
from sklearn.pipeline import _name_estimators
2525
from sklearn import datasets
26-
from sklearn.cross_validation import train_test_split
2726
from sklearn.preprocessing import StandardScaler
2827
from sklearn.preprocessing import LabelEncoder
29-
from sklearn.cross_validation import cross_val_score
3028
from sklearn.linear_model import LogisticRegression
3129
from sklearn.tree import DecisionTreeClassifier
3230
from sklearn.neighbors import KNeighborsClassifier
3331
from sklearn.pipeline import Pipeline
3432
from sklearn.metrics import roc_curve
3533
from sklearn.metrics import auc
3634
from sklearn.metrics import accuracy_score
37-
from sklearn.grid_search import GridSearchCV
3835
from sklearn.ensemble import BaggingClassifier
3936
from sklearn.ensemble import AdaBoostClassifier
4037
from itertools import product
4138

39+
# Added version check for recent scikit-learn 0.18 checks
40+
from distutils.version import LooseVersion as Version
41+
from sklearn import __version__ as sklearn_version
42+
if Version(sklearn_version) < '0.18':
43+
from sklearn.cross_validation import train_test_split
44+
from sklearn.cross_validation import cross_val_score
45+
from sklearn.cross_validation import GridSearchCV
46+
else:
47+
from sklearn.model_selection import train_test_split
48+
from sklearn.model_selection import cross_val_score
49+
from sklearn.model_selection import GridSearchCV
4250

4351
#############################################################################
4452
print(50 * '=')
@@ -48,7 +56,7 @@
4856

4957
def ensemble_error(n_classifier, error):
5058
k_start = math.ceil(n_classifier / 2.0)
51-
probs = [comb(n_classifier, k) * error**k * (1-error)**(n_classifier - k)
59+
probs = [comb(n_classifier, k) * error**k * (1 - error)**(n_classifier - k)
5260
for k in range(k_start, n_classifier + 1)]
5361
return sum(probs)
5462

@@ -185,11 +193,11 @@ def predict(self, X):
185193
for clf in self.classifiers_]).T
186194

187195
maj_vote = np.apply_along_axis(
188-
lambda x:
189-
np.argmax(np.bincount(x,
190-
weights=self.weights)),
191-
axis=1,
192-
arr=predictions)
196+
lambda x:
197+
np.argmax(np.bincount(x,
198+
weights=self.weights)),
199+
axis=1,
200+
arr=predictions)
193201
maj_vote = self.lablenc_.inverse_transform(maj_vote)
194202
return maj_vote
195203

@@ -237,9 +245,9 @@ def get_params(self, deep=True):
237245
y = le.fit_transform(y)
238246

239247
X_train, X_test, y_train, y_test =\
240-
train_test_split(X, y,
241-
test_size=0.5,
242-
random_state=1)
248+
train_test_split(X, y,
249+
test_size=0.5,
250+
random_state=1)
243251

244252
clf1 = LogisticRegression(penalty='l2',
245253
C=0.001,
@@ -391,9 +399,19 @@ def get_params(self, deep=True):
391399
scoring='roc_auc')
392400
grid.fit(X_train, y_train)
393401

394-
for params, mean_score, scores in grid.grid_scores_:
395-
print("%0.3f+/-%0.2f %r"
396-
% (mean_score, scores.std() / 2.0, params))
402+
if Version(sklearn_version) < '0.18':
403+
for params, mean_score, scores in grid.grid_scores_:
404+
print("%0.3f +/- %0.2f %r"
405+
% (mean_score, scores.std() / 2.0, params))
406+
407+
else:
408+
cv_keys = ('mean_test_score', 'std_test_score', 'params')
409+
410+
for r, _ in enumerate(grid.cv_results_['mean_test_score']):
411+
print("%0.3f +/- %0.2f %r"
412+
% (grid.cv_results_[cv_keys[0]][r],
413+
grid.cv_results_[cv_keys[1]][r] / 2.0,
414+
grid.cv_results_[cv_keys[2]][r]))
397415

398416
print('Best parameters: %s' % grid.best_params_)
399417
print('Accuracy: %.2f' % grid.best_score_)
@@ -426,9 +444,9 @@ def get_params(self, deep=True):
426444
y = le.fit_transform(y)
427445

428446
X_train, X_test, y_train, y_test =\
429-
train_test_split(X, y,
430-
test_size=0.40,
431-
random_state=1)
447+
train_test_split(X, y,
448+
test_size=0.40,
449+
random_state=1)
432450

433451
tree = DecisionTreeClassifier(criterion='entropy',
434452
max_depth=None,

0 commit comments

Comments
 (0)