29
29
from sklearn .cross_validation import StratifiedKFold
30
30
from sklearn .lda import LDA
31
31
32
- #charsToTrain=range(48,58)
33
- chars_to_train = range (48 ,58 )
34
-
35
- num_chars = 3000 #limit the number to speed up the calculation
36
-
37
- input_filters_dict = {'m_label' : chars_to_train , 'font' : 'E13B' }
38
-
39
- # output the character label and the image and column sums
40
- output_feature_list = ['m_label' ,'image' ]
41
-
42
- # read the complete image (20x20) = 400 pixels for each character
43
- ds = ocr_utils .read_data (input_filters_dict = input_filters_dict ,
44
- output_feature_list = output_feature_list ,
45
- random_state = 0 )
46
-
47
- y_train = ds .train .features [0 ][:num_chars ]
48
- X_train = ds .train .features [1 ][:num_chars ]
49
-
50
- # y_test = ds.test.features[0]-48
51
- # X_test = ds.test.features[1]
52
- # y_train, X_train, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = charsToTrain , columns=range(0,20), nChars=1000, test_size=0.3,random_state=0)
53
-
54
- from sklearn .linear_model import LogisticRegression
55
- from sklearn .cross_validation import train_test_split
56
-
57
- X_train , X_test , y_train , y_test = train_test_split (X_train , y_train , test_size = 0.3 , random_state = 0 )
58
-
59
- from sklearn .preprocessing import StandardScaler
60
- #
61
- # sc = StandardScaler()
62
- # X_train_std = sc.fit_transform(X_train)
63
- # X_test_std = sc.fit_transform(X_test)
64
-
65
- # X_train, X_test, y_train, y_test = \
66
- # train_test_split(X, y, test_size=0.20, random_state=1)
67
-
68
- from sklearn .decomposition import PCA
69
-
70
- from sklearn .pipeline import Pipeline
71
-
72
- num_planes = range (2 ,12 )
73
-
74
- pca_scores = []
75
- pca_std_dev = []
76
- for num_PCA in num_planes :
77
- print ('number of Principal Components = {}' .format (num_PCA ))
78
- pipe_lr = Pipeline ([('scl' , StandardScaler ()),
79
- ('pca' , PCA (n_components = num_PCA )),
80
- ('clf' , LogisticRegression (random_state = 1 ))])
32
+ if __name__ == '__main__' :
33
+ #charsToTrain=range(48,58)
34
+ chars_to_train = range (48 ,58 )
81
35
82
- pipe_lr .fit (X_train , y_train )
83
- print ('Test Accuracy: %.3f' % pipe_lr .score (X_test , y_test ))
36
+ num_chars = 3000 #limit the number to speed up the calculation
84
37
85
-
38
+ input_filters_dict = {'m_label' : chars_to_train , 'font' : 'E13B' }
39
+
40
+ # output the character label and the image and column sums
41
+ output_feature_list = ['m_label' ,'image' ]
42
+
43
+ # read the complete image (20x20) = 400 pixels for each character
44
+ ds = ocr_utils .read_data (input_filters_dict = input_filters_dict ,
45
+ output_feature_list = output_feature_list ,
46
+ random_state = 0 )
47
+
48
+ y_train = ds .train .features [0 ][:num_chars ]
49
+ X_train = ds .train .features [1 ][:num_chars ]
50
+
51
+ # y_test = ds.test.features[0]-48
52
+ # X_test = ds.test.features[1]
53
+ # y_train, X_train, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = charsToTrain , columns=range(0,20), nChars=1000, test_size=0.3,random_state=0)
54
+
55
+ from sklearn .linear_model import LogisticRegression
56
+ from sklearn .cross_validation import train_test_split
57
+
58
+ X_train , X_test , y_train , y_test = train_test_split (X_train , y_train , test_size = 0.3 , random_state = 0 )
59
+
60
+ from sklearn .preprocessing import StandardScaler
61
+ #
62
+ # sc = StandardScaler()
63
+ # X_train_std = sc.fit_transform(X_train)
64
+ # X_test_std = sc.fit_transform(X_test)
65
+
66
+ # X_train, X_test, y_train, y_test = \
67
+ # train_test_split(X, y, test_size=0.20, random_state=1)
68
+
69
+ from sklearn .decomposition import PCA
70
+
71
+ from sklearn .pipeline import Pipeline
72
+
73
+ num_planes = range (2 ,12 )
86
74
87
- kfold = StratifiedKFold (y = y_train ,
88
- n_folds = 10 ,
89
- random_state = 1 )
90
-
91
- scores = []
92
- for k , (train , test ) in enumerate (kfold ):
93
- pipe_lr .fit (X_train [train ], y_train [train ])
94
- score = pipe_lr .score (X_train [test ], y_train [test ])
95
- scores .append (score )
96
- #print ('train {} samples: {}'.format(len(train), train))
97
- #print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1, np.bincount(y_train[train])[list(charsToTrain)], score))
75
+ pca_scores = []
76
+ pca_std_dev = []
77
+ for num_PCA in num_planes :
78
+ print ('number of Principal Components = {}' .format (num_PCA ))
79
+ pipe_lr = Pipeline ([('scl' , StandardScaler ()),
80
+ ('pca' , PCA (n_components = num_PCA )),
81
+ ('clf' , LogisticRegression (random_state = 1 ))])
82
+
83
+ pipe_lr .fit (X_train , y_train )
84
+ print ('Test Accuracy: %.3f' % pipe_lr .score (X_test , y_test ))
98
85
99
- print ('\n CV accuracy: %.3f +/- %.3f' % (np .mean (scores ), np .std (scores )))
100
- from sklearn .cross_validation import cross_val_score
101
-
102
- scores = cross_val_score (estimator = pipe_lr ,
103
- X = X_train ,
104
- y = y_train ,
105
- cv = 10 ,
106
- n_jobs = 8 )
107
- print ('CV accuracy scores: %s' % scores )
108
- print ('CV accuracy: %.3f +/- %.3f' % (np .mean (scores ), np .std (scores )))
109
- pca_scores .append (np .mean (scores ))
110
- pca_std_dev .append (np .std (scores ))
111
-
112
- plt .plot (num_planes , pca_scores , marker = 'o' )
113
- plt .ylabel ('Accuracy' )
114
- plt .xlabel ('number of Principal Components' )
115
- title = 'Accuracy versus number of Principal Components'
116
- plt .title (title )
117
- plt .tight_layout ()
118
- ocr_utils .show_figures (plt , title )
119
-
120
- plt .plot (num_planes , pca_std_dev , marker = 'o' )
121
- plt .ylabel ('Standard Deviation' )
122
- plt .xlabel ('number of Principal Components' )
123
- title = 'Standard Deviation versus number of Principal Components'
124
- plt .title (title )
125
- plt .tight_layout ()
126
- ocr_utils .show_figures (plt , title )
127
-
128
- pca_scores = []
129
- pca_std_dev = []
130
- for num_LDA in num_planes :
131
- print ('number of Principal Components = {}' .format (num_LDA ))
132
- pipe_lr = Pipeline ([('scl' , StandardScaler ()),
133
- ('lda' , LDA (n_components = num_LDA )),
134
- ('clf' , LogisticRegression (random_state = 1 ))])
135
-
136
- pipe_lr .fit (X_train , y_train )
137
- print ('Test Accuracy: %.3f' % pipe_lr .score (X_test , y_test ))
138
-
139
86
140
- kfold = StratifiedKFold (y = y_train ,
141
- n_folds = 10 ,
142
- random_state = 1 )
143
-
144
- scores = []
145
- for k , (train , test ) in enumerate (kfold ):
146
- pipe_lr .fit (X_train [train ], y_train [train ])
147
- score = pipe_lr .score (X_train [test ], y_train [test ])
148
- scores .append (score )
149
- #print ('train {} samples: {}'.format(len(train), train))
150
- #print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1, np.bincount(y_train[train])[list(charsToTrain)], score))
151
87
152
- print ('\n CV accuracy: %.3f +/- %.3f' % (np .mean (scores ), np .std (scores )))
153
-
88
+ kfold = StratifiedKFold (y = y_train ,
89
+ n_folds = 10 ,
90
+ random_state = 1 )
91
+
92
+ scores = []
93
+ for k , (train , test ) in enumerate (kfold ):
94
+ pipe_lr .fit (X_train [train ], y_train [train ])
95
+ score = pipe_lr .score (X_train [test ], y_train [test ])
96
+ scores .append (score )
97
+ #print ('train {} samples: {}'.format(len(train), train))
98
+ #print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1, np.bincount(y_train[train])[list(charsToTrain)], score))
99
+
100
+ print ('\n CV accuracy: %.3f +/- %.3f' % (np .mean (scores ), np .std (scores )))
101
+ from sklearn .cross_validation import cross_val_score
102
+
103
+ scores = cross_val_score (estimator = pipe_lr ,
104
+ X = X_train ,
105
+ y = y_train ,
106
+ cv = 10 ,
107
+ n_jobs = - 1 )
108
+ print ('CV accuracy scores: %s' % scores )
109
+ print ('CV accuracy: %.3f +/- %.3f' % (np .mean (scores ), np .std (scores )))
110
+ pca_scores .append (np .mean (scores ))
111
+ pca_std_dev .append (np .std (scores ))
154
112
155
- scores = cross_val_score (estimator = pipe_lr ,
156
- X = X_train ,
157
- y = y_train ,
158
- cv = 10 ,
159
- n_jobs = 8 )
160
- print ('CV accuracy scores: %s' % scores )
161
- print ('CV accuracy: %.3f +/- %.3f' % (np .mean (scores ), np .std (scores )))
162
- pca_scores .append (np .mean (scores ))
163
- pca_std_dev .append (np .std (scores ))
164
-
165
- plt .plot (num_planes , pca_scores , marker = 'o' )
166
- plt .ylabel ('Accuracy' )
167
- plt .xlabel ('number of Linear Discriminants' )
168
- title = 'Accuracy versus number of Linear Discriminants'
169
- plt .title (title )
170
- plt .tight_layout ()
171
- ocr_utils .show_figures (plt , title )
172
-
173
- plt .plot (num_planes , pca_std_dev , marker = 'o' )
174
- plt .ylabel ('Standard Deviation' )
175
- plt .xlabel ('number of Linear Discriminants' )
176
- title = 'Standard Deviation versus number of Linear Discriminants'
177
- plt .title (title )
178
- plt .tight_layout ()
179
- ocr_utils .show_figures (plt , title )
180
-
181
- print ('\n ########################### No Errors ####################################' )
113
+ plt .plot (num_planes , pca_scores , marker = 'o' )
114
+ plt .ylabel ('Accuracy' )
115
+ plt .xlabel ('number of Principal Components' )
116
+ title = 'Accuracy versus number of Principal Components'
117
+ plt .title (title )
118
+ plt .tight_layout ()
119
+ ocr_utils .show_figures (plt , title )
120
+
121
+ plt .plot (num_planes , pca_std_dev , marker = 'o' )
122
+ plt .ylabel ('Standard Deviation' )
123
+ plt .xlabel ('number of Principal Components' )
124
+ title = 'Standard Deviation versus number of Principal Components'
125
+ plt .title (title )
126
+ plt .tight_layout ()
127
+ ocr_utils .show_figures (plt , title )
128
+
129
+ pca_scores = []
130
+ pca_std_dev = []
131
+ for num_LDA in num_planes :
132
+ print ('number of Principal Components = {}' .format (num_LDA ))
133
+ pipe_lr = Pipeline ([('scl' , StandardScaler ()),
134
+ ('lda' , LDA (n_components = num_LDA )),
135
+ ('clf' , LogisticRegression (random_state = 1 ))])
136
+
137
+ pipe_lr .fit (X_train , y_train )
138
+ print ('Test Accuracy: %.3f' % pipe_lr .score (X_test , y_test ))
139
+
140
+
141
+ kfold = StratifiedKFold (y = y_train ,
142
+ n_folds = 10 ,
143
+ random_state = 1 )
144
+
145
+ scores = []
146
+ for k , (train , test ) in enumerate (kfold ):
147
+ pipe_lr .fit (X_train [train ], y_train [train ])
148
+ score = pipe_lr .score (X_train [test ], y_train [test ])
149
+ scores .append (score )
150
+ #print ('train {} samples: {}'.format(len(train), train))
151
+ #print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1, np.bincount(y_train[train])[list(charsToTrain)], score))
152
+
153
+ print ('\n CV accuracy: %.3f +/- %.3f' % (np .mean (scores ), np .std (scores )))
154
+
155
+
156
+ scores = cross_val_score (estimator = pipe_lr ,
157
+ X = X_train ,
158
+ y = y_train ,
159
+ cv = 10 ,
160
+ n_jobs = - 1 )
161
+ print ('CV accuracy scores: %s' % scores )
162
+ print ('CV accuracy: %.3f +/- %.3f' % (np .mean (scores ), np .std (scores )))
163
+ pca_scores .append (np .mean (scores ))
164
+ pca_std_dev .append (np .std (scores ))
165
+
166
+ plt .plot (num_planes , pca_scores , marker = 'o' )
167
+ plt .ylabel ('Accuracy' )
168
+ plt .xlabel ('number of Linear Discriminants' )
169
+ title = 'Accuracy versus number of Linear Discriminants'
170
+ plt .title (title )
171
+ plt .tight_layout ()
172
+ ocr_utils .show_figures (plt , title )
173
+
174
+ plt .plot (num_planes , pca_std_dev , marker = 'o' )
175
+ plt .ylabel ('Standard Deviation' )
176
+ plt .xlabel ('number of Linear Discriminants' )
177
+ title = 'Standard Deviation versus number of Linear Discriminants'
178
+ plt .title (title )
179
+ plt .tight_layout ()
180
+ ocr_utils .show_figures (plt , title )
181
+
182
+ print ('\n ########################### No Errors ####################################' )
0 commit comments