@@ -92,19 +92,7 @@ def read_file(pathName, input_filters_dict, random_state=None):
92
92
rd_font = (rd_font ,)
93
93
except :
94
94
rd_font = ()
95
-
96
- # with ZipFile(pathName, 'r') as myzip:
97
- # if len(rd_font) == 0:
98
- # names = myzip.namelist()
99
- # print ('\nreading all files...please wait')
100
- # df = pd.concat(apply_column_filters(pd.read_csv(myzip.open(fname,'r')), input_filters_dict) for fname in names)
101
- # else:
102
- # try:
103
- # df = pd.concat(apply_column_filters(pd.read_csv(myzip.open(font+".csv",'r')), input_filters_dict) for font in rd_font)
104
- # except:
105
- # raise ValueError('Could not find font file {} in the zip file'.format(rd_font))
106
- # myzip.close()
107
- # assert df.size >0
95
+
108
96
109
97
with ZipFile (pathName , 'r' ) as myzip :
110
98
if len (rd_font ) == 0 :
@@ -168,21 +156,26 @@ class TruthedCharacters(object):
168
156
Holds the training features and size information
169
157
170
158
"""
171
- def __init__ (self , features , output_feature_list ,h ,w ):
159
+ def __init__ (self , features , output_feature_list , one_hot_map , engine_type , h ,w ):
172
160
173
161
self ._num_examples = features [0 ].shape [0 ]
174
162
self ._nRows = h
175
163
self ._nCols = w
176
- self ._features = features
164
+ self ._features = features # list of features
177
165
self ._epochs_completed = 0
178
166
self ._index_in_epoch = 0
179
- self ._feature_names = output_feature_list
167
+ self ._feature_names = output_feature_list # list of names of features
180
168
self ._num_features = len (features )
169
+ self ._one_hot_map = one_hot_map # list >0 for each feature that is one_hot
170
+ self ._engine_type = engine_type
181
171
182
172
self ._feature_width = []
183
173
for i in range (self ._num_features ):
184
174
try :
185
- self ._feature_width += [features [i ].shape [1 ]]
175
+ if one_hot_map [i ] == 0 :
176
+ self ._feature_width += [features [i ].shape [1 ]]
177
+ else :
178
+ self ._feature_width += [one_hot_map [i ]]
186
179
except :
187
180
self ._feature_width += [1 ]
188
181
@@ -194,10 +187,43 @@ def num_features(self):
194
187
@property
195
188
def feature_width (self ):
196
189
return self ._feature_width
190
+ # fixup for formats required by various engines
191
+ # features that are straight from the .CSV file, without
192
+ # modifications for one-hot or scaling fall here.
193
+ # tensorflow requires a shape (:,1), thus the reshaping
194
+
195
+ def engine_conversion (self ,t1 ,colName ):
196
+ if self ._engine_type == 'tensorflow' and len (t1 .shape )== 1 :
197
+ t1 = np .reshape (t1 ,(- 1 ,1 ))
198
+
199
+ if self ._engine_type == 'theano' and colName == 'image' :
200
+ t1 = np .reshape (t1 ,(- 1 ,1 ,self ._nRows ,self ._nCols ))
201
+ return t1
202
+
203
+ def get_features (self , i , start , end ):
204
+ '''
205
+ memory saving version of features[i][start:end]
206
+ '''
207
+ t1 = self ._features [i ][start :end ]
208
+ n_hots = self ._one_hot_map [i ]
209
+
210
+ if n_hots == 0 :
211
+ rtn = self .engine_conversion (t1 , self ._feature_names [i ])
212
+ else :
213
+ rtn = self .engine_conversion (np .eye (n_hots )[t1 ], self ._feature_names [i ])
214
+ return rtn
197
215
198
216
@property
199
217
def features (self ):
200
- return self ._features
218
+ # wait until last moment to compute one_hots to save memory
219
+ rtn = []
220
+ for t1 , nm , n_hots in zip (self ._features , self ._feature_names , self ._one_hot_map ):
221
+ if n_hots == 0 :
222
+ rtn .append (self .engine_conversion (t1 , nm ) )
223
+ #assert(np.all(rtn[-1]==t1))
224
+ else :
225
+ rtn .append ( self .engine_conversion (np .eye (n_hots )[t1 ], nm ) )
226
+ return rtn
201
227
202
228
@property
203
229
def feature_names (self ):
@@ -244,7 +270,7 @@ def next_batch(self, batch_size):
244
270
end = self ._index_in_epoch
245
271
outs = []
246
272
for i in range (self ._num_features ):
247
- outs += [self ._features [ i ][ start : end ] ]
273
+ outs += [self .get_features ( i , start , end ) ]
248
274
249
275
return outs
250
276
@@ -274,6 +300,8 @@ def dump_values(self):
274
300
print (s [:10 ],end = ' ' )
275
301
print (' ...' )
276
302
303
+
304
+
277
305
def apply_column_filters (df , input_filters_dict ):
278
306
''' apply the column filters to the incoming data
279
307
@@ -292,6 +320,21 @@ def apply_column_filters(df, input_filters_dict ):
292
320
criterion = df [key ].map (lambda x : x in value )
293
321
df = df [criterion ]
294
322
return df
323
+
324
+ def convert_to_unique (t1 ):
325
+ ''' convert unique values in an numpy.array into
326
+ indices into the unique array
327
+ arguments:
328
+ t1 numpy scalar array
329
+ return
330
+ t1 with each value changed to an index 0 to number of unique
331
+ values in t1-1
332
+ '''
333
+ t2 = t1
334
+ unique = np .unique (t1 )
335
+ for i ,u in enumerate (unique ):
336
+ t2 [t1 == u ]= i
337
+ return t2
295
338
296
339
def read_data (fileName = default_zip_file ,
297
340
input_filters_dict = {},
@@ -421,7 +464,7 @@ class DataSets(object):
421
464
'''
422
465
engine_type = engine_type .lower ()
423
466
424
- print ('\n parameter: input_filters_dict\n \t {}' .format (input_filters_dict ))
467
+ print ('\n parameter: input_filters_dict\n \t {}' .format (sorted ( input_filters_dict . items ()) ))
425
468
print ('parameter: output_feature_list\n \t {}' .format (output_feature_list ))
426
469
427
470
df = read_file (fileName , input_filters_dict ,random_state = random_state )
@@ -433,9 +476,7 @@ class DataSets(object):
433
476
available_columns .append (key )
434
477
435
478
print ('input filters available: \n \t {}:' .format (available_columns ))
436
-
437
-
438
-
479
+
439
480
h = int ((df .iloc [0 ])['h' ]) # get height and width of the image
440
481
w = int ((df .iloc [0 ])['w' ]) # assumes that h and w are the same for all rows
441
482
@@ -454,54 +495,48 @@ class DataSets(object):
454
495
# construct features, one_hots, computed features etc
455
496
456
497
outvars = []
457
- feature_name = []
498
+ feature_name = []
499
+ one_hot_map = []
500
+
458
501
for colName in output_feature_list :
459
-
502
+ one_hot_map . append ( 0 )
460
503
if colName == "aspect_ratio" :
461
504
t1 = np .array (df ['originalW' ] ,dtype = np .float32 )
462
505
t2 = np .array (df ['originalH' ] ,dtype = np .float32 )
463
506
t1 = t1 [:]/ t2 [:]
464
- feature_name .append (colName )
465
-
507
+ feature_name .append (colName )
466
508
467
509
elif colName == "upper_case" :
468
510
boolDF1 = df ['m_label' ]>= 64
469
511
boolDF2 = df ['m_label' ]<= 90
470
512
boolDF = boolDF1 & boolDF2
471
513
t1 = np .array (boolDF ,dtype = np .float32 )
472
- feature_name .append (colName )
514
+ feature_name .append (colName )
473
515
474
516
elif colName == 'image' :
475
517
t1 = np .array (df .loc [:,'r0c0' :],dtype = dtype ) #extract the images with is everything to the right of row 0 column 0
476
518
t1 = np .multiply (t1 , 1.0 / 256.0 )
477
519
feature_name .append (colName )
478
520
479
- elif colName == 'm_label_one_hot' :
480
- try :
481
- t1 = np .array (pd .get_dummies (df ['m_label' ]) , dtype = np .uint16 )
482
- except :
483
- t1 = np .zeros ((df .shape [0 ]))
484
- raise ValueError ('Memory error because the number of one-hot m_labels is too large' )
521
+ elif colName == 'm_label_one_hot' :
522
+ t1 = np .array (df ['m_label' ] , dtype = np .uint16 )
523
+ t1 = convert_to_unique (t1 )
524
+ one_hot_map [- 1 ] = len (np .unique (t1 ))
485
525
feature_name .append (colName )
486
526
487
527
elif colName == 'font_one_hot' :
488
- try :
489
- t1 = np .array (pd .get_dummies (df ['font' ]) , dtype = np .int8 )
490
- except :
491
- t1 = np .zeros ((df .shape [0 ]))
492
- raise ValueError ('Memory error because the number of one-hot fonts is too large' )
528
+ t1 = np .array (df ['font' ] , dtype = np .uint16 )
529
+ t1 = convert_to_unique (t1 )
530
+ one_hot_map [- 1 ] = len (np .unique (t1 ))
493
531
feature_name .append (colName )
494
532
495
533
elif colName == 'fontVariant_one_hot' :
496
- try :
497
- t1 = np .array (pd .get_dummies (df ['fontVariant' ]) , dtype = np .uint16 )
498
- except :
499
- t1 = np .zeros ((df .shape [0 ]))
500
- raise ValueError ('Memory error because the number of one-hot fontVariants is too large' )
534
+ t1 = np .array (df ['fontVariant' ] , dtype = np .uint16 )
535
+ t1 = convert_to_unique (t1 )
536
+ one_hot_map [- 1 ] = len (np .unique (t1 ))
501
537
feature_name .append (colName )
502
538
503
539
elif colName .find ('column_sum' )== 0 :
504
-
505
540
# compute the sum of each vertical column
506
541
t1 = df .loc [:,'r0c0' :]
507
542
t1 = np .multiply (t1 , 1.0 / 256.0 )
@@ -519,27 +554,15 @@ class DataSets(object):
519
554
feature_name .append ('column_sum[:]' )
520
555
else :
521
556
t1 = t1 [:,l ]
522
- feature_name .append ('column_sum{}' .format (l ))
557
+ feature_name .append ('column_sum{}' .format (l ))
523
558
524
559
525
560
else :
526
561
if colName in df .columns :
527
562
t1 = np .array (df [colName ])
528
- feature_name .append (colName )
563
+ feature_name .append (colName )
529
564
else :
530
565
raise ValueError ('Invalid ouput_feature_name: {}: it is not in the the database' .format (colName ))
531
-
532
-
533
- # fixup for formats required by various engines
534
- # features that are straight from the .CSV file, without
535
- # modifications for one-hot or scaling fall here.
536
- # tensorflow requires a shape (:,1), thus the reshaping
537
-
538
- if engine_type == 'tensorflow' and len (t1 .shape )== 1 :
539
- t1 = np .reshape (t1 ,(- 1 ,1 ))
540
-
541
- if engine_type == 'theano' and colName == 'image' :
542
- t1 = np .reshape (t1 ,(- 1 ,1 ,h ,w ))
543
566
544
567
outvars .append (t1 )
545
568
@@ -551,9 +574,9 @@ class DataSets(object):
551
574
outvars_test .append ( ot [:nTestCount ])
552
575
outvars_evaluation .append (ot [nTestCount :nTestCount + nEvaluationCount ])
553
576
554
- data_sets .train = TruthedCharacters (outvars_train , feature_name , h , w )
555
- data_sets .test = TruthedCharacters (outvars_test , feature_name , h , w )
556
- data_sets .evaluation = TruthedCharacters (outvars_evaluation ,feature_name , h , w )
577
+ data_sets .train = TruthedCharacters (outvars_train , feature_name , one_hot_map , engine_type , h , w )
578
+ data_sets .test = TruthedCharacters (outvars_test , feature_name , one_hot_map , engine_type , h , w )
579
+ data_sets .evaluation = TruthedCharacters (outvars_evaluation ,feature_name , one_hot_map , engine_type , h , w )
557
580
print ('feature results:' )
558
581
print ('\t number of train Images = ' ,nTrainCount )
559
582
print ('\t number of test Images = ' ,nTestCount )
@@ -668,6 +691,11 @@ def load_E13B(chars_to_train=(48,49) , columns=(9,17), nChars=None, test_size=0,
668
691
nChars = ds .train .features [0 ].shape [0 ]
669
692
670
693
labels = ['column {} sum' .format (columns [i ]) for i in range (len (columns ))]
694
+ #assert(np.all(ds.train.features[0]==ds.train._features[0]))
695
+ #assert(np.all(ds.train.features[1]==ds.train._features[1]))
696
+ #assert(np.all(ds.test.features[0]==ds.test._features[0]))
697
+ #assert(np.all(ds.test.features[1]==ds.test._features[1]))
698
+
671
699
return ds .train .features [0 ][:nChars ], ds .train .features [1 ][:nChars ], ds .test .features [0 ][:nChars ], ds .test .features [1 ][:nChars ], labels
672
700
673
701
0 commit comments