Skip to content

Commit 89f18a8

Browse files
committed
conserve memory by delaying one_hot calculation
1 parent 8aa5902 commit 89f18a8

File tree

1 file changed

+90
-62
lines changed

1 file changed

+90
-62
lines changed

ocr_utils.py

Lines changed: 90 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -92,19 +92,7 @@ def read_file(pathName, input_filters_dict, random_state=None):
9292
rd_font = (rd_font,)
9393
except:
9494
rd_font = ()
95-
96-
# with ZipFile(pathName, 'r') as myzip:
97-
# if len(rd_font) == 0:
98-
# names = myzip.namelist()
99-
# print ('\nreading all files...please wait')
100-
# df = pd.concat(apply_column_filters(pd.read_csv(myzip.open(fname,'r')), input_filters_dict) for fname in names)
101-
# else:
102-
# try:
103-
# df = pd.concat(apply_column_filters(pd.read_csv(myzip.open(font+".csv",'r')), input_filters_dict) for font in rd_font)
104-
# except:
105-
# raise ValueError('Could not find font file {} in the zip file'.format(rd_font))
106-
# myzip.close()
107-
# assert df.size >0
95+
10896

10997
with ZipFile(pathName, 'r') as myzip:
11098
if len(rd_font) == 0:
@@ -168,21 +156,26 @@ class TruthedCharacters(object):
168156
Holds the training features and size information
169157
170158
"""
171-
def __init__(self, features, output_feature_list,h,w):
159+
def __init__(self, features, output_feature_list, one_hot_map, engine_type,h,w):
172160

173161
self._num_examples = features[0].shape[0]
174162
self._nRows = h
175163
self._nCols = w
176-
self._features = features
164+
self._features = features # list of features
177165
self._epochs_completed = 0
178166
self._index_in_epoch = 0
179-
self._feature_names = output_feature_list
167+
self._feature_names = output_feature_list # list of names of features
180168
self._num_features = len(features)
169+
self._one_hot_map = one_hot_map # list >0 for each feature that is one_hot
170+
self._engine_type= engine_type
181171

182172
self._feature_width=[]
183173
for i in range(self._num_features ):
184174
try:
185-
self._feature_width += [features[i].shape[1]]
175+
if one_hot_map[i] == 0:
176+
self._feature_width += [features[i].shape[1]]
177+
else:
178+
self._feature_width += [one_hot_map[i]]
186179
except:
187180
self._feature_width += [1]
188181

@@ -194,10 +187,43 @@ def num_features(self):
194187
@property
195188
def feature_width(self):
196189
return self._feature_width
190+
# fixup for formats required by various engines
191+
# features that are straight from the .CSV file, without
192+
# modifications for one-hot or scaling fall here.
193+
# tensorflow requires a shape (:,1), thus the reshaping
194+
195+
def engine_conversion(self,t1,colName):
196+
if self._engine_type=='tensorflow' and len(t1.shape)==1:
197+
t1=np.reshape(t1,(-1,1))
198+
199+
if self._engine_type=='theano' and colName=='image':
200+
t1=np.reshape(t1,(-1,1,self._nRows,self._nCols))
201+
return t1
202+
203+
def get_features(self, i, start, end):
204+
'''
205+
memory saving version of features[i][start:end]
206+
'''
207+
t1 = self._features[i][start:end]
208+
n_hots = self._one_hot_map[i]
209+
210+
if n_hots==0:
211+
rtn=self.engine_conversion(t1, self._feature_names[i])
212+
else:
213+
rtn= self.engine_conversion(np.eye(n_hots )[t1], self._feature_names[i])
214+
return rtn
197215

198216
@property
199217
def features(self):
200-
return self._features
218+
# wait until last moment to compute one_hots to save memory
219+
rtn = []
220+
for t1, nm, n_hots in zip(self._features, self._feature_names, self._one_hot_map):
221+
if n_hots==0:
222+
rtn.append(self.engine_conversion(t1, nm) )
223+
#assert(np.all(rtn[-1]==t1))
224+
else:
225+
rtn.append( self.engine_conversion(np.eye(n_hots )[t1], nm) )
226+
return rtn
201227

202228
@property
203229
def feature_names(self):
@@ -244,7 +270,7 @@ def next_batch(self, batch_size):
244270
end = self._index_in_epoch
245271
outs = []
246272
for i in range(self._num_features):
247-
outs += [self._features[i][start:end]]
273+
outs += [self.get_features(i,start,end)]
248274

249275
return outs
250276

@@ -274,6 +300,8 @@ def dump_values(self):
274300
print(s[:10],end=' ')
275301
print(' ...')
276302

303+
304+
277305
def apply_column_filters(df, input_filters_dict ):
278306
''' apply the column filters to the incoming data
279307
@@ -292,6 +320,21 @@ def apply_column_filters(df, input_filters_dict ):
292320
criterion = df[key].map(lambda x: x in value)
293321
df = df[criterion]
294322
return df
323+
324+
def convert_to_unique(t1):
325+
''' convert unique values in an numpy.array into
326+
indices into the unique array
327+
arguments:
328+
t1 numpy scalar array
329+
return
330+
t1 with each value changed to an index 0 to number of unique
331+
values in t1-1
332+
'''
333+
t2 = t1
334+
unique = np.unique(t1)
335+
for i,u in enumerate(unique):
336+
t2[t1==u]=i
337+
return t2
295338

296339
def read_data(fileName=default_zip_file,
297340
input_filters_dict={},
@@ -421,7 +464,7 @@ class DataSets(object):
421464
'''
422465
engine_type = engine_type.lower()
423466

424-
print('\nparameter: input_filters_dict\n\t{}'.format(input_filters_dict))
467+
print('\nparameter: input_filters_dict\n\t{}'.format(sorted(input_filters_dict.items())))
425468
print('parameter: output_feature_list\n\t{}'.format(output_feature_list))
426469

427470
df = read_file(fileName, input_filters_dict,random_state=random_state)
@@ -433,9 +476,7 @@ class DataSets(object):
433476
available_columns.append(key)
434477

435478
print('input filters available: \n\t{}:'.format(available_columns))
436-
437-
438-
479+
439480
h=int((df.iloc[0])['h']) # get height and width of the image
440481
w=int((df.iloc[0])['w']) # assumes that h and w are the same for all rows
441482

@@ -454,54 +495,48 @@ class DataSets(object):
454495
# construct features, one_hots, computed features etc
455496

456497
outvars = []
457-
feature_name=[]
498+
feature_name=[]
499+
one_hot_map = []
500+
458501
for colName in output_feature_list:
459-
502+
one_hot_map.append(0)
460503
if colName=="aspect_ratio":
461504
t1 = np.array(df['originalW'] ,dtype=np.float32)
462505
t2 = np.array(df['originalH'] ,dtype=np.float32)
463506
t1 = t1[:]/t2[:]
464-
feature_name.append(colName)
465-
507+
feature_name.append(colName)
466508

467509
elif colName=="upper_case":
468510
boolDF1 = df['m_label']>=64
469511
boolDF2 = df['m_label']<=90
470512
boolDF = boolDF1 & boolDF2
471513
t1 = np.array(boolDF,dtype=np.float32)
472-
feature_name.append(colName)
514+
feature_name.append(colName)
473515

474516
elif colName=='image':
475517
t1 = np.array(df.loc[:,'r0c0':],dtype=dtype) #extract the images with is everything to the right of row 0 column 0
476518
t1 = np.multiply(t1, 1.0 / 256.0)
477519
feature_name.append(colName)
478520

479-
elif colName=='m_label_one_hot':
480-
try:
481-
t1 = np.array(pd.get_dummies(df['m_label']) , dtype=np.uint16)
482-
except:
483-
t1 = np.zeros((df.shape[0]))
484-
raise ValueError('Memory error because the number of one-hot m_labels is too large')
521+
elif colName=='m_label_one_hot':
522+
t1 = np.array(df['m_label'] , dtype=np.uint16)
523+
t1 = convert_to_unique(t1)
524+
one_hot_map[-1] = len(np.unique(t1))
485525
feature_name.append(colName)
486526

487527
elif colName=='font_one_hot':
488-
try:
489-
t1 = np.array(pd.get_dummies(df['font']) , dtype=np.int8)
490-
except:
491-
t1 = np.zeros((df.shape[0]))
492-
raise ValueError('Memory error because the number of one-hot fonts is too large')
528+
t1 = np.array(df['font'] , dtype=np.uint16)
529+
t1 = convert_to_unique(t1)
530+
one_hot_map[-1] = len(np.unique(t1))
493531
feature_name.append(colName)
494532

495533
elif colName=='fontVariant_one_hot':
496-
try:
497-
t1 = np.array(pd.get_dummies(df['fontVariant']) , dtype=np.uint16)
498-
except:
499-
t1 = np.zeros((df.shape[0]))
500-
raise ValueError('Memory error because the number of one-hot fontVariants is too large')
534+
t1 = np.array(df['fontVariant'] , dtype=np.uint16)
535+
t1 = convert_to_unique(t1)
536+
one_hot_map[-1] = len(np.unique(t1))
501537
feature_name.append(colName)
502538

503539
elif colName.find('column_sum')==0:
504-
505540
# compute the sum of each vertical column
506541
t1 = df.loc[:,'r0c0':]
507542
t1 = np.multiply(t1, 1.0 / 256.0)
@@ -519,27 +554,15 @@ class DataSets(object):
519554
feature_name.append('column_sum[:]')
520555
else:
521556
t1 = t1[:,l]
522-
feature_name.append('column_sum{}'.format(l))
557+
feature_name.append('column_sum{}'.format(l))
523558

524559

525560
else:
526561
if colName in df.columns :
527562
t1=np.array(df[colName])
528-
feature_name.append(colName)
563+
feature_name.append(colName)
529564
else:
530565
raise ValueError('Invalid ouput_feature_name: {}: it is not in the the database'.format(colName))
531-
532-
533-
# fixup for formats required by various engines
534-
# features that are straight from the .CSV file, without
535-
# modifications for one-hot or scaling fall here.
536-
# tensorflow requires a shape (:,1), thus the reshaping
537-
538-
if engine_type=='tensorflow' and len(t1.shape)==1:
539-
t1=np.reshape(t1,(-1,1))
540-
541-
if engine_type=='theano' and colName=='image':
542-
t1=np.reshape(t1,(-1,1,h,w))
543566

544567
outvars.append(t1)
545568

@@ -551,9 +574,9 @@ class DataSets(object):
551574
outvars_test.append( ot[:nTestCount])
552575
outvars_evaluation.append(ot[nTestCount:nTestCount+nEvaluationCount])
553576

554-
data_sets.train = TruthedCharacters(outvars_train, feature_name, h, w)
555-
data_sets.test = TruthedCharacters(outvars_test, feature_name, h, w)
556-
data_sets.evaluation = TruthedCharacters(outvars_evaluation,feature_name, h, w)
577+
data_sets.train = TruthedCharacters(outvars_train, feature_name, one_hot_map, engine_type, h, w)
578+
data_sets.test = TruthedCharacters(outvars_test, feature_name, one_hot_map, engine_type, h, w)
579+
data_sets.evaluation = TruthedCharacters(outvars_evaluation,feature_name, one_hot_map, engine_type, h, w)
557580
print ('feature results:')
558581
print ('\tnumber of train Images = ',nTrainCount)
559582
print ('\tnumber of test Images = ',nTestCount)
@@ -668,6 +691,11 @@ def load_E13B(chars_to_train=(48,49) , columns=(9,17), nChars=None, test_size=0,
668691
nChars = ds.train.features[0].shape[0]
669692

670693
labels= ['column {} sum'.format(columns[i]) for i in range(len(columns))]
694+
#assert(np.all(ds.train.features[0]==ds.train._features[0]))
695+
#assert(np.all(ds.train.features[1]==ds.train._features[1]))
696+
#assert(np.all(ds.test.features[0]==ds.test._features[0]))
697+
#assert(np.all(ds.test.features[1]==ds.test._features[1]))
698+
671699
return ds.train.features[0][:nChars], ds.train.features[1][:nChars], ds.test.features[0][:nChars], ds.test.features[1][:nChars], labels
672700

673701

0 commit comments

Comments
 (0)