change get_list output to dataframe because order of entries was

rrlyman · rrlyman · commit 6e291fe0268b · 2016-08-09T17:26:47.000-07:00
indeterminant
diff --git a/ocr_utils.py b/ocr_utils.py
@@ -116,8 +116,7 @@ def get_list(pathName="fonts.zip",input_filters_dict={}):
         
     Returns
     --------------
-        a list of all the all the unique values available in the 
-        dataset for the given columns
+        a dataframe of all the all the unique lines in the dataset
         
     Example:
     --------------    
@@ -141,8 +140,7 @@ def get_list(pathName="fonts.zip",input_filters_dict={}):
     keys=list(input_filters_dict.keys())
     df = df[keys]
     df= df.drop_duplicates()
-    y = np.array(df).tolist()
-    return y
+    return df
     
        
 class TruthedCharacters(object):
diff --git a/q1_database_statistics.py b/q1_database_statistics.py
@@ -7,6 +7,7 @@
 '''
 import ocr_utils
 import numpy as np
+import pandas as pd
 
 # # read and show the character images for each font variant
 # # output only the character label and the image
@@ -18,12 +19,12 @@
 # title = '{}: {}'.format('AGENCY','AGENCY Is')
 # ocr_utils.montage(X2D, title=title)
         
-lst = ocr_utils.get_list(input_filters_dict = {'font':()})
+df1 = ocr_utils.get_list(input_filters_dict = {'font':()})
 
 print('\n\nAvailable fonts:')
 import pprint
 pp = pprint.PrettyPrinter()
-pp.pprint(lst)
+pp.pprint(df1)
 # 
 # for font in lst:
 #     input_filters_dict = {'font':font, 'm_label': range(100)}    
@@ -40,9 +41,9 @@
 # read and show the character images for each font variant
 # output only the character label and the image
 fl = ['m_label','image'] 
-for font in lst:    
-    lst2 = ocr_utils.get_list(input_filters_dict={'font':font, 'fontVariant':()})
-    for f,fontVariant in lst2:
+for font in df1:    
+    df2 = ocr_utils.get_list(input_filters_dict={'font':font, 'fontVariant':()})
+    for font,fontVariant in zip(df2['font'],df2['fontVariant']):
         fd = {'font': font, 'fontVariant': fontVariant}
         ds = ocr_utils.read_data(input_filters_dict=fd, output_feature_list=fl, dtype=np.int32)   
         y,X = ds.train.features
diff --git a/q2_tensorflow_mnist.py b/q2_tensorflow_mnist.py
@@ -30,6 +30,7 @@
 import datetime
 from collections import namedtuple
 import numpy as np
+import pandas as pd
  
    
 def train_a_font(input_filters_dict,output_feature_list, nEpochs=5000):
@@ -323,7 +324,7 @@ def computeSize(s,tens):
     sess.close()
 
     
-if True:
+if False:
     # single font train
     
     # esamples
@@ -360,18 +361,18 @@ def computeSize(s,tens):
     # loop through all the fonts and train individually
 
     # pick up the entire list of fonts and font variants. Train each one.
-    lst = ocr_utils.get_list(input_filters_dict={'font': ()})      
+    df1 = ocr_utils.get_list(input_filters_dict={'font': ()})      
     
     import pprint as pprint
     pp = pprint.PrettyPrinter(indent=4)
-    pp.pprint(lst)
+    pp.pprint(df1)
    
     output_feature_list = ['m_label_one_hot','image','italic','aspect_ratio','upper_case']
     
     # Change nEpochs to 5000 for better results
-    for l in lst:
+    for l in df1:
         input_filters_dict= {'font': (l[0],)}       
-        train_a_font(input_filters_dict,output_feature_list, nEpochs = 1000) 
+        train_a_font(input_filters_dict,output_feature_list, nEpochs = 500) 
     
     
 print ('\n########################### No Errors ####################################')