|
| 1 | +""" |
| 2 | +Example employing Lasagne for digit recognition using the MNIST dataset. |
| 3 | +This example is deliberately structured as a long flat file, focusing on how |
| 4 | +to use Lasagne, instead of focusing on writing maximally modular and reusable |
| 5 | +code. It is used as the foundation for the introductory Lasagne tutorial: |
| 6 | +http://lasagne.readthedocs.org/en/latest/user/tutorial.html |
| 7 | +More in-depth examples and reproductions of paper results are maintained in |
| 8 | +a separate repository: https://github.com/Lasagne/Recipes |
| 9 | +
|
| 10 | +https://github.com/Lasagne/Lasagne/blob/master/examples/mnist.py |
| 11 | +
|
| 12 | +@author: richard lyman |
| 13 | +""" |
| 14 | + |
| 15 | +from __future__ import print_function |
| 16 | + |
| 17 | +import sys |
| 18 | +import time |
| 19 | +import numpy as np |
| 20 | +import theano |
| 21 | +import theano.tensor as T |
| 22 | +import lasagne |
| 23 | +import ocr_utils |
| 24 | + |
| 25 | +# ##################### Build the neural network model ####################### |
| 26 | +# This script supports three types of models. For each one, we define a |
| 27 | +# function that takes a Theano variable representing the input and returns |
| 28 | +# the output layer of a neural network model built in Lasagne. |
| 29 | + |
| 30 | +def build_mlp(input_var=None, nRow=28, nCol=28): |
| 31 | + # This creates an MLP of two hidden layers of 800 units each, followed by |
| 32 | + # a softmax output layer of 10 units. It applies 20% dropout to the input |
| 33 | + # data and 50% dropout to the hidden layers. |
| 34 | + |
| 35 | + # Input layer, specifying the expected input shape of the network |
| 36 | + # (unspecified batchsize, 1 channel, 28 rows and 28 columns) and |
| 37 | + # linking it to the given Theano variable `input_var`, if any: |
| 38 | + l_in = lasagne.layers.InputLayer(shape=(None, 1, nRow, nCol), |
| 39 | + input_var=input_var) |
| 40 | + |
| 41 | + # Apply 20% dropout to the input data: |
| 42 | + l_in_drop = lasagne.layers.DropoutLayer(l_in, p=0.2) |
| 43 | + |
| 44 | + # Add a fully-connected layer of 800 units, using the linear rectifier, and |
| 45 | + # initializing weights with Glorot's scheme (which is the default anyway): |
| 46 | + l_hid1 = lasagne.layers.DenseLayer( |
| 47 | + l_in_drop, num_units=800, |
| 48 | + nonlinearity=lasagne.nonlinearities.rectify, |
| 49 | + W=lasagne.init.GlorotUniform()) |
| 50 | + |
| 51 | + # We'll now add dropout of 50%: |
| 52 | + l_hid1_drop = lasagne.layers.DropoutLayer(l_hid1, p=0.5) |
| 53 | + |
| 54 | + # Another 800-unit layer: |
| 55 | + l_hid2 = lasagne.layers.DenseLayer( |
| 56 | + l_hid1_drop, num_units=800, |
| 57 | + nonlinearity=lasagne.nonlinearities.rectify) |
| 58 | + |
| 59 | + # 50% dropout again: |
| 60 | + l_hid2_drop = lasagne.layers.DropoutLayer(l_hid2, p=0.5) |
| 61 | + |
| 62 | + # Finally, we'll add the fully-connected output layer, of 10 softmax units: |
| 63 | + l_out = lasagne.layers.DenseLayer( |
| 64 | + l_hid2_drop, num_units=10, |
| 65 | + nonlinearity=lasagne.nonlinearities.softmax) |
| 66 | + |
| 67 | + # Each layer is linked to its incoming layer(s), so we only need to pass |
| 68 | + # the output layer to give access to a network in Lasagne: |
| 69 | + return l_out |
| 70 | + |
| 71 | + |
| 72 | +def build_custom_mlp(input_var=None, depth=2, width=800, drop_input=.2, |
| 73 | + drop_hidden=.5): |
| 74 | + # By default, this creates the same network as `build_mlp`, but it can be |
| 75 | + # customized with respect to the number and size of hidden layers. This |
| 76 | + # mostly showcases how creating a network in Python code can be a lot more |
| 77 | + # flexible than a configuration file. Note that to make the code easier, |
| 78 | + # all the layers are just called `network` -- there is no need to give them |
| 79 | + # different names if all we return is the last one we created anyway; we |
| 80 | + # just used different names above for clarity. |
| 81 | + |
| 82 | + # Input layer and dropout (with shortcut `dropout` for `DropoutLayer`): |
| 83 | + network = lasagne.layers.InputLayer(shape=(None, 1, 28, 28), |
| 84 | + input_var=input_var) |
| 85 | + if drop_input: |
| 86 | + network = lasagne.layers.dropout(network, p=drop_input) |
| 87 | + # Hidden layers and dropout: |
| 88 | + nonlin = lasagne.nonlinearities.rectify |
| 89 | + for _ in range(depth): |
| 90 | + network = lasagne.layers.DenseLayer( |
| 91 | + network, width, nonlinearity=nonlin) |
| 92 | + if drop_hidden: |
| 93 | + network = lasagne.layers.dropout(network, p=drop_hidden) |
| 94 | + # Output layer: |
| 95 | + softmax = lasagne.nonlinearities.softmax |
| 96 | + network = lasagne.layers.DenseLayer(network, 10, nonlinearity=softmax) |
| 97 | + return network |
| 98 | + |
| 99 | + |
| 100 | +def build_cnn(input_var=None): |
| 101 | + # As a third model, we'll create a CNN of two convolution + pooling stages |
| 102 | + # and a fully-connected hidden layer in front of the output layer. |
| 103 | + |
| 104 | + # Input layer, as usual: |
| 105 | + network = lasagne.layers.InputLayer(shape=(None, 1, 28, 28), |
| 106 | + input_var=input_var) |
| 107 | + # This time we do not apply input dropout, as it tends to work less well |
| 108 | + # for convolutional layers. |
| 109 | + |
| 110 | + # Convolutional layer with 32 kernels of size 5x5. Strided and padded |
| 111 | + # convolutions are supported as well; see the docstring. |
| 112 | + network = lasagne.layers.Conv2DLayer( |
| 113 | + network, num_filters=32, filter_size=(5, 5), |
| 114 | + nonlinearity=lasagne.nonlinearities.rectify, |
| 115 | + W=lasagne.init.GlorotUniform()) |
| 116 | + # Expert note: Lasagne provides alternative convolutional layers that |
| 117 | + # override Theano's choice of which implementation to use; for details |
| 118 | + # please see http://lasagne.readthedocs.org/en/latest/user/tutorial.html. |
| 119 | + |
| 120 | + # Max-pooling layer of factor 2 in both dimensions: |
| 121 | + network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 2)) |
| 122 | + |
| 123 | + # Another convolution with 32 5x5 kernels, and another 2x2 pooling: |
| 124 | + network = lasagne.layers.Conv2DLayer( |
| 125 | + network, num_filters=32, filter_size=(5, 5), |
| 126 | + nonlinearity=lasagne.nonlinearities.rectify) |
| 127 | + network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 2)) |
| 128 | + |
| 129 | + # A fully-connected layer of 256 units with 50% dropout on its inputs: |
| 130 | + network = lasagne.layers.DenseLayer( |
| 131 | + lasagne.layers.dropout(network, p=.5), |
| 132 | + num_units=256, |
| 133 | + nonlinearity=lasagne.nonlinearities.rectify) |
| 134 | + |
| 135 | + # And, finally, the 10-unit output layer with 50% dropout on its inputs: |
| 136 | + network = lasagne.layers.DenseLayer( |
| 137 | + lasagne.layers.dropout(network, p=.5), |
| 138 | + num_units=10, |
| 139 | + nonlinearity=lasagne.nonlinearities.softmax) |
| 140 | + |
| 141 | + return network |
| 142 | + |
| 143 | + |
| 144 | +# ############################# Batch iterator ############################### |
| 145 | +# This is just a simple helper function iterating over training data in |
| 146 | +# mini-batches of a particular size, optionally in random order. It assumes |
| 147 | +# data is available as numpy arrays. For big datasets, you could load numpy |
| 148 | +# arrays as memory-mapped files (np.load(..., mmap_mode='r')), or write your |
| 149 | +# own custom data iteration function. For small datasets, you can also copy |
| 150 | +# them to GPU at once for slightly improved performance. This would involve |
| 151 | +# several changes in the main program, though, and is not demonstrated here. |
| 152 | +# Notice that this function returns only mini-batches of size `batchsize`. |
| 153 | +# If the size of the data is not a multiple of `batchsize`, it will not |
| 154 | +# return the last (remaining) mini-batch. |
| 155 | + |
| 156 | +def iterate_minibatches(inputs, targets, batchsize, shuffle=False): |
| 157 | + assert len(inputs) == len(targets) |
| 158 | + ln = len(inputs) - len(inputs) % batchsize |
| 159 | + assert ln % batchsize == 0 |
| 160 | + if shuffle: |
| 161 | + indices = np.arange(ln) |
| 162 | + np.random.shuffle(indices) |
| 163 | + |
| 164 | + for start_idx in range(0, ln , batchsize): |
| 165 | + if shuffle: |
| 166 | + excerpt = indices[start_idx:start_idx + batchsize] |
| 167 | + else: |
| 168 | + excerpt = slice(start_idx, start_idx + batchsize) |
| 169 | + yield inputs[excerpt], targets[excerpt] |
| 170 | + |
| 171 | + |
| 172 | +# ############################## Main program ################################ |
| 173 | +# Everything else will be handled in our main program now. We could pull out |
| 174 | +# more functions to better separate the code, but it wouldn't make it any |
| 175 | +# easier to read. |
| 176 | + |
| 177 | +def main(model='mlp', num_epochs=50): |
| 178 | + |
| 179 | + print("Loading data...") |
| 180 | + |
| 181 | + input_filters_dict = {'font': ('HANDPRINT',), 'm_label': range(48,57)} |
| 182 | + output_feature_list = ['m_label','image'] |
| 183 | + ds = ocr_utils.read_data(input_filters_dict = input_filters_dict, |
| 184 | + output_feature_list=output_feature_list, |
| 185 | + engine_type='theano', |
| 186 | + test_size = .1, |
| 187 | + evaluation_size = .1, |
| 188 | + dtype='float32') |
| 189 | + nRows = ds.train.num_rows |
| 190 | + nCols = ds.train.num_columns |
| 191 | + X_train = ds.train.features[1] |
| 192 | + |
| 193 | + X_val = ds.evaluation.features[1] |
| 194 | + |
| 195 | + X_test = ds.test.features[1] |
| 196 | + y_train = np.array(ds.train.features[0]-48,dtype=np.int32) |
| 197 | + y_test = np.array(ds.test.features[0]-48,dtype=np.int32) |
| 198 | + y_val = np.array(ds.evaluation.features[0]-48,dtype=np.int32) |
| 199 | + |
| 200 | + # Prepare Theano variables for inputs and targets |
| 201 | + input_var = T.tensor4('inputs') |
| 202 | + target_var = T.ivector('targets') |
| 203 | + |
| 204 | + # Create neural network model (depending on first command line parameter) |
| 205 | + print("Building model and compiling functions...") |
| 206 | + if model == 'mlp': |
| 207 | + network = build_mlp(input_var,nRows, nCols) |
| 208 | + elif model.startswith('custom_mlp:'): |
| 209 | + depth, width, drop_in, drop_hid = model.split(':', 1)[1].split(',') |
| 210 | + network = build_custom_mlp(input_var, int(depth), int(width), |
| 211 | + float(drop_in), float(drop_hid)) |
| 212 | + elif model == 'cnn': |
| 213 | + network = build_cnn(input_var) |
| 214 | + else: |
| 215 | + print("Unrecognized model type %r." % model,flush=True) |
| 216 | + return |
| 217 | + |
| 218 | + # Create a loss expression for training, i.e., a scalar objective we want |
| 219 | + # to minimize (for our multi-class problem, it is the cross-entropy loss): |
| 220 | + prediction = lasagne.layers.get_output(network) |
| 221 | + loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) |
| 222 | + loss = loss.mean() |
| 223 | + # We could add some weight decay as well here, see lasagne.regularization. |
| 224 | + |
| 225 | + # Create update expressions for training, i.e., how to modify the |
| 226 | + # parameters at each training step. Here, we'll use Stochastic Gradient |
| 227 | + # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more. |
| 228 | + params = lasagne.layers.get_all_params(network, trainable=True) |
| 229 | + updates = lasagne.updates.nesterov_momentum( |
| 230 | + loss, params, learning_rate=0.01, momentum=0.9) |
| 231 | + |
| 232 | + # Create a loss expression for validation/testing. The crucial difference |
| 233 | + # here is that we do a deterministic forward pass through the network, |
| 234 | + # disabling dropout layers. |
| 235 | + test_prediction = lasagne.layers.get_output(network, deterministic=True) |
| 236 | + test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, |
| 237 | + target_var) |
| 238 | + test_loss = test_loss.mean() |
| 239 | + # As a bonus, also create an expression for the classification accuracy: |
| 240 | + test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), |
| 241 | + dtype='float32') |
| 242 | + |
| 243 | + # Compile a function performing a training step on a mini-batch (by giving |
| 244 | + # the updates dictionary) and returning the corresponding training loss: |
| 245 | + train_fn = theano.function([input_var, target_var], loss, updates=updates) |
| 246 | + |
| 247 | + # Compile a second function computing the validation loss and accuracy: |
| 248 | + val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) |
| 249 | + |
| 250 | + # Finally, launch the training loop. |
| 251 | + print("Starting training...") |
| 252 | + # We iterate over epochs: |
| 253 | + for epoch in range(num_epochs): |
| 254 | + # In each epoch, we do a full pass over the training data: |
| 255 | + train_err = 0 |
| 256 | + train_batches = 0 |
| 257 | + start_time = time.time() |
| 258 | + for batch in iterate_minibatches(X_train, y_train, 500, shuffle=True): |
| 259 | + inputs, targets = batch |
| 260 | + train_err += train_fn(inputs, targets) |
| 261 | + train_batches += 1 |
| 262 | + |
| 263 | + # And a full pass over the validation data: |
| 264 | + val_err = 0 |
| 265 | + val_acc = 0 |
| 266 | + val_batches = 0 |
| 267 | + for batch in iterate_minibatches(X_val, y_val, 500, shuffle=False): |
| 268 | + inputs, targets = batch |
| 269 | + err, acc = val_fn(inputs, targets) |
| 270 | + val_err += err |
| 271 | + val_acc += acc |
| 272 | + val_batches += 1 |
| 273 | + |
| 274 | + # Then we print the results for this epoch: |
| 275 | + print("Epoch {} of {} took {:.3f}s".format( |
| 276 | + epoch + 1, num_epochs, time.time() - start_time),flush=True) |
| 277 | + print(" training loss:\t\t{:.6f}".format(train_err / train_batches),flush=True) |
| 278 | + print(" validation loss:\t\t{:.6f}".format(val_err / val_batches),flush=True) |
| 279 | + print(" validation accuracy:\t\t{:.2f} %".format( |
| 280 | + val_acc / val_batches * 100)) |
| 281 | + |
| 282 | + # After training, we compute and print the test error: |
| 283 | + test_err = 0 |
| 284 | + test_acc = 0 |
| 285 | + test_batches = 0 |
| 286 | + for batch in iterate_minibatches(X_test, y_test, 500, shuffle=False): |
| 287 | + inputs, targets = batch |
| 288 | + err, acc = val_fn(inputs, targets) |
| 289 | + test_err += err |
| 290 | + test_acc += acc |
| 291 | + test_batches += 1 |
| 292 | + print("Final results:",flush=True) |
| 293 | + print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches),flush=True) |
| 294 | + print(" test accuracy:\t\t{:.2f} %".format( |
| 295 | + test_acc / test_batches * 100),flush=True) |
| 296 | + |
| 297 | + # Optionally, you could now dump the network weights to a file like this: |
| 298 | + # np.savez('model.npz', *lasagne.layers.get_all_param_values(network)) |
| 299 | + # |
| 300 | + # And load them again later on like this: |
| 301 | + # with np.load('model.npz') as f: |
| 302 | + # param_values = [f['arr_%d' % i] for i in range(len(f.files))] |
| 303 | + # lasagne.layers.set_all_param_values(network, param_values) |
| 304 | + |
| 305 | + |
| 306 | +if __name__ == '__main__': |
| 307 | + if ('--help' in sys.argv) or ('-h' in sys.argv): |
| 308 | + print("Trains a neural network on MNIST using Lasagne.") |
| 309 | + print("Usage: %s [MODEL [EPOCHS]]" % sys.argv[0]) |
| 310 | + print() |
| 311 | + print("MODEL: 'mlp' for a simple Multi-Layer Perceptron (MLP),") |
| 312 | + print(" 'custom_mlp:DEPTH,WIDTH,DROP_IN,DROP_HID' for an MLP") |
| 313 | + print(" with DEPTH hidden layers of WIDTH units, DROP_IN") |
| 314 | + print(" input dropout and DROP_HID hidden dropout,") |
| 315 | + print(" 'cnn' for a simple Convolutional Neural Network (CNN).") |
| 316 | + print("EPOCHS: number of training epochs to perform (default: 500)") |
| 317 | + else: |
| 318 | + kwargs = {} |
| 319 | + if len(sys.argv) > 1: |
| 320 | + kwargs['model'] = sys.argv[1] |
| 321 | + if len(sys.argv) > 2: |
| 322 | + kwargs['num_epochs'] = int(sys.argv[2]) |
| 323 | + main(**kwargs) |
| 324 | + |
| 325 | +print ('\n########################### No Errors ####################################') |
0 commit comments