|
| 1 | +import cv2 |
| 2 | +import numpy as np |
| 3 | + |
| 4 | +# print(output.keys(), output.shape) |
| 5 | +def handle_pose(output, input_shape): |
| 6 | + ''' |
| 7 | + Handles the output of the Pose Estimation model. |
| 8 | + Returns ONLY the keypoint heatmaps, and not the Part Affinity Fields. |
| 9 | + ''' |
| 10 | + # The input image shape is say (750, 1000, 3) |
| 11 | + # output is a dictionary with two items (blobs)"Mconv7_stage2_L1", "Mconv7_stage2_L2" |
| 12 | + # Extract only the second blob output (keypoint heatmaps) |
| 13 | + # which is of shape (1, 19, 32, 57) |
| 14 | + # 19 images of dimensions (32, 57). Each image corresponds to a keypoint heatmap. |
| 15 | + heatmaps = output['Mconv7_stage2_L2'] |
| 16 | + # Resize the heatmap back to the size of the input |
| 17 | + # create a 19 x 750 x 1000 array |
| 18 | + out_heatmap = np.zeros([heatmaps.shape[1], input_shape[0], input_shape[1]]) |
| 19 | + # Iterate through and re-size each of the heatmaps to size of the input image |
| 20 | + # also reverse the input H x W dimensions to W x H because cv2.resize |
| 21 | + # accepts horizontal(fx), vertical(fy) |
| 22 | + for h in range(len(heatmaps[0])): |
| 23 | + out_heatmap[h] = cv2.resize(heatmaps[0][h], input_shape[0:2][::-1]) |
| 24 | + |
| 25 | + return out_heatmap |
| 26 | + |
| 27 | + |
| 28 | +def handle_text(output, input_shape): |
| 29 | + ''' |
| 30 | + Handles the output of the Text Detection model. |
| 31 | + Returns ONLY the text/no text classification of each pixel, |
| 32 | + and not the linkage between pixels and their neighbors. |
| 33 | + ''' |
| 34 | + # input shape : input image in the format [BxCxHxW], |
| 35 | + # Extract only the first blob output (text/no text classification) |
| 36 | + #[1x2x192x320] - logits related to text/no-text classification for each pixel. |
| 37 | + text_classes = output['model/segm_logits/add'] |
| 38 | + # TODO 2: Resize this output back to the size of the input |
| 39 | + # 2 x 192 x 320 -> 2 x H x W |
| 40 | + out_text = np.empty([text_classes.shape[1], input_shape[0], input_shape[1]]) |
| 41 | + for t in range(len(text_classes[0])): |
| 42 | + out_text[t] = cv2.resize(text_classes[0][t], input_shape[0:2][::-1]) |
| 43 | + return out_text |
| 44 | + |
| 45 | + |
| 46 | +def handle_car(output, input_shape): |
| 47 | + ''' |
| 48 | + Handles the output of the Car Metadata model. |
| 49 | + Returns two integers: the argmax of each softmax output. |
| 50 | + The first is for color, and the second for type. |
| 51 | + ''' |
| 52 | + # input :shape: [1x3x72x72] - An input image in following format [1xCxHxW] |
| 53 | + # Get the argmax of the "color" output |
| 54 | + #"color", shape: [1, 7, 1, 1] - Softmax output across seven color classes [white, gray, yellow, red, green, blue, black] |
| 55 | + color = output['color'].flatten() |
| 56 | + color_pred = np.argmax(color) |
| 57 | + # Get the argmax of the "type" output |
| 58 | + # "type", shape: [1, 4, 1, 1] - Softmax output across four type classes [car, bus, truck, van] |
| 59 | + car_type = output['type'].flatten() |
| 60 | + type_pred = np.argmax(car_type) |
| 61 | + |
| 62 | + return color_pred, type_pred |
| 63 | + |
| 64 | + |
| 65 | +def handle_output(model_type): |
| 66 | + ''' |
| 67 | + Returns the related function to handle an output, |
| 68 | + based on the model_type being used. |
| 69 | + ''' |
| 70 | + if model_type == "POSE": |
| 71 | + return handle_pose |
| 72 | + elif model_type == "TEXT": |
| 73 | + return handle_text |
| 74 | + elif model_type == "CAR_META": |
| 75 | + return handle_car |
| 76 | + else: |
| 77 | + return None |
| 78 | + |
| 79 | + |
| 80 | +''' |
| 81 | +The below function is carried over from the previous exercise. |
| 82 | +You just need to call it appropriately in `app.py` to preprocess |
| 83 | +the input image. |
| 84 | +''' |
| 85 | +def preprocessing(input_image, height, width): |
| 86 | + ''' |
| 87 | + Given an input image, height and width: |
| 88 | + - Resize to width and height |
| 89 | + - Transpose the final "channel" dimension to be first |
| 90 | + - Reshape the image to add a "batch" of 1 at the start |
| 91 | + ''' |
| 92 | + image = np.copy(input_image) |
| 93 | + image = cv2.resize(image, (width, height)) |
| 94 | + image = image.transpose((2,0,1)) |
| 95 | + image = image.reshape(1, 3, height, width) |
| 96 | + |
| 97 | + return image |
0 commit comments