PiperOrigin-RevId: 205684720

aquariusjay · huihui-personal · commit a17394cc15a2 · 2018-07-24T14:15:21.000+08:00
diff --git a/research/deeplab/README.md b/research/deeplab/README.md
@@ -33,10 +33,10 @@ works:
 *   DeepLabv3+:
 
 ```
-@article{deeplabv3plus2018,
+@inproceedings{deeplabv3plus2018,
   title={Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation},
   author={Liang-Chieh Chen and Yukun Zhu and George Papandreou and Florian Schroff and Hartwig Adam},
-  journal={arXiv:1802.02611},
+  booktitle={ECCV},
   year={2018}
 }
 ```
@@ -45,7 +45,7 @@ works:
 
 ```
 @inproceedings{mobilenetv22018,
-  title={Inverted Residuals and Linear Bottlenecks: Mobile Networks for Classification, Detection and Segmentation},
+  title={MobileNetV2: Inverted Residuals and Linear Bottlenecks},
   author={Mark Sandler and Andrew Howard and Menglong Zhu and Andrey Zhmoginov and Liang-Chieh Chen},
   booktitle={CVPR},
   year={2018}
@@ -78,6 +78,7 @@ Some segmentation results on Flickr images:
 *   Liang-Chieh Chen, github: [aquariusjay](https://github.com/aquariusjay)
 *   YuKun Zhu, github: [yknzhu](https://github.com/YknZhu)
 *   George Papandreou, github: [gpapan](https://github.com/gpapan)
+*   Hui Hui, github: [huihui-personal](https://github.com/huihui-personal)
 
 ## Tables of Contents
 
@@ -154,8 +155,8 @@ and Cityscapes.
     [[link]](http://arxiv.org/abs/1706.05587). arXiv: 1706.05587, 2017.
 
 4.  **Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation**<br />
-    Liang-Chieh Chen, Yukun Zhu, George Papandreou, Florian Schroff, Hartwig Adam. arXiv: 1802.02611.<br />
-    [[link]](https://arxiv.org/abs/1802.02611). arXiv: 1802.02611, 2018.
+    Liang-Chieh Chen, Yukun Zhu, George Papandreou, Florian Schroff, Hartwig Adam.<br />
+    [[link]](https://arxiv.org/abs/1802.02611). In ECCV, 2018.
 
 5.  **ParseNet: Looking Wider to See Better**<br />
     Wei Liu, Andrew Rabinovich, Alexander C Berg<br />
@@ -169,9 +170,9 @@ and Cityscapes.
     Sergey Ioffe, Christian Szegedy <br />
     [[link]](https://arxiv.org/abs/1502.03167). In ICML, 2015.
 
-8.  **Inverted Residuals and Linear Bottlenecks: Mobile Networks for Classification, Detection and Segmentation**<br />
+8.  **MobileNetV2: Inverted Residuals and Linear Bottlenecks**<br />
     Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen<br />
-    [[link]](https://arxiv.org/abs/1801.04381). arXiv:1801.04381, 2018.
+    [[link]](https://arxiv.org/abs/1801.04381). In CVPR, 2018.
 
 9.  **Xception: Deep Learning with Depthwise Separable Convolutions**<br />
     François Chollet<br />
diff --git a/research/deeplab/common.py b/research/deeplab/common.py
@@ -17,6 +17,7 @@
 Common flags from train/eval/vis/export_model.py are collected in this script.
 """
 import collections
+import copy
 
 import tensorflow as tf
 
@@ -51,6 +52,12 @@
 flags.DEFINE_boolean('add_image_level_feature', True,
                      'Add image level feature.')
 
+flags.DEFINE_multi_integer(
+    'image_pooling_crop_size', None,
+    'Image pooling crop size [height, width] used in the ASPP module. When '
+    'value is None, the model performs image pooling with "crop_size". This'
+    'flag is useful when one likes to use different image pooling sizes.')
+
 flags.DEFINE_boolean('aspp_with_batch_norm', True,
                      'Use batch norm parameters for ASPP or not.')
 
@@ -106,6 +113,7 @@ class ModelOptions(
         'output_stride',
         'merge_method',
         'add_image_level_feature',
+        'image_pooling_crop_size',
         'aspp_with_batch_norm',
         'aspp_with_separable_conv',
         'multi_grid',
@@ -140,7 +148,13 @@ def __new__(cls,
     return super(ModelOptions, cls).__new__(
         cls, outputs_to_num_classes, crop_size, atrous_rates, output_stride,
         FLAGS.merge_method, FLAGS.add_image_level_feature,
-        FLAGS.aspp_with_batch_norm, FLAGS.aspp_with_separable_conv,
-        FLAGS.multi_grid, FLAGS.decoder_output_stride,
-        FLAGS.decoder_use_separable_conv, FLAGS.logits_kernel_size,
-        FLAGS.model_variant, FLAGS.depth_multiplier)
+        FLAGS.image_pooling_crop_size, FLAGS.aspp_with_batch_norm,
+        FLAGS.aspp_with_separable_conv, FLAGS.multi_grid,
+        FLAGS.decoder_output_stride, FLAGS.decoder_use_separable_conv,
+        FLAGS.logits_kernel_size, FLAGS.model_variant, FLAGS.depth_multiplier)
+
+  def __deepcopy__(self, memo):
+    return ModelOptions(copy.deepcopy(self.outputs_to_num_classes),
+                        self.crop_size,
+                        self.atrous_rates,
+                        self.output_stride)
diff --git a/research/deeplab/common_test.py b/research/deeplab/common_test.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 
 """Tests for common.py."""
+import copy
 
 import tensorflow as tf
 
@@ -29,6 +30,23 @@ def testOutputsToNumClasses(self):
     self.assertEqual(model_options.outputs_to_num_classes[common.OUTPUT_TYPE],
                      num_classes)
 
+  def testDeepcopy(self):
+    num_classes = 21
+    model_options = common.ModelOptions(
+        outputs_to_num_classes={common.OUTPUT_TYPE: num_classes})
+    model_options_new = copy.deepcopy(model_options)
+    self.assertEqual((model_options_new.
+                      outputs_to_num_classes[common.OUTPUT_TYPE]),
+                     num_classes)
+
+    num_classes_new = 22
+    model_options_new.outputs_to_num_classes[common.OUTPUT_TYPE] = (
+        num_classes_new)
+    self.assertEqual(model_options.outputs_to_num_classes[common.OUTPUT_TYPE],
+                     num_classes)
+    self.assertEqual((model_options_new.
+                      outputs_to_num_classes[common.OUTPUT_TYPE]),
+                     num_classes_new)
 
 if __name__ == '__main__':
   tf.test.main()
diff --git a/research/deeplab/deeplab_demo.ipynb b/research/deeplab/deeplab_demo.ipynb
@@ -294,13 +294,13 @@
         "  try:\n",
         "    f = urllib.request.urlopen(url)\n",
         "    jpeg_str = f.read()\n",
-        "    original_im = Image.open(BytesIO(jpeg_str))\n",
+        "    orignal_im = Image.open(BytesIO(jpeg_str))\n",
         "  except IOError:\n",
         "    print('Cannot retrieve image. Please check url: ' + url)\n",
         "    return\n",
         "\n",
         "  print('running deeplab on image %s...' % url)\n",
-        "  resized_im, seg_map = MODEL.run(original_im)\n",
+        "  resized_im, seg_map = MODEL.run(orignal_im)\n",
         "\n",
         "  vis_segmentation(resized_im, seg_map)\n",
         "\n",
diff --git a/research/deeplab/input_preprocess.py b/research/deeplab/input_preprocess.py
@@ -95,11 +95,12 @@ def preprocess_image_and_label(image,
     original_image = tf.identity(processed_image)
 
   # Data augmentation by randomly scaling the inputs.
-  scale = preprocess_utils.get_random_scale(
-      min_scale_factor, max_scale_factor, scale_factor_step_size)
-  processed_image, label = preprocess_utils.randomly_scale_image_and_label(
-      processed_image, label, scale)
-  processed_image.set_shape([None, None, 3])
+  if is_training:
+    scale = preprocess_utils.get_random_scale(
+        min_scale_factor, max_scale_factor, scale_factor_step_size)
+    processed_image, label = preprocess_utils.randomly_scale_image_and_label(
+        processed_image, label, scale)
+    processed_image.set_shape([None, None, 3])
 
   # Pad image and label to have dimensions >= [crop_height, crop_width]
   image_shape = tf.shape(processed_image)
diff --git a/research/deeplab/model.py b/research/deeplab/model.py
@@ -237,9 +237,6 @@ def multi_scale_logits(images,
   # Setup default values.
   if not image_pyramid:
     image_pyramid = [1.0]
-  if model_options.crop_size is None and model_options.add_image_level_feature:
-    raise ValueError(
-        'Crop size must be specified for using image-level feature.')
   crop_height = (
       model_options.crop_size[0]
       if model_options.crop_size else tf.shape(images)[1])
@@ -378,18 +375,39 @@ def extract_features(images,
         branch_logits = []
 
         if model_options.add_image_level_feature:
-          pool_height = scale_dimension(model_options.crop_size[0],
-                                        1. / model_options.output_stride)
-          pool_width = scale_dimension(model_options.crop_size[1],
-                                       1. / model_options.output_stride)
-          image_feature = slim.avg_pool2d(
-              features, [pool_height, pool_width], [pool_height, pool_width],
-              padding='VALID')
+          if model_options.crop_size is not None:
+            image_pooling_crop_size = model_options.image_pooling_crop_size
+            # If image_pooling_crop_size is not specified, use crop_size.
+            if image_pooling_crop_size is None:
+              image_pooling_crop_size = model_options.crop_size
+            pool_height = scale_dimension(image_pooling_crop_size[0],
+                                          1. / model_options.output_stride)
+            pool_width = scale_dimension(image_pooling_crop_size[1],
+                                         1. / model_options.output_stride)
+            image_feature = slim.avg_pool2d(
+                features, [pool_height, pool_width], [1, 1], padding='VALID')
+            resize_height = scale_dimension(model_options.crop_size[0],
+                                            1. / model_options.output_stride)
+            resize_width = scale_dimension(model_options.crop_size[1],
+                                           1. / model_options.output_stride)
+          else:
+            # If crop_size is None, we simply do global pooling.
+            pool_height = tf.shape(features)[1]
+            pool_width = tf.shape(features)[2]
+            image_feature = tf.reduce_mean(features, axis=[1, 2])[:, tf.newaxis,
+                                                                  tf.newaxis]
+            resize_height = pool_height
+            resize_width = pool_width
           image_feature = slim.conv2d(
               image_feature, depth, 1, scope=IMAGE_POOLING_SCOPE)
           image_feature = tf.image.resize_bilinear(
-              image_feature, [pool_height, pool_width], align_corners=True)
-          image_feature.set_shape([None, pool_height, pool_width, depth])
+              image_feature, [resize_height, resize_width], align_corners=True)
+          # Set shape for resize_height/resize_width if they are not Tensor.
+          if isinstance(resize_height, tf.Tensor):
+            resize_height = None
+          if isinstance(resize_width, tf.Tensor):
+            resize_width = None
+          image_feature.set_shape([None, resize_height, resize_width, depth])
           branch_logits.append(image_feature)
 
         # Employ a 1x1 convolution.
@@ -453,9 +471,14 @@ def _get_logits(images,
       fine_tune_batch_norm=fine_tune_batch_norm)
 
   if model_options.decoder_output_stride is not None:
-    decoder_height = scale_dimension(model_options.crop_size[0],
+    if model_options.crop_size is None:
+      height = tf.shape(images)[1]
+      width = tf.shape(images)[2]
+    else:
+      height, width = model_options.crop_size
+    decoder_height = scale_dimension(height,
                                      1.0 / model_options.decoder_output_stride)
-    decoder_width = scale_dimension(model_options.crop_size[1],
+    decoder_width = scale_dimension(width,
                                     1.0 / model_options.decoder_output_stride)
     features = refine_by_decoder(
         features,
@@ -557,8 +580,11 @@ def refine_by_decoder(features,
             for j, feature in enumerate(decoder_features_list):
               decoder_features_list[j] = tf.image.resize_bilinear(
                   feature, [decoder_height, decoder_width], align_corners=True)
-              decoder_features_list[j].set_shape(
-                  [None, decoder_height, decoder_width, None])
+              h = (None if isinstance(decoder_height, tf.Tensor)
+                   else decoder_height)
+              w = (None if isinstance(decoder_width, tf.Tensor)
+                   else decoder_width)
+              decoder_features_list[j].set_shape([None, h, w, None])
             decoder_depth = 256
             if decoder_use_separable_conv:
               decoder_features = split_separable_conv2d(
diff --git a/research/deeplab/utils/get_dataset_colormap.py b/research/deeplab/utils/get_dataset_colormap.py
@@ -21,6 +21,8 @@
 
 * Cityscapes dataset (https://www.cityscapes-dataset.com).
 
+* Mapillary Vistas (https://research.mapillary.com).
+
 * PASCAL VOC 2012 (http://host.robots.ox.ac.uk/pascal/VOC/).
 """
 
diff --git a/research/deeplab/utils/train_utils.py b/research/deeplab/utils/train_utils.py
@@ -112,10 +112,12 @@ def get_model_init_fn(train_logdir,
 
   variables_to_restore = slim.get_variables_to_restore(exclude=exclude_list)
 
-  return slim.assign_from_checkpoint_fn(
-      tf_initial_checkpoint,
-      variables_to_restore,
-      ignore_missing_vars=ignore_missing_vars)
+  if variables_to_restore:
+    return slim.assign_from_checkpoint_fn(
+        tf_initial_checkpoint,
+        variables_to_restore,
+        ignore_missing_vars=ignore_missing_vars)
+  return None
 
 
 def get_model_gradient_multipliers(last_layers, last_layer_gradient_multiplier):