tensorflow · Oct 3, 2019
diff --git a/‎examples/mnist.py
+3-1 b/‎examples/mnist.py
+3-1
diff --git a/‎examples/mnist_dataset.py
+3-1 b/‎examples/mnist_dataset.py
+3-1
diff --git a/‎examples/toy_model_tpu.py
+5-3 b/‎examples/toy_model_tpu.py
+5-3
diff --git a/‎mesh_tensorflow/auto_mtf/api.py
+3-1 b/‎mesh_tensorflow/auto_mtf/api.py
+3-1
diff --git a/‎mesh_tensorflow/auto_mtf/api_test.py
+3-1 b/‎mesh_tensorflow/auto_mtf/api_test.py
+3-1
diff --git a/‎mesh_tensorflow/auto_mtf/graph_interface.py
+3-1 b/‎mesh_tensorflow/auto_mtf/graph_interface.py
+3-1
diff --git a/‎mesh_tensorflow/auto_mtf/graph_interface_test.py
+3-1 b/‎mesh_tensorflow/auto_mtf/graph_interface_test.py
+3-1
diff --git a/‎mesh_tensorflow/auto_mtf/layout_optimizer_test.py
+3-1 b/‎mesh_tensorflow/auto_mtf/layout_optimizer_test.py
+3-1
diff --git a/‎mesh_tensorflow/auto_mtf/memory_estimator_test.py
+3-1 b/‎mesh_tensorflow/auto_mtf/memory_estimator_test.py
+3-1
diff --git a/‎mesh_tensorflow/auto_mtf/scheduler_test.py
+3-1 b/‎mesh_tensorflow/auto_mtf/scheduler_test.py
+3-1
diff --git a/‎mesh_tensorflow/auto_mtf/valid_layouts_test.py
+3-1 b/‎mesh_tensorflow/auto_mtf/valid_layouts_test.py
+3-1
diff --git a/‎mesh_tensorflow/beam_search.py
+3-1 b/‎mesh_tensorflow/beam_search.py
+3-1
diff --git a/‎mesh_tensorflow/import_test.py
+3-1 b/‎mesh_tensorflow/import_test.py
+3-1
diff --git a/‎mesh_tensorflow/layers.py
+3-1 b/‎mesh_tensorflow/layers.py
+3-1
diff --git a/‎mesh_tensorflow/layers_test.py
+9-5 b/‎mesh_tensorflow/layers_test.py
+9-5
diff --git a/‎mesh_tensorflow/ops.py
+17-48 b/‎mesh_tensorflow/ops.py
+17-48
@@ -24,7 +24,9 @@
 
 import mesh_tensorflow as mtf
 import mnist_dataset as dataset  # local file import
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
+
+tf.disable_v2_behavior()
 
 
 tf.flags.DEFINE_string("data_dir", "/tmp/mnist_data",
 
@@ -39,7 +39,9 @@
 
 import numpy as np
 from six.moves import urllib
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
+
+tf.disable_v2_behavior()
 
 
 def read32(bytestream):
 
@@ -21,7 +21,7 @@
 
 import mesh_tensorflow as mtf
 import numpy
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 
 from tensorflow.contrib.tpu.python.tpu import tpu_config
 from tensorflow.contrib.tpu.python.tpu import tpu_estimator
@@ -30,6 +30,8 @@
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow_estimator.python.estimator import estimator as estimator_lib
 
+tf.disable_v2_behavior()
+
 FLAGS = flags.FLAGS
 
 tf.flags.DEFINE_integer('batch_size', 64, 'Training batch size.')
@@ -89,7 +91,7 @@ def __call__(self, params):
     """Input function which provides a single batch for train or eval."""
     # Retrieves the batch size for the current shard. The # of shards is
     # computed according to the input pipeline deployment. See
-    # `tf.contrib.tpu.RunConfig` for details.
+    # `tf.estimator.tpu.RunConfig` for details.
     batch_size = params['batch_size']
     logging.info('call ToyModelInput() with batch size {}'.format(batch_size))
 
@@ -242,7 +244,7 @@ def metric_fn(tf_logits):
 
 def run_toy_model_tpu():
   """Run a toy model on TPU."""
-  tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
+  tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
       FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
 
   iterations_per_loop = FLAGS.iterations
 
@@ -39,7 +39,9 @@
 import mesh_tensorflow as mtf
 from mesh_tensorflow.auto_mtf import layout_optimizer
 from mesh_tensorflow.auto_mtf import memory_estimator
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
+
+tf.disable_v2_behavior()
 
 
 def layout(mtf_graph, mesh_shape, mtf_outputs=()):
 
@@ -22,7 +22,9 @@
 import mesh_tensorflow as mtf
 import mesh_tensorflow.auto_mtf  # pylint: disable=unused-import
 import mesh_tensorflow.auto_mtf.api
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
+
+tf.disable_v2_behavior()
 
 
 class LayoutTest(tf.test.TestCase):
 
@@ -29,9 +29,11 @@
 import collections
 import math
 import mesh_tensorflow as mtf
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.core.framework import cost_graph_pb2
 
+tf.disable_v2_behavior()
+
 
 class GraphInterface(object):
   """tf.Graph & mtf.Graph common representation which produces a CostGraphDef.
 
@@ -21,11 +21,13 @@
 
 import mesh_tensorflow as mtf
 from mesh_tensorflow.auto_mtf import graph_interface
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.core.framework import cost_graph_pb2
 from tensorflow.core.framework import tensor_shape_pb2
 from tensorflow.core.framework import types_pb2
 
+tf.disable_v2_behavior()
+
 
 class GraphInterfaceTest(tf.test.TestCase):
 
 
@@ -23,7 +23,9 @@
 from mesh_tensorflow.auto_mtf import layout_optimizer
 from mesh_tensorflow.auto_mtf import memory_estimator
 import six
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
+
+tf.disable_v2_behavior()
 
 
 class VariableNamesTest(tf.test.TestCase):
 
@@ -21,7 +21,9 @@
 
 import mesh_tensorflow as mtf
 from mesh_tensorflow.auto_mtf import memory_estimator
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
+
+tf.disable_v2_behavior()
 
 
 class MemoryEstimatorTest(tf.test.TestCase):
 
@@ -23,7 +23,9 @@
 import mesh_tensorflow as mtf
 from mesh_tensorflow.auto_mtf import graph_interface
 from mesh_tensorflow.auto_mtf import scheduler
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
+
+tf.disable_v2_behavior()
 
 
 class SchedulerTest(parameterized.TestCase, tf.test.TestCase):
 
@@ -21,7 +21,9 @@
 
 import mesh_tensorflow as mtf
 from mesh_tensorflow.auto_mtf import valid_layouts
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
+
+tf.disable_v2_behavior()
 
 
 class LayoutValidatorTest(tf.test.TestCase):
 
@@ -21,7 +21,9 @@
 
 import gin
 from mesh_tensorflow import ops_with_redefined_builtins as mtf
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
+
+tf.disable_v2_behavior()
 
 # Assuming EOS_ID is 1
 EOS_ID = 1
 
@@ -20,7 +20,9 @@
 from __future__ import print_function
 
 import mesh_tensorflow as mtf  # pylint: disable=unused-import
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
+
+tf.disable_v2_behavior()
 
 
 class ImportTest(tf.test.TestCase):
 
@@ -21,7 +21,9 @@
 
 from mesh_tensorflow import ops_with_redefined_builtins as mtf
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
+
+tf.disable_v2_behavior()
 
 
 def dense(x, output_dim, reduced_dims=None, expert_dims=None,
 
@@ -25,8 +25,12 @@
 import numpy as np
 
 from tensor2tensor.layers import common_layers
+from tensor2tensor.utils import test_utils
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
+
+tf.disable_v2_behavior()
+tf.enable_eager_execution()
 
 
 class LayersTest(parameterized.TestCase, tf.test.TestCase):
@@ -69,7 +73,7 @@ def testDense(self, units, use_bias):
 
     self.assertEqual(actual.shape, expected.shape)
 
-  @tf.contrib.eager.run_test_in_graph_and_eager_modes()
+  @test_utils.run_in_graph_and_eager_modes()
   def testLayerNorm(self):
     batch = 2
     channels = 3
@@ -98,7 +102,7 @@ def testLayerNorm(self):
 
     self.assertEqual(actual.shape, expected.shape)
 
-  @tf.contrib.eager.run_test_in_graph_and_eager_modes()
+  @test_utils.run_in_graph_and_eager_modes()
   def testBatchNorm(self):
     batch = 2
     channels = 3
@@ -138,7 +142,7 @@ def testBatchNorm(self):
     self.assertAllClose(actual_0, expected)
     self.assertAllClose(actual_1, expected)
 
-  @tf.contrib.eager.run_test_in_graph_and_eager_modes()
+  @test_utils.run_in_graph_and_eager_modes()
   def testWeightsNonzero(self):
     inputs = tf.constant([[3, 1, 0], [1, 0, 0]])
 
@@ -162,7 +166,7 @@ def testWeightsNonzero(self):
 
     self.assertAllEqual(actual, expected)
 
-  @tf.contrib.eager.run_test_in_graph_and_eager_modes()
+  @test_utils.run_in_graph_and_eager_modes()
   def testDenseReluDense(self):
     batch = 2
     channels = 3
 
@@ -29,12 +29,13 @@
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 
 # pylint: disable=g-direct-tensorflow-import
 from tensorflow.python.ops.gen_nn_ops import conv3d_backprop_input_v2
 from tensorflow.python.ops.nn_ops import conv3d_backprop_filter_v2
 
+tf.disable_v2_behavior()
 
 Dimension = collections.namedtuple("Dimension", ["name", "size"])
 
@@ -1218,6 +1219,8 @@ def einsum(self, equation, *slices):
     Args:
       equation: a string
       *slices: a list of tf.Tensor
+    Returns:
+      a Tensor
     """
     return tf.einsum(equation, *slices)
 
@@ -4973,6 +4976,8 @@ def gather(weights, indices, dim, output_shape=None):
   dim = convert_to_dimension(dim)
   output_shape = convert_to_shape(output_shape)
   if not isinstance(indices, Tensor):
+    # TODO(noam): when `indices` is an integer, gather can be implemented
+    #   more directly with mtf_slice() and reshape()
     indices = constant(weights.mesh, indices, dtype=tf.int32)
   if weights.dtype == tf.bool:
     return cast(gather(to_float(weights), indices, dim, output_shape), tf.bool)
@@ -6087,60 +6092,24 @@ def body_fn(microbatch_num):
   return combined_grads, combined_outputs
 
 
-class NthSmallestElementOperation(Operation):
-  """Reduce out last dimension - output is nth-smallest (or largest) element.
-
-  TODO(noam): make n a tensor instead of an integer
-  """
-
-  def __init__(self, x, n, reverse, name=None):
-    super(NthSmallestElementOperation, self).__init__(
-        [x], name=name or "nth_element")
-    reduced_dim = x.shape.dims[-1]
-    output_shape = x.shape - reduced_dim
-    self._outputs = [Tensor(self, output_shape, x.dtype)]
-    self._n = n
-    self._initialize_splittable_and_unsplittable_dims(
-        "splittable", [reduced_dim])
-    self._reverse = reverse
-
-  def gradient(self, grad_ys):
-    raise NotImplementedError("TODO(noam): implement gradient")
-
-  def lower(self, lowering):
-    mesh_impl = lowering.mesh_impl(self)
-    def slicewise_fn(x):
-      return tf.contrib.nn.nth_element(x, self._n, reverse=self._reverse)
-    y = mesh_impl.slicewise(slicewise_fn, lowering.tensors[self.inputs[0]])
-    lowering.set_tensor_lowering(self.outputs[0], y)
-
-
-def nth_smallest_element(x, n, reduced_dim=None, reverse=False, name=None):
-  """Nth-smallest (or largest) reduction on specified axis.
+def nth_largest_element(x, n, reduced_dim, name=None):
+  """Nth-largest reduction on specified axis.
 
   Note that n is zero-indexed.
 
-  In the case that reduced_dim is split, we do something inefficient:
-    shift data around so that it is replicated and do the computation
-    everywhere.
-
   Args:
     x: a Tensor
     n: an integer
-    reduced_dim: an optional Dimension - defaults to the last dimension of n
-    reverse: a boolean
+    reduced_dim: a Dimension
     name: an optional string
   Returns:
     a Tensor
   """
-  if reduced_dim is None:
-    reduced_dim = x.shape.dims[-1]
-  # remove the reduced dimension from the shape and insert it at the end
-  x = transpose(x, x.shape - reduced_dim + reduced_dim)
-  # Since the NthSmallestElementOperation does not know how to reduce over a
-  # split dimension, we rename the reduced dimension so that we ensure that it
-  # is not split.  This may cause the tensor to get all-concatenated, causing
-  # redundant computation.
-  unsplit_dim = Dimension("_unsplit", reduced_dim.size)
-  x = replace_dimensions(x, reduced_dim, unsplit_dim)
-  return NthSmallestElementOperation(x, n, reverse, name).outputs[0]
+  # Compute the top k=n+1 values, then take the last one.
+  k_dim = Dimension("_top_k_", n + 1)
+  values, _ = top_k(x, reduced_dim=reduced_dim, k_dim=k_dim, name=name)
+  return gather(values, n, k_dim)
+
+
+def nth_smallest_element(x, n, reduced_dim, name=None):
+  return -nth_largest_element(-x, n, reduced_dim, name=name)