tensorflow
diff --git a/‎Sources/x10/xla_client/mesh_service.cc
+55-30 b/‎Sources/x10/xla_client/mesh_service.cc
+55-30
diff --git a/‎Sources/x10/xla_client/mesh_service.h
+2-1 b/‎Sources/x10/xla_client/mesh_service.h
+2-1
diff --git a/‎Sources/x10/xla_client/mesh_service.proto
+2-1 b/‎Sources/x10/xla_client/mesh_service.proto
+2-1
diff --git a/‎Sources/x10/xla_tensor/aten_compat.h
+1 b/‎Sources/x10/xla_tensor/aten_compat.h
+1
diff --git a/‎Sources/x10/xla_tensor/cross_replica_reduces.cpp
+10-22 b/‎Sources/x10/xla_tensor/cross_replica_reduces.cpp
+10-22
@@ -26,7 +26,9 @@
 #include <atomic>
 #include <chrono>
 #include <iostream>
+#include <map>
 #include <mutex>
+#include <set>
 #include <unordered_map>
 
 #include "absl/strings/str_cat.h"
@@ -91,21 +93,14 @@ class MeshServiceImpl : public grpc::MeshService::Service {
  private:
   class RendezvousData {
    public:
-    explicit RendezvousData(size_t count)
-        : mwait_(count), release_count_(0), payloads_(count) {}
+    explicit RendezvousData(size_t count, const std::set<int64>& replicas)
+        : count_(count),
+          replicas_(replicas),
+          mwait_(count),
+          release_count_(0) {}
 
     bool Release() { return release_count_.fetch_add(1) == 0; }
 
-    void SetPayload(size_t ordinal, std::string payload) {
-      std::lock_guard<std::mutex> lock(lock_);
-      if (ordinal >= payloads_.size()) {
-        status_ = ::grpc::Status(::grpc::StatusCode::INVALID_ARGUMENT,
-                                 absl::StrCat("Invalid ordinal: ", ordinal));
-      } else {
-        payloads_[ordinal] = std::move(payload);
-      }
-    }
-
     ::grpc::Status Wait() {
       ::grpc::Status status =
           ToGrpcStatus(xla::util::CheckedCall([&]() { mwait_.Wait(); }));
@@ -116,25 +111,50 @@ class MeshServiceImpl : public grpc::MeshService::Service {
       return status;
     }
 
-    void Done() { mwait_.Done(); }
+    void Complete(int64 ordinal, std::string payload,
+                  const std::set<int64>& replicas) {
+      std::lock_guard<std::mutex> lock(lock_);
+      if ((!replicas_.empty() && replicas_.count(ordinal) == 0) ||
+          (replicas_.empty() && ordinal >= count_)) {
+        status_ = ::grpc::Status(::grpc::StatusCode::INVALID_ARGUMENT,
+                                 absl::StrCat("Invalid ordinal: ", ordinal));
+      } else if (replicas != replicas_) {
+        status_ = ::grpc::Status(
+            ::grpc::StatusCode::INVALID_ARGUMENT,
+            absl::StrCat("Mismatching replicas: (",
+                         absl::StrJoin(replicas_, ", "), ") vs. (",
+                         absl::StrJoin(replicas, ", "), ")"));
+      } else {
+        auto insert_result = payloads_.emplace(ordinal, std::move(payload));
+        if (!insert_result.second) {
+          status_ =
+              ::grpc::Status(::grpc::StatusCode::INVALID_ARGUMENT,
+                             absl::StrCat("Duplicate ordinal: ", ordinal));
+        }
+      }
+      mwait_.Done();
+    }
 
-    const std::vector<std::string>& Payloads() const { return payloads_; };
+    const std::map<int64, std::string>& Payloads() const { return payloads_; };
 
    private:
+    size_t count_;
+    std::set<int64> replicas_;
     std::mutex lock_;
     util::MultiWait mwait_;
     std::atomic<size_t> release_count_;
-    std::vector<std::string> payloads_;
+    std::map<int64, std::string> payloads_;
     ::grpc::Status status_;
   };
 
-  std::shared_ptr<RendezvousData> GetRendezvous(const std::string& tag) {
+  std::shared_ptr<RendezvousData> GetRendezvous(
+      const std::string& tag, const std::set<int64>& replicas) {
     std::lock_guard<std::mutex> lock(lock_);
     auto it = rendezvous_map_.find(tag);
     if (it == rendezvous_map_.end()) {
+      size_t count = replicas.empty() ? config_.mesh_size() : replicas.size();
       it = rendezvous_map_
-               .emplace(tag,
-                        std::make_shared<RendezvousData>(config_.mesh_size()))
+               .emplace(tag, std::make_shared<RendezvousData>(count, replicas))
                .first;
     }
     return it->second;
@@ -165,18 +185,19 @@ ::grpc::Status MeshServiceImpl::GetConfig(::grpc::ServerContext* context,
 ::grpc::Status MeshServiceImpl::Rendezvous(
     ::grpc::ServerContext* context, const grpc::RendezvousRequest* request,
     grpc::RendezvousResponse* response) {
-  auto rendezvous = GetRendezvous(request->tag());
-  rendezvous->SetPayload(request->ordinal(), request->payload());
-  rendezvous->Done();
+  std::set<int64> replicas(request->replicas().begin(),
+                           request->replicas().end());
+  auto rendezvous = GetRendezvous(request->tag(), replicas);
+  rendezvous->Complete(request->ordinal(), request->payload(), replicas);
   TF_VLOG(3) << "Entering rendezvous: ordinal=" << request->ordinal()
-             << " tag=" << request->tag() << " peer=" << context->peer();
+             << ", tag=" << request->tag() << ", peer=" << context->peer();
   ::grpc::Status status = rendezvous->Wait();
   TF_VLOG(3) << "Exiting rendezvous: ordinal=" << request->ordinal()
-             << " tag=" << request->tag() << " peer=" << context->peer()
-             << " status=" << status;
+             << ", tag=" << request->tag() << ", peer=" << context->peer()
+             << ", status=" << status;
   if (status.ok()) {
-    for (auto& payload : rendezvous->Payloads()) {
-      response->add_payloads(payload);
+    for (auto& ordinal_payload : rendezvous->Payloads()) {
+      response->add_payloads(ordinal_payload.second);
     }
   }
   ReleaseRendezvous(request->tag(), rendezvous);
@@ -267,13 +288,17 @@ grpc::Config MeshClient::GetConfig() const {
 }
 
 std::vector<std::string> MeshClient::Rendezvous(
-    int ordinal, const std::string& tag, const std::string& payload) const {
+    int ordinal, const std::string& tag, const std::string& payload,
+    absl::Span<const int64> replicas) const {
   ::grpc::ClientContext context;
   grpc::RendezvousRequest request;
   grpc::RendezvousResponse response;
   request.set_tag(tag);
   request.set_payload(payload);
   request.set_ordinal(ordinal);
+  for (auto& replica : replicas) {
+    request.add_replicas(replica);
+  }
   TF_VLOG(3) << "Waiting for rendezvous: ordinal=" << ordinal << " tag=" << tag;
   ::grpc::Status status = impl_->stub->Rendezvous(&context, request, &response);
   TF_VLOG(3) << "Rendezvous wait complete: " << tag;
@@ -290,16 +315,16 @@ std::vector<std::string> MeshClient::Rendezvous(
 std::string MeshClient::GetNcclUniqueUid(
     absl::Span<const int64> replicas) const {
   ::grpc::ClientContext context;
-  grpc::GetNcclUniqueUidRequest reqeust;
+  grpc::GetNcclUniqueUidRequest request;
   grpc::GetNcclUniqueUidResponse response;
   for (auto& replica : replicas) {
-    reqeust.add_replicas(replica);
+    request.add_replicas(replica);
   }
 
   TF_VLOG(3) << "Waiting for NCCL UID: replicas=("
              << absl::StrJoin(replicas, ", ") << ")";
   ::grpc::Status status =
-      impl_->stub->GetNcclUniqueUid(&context, reqeust, &response);
+      impl_->stub->GetNcclUniqueUid(&context, request, &response);
   TF_VLOG(3) << "NCCL UID wait complete: " << absl::StrJoin(replicas, ", ")
              << ")";
   if (!status.ok()) {
 
@@ -51,7 +51,8 @@ class MeshClient {
   grpc::Config GetConfig() const;
 
   std::vector<std::string> Rendezvous(int ordinal, const std::string& tag,
-                                      const std::string& payload) const;
+                                      const std::string& payload,
+                                      absl::Span<const int64> replicas) const;
 
   std::string GetNcclUniqueUid(absl::Span<const int64> replicas) const;
 
 
@@ -46,14 +46,15 @@ message RendezvousRequest {
   required string tag = 1;
   required bytes payload = 2;
   required uint32 ordinal = 3;
+  repeated uint32 replicas = 4;
 }
 
 message RendezvousResponse {
   repeated bytes payloads = 1;
 }
 
 message GetNcclUniqueUidRequest {
-  repeated int64 replicas = 1;
+  repeated uint32 replicas = 1;
 }
 
 message GetNcclUniqueUidResponse {
 
@@ -774,6 +774,7 @@
   _(xla, generic_slice)            \
   _(xla, get_dimensions_size)      \
   _(xla, moving_average)           \
+  _(xla, nms)                      \
   _(xla, not_supported)            \
   _(xla, replication_pad)          \
   _(xla, replication_pad_backward) \
 
@@ -22,6 +22,7 @@
 #include "tensorflow/compiler/tf2xla/xla_tensor/convert_ops.h"
 #include "tensorflow/compiler/tf2xla/xla_tensor/helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_tensor/layout_manager.h"
+#include "tensorflow/compiler/tf2xla/xla_tensor/token_handler.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 
 namespace swift_xla {
@@ -94,14 +95,6 @@ std::vector<xla::ReplicaGroup> CreateReduceGroups(
   return reduce_groups;
 }
 
-xla::XlaOp SliceOneToken(xla::XlaOp input) {
-  const xla::Shape& input_shape = XlaHelpers::ShapeOfXlaOp(input);
-  if (input_shape.rank() == 0) {
-    return input;
-  }
-  return xla::SliceInDim(input, 0, 1, 1, 0);
-}
-
 }  // namespace
 
 std::vector<xla::XlaOp> BuildAllReduce(
@@ -151,32 +144,27 @@ AllToAllResult BuildAllToAll(
     const std::vector<std::vector<xla::int64>>& groups) {
   std::vector<xla::ReplicaGroup> reduce_groups = CreateReduceGroups(groups);
   const xla::Shape& input_shape = XlaHelpers::ShapeOfXlaOp(input);
-  xla::XlaOp affine_token = MaybeConvertTo(token, input_shape.element_type());
   // TODO: This is missing layout pinning ATM. If XLA scheduling is not exactly
   // the same (graphs on cores differ), XLA could assign different layouts and
   // things will break.
-  xla::XlaOp reduce_result =
-      xla::AllToAll(input + affine_token, split_dimension, concat_dimension,
-                    split_count, reduce_groups);
-  xla::XlaOp chained_token =
-      MaybeConvertTo(affine_token * SliceOneToken(reduce_result),
-                     XlaHelpers::TypeOfXlaOp(token));
-  return {reduce_result, chained_token};
+  TokenHandler token_handler(token);
+  xla::XlaOp reduce_result = xla::AllToAll(
+      token_handler.GetInput(input, &input_shape), split_dimension,
+      concat_dimension, split_count, reduce_groups);
+  return {reduce_result, token_handler.GetNewToken(reduce_result)};
 }
 
 CollectivePermuteResult BuildCollectivePermute(
     xla::XlaOp input, xla::XlaOp token,
     const std::vector<std::pair<xla::int64, xla::int64>>& source_target_pairs) {
   const xla::Shape& input_shape = XlaHelpers::ShapeOfXlaOp(input);
-  xla::XlaOp affine_token = MaybeConvertTo(token, input_shape.element_type());
+  TokenHandler token_handler(token);
   // TODO: This is missing layout pinning ATM. If XLA scheduling is not exactly
   // the same (graphs on cores differ), XLA could assign different layouts and
   // things will break.
-  xla::XlaOp result =
-      xla::CollectivePermute(input + affine_token, source_target_pairs);
-  xla::XlaOp chained_token = MaybeConvertTo(
-      affine_token * SliceOneToken(result), XlaHelpers::TypeOfXlaOp(token));
-  return {result, chained_token};
+  xla::XlaOp result = xla::CollectivePermute(
+      token_handler.GetInput(input, &input_shape), source_target_pairs);
+  return {result, token_handler.GetNewToken(result)};
 }
 
 }  // namespace swift_xla
Original file line number	Diff line number	Diff line change
`@@ -46,14 +46,15 @@ message RendezvousRequest {`
`46`	`46`	`required string tag = 1;`
`47`	`47`	`required bytes payload = 2;`
`48`	`48`	`required uint32 ordinal = 3;`
	`49`	`+ repeated uint32 replicas = 4;`
`49`	`50`	`}`
`50`	`51`
`51`	`52`	`message RendezvousResponse {`
`52`	`53`	`repeated bytes payloads = 1;`
`53`	`54`	`}`
`54`	`55`
`55`	`56`	`message GetNcclUniqueUidRequest {`
`56`		`- repeated int64 replicas = 1;`
	`57`	`+ repeated uint32 replicas = 1;`
`57`	`58`	`}`
`58`	`59`
`59`	`60`	`message GetNcclUniqueUidResponse {`