fix: avoid deadlock race writing to a disconnected mapper (coder#20303)

spikecurtis · web-flow · commit 05b037bdea90 · 2025-10-15T15:56:07.000+04:00
fixes coder/internal#1045 Fixes a race condition in our PG Coordinator when a peer disconnects. We issue database queries to find the peer mappings (node structures for each peer connected via a tunnel), and then send these to the "mapper" that generates diffs and eventually writes the update to the websocket. Before this change we erroneously used the querier's context for this update, which has the same lifetime as the coordinator itself. If the peer has disconnected, the mapper might not be reading from its channel, and this causes a deadlock in a querier worker. This also prevents us from doing any more work on the peer. I also added some more debug logging that would have been helpful when tracking this down.
diff --git a/enterprise/tailnet/pgcoord.go b/enterprise/tailnet/pgcoord.go
@@ -873,9 +873,11 @@ func (q *querier) handleIncoming() {
 			return
 
 		case c := <-q.newConnections:
+			q.logger.Debug(q.ctx, "new connection received", slog.F("peer_id", c.UniqueID()))
 			q.newConn(c)
 
 		case c := <-q.closeConnections:
+			q.logger.Debug(q.ctx, "connection close request", slog.F("peer_id", c.UniqueID()))
 			q.cleanupConn(c)
 		}
 	}
@@ -902,7 +904,8 @@ func (q *querier) newConn(c *connIO) {
 	mk := mKey(c.UniqueID())
 	dup, ok := q.mappers[mk]
 	if ok {
-		// duplicate, overwrite and close the old one
+		q.logger.Debug(q.ctx, "duplicate mapper found; closing old connection", slog.F("peer_id", dup.c.UniqueID()))
+		// overwrite and close the old one
 		atomic.StoreInt64(&c.overwrites, dup.c.Overwrites()+1)
 		err := dup.c.CoordinatorClose()
 		if err != nil {
@@ -913,6 +916,7 @@ func (q *querier) newConn(c *connIO) {
 	q.workQ.enqueue(querierWorkKey{
 		mappingQuery: mk,
 	})
+	q.logger.Debug(q.ctx, "added new mapper", slog.F("peer_id", c.UniqueID()))
 }
 
 func (q *querier) isHealthy() bool {
@@ -940,11 +944,12 @@ func (q *querier) cleanupConn(c *connIO) {
 		logger.Error(q.ctx, "failed to close connIO", slog.Error(err))
 	}
 	delete(q.mappers, mk)
-	q.logger.Debug(q.ctx, "removed mapper")
+	q.logger.Debug(q.ctx, "removed mapper", slog.F("peer_id", c.UniqueID()))
 }
 
 func (q *querier) worker() {
 	defer q.wg.Done()
+	defer q.logger.Debug(q.ctx, "worker exited")
 	eb := backoff.NewExponentialBackOff()
 	eb.MaxElapsedTime = 0 // retry indefinitely
 	eb.MaxInterval = dbMaxBackoff
@@ -1019,7 +1024,7 @@ func (q *querier) mappingQuery(peer mKey) error {
 		return nil
 	}
 	logger.Debug(q.ctx, "sending mappings", slog.F("mapping_len", len(mappings)))
-	return agpl.SendCtx(q.ctx, mpr.mappings, mappings)
+	return agpl.SendCtx(mpr.ctx, mpr.mappings, mappings)
 }
 
 func (q *querier) bindingsToMappings(bindings []database.GetTailnetTunnelPeerBindingsRow) ([]mapping, error) {