[WIP] Cluster resource limits (dask#266)

jcrist · web-flow · commit f8aff26a854e · 2020-07-15T15:10:19.000-05:00
* [WIP] Cluster resource limits

This adds resource limits *per cluster*. Currently we support:

- Max cores per cluster
- Max memory per cluster
- Max workers per cluster

At runtime these are normalized into a single `cluster_max_workers`
field, which is used to check incoming scale and adapt requests. If a
request from a user exceeds the limit, it is trimmed to be within
bounds, and a warning is raised in the user's terminal notifying them of
the limit.

Still needs tests and docs.

* Update, add tests

* Add docs
diff --git a/dask-gateway-server/dask_gateway_server/backends/base.py b/dask-gateway-server/dask_gateway_server/backends/base.py
@@ -1,13 +1,23 @@
 import asyncio
 
 import aiohttp
-from traitlets import Instance, Integer, Float, Dict, Union, Unicode, default
+from traitlets import (
+    Instance,
+    Integer,
+    Float,
+    Dict,
+    Union,
+    Unicode,
+    default,
+    validate,
+    observe,
+)
 from traitlets.config import LoggingConfigurable, Configurable
 
 from .. import models
 from ..options import Options
 from ..traitlets import MemoryLimit, Type, Callable, Command
-from ..utils import awaitable
+from ..utils import awaitable, format_bytes
 
 
 __all__ = ("Backend", "ClusterConfig")
@@ -320,6 +330,126 @@ class ClusterConfig(Configurable):
         config=True,
     )
 
+    cluster_max_memory = MemoryLimit(
+        None,
+        help="""
+        The maximum amount of memory (in bytes) available to this cluster.
+        Allows the following suffixes:
+
+        - K -> Kibibytes
+        - M -> Mebibytes
+        - G -> Gibibytes
+        - T -> Tebibytes
+
+        Set to ``None`` for no memory limit (default).
+        """,
+        min=0,
+        allow_none=True,
+        config=True,
+    )
+
+    cluster_max_cores = Float(
+        None,
+        help="""
+        The maximum number of cores available to this cluster.
+
+        Set to ``None`` for no cores limit (default).
+        """,
+        min=0.0,
+        allow_none=True,
+        config=True,
+    )
+
+    cluster_max_workers = Integer(
+        help="""
+        The maximum number of workers available to this cluster.
+
+        Note that this will be combined with ``cluster_max_cores`` and
+        ``cluster_max_memory`` at runtime to determine the actual maximum
+        number of workers available to this cluster.
+        """,
+        allow_none=True,
+        min=0,
+        config=True,
+    )
+
+    def _check_scheduler_memory(self, scheduler_memory, cluster_max_memory):
+        if cluster_max_memory is not None and scheduler_memory > cluster_max_memory:
+            memory = format_bytes(scheduler_memory)
+            limit = format_bytes(cluster_max_memory)
+            raise ValueError(
+                f"Scheduler memory request of {memory} exceeds cluster memory "
+                f"limit of {limit}"
+            )
+
+    def _check_scheduler_cores(self, scheduler_cores, cluster_max_cores):
+        if cluster_max_cores is not None and scheduler_cores > cluster_max_cores:
+            raise ValueError(
+                f"Scheduler cores request of {scheduler_cores} exceeds cluster "
+                f"cores limit of {cluster_max_cores}"
+            )
+
+    def _worker_limit_from_resources(self):
+        inf = max_workers = float("inf")
+        if self.cluster_max_memory is not None:
+            max_workers = min(
+                (self.cluster_max_memory - self.scheduler_memory) // self.worker_memory,
+                max_workers,
+            )
+        if self.cluster_max_cores is not None:
+            max_workers = min(
+                (self.cluster_max_cores - self.scheduler_cores) // self.worker_cores,
+                max_workers,
+            )
+
+        if max_workers == inf:
+            return None
+        return max(0, int(max_workers))
+
+    @validate("scheduler_memory")
+    def _validate_scheduler_memory(self, proposal):
+        self._check_scheduler_memory(proposal.value, self.cluster_max_memory)
+        return proposal.value
+
+    @validate("scheduler_cores")
+    def _validate_scheduler_cores(self, proposal):
+        self._check_scheduler_cores(proposal.value, self.cluster_max_cores)
+        return proposal.value
+
+    @validate("cluster_max_memory")
+    def _validate_cluster_max_memory(self, proposal):
+        self._check_scheduler_memory(self.scheduler_memory, proposal.value)
+        return proposal.value
+
+    @validate("cluster_max_cores")
+    def _validate_cluster_max_cores(self, proposal):
+        self._check_scheduler_cores(self.scheduler_cores, proposal.value)
+        return proposal.value
+
+    @validate("cluster_max_workers")
+    def _validate_cluster_max_workers(self, proposal):
+        lim = self._worker_limit_from_resources()
+        if lim is None:
+            return proposal.value
+        if proposal.value is None:
+            return lim
+        return min(proposal.value, lim)
+
+    @observe("cluster_max_workers")
+    def _observe_cluster_max_workers(self, change):
+        # This shouldn't be needed, but traitlet validators don't run
+        # if a value is `None` and `allow_none` is true, so we need to
+        # add an observer to handle the event of an *explicit* `None`
+        # set for `cluster_max_workers`
+        if change.new is None:
+            lim = self._worker_limit_from_resources()
+            if lim is not None:
+                self.cluster_max_workers = lim
+
+    @default("cluster_max_workers")
+    def _default_cluster_max_workers(self):
+        return self._worker_limit_from_resources()
+
     def to_dict(self):
         return {
             k: getattr(self, k)
diff --git a/dask-gateway-server/dask_gateway_server/backends/db_base.py b/dask-gateway-server/dask_gateway_server/backends/db_base.py
@@ -207,6 +207,7 @@ def to_model(self):
             username=self.username,
             token=self.token,
             options=self.options,
+            config=self.config,
             status=self.model_status,
             scheduler_address=self.scheduler_address,
             dashboard_address=self.dashboard_address,
@@ -956,6 +957,18 @@ async def on_cluster_heartbeat(self, cluster_name, msg):
             len(closed_workers),
         )
 
+        max_workers = cluster.config.get("cluster_max_workers")
+        if max_workers is not None and count > max_workers:
+            # This shouldn't happen under normal operation, but could if the
+            # user does something malicious (or there's a bug).
+            self.log.info(
+                "Cluster %s heartbeat requested %d workers, exceeding limit of %s.",
+                cluster_name,
+                count,
+                max_workers,
+            )
+            count = max_workers
+
         if count != cluster.count:
             cluster_update["count"] = count
 
diff --git a/dask-gateway-server/dask_gateway_server/backends/kubernetes/backend.py b/dask-gateway-server/dask_gateway_server/backends/kubernetes/backend.py
@@ -456,6 +456,21 @@ async def on_cluster_heartbeat(self, cluster_name, msg):
             len(msg["closed_workers"]),
         )
 
+        cluster = self.clusters.get(cluster_name)
+        if cluster is None:
+            return
+        max_workers = cluster.config.get("cluster_max_workers")
+        if max_workers is not None and count > max_workers:
+            # This shouldn't happen under normal operation, but could if the
+            # user does something malicious (or there's a bug).
+            self.log.info(
+                "Cluster %s heartbeat requested %d workers, exceeding limit of %s.",
+                cluster_name,
+                count,
+                max_workers,
+            )
+            count = max_workers
+
         try:
             await self.custom_client.patch_namespaced_custom_object(
                 "gateway.dask.org",
@@ -567,6 +582,7 @@ async def sync_cluster(self, cluster_name):
                 name=cluster_name,
                 username=obj["spec"].get("username", ""),
                 options=obj["spec"].get("options") or {},
+                config=obj["spec"].get("config") or {},
                 token="",
                 scheduler_address=scheduler_address,
                 dashboard_address=dashboard_address,
diff --git a/dask-gateway-server/dask_gateway_server/models.py b/dask-gateway-server/dask_gateway_server/models.py
@@ -78,6 +78,8 @@ class Cluster(object):
         The normalized set of configuration options provided when starting this
         cluster. These values are user-facing, and don't necessarily correspond
         with the ``ClusterConfig`` options on the backend.
+    config : dict
+        The serialized version of ``ClusterConfig`` for this cluster.
     status : ClusterStatus
         The status of the cluster.
     scheduler_address : str
@@ -103,6 +105,7 @@ def __init__(
         username,
         token,
         options,
+        config,
         status,
         scheduler_address="",
         dashboard_address="",
@@ -116,6 +119,7 @@ def __init__(
         self.username = username
         self.token = token
         self.options = options
+        self.config = config
         self.status = status
         self.scheduler_address = scheduler_address
         self.dashboard_address = dashboard_address
diff --git a/dask-gateway-server/dask_gateway_server/routes.py b/dask-gateway-server/dask_gateway_server/routes.py
@@ -198,13 +198,23 @@ async def scale_cluster(request):
             reason=f"Scale expects a non-negative integer, got {count}"
         )
 
+    max_workers = cluster.config.get("cluster_max_workers")
+    resp_msg = None
+    if max_workers is not None and count > max_workers:
+        resp_msg = (
+            f"Scale request of {count} workers would exceed resource limit of "
+            f"{max_workers} workers. Scaling to {max_workers} instead."
+        )
+        count = max_workers
+
     try:
         await backend.forward_message_to_scheduler(
             cluster, {"op": "scale", "count": count}
         )
     except PublicException as exc:
         raise web.HTTPConflict(reason=str(exc))
-    return web.Response()
+
+    return web.json_response({"ok": not resp_msg, "msg": resp_msg})
 
 
 @default_routes.post("/api/v1/clusters/{cluster_name}/adapt")
@@ -226,14 +236,33 @@ async def adapt_cluster(request):
     maximum = msg.get("maximum", None)
     active = msg.get("active", True)
 
+    max_workers = cluster.config.get("cluster_max_workers")
+    resp_msg = None
+    if max_workers is not None:
+        if maximum is None:
+            maximum = max_workers
+        if minimum is None:
+            minimum = 0
+        if maximum > max_workers or minimum > max_workers:
+            orig_max = maximum
+            orig_min = minimum
+            maximum = min(max_workers, maximum)
+            minimum = min(max_workers, minimum)
+            resp_msg = (
+                f"Adapt with `maximum={orig_max}, minimum={orig_min}` workers "
+                f"would exceed resource limit of {max_workers} workers. Using "
+                f"`maximum={maximum}, minimum={minimum}` instead."
+            )
+
     try:
         await backend.forward_message_to_scheduler(
             cluster,
             {"op": "adapt", "minimum": minimum, "maximum": maximum, "active": active},
         )
     except PublicException as exc:
         raise web.HTTPConflict(reason=str(exc))
-    return web.Response()
+
+    return web.json_response({"ok": not resp_msg, "msg": resp_msg})
 
 
 @default_routes.post("/api/v1/clusters/{cluster_name}/heartbeat")
@@ -242,7 +271,10 @@ async def handle_heartbeat(request):
     backend = request.app["backend"]
     cluster_name = request.match_info["cluster_name"]
     msg = await request.json()
-    await backend.on_cluster_heartbeat(cluster_name, msg)
+    try:
+        await backend.on_cluster_heartbeat(cluster_name, msg)
+    except PublicException as exc:
+        raise web.HTTPConflict(reason=str(exc))
     return web.Response()
 
 
diff --git a/dask-gateway/dask_gateway/client.py b/dask-gateway/dask_gateway/client.py
@@ -652,7 +652,13 @@ def stop_cluster(self, cluster_name, **kwargs):
 
     async def _scale_cluster(self, cluster_name, n):
         url = "%s/api/v1/clusters/%s/scale" % (self.address, cluster_name)
-        await self._request("POST", url, json={"count": n})
+        resp = await self._request("POST", url, json={"count": n})
+        try:
+            msg = await resp.json()
+        except Exception:
+            msg = {}
+        if not msg.get("ok", True) and msg.get("msg"):
+            warnings.warn(GatewayWarning(msg["msg"]))
 
     def scale_cluster(self, cluster_name, n, **kwargs):
         """Scale a cluster to n workers.
@@ -669,11 +675,17 @@ def scale_cluster(self, cluster_name, n, **kwargs):
     async def _adapt_cluster(
         self, cluster_name, minimum=None, maximum=None, active=True
     ):
-        await self._request(
+        resp = await self._request(
             "POST",
             "%s/api/v1/clusters/%s/adapt" % (self.address, cluster_name),
             json={"minimum": minimum, "maximum": maximum, "active": active},
         )
+        try:
+            msg = await resp.json()
+        except Exception:
+            msg = {}
+        if not msg.get("ok", True) and msg.get("msg"):
+            warnings.warn(GatewayWarning(msg["msg"]))
 
     def adapt_cluster(
         self, cluster_name, minimum=None, maximum=None, active=True, **kwargs
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -90,6 +90,7 @@ both the cluster backend and the authentication protocol are pluggable.
    authentication
    security
    cluster-options
+   resource-limits
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/source/resource-limits.rst b/docs/source/resource-limits.rst
@@ -0,0 +1,28 @@
+Cluster Resource Limits
+=======================
+
+By default users can create clusters with as many workers and resources as they
+want.  In shared environments this may not always be desirable. To remedy this
+administrators can set per-cluster resource limits.
+
+A few limits are available:
+
+- :data:`c.ClusterConfig.cluster_max_cores`: Maximum number of cores per cluster
+- :data:`c.ClusterConfig.cluster_max_memory`: Maximum amount of memory per cluster
+- :data:`c.ClusterConfig.cluster_max_workers`: Maximum number of workers per cluster
+
+If a cluster is at capacity for any of these limits, requests for new workers
+or workers will warn with an informative message saying they're at capacity.
+
+Example
+-------
+
+Here we limit each cluster to:
+
+- A max of 80 active cores
+- A max of 1 TiB of RAM
+
+.. code-block:: python
+
+    c.ClusterConfig.cluster_max_cores = 80
+    c.ClusterConfig.cluster_max_memory = "1 T"
diff --git a/tests/test_db_backend.py b/tests/test_db_backend.py