15
15
from .. import models
16
16
from ..proxy import Proxy
17
17
from ..tls import new_keypair
18
- from ..utils import (
19
- FrozenAttrDict ,
20
- TaskPool ,
21
- Flag ,
22
- normalize_address ,
23
- UniqueQueue ,
24
- CancelGroup ,
25
- timestamp ,
26
- )
18
+ from ..workqueue import WorkQueue , Backoff , WorkQueueClosed
19
+ from ..utils import FrozenAttrDict , TaskPool , Flag , normalize_address , timestamp
27
20
28
21
29
22
__all__ = ("DBBackendBase" , "Cluster" , "Worker" )
@@ -778,6 +771,28 @@ def _default_check_timeouts_period(self):
778
771
config = True ,
779
772
)
780
773
774
+ backoff_base_delay = Float (
775
+ 0.1 ,
776
+ help = """
777
+ Base delay (in seconds) for backoff when retrying after failures.
778
+
779
+ If an operation fails, it is retried after a backoff computed as:
780
+
781
+ ```
782
+ min(backoff_max_delay, backoff_base_delay * 2 ** num_failures)
783
+ ```
784
+ """ ,
785
+ config = True ,
786
+ )
787
+
788
+ backoff_max_delay = Float (
789
+ 300 ,
790
+ help = """
791
+ Max delay (in seconds) for backoff policy when retrying after failures.
792
+ """ ,
793
+ config = True ,
794
+ )
795
+
781
796
api_url = Unicode (
782
797
help = """
783
798
The address that internal components (e.g. dask clusters)
@@ -800,11 +815,14 @@ async def setup(self, app):
800
815
await super ().setup (app )
801
816
802
817
# Setup reconcilation queues
803
- self .cg = CancelGroup ()
804
-
805
- self .queues = [UniqueQueue () for _ in range (self .parallelism )]
818
+ self .queue = WorkQueue (
819
+ backoff = Backoff (
820
+ base_delay = self .backoff_base_delay , max_delay = self .backoff_max_delay
821
+ )
822
+ )
806
823
self .reconcilers = [
807
- asyncio .ensure_future (self .reconciler_loop (q )) for q in self .queues
824
+ asyncio .ensure_future (self .reconciler_loop ())
825
+ for _ in range (self .parallelism )
808
826
]
809
827
810
828
# Start the proxy
@@ -826,10 +844,10 @@ async def setup(self, app):
826
844
# Load all active clusters/workers into reconcilation queues
827
845
for cluster in self .db .name_to_cluster .values ():
828
846
if cluster .status < JobStatus .STOPPED :
829
- await self .enqueue (cluster )
847
+ self .queue . put (cluster )
830
848
for worker in cluster .workers .values ():
831
849
if worker .status < JobStatus .STOPPED :
832
- await self .enqueue (worker )
850
+ self .queue . put (worker )
833
851
834
852
# Further backend-specific setup
835
853
await self .do_setup ()
@@ -853,7 +871,7 @@ async def cleanup(self):
853
871
[(c , {"target" : JobStatus .FAILED }) for c in active ]
854
872
)
855
873
for c in active :
856
- await self .enqueue (c )
874
+ self .queue . put (c )
857
875
858
876
# Wait until all clusters are shutdown
859
877
pending_shutdown = [
@@ -864,9 +882,9 @@ async def cleanup(self):
864
882
if pending_shutdown :
865
883
await asyncio .wait ([c .shutdown for c in pending_shutdown ])
866
884
867
- if hasattr ( self , "cg" ):
868
- # Stop reconcilation queues
869
- await self .cg . cancel ()
885
+ # Stop reconcilation queues
886
+ if hasattr ( self , "reconcilers" ):
887
+ self .queue . close ()
870
888
await asyncio .gather (* self .reconcilers , return_exceptions = True )
871
889
872
890
await self .do_cleanup ()
@@ -895,7 +913,7 @@ async def start_cluster(self, user, cluster_options):
895
913
options , config = await self .process_cluster_options (user , cluster_options )
896
914
cluster = self .db .create_cluster (user .name , options , config .to_dict ())
897
915
self .log .info ("Created cluster %s for user %s" , cluster .name , user .name )
898
- await self .enqueue (cluster )
916
+ self .queue . put (cluster )
899
917
return cluster .name
900
918
901
919
async def stop_cluster (self , cluster_name , failed = False ):
@@ -906,7 +924,7 @@ async def stop_cluster(self, cluster_name, failed=False):
906
924
self .log .info ("Stopping cluster %s" , cluster .name )
907
925
target = JobStatus .FAILED if failed else JobStatus .STOPPED
908
926
self .db .update_cluster (cluster , target = target )
909
- await self .enqueue (cluster )
927
+ self .queue . put (cluster )
910
928
911
929
async def on_cluster_heartbeat (self , cluster_name , msg ):
912
930
cluster = self .db .get_cluster (cluster_name )
@@ -976,11 +994,11 @@ async def on_cluster_heartbeat(self, cluster_name, msg):
976
994
977
995
if cluster_update :
978
996
self .db .update_cluster (cluster , ** cluster_update )
979
- await self .enqueue (cluster )
997
+ self .queue . put (cluster )
980
998
981
999
self .db .update_workers (target_updates )
982
1000
for w , u in target_updates :
983
- await self .enqueue (w )
1001
+ self .queue . put (w )
984
1002
985
1003
if newly_running :
986
1004
# At least one worker successfully started, reset failure count
@@ -1037,10 +1055,10 @@ async def _check_timeouts(self):
1037
1055
worker_updates .append ((w , {"target" : JobStatus .FAILED }))
1038
1056
self .db .update_clusters (cluster_updates )
1039
1057
for c , _ in cluster_updates :
1040
- await self .enqueue (c )
1058
+ self .queue . put (c )
1041
1059
self .db .update_workers (worker_updates )
1042
1060
for w , _ in worker_updates :
1043
- await self .enqueue (w )
1061
+ self .queue . put (w )
1044
1062
1045
1063
async def check_clusters_loop (self ):
1046
1064
while True :
@@ -1061,7 +1079,7 @@ async def check_clusters_loop(self):
1061
1079
self .db .update_clusters (updates )
1062
1080
for c , _ in updates :
1063
1081
self .log .info ("Cluster %s failed during startup" , c .name )
1064
- await self .enqueue (c )
1082
+ self .queue . put (c )
1065
1083
except asyncio .CancelledError :
1066
1084
raise
1067
1085
except Exception as exc :
@@ -1095,7 +1113,7 @@ async def check_workers_loop(self):
1095
1113
for w , _ in updates :
1096
1114
self .log .info ("Worker %s failed during startup" , w .name )
1097
1115
w .cluster .worker_start_failure_count += 1
1098
- await self .enqueue (w )
1116
+ self .queue . put (w )
1099
1117
except asyncio .CancelledError :
1100
1118
raise
1101
1119
except Exception as exc :
@@ -1115,14 +1133,13 @@ async def cleanup_db_loop(self):
1115
1133
self .log .debug ("Removed %d expired clusters from the database" , n )
1116
1134
await asyncio .sleep (self .db_cleanup_period )
1117
1135
1118
- async def enqueue (self , obj ):
1119
- ind = hash (obj ) % self .parallelism
1120
- await self .queues [ind ].put (obj )
1121
-
1122
- async def reconciler_loop (self , queue ):
1136
+ async def reconciler_loop (self ):
1123
1137
while True :
1124
- async with self .cg .cancellable ():
1125
- obj = await queue .get ()
1138
+ try :
1139
+ obj = await self .queue .get ()
1140
+ except WorkQueueClosed :
1141
+ return
1142
+
1126
1143
if isinstance (obj , Cluster ):
1127
1144
method = self .reconcile_cluster
1128
1145
kind = "cluster"
@@ -1144,6 +1161,11 @@ async def reconciler_loop(self, queue):
1144
1161
self .log .warning (
1145
1162
"Error while reconciling %s %s" , kind , obj .name , exc_info = True
1146
1163
)
1164
+ self .queue .put_backoff (obj )
1165
+ else :
1166
+ self .queue .reset_backoff (obj )
1167
+ finally :
1168
+ self .queue .task_done (obj )
1147
1169
1148
1170
async def reconcile_cluster (self , cluster ):
1149
1171
if cluster .status >= JobStatus .STOPPED :
@@ -1177,17 +1199,17 @@ async def reconcile_worker(self, worker):
1177
1199
if worker .status != JobStatus .CLOSING :
1178
1200
self .db .update_worker (worker , status = JobStatus .CLOSING )
1179
1201
if self .is_cluster_ready_to_close (worker .cluster ):
1180
- await self .enqueue (worker .cluster )
1202
+ self .queue . put (worker .cluster )
1181
1203
return
1182
1204
1183
1205
if worker .target in (JobStatus .STOPPED , JobStatus .FAILED ):
1184
1206
await self ._worker_to_stopped (worker )
1185
1207
if self .is_cluster_ready_to_close (worker .cluster ):
1186
- await self .enqueue (worker .cluster )
1208
+ self .queue . put (worker .cluster )
1187
1209
elif (
1188
1210
worker .cluster .target == JobStatus .RUNNING and not worker .close_expected
1189
1211
):
1190
- await self .enqueue (worker .cluster )
1212
+ self .queue . put (worker .cluster )
1191
1213
return
1192
1214
1193
1215
if worker .status == JobStatus .CREATED and worker .target == JobStatus .RUNNING :
@@ -1225,20 +1247,20 @@ async def _cluster_to_submitted(self, cluster):
1225
1247
self .db .update_cluster (
1226
1248
cluster , status = JobStatus .SUBMITTED , target = JobStatus .FAILED
1227
1249
)
1228
- await self .enqueue (cluster )
1250
+ self .queue . put (cluster )
1229
1251
1230
1252
async def _cluster_to_closing (self , cluster ):
1231
1253
self .log .debug ("Preparing to stop cluster %s" , cluster .name )
1232
1254
target = JobStatus .CLOSING if self .supports_bulk_shutdown else JobStatus .STOPPED
1233
1255
workers = [w for w in cluster .workers .values () if w .target < target ]
1234
1256
self .db .update_workers ([(w , {"target" : target }) for w in workers ])
1235
1257
for w in workers :
1236
- await self .enqueue (w )
1258
+ self .queue . put (w )
1237
1259
self .db .update_cluster (cluster , status = JobStatus .CLOSING )
1238
1260
if not workers :
1239
1261
# If there are workers, the cluster will be enqueued after the last one closed
1240
- # If there are no workers, re-enqueue now
1241
- await self .enqueue (cluster )
1262
+ # If there are no workers, requeue now
1263
+ self .queue . put (cluster )
1242
1264
cluster .ready .set ()
1243
1265
1244
1266
async def _cluster_to_stopped (self , cluster ):
@@ -1291,7 +1313,7 @@ async def _check_cluster_scale(self, cluster):
1291
1313
cluster .worker_start_failure_count ,
1292
1314
)
1293
1315
self .db .update_cluster (cluster , target = JobStatus .FAILED )
1294
- await self .enqueue (cluster )
1316
+ self .queue . put (cluster )
1295
1317
return
1296
1318
1297
1319
active = cluster .active_workers ()
@@ -1301,7 +1323,7 @@ async def _check_cluster_scale(self, cluster):
1301
1323
self .log .info (
1302
1324
"Created worker %s for cluster %s" , worker .name , cluster .name
1303
1325
)
1304
- await self .enqueue (worker )
1326
+ self .queue . put (worker )
1305
1327
1306
1328
async def _worker_to_submitted (self , worker ):
1307
1329
self .log .info ("Submitting worker %s..." , worker .name )
@@ -1325,7 +1347,7 @@ async def _worker_to_submitted(self, worker):
1325
1347
worker , status = JobStatus .SUBMITTED , target = JobStatus .FAILED
1326
1348
)
1327
1349
worker .cluster .worker_start_failure_count += 1
1328
- await self .enqueue (worker )
1350
+ self .queue . put (worker )
1329
1351
1330
1352
async def _worker_to_stopped (self , worker ):
1331
1353
self .log .info ("Stopping worker %s..." , worker .name )
0 commit comments