Skip to content

Commit f6e86c6

Browse files
authored
feat: cancel pending prebuilds from non-active template versions (coder#20387)
## Description This PR introduces an optimization to automatically cancel pending prebuild-related jobs from non-active template versions in the reconciliation loop. ## Problem Currently, when a template is configured with more prebuild instances than available provisioners, the provisioner queue can become flooded with pending prebuild jobs. This issue is worsened when provisioning/deprovisioning operations take a long time. When the prebuild reconciliation loop generates jobs faster than provisioners can process them, pending jobs accumulate in the queue. Since prebuilt workspaces should always run the latest active template version, pending prebuild jobs from non-active versions become obsolete once a new version is promoted. ## Solution The reconciliation loop cancels pending prebuild-related jobs from non-active template versions that match the following criteria: * Build number: 1 (initial build created by the reconciliation loop) * Job status: `pending` * Not yet picked up by a provisioner (`worker_id` is `NULL`) * Owned by the prebuilds system user * Workspace transition: `start` This prevents the queue from being cluttered with stale prebuild jobs that would provision workspaces on an outdated template version that would consequently need to be deprovisioned. ## Changes * Added new SQL query `CountPendingNonActivePrebuilds` to identify presets with pending jobs from non-active versions * Added new SQL query `UpdatePrebuildProvisionerJobWithCancel` to cancel jobs for a specific preset * New reconciliation action type `ActionTypeCancelPending` handles the cancellation logic * Cancellation is non-blocking: failures to cancel prebuild jobs are logged as errors and don't prevent other reconciliation actions ## Follow-up PR Canceling pending prebuild jobs leaves workspaces in a Canceled state. While no Terraform resources need to be destroyed (since jobs were canceled before provisioning started), these database records should still be cleaned up. This will be addressed in a follow-up PR. Closes: coder#20242
1 parent c301a0d commit f6e86c6

File tree

13 files changed

+1016
-63
lines changed

13 files changed

+1016
-63
lines changed

coderd/database/dbauthz/dbauthz.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1512,6 +1512,13 @@ func (q *querier) CountInProgressPrebuilds(ctx context.Context) ([]database.Coun
15121512
return q.db.CountInProgressPrebuilds(ctx)
15131513
}
15141514

1515+
func (q *querier) CountPendingNonActivePrebuilds(ctx context.Context) ([]database.CountPendingNonActivePrebuildsRow, error) {
1516+
if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceWorkspace.All()); err != nil {
1517+
return nil, err
1518+
}
1519+
return q.db.CountPendingNonActivePrebuilds(ctx)
1520+
}
1521+
15151522
func (q *querier) CountUnreadInboxNotificationsByUserID(ctx context.Context, userID uuid.UUID) (int64, error) {
15161523
if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceInboxNotification.WithOwner(userID.String())); err != nil {
15171524
return 0, err
@@ -4875,6 +4882,14 @@ func (q *querier) UpdateOrganizationDeletedByID(ctx context.Context, arg databas
48754882
return deleteQ(q.log, q.auth, q.db.GetOrganizationByID, deleteF)(ctx, arg.ID)
48764883
}
48774884

4885+
func (q *querier) UpdatePrebuildProvisionerJobWithCancel(ctx context.Context, arg database.UpdatePrebuildProvisionerJobWithCancelParams) ([]uuid.UUID, error) {
4886+
// Prebuild operation for canceling pending prebuild jobs from non-active template versions
4887+
if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourcePrebuiltWorkspace); err != nil {
4888+
return []uuid.UUID{}, err
4889+
}
4890+
return q.db.UpdatePrebuildProvisionerJobWithCancel(ctx, arg)
4891+
}
4892+
48784893
func (q *querier) UpdatePresetPrebuildStatus(ctx context.Context, arg database.UpdatePresetPrebuildStatusParams) error {
48794894
preset, err := q.db.GetPresetByID(ctx, arg.PresetID)
48804895
if err != nil {

coderd/database/dbauthz/dbauthz_test.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -641,6 +641,16 @@ func (s *MethodTestSuite) TestProvisionerJob() {
641641
dbm.EXPECT().UpdateProvisionerJobWithCancelByID(gomock.Any(), arg).Return(nil).AnyTimes()
642642
check.Args(arg).Asserts(v.RBACObject(tpl), []policy.Action{policy.ActionRead, policy.ActionUpdate}).Returns()
643643
}))
644+
s.Run("UpdatePrebuildProvisionerJobWithCancel", s.Mocked(func(dbm *dbmock.MockStore, faker *gofakeit.Faker, check *expects) {
645+
arg := database.UpdatePrebuildProvisionerJobWithCancelParams{
646+
PresetID: uuid.NullUUID{UUID: uuid.New(), Valid: true},
647+
Now: dbtime.Now(),
648+
}
649+
jobIDs := []uuid.UUID{uuid.New(), uuid.New()}
650+
651+
dbm.EXPECT().UpdatePrebuildProvisionerJobWithCancel(gomock.Any(), arg).Return(jobIDs, nil).AnyTimes()
652+
check.Args(arg).Asserts(rbac.ResourcePrebuiltWorkspace, policy.ActionUpdate).Returns(jobIDs)
653+
}))
644654
s.Run("GetProvisionerJobsByIDs", s.Mocked(func(dbm *dbmock.MockStore, faker *gofakeit.Faker, check *expects) {
645655
org := testutil.Fake(s.T(), faker, database.Organization{})
646656
org2 := testutil.Fake(s.T(), faker, database.Organization{})
@@ -3758,6 +3768,10 @@ func (s *MethodTestSuite) TestPrebuilds() {
37583768
dbm.EXPECT().CountInProgressPrebuilds(gomock.Any()).Return([]database.CountInProgressPrebuildsRow{}, nil).AnyTimes()
37593769
check.Args().Asserts(rbac.ResourceWorkspace.All(), policy.ActionRead)
37603770
}))
3771+
s.Run("CountPendingNonActivePrebuilds", s.Mocked(func(dbm *dbmock.MockStore, _ *gofakeit.Faker, check *expects) {
3772+
dbm.EXPECT().CountPendingNonActivePrebuilds(gomock.Any()).Return([]database.CountPendingNonActivePrebuildsRow{}, nil).AnyTimes()
3773+
check.Args().Asserts(rbac.ResourceWorkspace.All(), policy.ActionRead)
3774+
}))
37613775
s.Run("GetPresetsAtFailureLimit", s.Mocked(func(dbm *dbmock.MockStore, _ *gofakeit.Faker, check *expects) {
37623776
dbm.EXPECT().GetPresetsAtFailureLimit(gomock.Any(), int64(0)).Return([]database.GetPresetsAtFailureLimitRow{}, nil).AnyTimes()
37633777
check.Args(int64(0)).Asserts(rbac.ResourceTemplate.All(), policy.ActionViewInsights)

coderd/database/dbfake/dbfake.go

Lines changed: 40 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -55,14 +55,10 @@ type WorkspaceBuildBuilder struct {
5555
resources []*sdkproto.Resource
5656
params []database.WorkspaceBuildParameter
5757
agentToken string
58-
dispo workspaceBuildDisposition
58+
jobStatus database.ProvisionerJobStatus
5959
taskAppID uuid.UUID
6060
}
6161

62-
type workspaceBuildDisposition struct {
63-
starting bool
64-
}
65-
6662
// WorkspaceBuild generates a workspace build for the provided workspace.
6763
// Pass a database.Workspace{} with a nil ID to also generate a new workspace.
6864
// Omitting the template ID on a workspace will also generate a new template
@@ -145,8 +141,17 @@ func (b WorkspaceBuildBuilder) WithTask(seed *sdkproto.App) WorkspaceBuildBuilde
145141
}
146142

147143
func (b WorkspaceBuildBuilder) Starting() WorkspaceBuildBuilder {
148-
//nolint: revive // returns modified struct
149-
b.dispo.starting = true
144+
b.jobStatus = database.ProvisionerJobStatusRunning
145+
return b
146+
}
147+
148+
func (b WorkspaceBuildBuilder) Pending() WorkspaceBuildBuilder {
149+
b.jobStatus = database.ProvisionerJobStatusPending
150+
return b
151+
}
152+
153+
func (b WorkspaceBuildBuilder) Canceled() WorkspaceBuildBuilder {
154+
b.jobStatus = database.ProvisionerJobStatusCanceled
150155
return b
151156
}
152157

@@ -231,7 +236,11 @@ func (b WorkspaceBuildBuilder) Do() WorkspaceResponse {
231236
require.NoError(b.t, err, "insert job")
232237
b.logger.Debug(context.Background(), "inserted provisioner job", slog.F("job_id", job.ID))
233238

234-
if b.dispo.starting {
239+
switch b.jobStatus {
240+
case database.ProvisionerJobStatusPending:
241+
// Provisioner jobs are created in 'pending' status
242+
b.logger.Debug(context.Background(), "pending the provisioner job")
243+
case database.ProvisionerJobStatusRunning:
235244
// might need to do this multiple times if we got a template version
236245
// import job as well
237246
b.logger.Debug(context.Background(), "looping to acquire provisioner job")
@@ -255,7 +264,23 @@ func (b WorkspaceBuildBuilder) Do() WorkspaceResponse {
255264
break
256265
}
257266
}
258-
} else {
267+
case database.ProvisionerJobStatusCanceled:
268+
// Set provisioner job status to 'canceled'
269+
b.logger.Debug(context.Background(), "canceling the provisioner job")
270+
err = b.db.UpdateProvisionerJobWithCancelByID(ownerCtx, database.UpdateProvisionerJobWithCancelByIDParams{
271+
ID: jobID,
272+
CanceledAt: sql.NullTime{
273+
Time: dbtime.Now(),
274+
Valid: true,
275+
},
276+
CompletedAt: sql.NullTime{
277+
Time: dbtime.Now(),
278+
Valid: true,
279+
},
280+
})
281+
require.NoError(b.t, err, "cancel job")
282+
default:
283+
// By default, consider jobs in 'succeeded' status
259284
b.logger.Debug(context.Background(), "completing the provisioner job")
260285
err = b.db.UpdateProvisionerJobWithCompleteByID(ownerCtx, database.UpdateProvisionerJobWithCompleteByIDParams{
261286
ID: job.ID,
@@ -571,6 +596,12 @@ func (t TemplateVersionBuilder) Do() TemplateVersionResponse {
571596
t.params[i] = dbgen.TemplateVersionParameter(t.t, t.db, param)
572597
}
573598

599+
// Update response with template and version
600+
if resp.Template.ID == uuid.Nil && version.TemplateID.Valid {
601+
template, err := t.db.GetTemplateByID(ownerCtx, version.TemplateID.UUID)
602+
require.NoError(t.t, err)
603+
resp.Template = template
604+
}
574605
resp.TemplateVersion = version
575606
return resp
576607
}

coderd/database/dbmetrics/querymetrics.go

Lines changed: 14 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/dbmock/dbmock.go

Lines changed: 30 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/querier.go

Lines changed: 7 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries.sql.go

Lines changed: 112 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)