Compare commits

...

4 Commits

Author SHA1 Message Date
Callum Styan
0b73a4f24f fix(cli): find prebuild workspaces by owner and name rather than via template
Instead of looking up workspaces per-template, find all prebuild workspaces
directly by owner (the prebuilds system user) and by name substring, merging
the results. Workspaces and templates are then deleted in separate phases so
templates are only attempted after their workspaces are gone.
2026-03-25 00:31:18 +00:00
Callum Styan
386206fe77 fix(cli): teach exp scaletest cleanup about prebuild templates
Prebuild workspaces are owned by the prebuilds system user and not
prefixed with scaletest-, so the cleanup command previously left them
behind. A new phase runs before the existing workspace/user cleanup,
finding templates with the scaletest- prefix and deleting their
workspaces before deleting the templates.
2026-03-25 00:31:18 +00:00
Callum Styan
7f8e6c19e0 fix(scaletest/prebuilds): improve measureDeletion retry and completion tracking
Retries failed or canceled delete builds up to 3 times per workspace and
exits early if all remaining workspaces exhaust retries. Snapshots the
initial workspace count at deletion start as the completion denominator,
since the reconciler may create more workspaces than the configured target.
2026-03-25 00:31:18 +00:00
Callum Styan
d72d5a649f fix(scaletest/prebuilds): fix Runner.Cleanup() to delete workspaces before template
The API rejects template deletion with a 400 while workspaces still exist.
Cleanup() now lists all workspaces for the template, deletes each via
workspacebuild.NewCleanupRunner with up to 3 retries, then deletes the
template. If Run() fails with a 409 conflict (template already exists from a
prior failed run), the existing template is looked up so Cleanup() can remove
it rather than silently skipping it.
2026-03-25 00:31:18 +00:00
2 changed files with 319 additions and 13 deletions

View File

@@ -519,6 +519,91 @@ func (r *userCleanupRunner) Run(ctx context.Context, _ string, _ io.Writer) erro
return nil
}
// prebuildTemplateCleanupRunner deletes a single scaletest prebuilds template.
// All prebuild workspaces must be deleted before this runs.
type prebuildTemplateCleanupRunner struct {
client *codersdk.Client
template codersdk.Template
}
var _ harness.Runnable = &prebuildTemplateCleanupRunner{}
// Run implements Runnable.
func (r *prebuildTemplateCleanupRunner) Run(ctx context.Context, _ string, _ io.Writer) error {
ctx, span := tracing.StartSpan(ctx)
defer span.End()
if err := r.client.DeleteTemplate(ctx, r.template.ID); err != nil {
return xerrors.Errorf("delete template %q: %w", r.template.Name, err)
}
return nil
}
// getScaletestPrebuildWorkspaces returns all prebuild workspaces by querying
// on owner (the prebuilds system user) and on name (substring "prebuild"),
// merging and deduplicating the results. Both filters are used because a
// workspace might only match one depending on how it was created.
func getScaletestPrebuildWorkspaces(ctx context.Context, client *codersdk.Client) ([]codersdk.Workspace, error) {
const pageSize = 100
seen := make(map[uuid.UUID]struct{})
var result []codersdk.Workspace
// paginateWorkspaces appends all pages for the given filter, skipping
// workspaces already seen by a previous query.
paginateWorkspaces := func(filter codersdk.WorkspaceFilter) error {
for page := 0; ; page++ {
filter.Offset = page * pageSize
filter.Limit = pageSize
resp, err := client.Workspaces(ctx, filter)
if err != nil {
return xerrors.Errorf("list prebuild workspaces (page %d): %w", page, err)
}
for _, ws := range resp.Workspaces {
if _, ok := seen[ws.ID]; !ok {
seen[ws.ID] = struct{}{}
result = append(result, ws)
}
}
if len(resp.Workspaces) < pageSize {
break
}
}
return nil
}
// Query by owner first (the prebuilds system user), then by name substring
// to catch any workspaces that might not match on owner alone.
if err := paginateWorkspaces(codersdk.WorkspaceFilter{Owner: "prebuilds"}); err != nil {
return nil, err
}
if err := paginateWorkspaces(codersdk.WorkspaceFilter{Name: "prebuild"}); err != nil {
return nil, err
}
return result, nil
}
// getScaletestPrebuildsTemplates returns all templates that look like they were
// created by the scaletest prebuilds command: they must have the scaletest
// prefix and contain "prebuild" anywhere in the name.
func getScaletestPrebuildsTemplates(ctx context.Context, client *codersdk.Client) ([]codersdk.Template, error) {
templates, err := client.Templates(ctx, codersdk.TemplateFilter{
FuzzyName: "prebuild",
})
if err != nil {
return nil, xerrors.Errorf("list templates: %w", err)
}
var result []codersdk.Template
for _, t := range templates {
if strings.HasPrefix(t.Name, loadtestutil.ScaleTestPrefix+"-") && strings.Contains(t.Name, "prebuild") {
result = append(result, t)
}
}
return result, nil
}
func (r *RootCmd) scaletestCleanup() *serpent.Command {
var template string
cleanupStrategy := newScaletestCleanupStrategy()
@@ -555,6 +640,71 @@ func (r *RootCmd) scaletestCleanup() *serpent.Command {
}
}
cliui.Infof(inv.Stdout, "Fetching scaletest prebuild workspaces...")
prebuildWorkspaces, err := getScaletestPrebuildWorkspaces(ctx, client)
if err != nil {
return err
}
cliui.Errorf(inv.Stderr, "Found %d scaletest prebuild workspaces\n", len(prebuildWorkspaces))
if len(prebuildWorkspaces) != 0 {
cliui.Infof(inv.Stdout, "Deleting scaletest prebuild workspaces...")
prebuildWsHarness := harness.NewTestHarness(cleanupStrategy.toStrategy(), harness.ConcurrentExecutionStrategy{})
for i, ws := range prebuildWorkspaces {
const testName = "cleanup-prebuild-workspace"
prebuildWsHarness.AddRun(testName, strconv.Itoa(i), workspacebuild.NewCleanupRunner(client, ws.ID))
}
prebuildWsCtx, prebuildWsCancel := cleanupStrategy.toContext(ctx)
defer prebuildWsCancel()
if err := prebuildWsHarness.Run(prebuildWsCtx); err != nil {
return xerrors.Errorf("run test harness to delete prebuild workspaces (harness failure, not a test failure): %w", err)
}
cliui.Infof(inv.Stdout, "Done deleting scaletest prebuild workspaces:")
prebuildWsRes := prebuildWsHarness.Results()
prebuildWsRes.PrintText(inv.Stderr)
if prebuildWsRes.TotalFail > 0 {
return xerrors.Errorf("failed to delete %d scaletest prebuild workspace(s)", prebuildWsRes.TotalFail)
}
}
cliui.Infof(inv.Stdout, "Fetching scaletest prebuilds templates...")
prebuildTemplates, err := getScaletestPrebuildsTemplates(ctx, client)
if err != nil {
return err
}
cliui.Errorf(inv.Stderr, "Found %d scaletest prebuilds templates\n", len(prebuildTemplates))
if len(prebuildTemplates) != 0 {
cliui.Infof(inv.Stdout, "Deleting scaletest prebuilds templates...")
prebuildTplHarness := harness.NewTestHarness(cleanupStrategy.toStrategy(), harness.ConcurrentExecutionStrategy{})
for i, t := range prebuildTemplates {
const testName = "cleanup-prebuilds-template"
prebuildTplHarness.AddRun(testName, strconv.Itoa(i), &prebuildTemplateCleanupRunner{
client: client,
template: t,
})
}
prebuildTplCtx, prebuildTplCancel := cleanupStrategy.toContext(ctx)
defer prebuildTplCancel()
if err := prebuildTplHarness.Run(prebuildTplCtx); err != nil {
return xerrors.Errorf("run test harness to delete prebuilds templates (harness failure, not a test failure): %w", err)
}
cliui.Infof(inv.Stdout, "Done deleting scaletest prebuilds templates:")
prebuildTplRes := prebuildTplHarness.Results()
prebuildTplRes.PrintText(inv.Stderr)
if prebuildTplRes.TotalFail > 0 {
return xerrors.Errorf("failed to delete %d scaletest prebuilds template(s)", prebuildTplRes.TotalFail)
}
}
cliui.Infof(inv.Stdout, "Fetching scaletest workspaces...")
workspaces, _, err := getScaletestWorkspaces(ctx, client, "", template)
if err != nil {

View File

@@ -6,6 +6,7 @@ import (
_ "embed"
"html/template"
"io"
"net/http"
"time"
"github.com/google/uuid"
@@ -17,6 +18,7 @@ import (
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/scaletest/harness"
"github.com/coder/coder/v2/scaletest/loadtestutil"
"github.com/coder/coder/v2/scaletest/workspacebuild"
)
type Runner struct {
@@ -77,6 +79,22 @@ func (r *Runner) Run(ctx context.Context, id string, logs io.Writer) error {
}
templ, err := r.client.CreateTemplate(ctx, r.cfg.OrganizationID, templateReq)
if err != nil {
// If the template already exists from a previous failed run, look it up so
// Cleanup() can delete it and the rerun doesn't leave orphaned resources.
var sdkErr *codersdk.Error
if xerrors.As(err, &sdkErr) && sdkErr.StatusCode() == http.StatusConflict {
existing, listErr := r.client.Templates(ctx, codersdk.TemplateFilter{
OrganizationID: r.cfg.OrganizationID,
ExactName: templateName,
})
if listErr == nil && len(existing) > 0 {
r.template = existing[0]
logger.Warn(ctx, "template already exists from a previous run, will be cleaned up",
slog.F("template_name", r.template.Name),
slog.F("template_id", r.template.ID),
)
}
}
r.cfg.Metrics.AddError(templateName, "create_template")
return xerrors.Errorf("create template: %w", err)
}
@@ -193,13 +211,32 @@ func (r *Runner) measureCreation(ctx context.Context, logger slog.Logger) error
func (r *Runner) measureDeletion(ctx context.Context, logger slog.Logger) error {
deletionStartTime := time.Now().UTC()
const deletionPollInterval = 500 * time.Millisecond
targetNumWorkspaces := r.cfg.NumPresets * r.cfg.NumPresetPrebuilds
const (
deletionPollInterval = 500 * time.Millisecond
maxDeletionRetries = 3
)
deletionCtx, cancel := context.WithTimeout(ctx, r.cfg.PrebuildWorkspaceTimeout)
defer cancel()
// Capture the actual workspace count at the start of the deletion phase.
// The reconciler may have created extra workspaces beyond the configured
// target (e.g. replacements for failed builds), so using targetNumWorkspaces
// as the denominator would undercount completed deletions.
initialWorkspaces, err := r.client.Workspaces(deletionCtx, codersdk.WorkspaceFilter{
Template: r.template.Name,
})
if err != nil {
return xerrors.Errorf("list workspaces at deletion start: %w", err)
}
initialWorkspaceCount := len(initialWorkspaces.Workspaces)
// retryCount tracks how many delete builds we've submitted per workspace.
// lastRetriedBuildID prevents submitting a second retry for the same failed
// build before the API reflects the new build.
retryCount := make(map[uuid.UUID]int)
lastRetriedBuildID := make(map[uuid.UUID]uuid.UUID)
tkr := r.cfg.Clock.TickerFunc(deletionCtx, deletionPollInterval, func() error {
workspaces, err := r.client.Workspaces(deletionCtx, codersdk.WorkspaceFilter{
Template: r.template.Name,
@@ -211,20 +248,52 @@ func (r *Runner) measureDeletion(ctx context.Context, logger slog.Logger) error
createdCount := 0
runningCount := 0
failedCount := 0
exhaustedCount := 0
for _, ws := range workspaces.Workspaces {
if ws.LatestBuild.Transition == codersdk.WorkspaceTransitionDelete {
createdCount++
switch ws.LatestBuild.Job.Status {
case codersdk.ProvisionerJobRunning:
if ws.LatestBuild.Transition != codersdk.WorkspaceTransitionDelete {
// The reconciler hasn't submitted a delete build yet.
continue
}
createdCount++
switch ws.LatestBuild.Job.Status {
case codersdk.ProvisionerJobRunning, codersdk.ProvisionerJobPending:
runningCount++
case codersdk.ProvisionerJobFailed, codersdk.ProvisionerJobCanceled:
// Skip if we've already submitted a retry for this specific
// failed build and are waiting for the new build to appear.
if lastRetriedBuildID[ws.ID] == ws.LatestBuild.ID {
runningCount++
case codersdk.ProvisionerJobFailed, codersdk.ProvisionerJobCanceled:
failedCount++
continue
}
if retryCount[ws.ID] >= maxDeletionRetries {
exhaustedCount++
failedCount++
continue
}
retryCount[ws.ID]++
lastRetriedBuildID[ws.ID] = ws.LatestBuild.ID
logger.Warn(deletionCtx, "retrying failed workspace deletion",
slog.F("workspace_id", ws.ID),
slog.F("workspace_name", ws.Name),
slog.F("attempt", retryCount[ws.ID]),
slog.F("max_attempts", maxDeletionRetries),
)
_, retryErr := r.client.CreateWorkspaceBuild(deletionCtx, ws.ID, codersdk.CreateWorkspaceBuildRequest{
Transition: codersdk.WorkspaceTransitionDelete,
})
if retryErr != nil {
return xerrors.Errorf("retry workspace deletion (attempt %d): %w", retryCount[ws.ID], retryErr)
}
runningCount++
}
}
completedCount := targetNumWorkspaces - len(workspaces.Workspaces)
completedCount := initialWorkspaceCount - len(workspaces.Workspaces)
createdCount += completedCount
r.cfg.Metrics.SetDeletionJobsCreated(createdCount, r.template.Name)
@@ -236,9 +305,15 @@ func (r *Runner) measureDeletion(ctx context.Context, logger slog.Logger) error
return errTickerDone
}
// If every remaining workspace has exhausted all retries, fail
// immediately rather than waiting for the timeout.
if exhaustedCount > 0 && exhaustedCount == len(workspaces.Workspaces) {
return xerrors.Errorf("%d workspace(s) failed to delete after %d attempts", exhaustedCount, maxDeletionRetries+1)
}
return nil
}, "waitForPrebuildWorkspacesDeletion")
err := tkr.Wait()
err = tkr.Wait()
if !xerrors.Is(err, errTickerDone) {
r.cfg.Metrics.AddError(r.template.Name, "wait_for_workspace_deletion")
return xerrors.Errorf("wait for workspace deletion: %w", err)
@@ -305,10 +380,69 @@ func (r *Runner) Cleanup(ctx context.Context, _ string, logs io.Writer) error {
logs = loadtestutil.NewSyncWriter(logs)
logger := slog.Make(sloghuman.Sink(logs)).Leveled(slog.LevelDebug)
logger.Info(ctx, "deleting template", slog.F("template_name", r.template.Name))
// If Run failed before the template was created, there is nothing to clean up.
if r.template.ID == uuid.Nil {
logger.Info(ctx, "template was never created, skipping cleanup")
return nil
}
err := r.client.DeleteTemplate(ctx, r.template.ID)
// Workspaces must be deleted before the template can be deleted.
workspaces, err := allWorkspacesForTemplate(ctx, r.client, r.template.Name)
if err != nil {
return xerrors.Errorf("list workspaces for template %q: %w", r.template.Name, err)
}
logger.Info(ctx, "deleting workspaces for template", slog.F("count", len(workspaces)), slog.F("template_name", r.template.Name))
// Retry failed workspace deletions up to maxDeletionAttempts times to
// handle transient errors (e.g. a delete build that fails due to a
// provisioner hiccup).
const maxDeletionAttempts = 3
remaining := workspaces
for attempt := range maxDeletionAttempts {
if len(remaining) == 0 {
break
}
if attempt > 0 {
logger.Info(ctx, "retrying workspace deletions",
slog.F("attempt", attempt+1),
slog.F("remaining", len(remaining)),
slog.F("template_name", r.template.Name),
)
}
var failed []codersdk.Workspace
for _, ws := range remaining {
cr := workspacebuild.NewCleanupRunner(r.client, ws.ID)
if err := cr.Run(ctx, ws.ID.String(), logs); err != nil {
logger.Warn(ctx, "failed to delete workspace",
slog.F("workspace_id", ws.ID),
slog.F("workspace_name", ws.Name),
slog.Error(err),
)
failed = append(failed, ws)
}
}
remaining = failed
}
if len(remaining) > 0 {
ids := make([]string, len(remaining))
for i, ws := range remaining {
ids[i] = ws.ID.String()
}
logger.Error(ctx, "CLEANUP INCOMPLETE: could not delete all workspaces after retries; template deletion will likely fail",
slog.F("template_name", r.template.Name),
slog.F("remaining_count", len(remaining)),
slog.F("remaining_workspace_ids", ids),
)
}
// Always attempt template deletion even if some workspaces could not be
// removed. The delete call will fail with a 400 if workspaces remain, but
// the error — combined with the log above — makes the state clear to the
// operator.
logger.Info(ctx, "deleting template", slog.F("template_name", r.template.Name))
if err := r.client.DeleteTemplate(ctx, r.template.ID); err != nil {
return xerrors.Errorf("delete template: %w", err)
}
@@ -316,6 +450,28 @@ func (r *Runner) Cleanup(ctx context.Context, _ string, logs io.Writer) error {
return nil
}
// allWorkspacesForTemplate returns all workspaces belonging to templateName,
// paginating through results until exhausted.
func allWorkspacesForTemplate(ctx context.Context, client *codersdk.Client, templateName string) ([]codersdk.Workspace, error) {
const pageSize = 100
var workspaces []codersdk.Workspace
for page := 0; ; page++ {
resp, err := client.Workspaces(ctx, codersdk.WorkspaceFilter{
Template: templateName,
Offset: page * pageSize,
Limit: pageSize,
})
if err != nil {
return nil, xerrors.Errorf("list workspaces page %d: %w", page, err)
}
workspaces = append(workspaces, resp.Workspaces...)
if len(resp.Workspaces) < pageSize {
break
}
}
return workspaces, nil
}
//go:embed tf/main.tf.tpl
var templateContent string