Compare commits
4 Commits
main
...
callum/pre
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0b73a4f24f | ||
|
|
386206fe77 | ||
|
|
7f8e6c19e0 | ||
|
|
d72d5a649f |
@@ -519,6 +519,91 @@ func (r *userCleanupRunner) Run(ctx context.Context, _ string, _ io.Writer) erro
|
||||
return nil
|
||||
}
|
||||
|
||||
// prebuildTemplateCleanupRunner deletes a single scaletest prebuilds template.
|
||||
// All prebuild workspaces must be deleted before this runs.
|
||||
type prebuildTemplateCleanupRunner struct {
|
||||
client *codersdk.Client
|
||||
template codersdk.Template
|
||||
}
|
||||
|
||||
var _ harness.Runnable = &prebuildTemplateCleanupRunner{}
|
||||
|
||||
// Run implements Runnable.
|
||||
func (r *prebuildTemplateCleanupRunner) Run(ctx context.Context, _ string, _ io.Writer) error {
|
||||
ctx, span := tracing.StartSpan(ctx)
|
||||
defer span.End()
|
||||
|
||||
if err := r.client.DeleteTemplate(ctx, r.template.ID); err != nil {
|
||||
return xerrors.Errorf("delete template %q: %w", r.template.Name, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// getScaletestPrebuildWorkspaces returns all prebuild workspaces by querying
|
||||
// on owner (the prebuilds system user) and on name (substring "prebuild"),
|
||||
// merging and deduplicating the results. Both filters are used because a
|
||||
// workspace might only match one depending on how it was created.
|
||||
func getScaletestPrebuildWorkspaces(ctx context.Context, client *codersdk.Client) ([]codersdk.Workspace, error) {
|
||||
const pageSize = 100
|
||||
|
||||
seen := make(map[uuid.UUID]struct{})
|
||||
var result []codersdk.Workspace
|
||||
|
||||
// paginateWorkspaces appends all pages for the given filter, skipping
|
||||
// workspaces already seen by a previous query.
|
||||
paginateWorkspaces := func(filter codersdk.WorkspaceFilter) error {
|
||||
for page := 0; ; page++ {
|
||||
filter.Offset = page * pageSize
|
||||
filter.Limit = pageSize
|
||||
resp, err := client.Workspaces(ctx, filter)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("list prebuild workspaces (page %d): %w", page, err)
|
||||
}
|
||||
for _, ws := range resp.Workspaces {
|
||||
if _, ok := seen[ws.ID]; !ok {
|
||||
seen[ws.ID] = struct{}{}
|
||||
result = append(result, ws)
|
||||
}
|
||||
}
|
||||
if len(resp.Workspaces) < pageSize {
|
||||
break
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Query by owner first (the prebuilds system user), then by name substring
|
||||
// to catch any workspaces that might not match on owner alone.
|
||||
if err := paginateWorkspaces(codersdk.WorkspaceFilter{Owner: "prebuilds"}); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := paginateWorkspaces(codersdk.WorkspaceFilter{Name: "prebuild"}); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// getScaletestPrebuildsTemplates returns all templates that look like they were
|
||||
// created by the scaletest prebuilds command: they must have the scaletest
|
||||
// prefix and contain "prebuild" anywhere in the name.
|
||||
func getScaletestPrebuildsTemplates(ctx context.Context, client *codersdk.Client) ([]codersdk.Template, error) {
|
||||
templates, err := client.Templates(ctx, codersdk.TemplateFilter{
|
||||
FuzzyName: "prebuild",
|
||||
})
|
||||
if err != nil {
|
||||
return nil, xerrors.Errorf("list templates: %w", err)
|
||||
}
|
||||
|
||||
var result []codersdk.Template
|
||||
for _, t := range templates {
|
||||
if strings.HasPrefix(t.Name, loadtestutil.ScaleTestPrefix+"-") && strings.Contains(t.Name, "prebuild") {
|
||||
result = append(result, t)
|
||||
}
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (r *RootCmd) scaletestCleanup() *serpent.Command {
|
||||
var template string
|
||||
cleanupStrategy := newScaletestCleanupStrategy()
|
||||
@@ -555,6 +640,71 @@ func (r *RootCmd) scaletestCleanup() *serpent.Command {
|
||||
}
|
||||
}
|
||||
|
||||
cliui.Infof(inv.Stdout, "Fetching scaletest prebuild workspaces...")
|
||||
prebuildWorkspaces, err := getScaletestPrebuildWorkspaces(ctx, client)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
cliui.Errorf(inv.Stderr, "Found %d scaletest prebuild workspaces\n", len(prebuildWorkspaces))
|
||||
if len(prebuildWorkspaces) != 0 {
|
||||
cliui.Infof(inv.Stdout, "Deleting scaletest prebuild workspaces...")
|
||||
prebuildWsHarness := harness.NewTestHarness(cleanupStrategy.toStrategy(), harness.ConcurrentExecutionStrategy{})
|
||||
|
||||
for i, ws := range prebuildWorkspaces {
|
||||
const testName = "cleanup-prebuild-workspace"
|
||||
prebuildWsHarness.AddRun(testName, strconv.Itoa(i), workspacebuild.NewCleanupRunner(client, ws.ID))
|
||||
}
|
||||
|
||||
prebuildWsCtx, prebuildWsCancel := cleanupStrategy.toContext(ctx)
|
||||
defer prebuildWsCancel()
|
||||
if err := prebuildWsHarness.Run(prebuildWsCtx); err != nil {
|
||||
return xerrors.Errorf("run test harness to delete prebuild workspaces (harness failure, not a test failure): %w", err)
|
||||
}
|
||||
|
||||
cliui.Infof(inv.Stdout, "Done deleting scaletest prebuild workspaces:")
|
||||
prebuildWsRes := prebuildWsHarness.Results()
|
||||
prebuildWsRes.PrintText(inv.Stderr)
|
||||
|
||||
if prebuildWsRes.TotalFail > 0 {
|
||||
return xerrors.Errorf("failed to delete %d scaletest prebuild workspace(s)", prebuildWsRes.TotalFail)
|
||||
}
|
||||
}
|
||||
|
||||
cliui.Infof(inv.Stdout, "Fetching scaletest prebuilds templates...")
|
||||
prebuildTemplates, err := getScaletestPrebuildsTemplates(ctx, client)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
cliui.Errorf(inv.Stderr, "Found %d scaletest prebuilds templates\n", len(prebuildTemplates))
|
||||
if len(prebuildTemplates) != 0 {
|
||||
cliui.Infof(inv.Stdout, "Deleting scaletest prebuilds templates...")
|
||||
prebuildTplHarness := harness.NewTestHarness(cleanupStrategy.toStrategy(), harness.ConcurrentExecutionStrategy{})
|
||||
|
||||
for i, t := range prebuildTemplates {
|
||||
const testName = "cleanup-prebuilds-template"
|
||||
prebuildTplHarness.AddRun(testName, strconv.Itoa(i), &prebuildTemplateCleanupRunner{
|
||||
client: client,
|
||||
template: t,
|
||||
})
|
||||
}
|
||||
|
||||
prebuildTplCtx, prebuildTplCancel := cleanupStrategy.toContext(ctx)
|
||||
defer prebuildTplCancel()
|
||||
if err := prebuildTplHarness.Run(prebuildTplCtx); err != nil {
|
||||
return xerrors.Errorf("run test harness to delete prebuilds templates (harness failure, not a test failure): %w", err)
|
||||
}
|
||||
|
||||
cliui.Infof(inv.Stdout, "Done deleting scaletest prebuilds templates:")
|
||||
prebuildTplRes := prebuildTplHarness.Results()
|
||||
prebuildTplRes.PrintText(inv.Stderr)
|
||||
|
||||
if prebuildTplRes.TotalFail > 0 {
|
||||
return xerrors.Errorf("failed to delete %d scaletest prebuilds template(s)", prebuildTplRes.TotalFail)
|
||||
}
|
||||
}
|
||||
|
||||
cliui.Infof(inv.Stdout, "Fetching scaletest workspaces...")
|
||||
workspaces, _, err := getScaletestWorkspaces(ctx, client, "", template)
|
||||
if err != nil {
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
_ "embed"
|
||||
"html/template"
|
||||
"io"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
@@ -17,6 +18,7 @@ import (
|
||||
"github.com/coder/coder/v2/codersdk"
|
||||
"github.com/coder/coder/v2/scaletest/harness"
|
||||
"github.com/coder/coder/v2/scaletest/loadtestutil"
|
||||
"github.com/coder/coder/v2/scaletest/workspacebuild"
|
||||
)
|
||||
|
||||
type Runner struct {
|
||||
@@ -77,6 +79,22 @@ func (r *Runner) Run(ctx context.Context, id string, logs io.Writer) error {
|
||||
}
|
||||
templ, err := r.client.CreateTemplate(ctx, r.cfg.OrganizationID, templateReq)
|
||||
if err != nil {
|
||||
// If the template already exists from a previous failed run, look it up so
|
||||
// Cleanup() can delete it and the rerun doesn't leave orphaned resources.
|
||||
var sdkErr *codersdk.Error
|
||||
if xerrors.As(err, &sdkErr) && sdkErr.StatusCode() == http.StatusConflict {
|
||||
existing, listErr := r.client.Templates(ctx, codersdk.TemplateFilter{
|
||||
OrganizationID: r.cfg.OrganizationID,
|
||||
ExactName: templateName,
|
||||
})
|
||||
if listErr == nil && len(existing) > 0 {
|
||||
r.template = existing[0]
|
||||
logger.Warn(ctx, "template already exists from a previous run, will be cleaned up",
|
||||
slog.F("template_name", r.template.Name),
|
||||
slog.F("template_id", r.template.ID),
|
||||
)
|
||||
}
|
||||
}
|
||||
r.cfg.Metrics.AddError(templateName, "create_template")
|
||||
return xerrors.Errorf("create template: %w", err)
|
||||
}
|
||||
@@ -193,13 +211,32 @@ func (r *Runner) measureCreation(ctx context.Context, logger slog.Logger) error
|
||||
|
||||
func (r *Runner) measureDeletion(ctx context.Context, logger slog.Logger) error {
|
||||
deletionStartTime := time.Now().UTC()
|
||||
const deletionPollInterval = 500 * time.Millisecond
|
||||
|
||||
targetNumWorkspaces := r.cfg.NumPresets * r.cfg.NumPresetPrebuilds
|
||||
const (
|
||||
deletionPollInterval = 500 * time.Millisecond
|
||||
maxDeletionRetries = 3
|
||||
)
|
||||
|
||||
deletionCtx, cancel := context.WithTimeout(ctx, r.cfg.PrebuildWorkspaceTimeout)
|
||||
defer cancel()
|
||||
|
||||
// Capture the actual workspace count at the start of the deletion phase.
|
||||
// The reconciler may have created extra workspaces beyond the configured
|
||||
// target (e.g. replacements for failed builds), so using targetNumWorkspaces
|
||||
// as the denominator would undercount completed deletions.
|
||||
initialWorkspaces, err := r.client.Workspaces(deletionCtx, codersdk.WorkspaceFilter{
|
||||
Template: r.template.Name,
|
||||
})
|
||||
if err != nil {
|
||||
return xerrors.Errorf("list workspaces at deletion start: %w", err)
|
||||
}
|
||||
initialWorkspaceCount := len(initialWorkspaces.Workspaces)
|
||||
|
||||
// retryCount tracks how many delete builds we've submitted per workspace.
|
||||
// lastRetriedBuildID prevents submitting a second retry for the same failed
|
||||
// build before the API reflects the new build.
|
||||
retryCount := make(map[uuid.UUID]int)
|
||||
lastRetriedBuildID := make(map[uuid.UUID]uuid.UUID)
|
||||
|
||||
tkr := r.cfg.Clock.TickerFunc(deletionCtx, deletionPollInterval, func() error {
|
||||
workspaces, err := r.client.Workspaces(deletionCtx, codersdk.WorkspaceFilter{
|
||||
Template: r.template.Name,
|
||||
@@ -211,20 +248,52 @@ func (r *Runner) measureDeletion(ctx context.Context, logger slog.Logger) error
|
||||
createdCount := 0
|
||||
runningCount := 0
|
||||
failedCount := 0
|
||||
exhaustedCount := 0
|
||||
|
||||
for _, ws := range workspaces.Workspaces {
|
||||
if ws.LatestBuild.Transition == codersdk.WorkspaceTransitionDelete {
|
||||
createdCount++
|
||||
switch ws.LatestBuild.Job.Status {
|
||||
case codersdk.ProvisionerJobRunning:
|
||||
if ws.LatestBuild.Transition != codersdk.WorkspaceTransitionDelete {
|
||||
// The reconciler hasn't submitted a delete build yet.
|
||||
continue
|
||||
}
|
||||
createdCount++
|
||||
|
||||
switch ws.LatestBuild.Job.Status {
|
||||
case codersdk.ProvisionerJobRunning, codersdk.ProvisionerJobPending:
|
||||
runningCount++
|
||||
|
||||
case codersdk.ProvisionerJobFailed, codersdk.ProvisionerJobCanceled:
|
||||
// Skip if we've already submitted a retry for this specific
|
||||
// failed build and are waiting for the new build to appear.
|
||||
if lastRetriedBuildID[ws.ID] == ws.LatestBuild.ID {
|
||||
runningCount++
|
||||
case codersdk.ProvisionerJobFailed, codersdk.ProvisionerJobCanceled:
|
||||
failedCount++
|
||||
continue
|
||||
}
|
||||
|
||||
if retryCount[ws.ID] >= maxDeletionRetries {
|
||||
exhaustedCount++
|
||||
failedCount++
|
||||
continue
|
||||
}
|
||||
|
||||
retryCount[ws.ID]++
|
||||
lastRetriedBuildID[ws.ID] = ws.LatestBuild.ID
|
||||
logger.Warn(deletionCtx, "retrying failed workspace deletion",
|
||||
slog.F("workspace_id", ws.ID),
|
||||
slog.F("workspace_name", ws.Name),
|
||||
slog.F("attempt", retryCount[ws.ID]),
|
||||
slog.F("max_attempts", maxDeletionRetries),
|
||||
)
|
||||
_, retryErr := r.client.CreateWorkspaceBuild(deletionCtx, ws.ID, codersdk.CreateWorkspaceBuildRequest{
|
||||
Transition: codersdk.WorkspaceTransitionDelete,
|
||||
})
|
||||
if retryErr != nil {
|
||||
return xerrors.Errorf("retry workspace deletion (attempt %d): %w", retryCount[ws.ID], retryErr)
|
||||
}
|
||||
runningCount++
|
||||
}
|
||||
}
|
||||
|
||||
completedCount := targetNumWorkspaces - len(workspaces.Workspaces)
|
||||
completedCount := initialWorkspaceCount - len(workspaces.Workspaces)
|
||||
createdCount += completedCount
|
||||
|
||||
r.cfg.Metrics.SetDeletionJobsCreated(createdCount, r.template.Name)
|
||||
@@ -236,9 +305,15 @@ func (r *Runner) measureDeletion(ctx context.Context, logger slog.Logger) error
|
||||
return errTickerDone
|
||||
}
|
||||
|
||||
// If every remaining workspace has exhausted all retries, fail
|
||||
// immediately rather than waiting for the timeout.
|
||||
if exhaustedCount > 0 && exhaustedCount == len(workspaces.Workspaces) {
|
||||
return xerrors.Errorf("%d workspace(s) failed to delete after %d attempts", exhaustedCount, maxDeletionRetries+1)
|
||||
}
|
||||
|
||||
return nil
|
||||
}, "waitForPrebuildWorkspacesDeletion")
|
||||
err := tkr.Wait()
|
||||
err = tkr.Wait()
|
||||
if !xerrors.Is(err, errTickerDone) {
|
||||
r.cfg.Metrics.AddError(r.template.Name, "wait_for_workspace_deletion")
|
||||
return xerrors.Errorf("wait for workspace deletion: %w", err)
|
||||
@@ -305,10 +380,69 @@ func (r *Runner) Cleanup(ctx context.Context, _ string, logs io.Writer) error {
|
||||
logs = loadtestutil.NewSyncWriter(logs)
|
||||
logger := slog.Make(sloghuman.Sink(logs)).Leveled(slog.LevelDebug)
|
||||
|
||||
logger.Info(ctx, "deleting template", slog.F("template_name", r.template.Name))
|
||||
// If Run failed before the template was created, there is nothing to clean up.
|
||||
if r.template.ID == uuid.Nil {
|
||||
logger.Info(ctx, "template was never created, skipping cleanup")
|
||||
return nil
|
||||
}
|
||||
|
||||
err := r.client.DeleteTemplate(ctx, r.template.ID)
|
||||
// Workspaces must be deleted before the template can be deleted.
|
||||
workspaces, err := allWorkspacesForTemplate(ctx, r.client, r.template.Name)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("list workspaces for template %q: %w", r.template.Name, err)
|
||||
}
|
||||
|
||||
logger.Info(ctx, "deleting workspaces for template", slog.F("count", len(workspaces)), slog.F("template_name", r.template.Name))
|
||||
|
||||
// Retry failed workspace deletions up to maxDeletionAttempts times to
|
||||
// handle transient errors (e.g. a delete build that fails due to a
|
||||
// provisioner hiccup).
|
||||
const maxDeletionAttempts = 3
|
||||
remaining := workspaces
|
||||
for attempt := range maxDeletionAttempts {
|
||||
if len(remaining) == 0 {
|
||||
break
|
||||
}
|
||||
if attempt > 0 {
|
||||
logger.Info(ctx, "retrying workspace deletions",
|
||||
slog.F("attempt", attempt+1),
|
||||
slog.F("remaining", len(remaining)),
|
||||
slog.F("template_name", r.template.Name),
|
||||
)
|
||||
}
|
||||
var failed []codersdk.Workspace
|
||||
for _, ws := range remaining {
|
||||
cr := workspacebuild.NewCleanupRunner(r.client, ws.ID)
|
||||
if err := cr.Run(ctx, ws.ID.String(), logs); err != nil {
|
||||
logger.Warn(ctx, "failed to delete workspace",
|
||||
slog.F("workspace_id", ws.ID),
|
||||
slog.F("workspace_name", ws.Name),
|
||||
slog.Error(err),
|
||||
)
|
||||
failed = append(failed, ws)
|
||||
}
|
||||
}
|
||||
remaining = failed
|
||||
}
|
||||
|
||||
if len(remaining) > 0 {
|
||||
ids := make([]string, len(remaining))
|
||||
for i, ws := range remaining {
|
||||
ids[i] = ws.ID.String()
|
||||
}
|
||||
logger.Error(ctx, "CLEANUP INCOMPLETE: could not delete all workspaces after retries; template deletion will likely fail",
|
||||
slog.F("template_name", r.template.Name),
|
||||
slog.F("remaining_count", len(remaining)),
|
||||
slog.F("remaining_workspace_ids", ids),
|
||||
)
|
||||
}
|
||||
|
||||
// Always attempt template deletion even if some workspaces could not be
|
||||
// removed. The delete call will fail with a 400 if workspaces remain, but
|
||||
// the error — combined with the log above — makes the state clear to the
|
||||
// operator.
|
||||
logger.Info(ctx, "deleting template", slog.F("template_name", r.template.Name))
|
||||
if err := r.client.DeleteTemplate(ctx, r.template.ID); err != nil {
|
||||
return xerrors.Errorf("delete template: %w", err)
|
||||
}
|
||||
|
||||
@@ -316,6 +450,28 @@ func (r *Runner) Cleanup(ctx context.Context, _ string, logs io.Writer) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// allWorkspacesForTemplate returns all workspaces belonging to templateName,
|
||||
// paginating through results until exhausted.
|
||||
func allWorkspacesForTemplate(ctx context.Context, client *codersdk.Client, templateName string) ([]codersdk.Workspace, error) {
|
||||
const pageSize = 100
|
||||
var workspaces []codersdk.Workspace
|
||||
for page := 0; ; page++ {
|
||||
resp, err := client.Workspaces(ctx, codersdk.WorkspaceFilter{
|
||||
Template: templateName,
|
||||
Offset: page * pageSize,
|
||||
Limit: pageSize,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, xerrors.Errorf("list workspaces page %d: %w", page, err)
|
||||
}
|
||||
workspaces = append(workspaces, resp.Workspaces...)
|
||||
if len(resp.Workspaces) < pageSize {
|
||||
break
|
||||
}
|
||||
}
|
||||
return workspaces, nil
|
||||
}
|
||||
|
||||
//go:embed tf/main.tf.tpl
|
||||
var templateContent string
|
||||
|
||||
|
||||
Reference in New Issue
Block a user