Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| c525f02370 | |||
| 1b10d94be0 | |||
| 5fd7d403c7 | |||
| fb63db7121 | |||
| 1a57963b86 |
@@ -7,6 +7,7 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
@@ -16,6 +17,7 @@ import (
|
||||
"github.com/hashicorp/go-multierror"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||
"github.com/sqlc-dev/pqtype"
|
||||
"golang.org/x/sync/errgroup"
|
||||
"golang.org/x/xerrors"
|
||||
|
||||
@@ -37,6 +39,72 @@ import (
|
||||
"github.com/coder/quartz"
|
||||
)
|
||||
|
||||
// presetFailuresTracker tracks creation failures for presets to implement incremental backoff.
|
||||
type presetFailuresTracker struct {
|
||||
failures map[uuid.UUID]*presetCreationFailure
|
||||
mu sync.RWMutex
|
||||
clock quartz.Clock
|
||||
}
|
||||
|
||||
// presetCreationFailure tracks recent creation failures for a preset to implement incremental backoff.
|
||||
type presetCreationFailure struct {
|
||||
consecutiveFailures int
|
||||
lastFailureAt time.Time
|
||||
}
|
||||
|
||||
func newPresetFailuresTracker(clock quartz.Clock) *presetFailuresTracker {
|
||||
return &presetFailuresTracker{
|
||||
failures: make(map[uuid.UUID]*presetCreationFailure),
|
||||
clock: clock,
|
||||
}
|
||||
}
|
||||
|
||||
// RecordFailure records a prebuild creation failure for a preset and increments the consecutive failure count.
|
||||
func (t *presetFailuresTracker) RecordFailure(presetID uuid.UUID) {
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
|
||||
failure, exists := t.failures[presetID]
|
||||
if !exists {
|
||||
failure = &presetCreationFailure{}
|
||||
t.failures[presetID] = failure
|
||||
}
|
||||
|
||||
failure.consecutiveFailures++
|
||||
failure.lastFailureAt = t.clock.Now()
|
||||
}
|
||||
|
||||
// RecordSuccess clears the failure tracking for a preset after a successful creation.
|
||||
func (t *presetFailuresTracker) RecordSuccess(presetID uuid.UUID) {
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
|
||||
delete(t.failures, presetID)
|
||||
}
|
||||
|
||||
// ShouldBackoff checks if we should delay creation attempts for a preset based on recent failures.
|
||||
// It returns true and the backoff time if we should delay, false and zero time otherwise.
|
||||
func (t *presetFailuresTracker) ShouldBackoff(presetID uuid.UUID, backoffInterval time.Duration) (bool, time.Time) {
|
||||
t.mu.RLock()
|
||||
defer t.mu.RUnlock()
|
||||
|
||||
failure, exists := t.failures[presetID]
|
||||
if !exists || failure.consecutiveFailures == 0 {
|
||||
return false, time.Time{}
|
||||
}
|
||||
|
||||
// Calculate exponential backoff: backoffInterval * consecutiveFailures
|
||||
// This gives us a linear backoff that increases with each consecutive failure.
|
||||
backoffDuration := backoffInterval * time.Duration(failure.consecutiveFailures)
|
||||
backoffUntil := failure.lastFailureAt.Add(backoffDuration)
|
||||
|
||||
if t.clock.Now().Before(backoffUntil) {
|
||||
return true, backoffUntil
|
||||
}
|
||||
|
||||
return false, time.Time{}
|
||||
}
|
||||
|
||||
type StoreReconciler struct {
|
||||
store database.Store
|
||||
cfg codersdk.PrebuildsConfig
|
||||
@@ -58,6 +126,9 @@ type StoreReconciler struct {
|
||||
metrics *MetricsCollector
|
||||
// Operational metrics
|
||||
reconciliationDuration prometheus.Histogram
|
||||
|
||||
// Per-preset creation failure tracking for incremental backoff
|
||||
failureTracker *presetFailuresTracker
|
||||
}
|
||||
|
||||
var _ prebuilds.ReconciliationOrchestrator = &StoreReconciler{}
|
||||
@@ -102,6 +173,7 @@ func NewStoreReconciler(store database.Store,
|
||||
buildUsageChecker: buildUsageChecker,
|
||||
done: make(chan struct{}, 1),
|
||||
provisionNotifyCh: make(chan database.ProvisionerJob, 10),
|
||||
failureTracker: newPresetFailuresTracker(clock),
|
||||
}
|
||||
|
||||
if registerer != nil {
|
||||
@@ -124,6 +196,22 @@ func NewStoreReconciler(store database.Store,
|
||||
return reconciler
|
||||
}
|
||||
|
||||
// RecordCreationFailure records a prebuild creation failure for a preset and increments the consecutive failure count.
|
||||
func (c *StoreReconciler) RecordCreationFailure(presetID uuid.UUID) {
|
||||
c.failureTracker.RecordFailure(presetID)
|
||||
}
|
||||
|
||||
// RecordCreationSuccess clears the failure tracking for a preset after a successful creation.
|
||||
func (c *StoreReconciler) RecordCreationSuccess(presetID uuid.UUID) {
|
||||
c.failureTracker.RecordSuccess(presetID)
|
||||
}
|
||||
|
||||
// ShouldBackoffCreation checks if we should delay creation attempts for a preset based on recent failures.
|
||||
// It returns true and the backoff time if we should delay, false and zero time otherwise.
|
||||
func (c *StoreReconciler) ShouldBackoffCreation(presetID uuid.UUID) (bool, time.Time) {
|
||||
return c.failureTracker.ShouldBackoff(presetID, c.cfg.ReconciliationBackoffInterval.Value())
|
||||
}
|
||||
|
||||
func (c *StoreReconciler) Run(ctx context.Context) {
|
||||
reconciliationInterval := c.cfg.ReconciliationInterval.Value()
|
||||
if reconciliationInterval <= 0 { // avoids a panic
|
||||
@@ -643,6 +731,16 @@ func (c *StoreReconciler) executeReconciliationAction(ctx context.Context, logge
|
||||
return nil
|
||||
|
||||
case prebuilds.ActionTypeCreate:
|
||||
// Check if we should backoff on this preset due to recent creation failures
|
||||
if shouldBackoff, backoffUntil := c.failureTracker.ShouldBackoff(ps.Preset.ID, c.cfg.ReconciliationBackoffInterval.Value()); shouldBackoff {
|
||||
logger.Warn(ctx, "backing off prebuild creation due to recent failures",
|
||||
slog.F("preset_id", ps.Preset.ID.String()),
|
||||
slog.F("backoff_until", backoffUntil.Format(time.RFC3339)),
|
||||
slog.F("backoff_secs", math.Round(backoffUntil.Sub(c.clock.Now()).Seconds())),
|
||||
)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Unexpected things happen (i.e. bugs or bitflips); let's defend against disastrous outcomes.
|
||||
// See https://blog.robertelder.org/causes-of-bit-flips-in-computer-memory/.
|
||||
// This is obviously not comprehensive protection against this sort of problem, but this is one essential check.
|
||||
@@ -666,7 +764,18 @@ func (c *StoreReconciler) executeReconciliationAction(ctx context.Context, logge
|
||||
for range action.Create {
|
||||
if err := c.createPrebuiltWorkspace(prebuildsCtx, uuid.New(), ps.Preset.TemplateID, ps.Preset.ID); err != nil {
|
||||
logger.Error(ctx, "failed to create prebuild", slog.Error(err))
|
||||
|
||||
// Only apply backoff for transient errors (500-level).
|
||||
// Config errors (400-level) should fail immediately and count toward the hard limit.
|
||||
var buildErr wsbuilder.BuildError
|
||||
if errors.As(err, &buildErr) && buildErr.Status == http.StatusInternalServerError {
|
||||
c.failureTracker.RecordFailure(ps.Preset.ID)
|
||||
}
|
||||
|
||||
multiErr.Errors = append(multiErr.Errors, err)
|
||||
} else {
|
||||
// Only clear failure tracking if we successfully created at least one prebuild
|
||||
c.failureTracker.RecordSuccess(ps.Preset.ID)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -734,7 +843,22 @@ func (c *StoreReconciler) createPrebuiltWorkspace(ctx context.Context, prebuiltW
|
||||
slog.F("workspace_id", prebuiltWorkspaceID.String()), slog.F("preset_id", presetID.String()))
|
||||
|
||||
provisionerJob, err = c.provision(ctx, db, prebuiltWorkspaceID, template, presetID, database.WorkspaceTransitionStart, workspace, DeprovisionModeNormal)
|
||||
return err
|
||||
if err != nil {
|
||||
// Check if this is a config error (non-transient) from wsbuilder.
|
||||
// If so, create a failed build record so it counts toward the hard limit.
|
||||
var buildErr wsbuilder.BuildError
|
||||
if errors.As(err, &buildErr) && buildErr.Status != http.StatusInternalServerError {
|
||||
// This is a config error (400-level). Create a failed build record
|
||||
// so it counts toward the hard failure limit.
|
||||
if failErr := c.createFailedBuildRecord(ctx, db, workspace, template, presetID, now, buildErr); failErr != nil {
|
||||
c.logger.Warn(ctx, "failed to create failed build record for config error",
|
||||
slog.Error(failErr),
|
||||
slog.F("original_error", err.Error()))
|
||||
}
|
||||
}
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}, &database.TxOptions{
|
||||
Isolation: sql.LevelRepeatableRead,
|
||||
ReadOnly: false,
|
||||
@@ -749,6 +873,105 @@ func (c *StoreReconciler) createPrebuiltWorkspace(ctx context.Context, prebuiltW
|
||||
return nil
|
||||
}
|
||||
|
||||
// createFailedBuildRecord creates a workspace build and provisioner job record marked as failed.
|
||||
// This allows config errors that fail at wsbuilder.Build() time to count toward the hard failure limit.
|
||||
// The hard limit query checks workspace_latest_builds.job_status, which is derived from the provisioner job.
|
||||
//
|
||||
// IMPORTANT: This function must be called within a database transaction.
|
||||
func (c *StoreReconciler) createFailedBuildRecord(
|
||||
ctx context.Context,
|
||||
db database.Store,
|
||||
workspace database.Workspace,
|
||||
template database.Template,
|
||||
presetID uuid.UUID,
|
||||
now time.Time,
|
||||
buildErr wsbuilder.BuildError,
|
||||
) error {
|
||||
// Get template version job to populate provisioner job fields
|
||||
templateVersion, err := db.GetTemplateVersionByID(ctx, template.ActiveVersionID)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("get template version: %w", err)
|
||||
}
|
||||
|
||||
templateVersionJob, err := db.GetProvisionerJobByID(ctx, templateVersion.JobID)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("get template version job: %w", err)
|
||||
}
|
||||
|
||||
// Create a provisioner job marked as failed
|
||||
provisionerJobID := uuid.New()
|
||||
_, err = db.InsertProvisionerJob(ctx, database.InsertProvisionerJobParams{
|
||||
ID: provisionerJobID,
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
InitiatorID: database.PrebuildsSystemUserID,
|
||||
OrganizationID: template.OrganizationID,
|
||||
Provisioner: template.Provisioner,
|
||||
Type: database.ProvisionerJobTypeWorkspaceBuild,
|
||||
StorageMethod: templateVersionJob.StorageMethod,
|
||||
FileID: templateVersionJob.FileID,
|
||||
Input: []byte("{}"), // Empty input since we never got to build
|
||||
Tags: database.StringMap{},
|
||||
TraceMetadata: pqtype.NullRawMessage{Valid: false},
|
||||
LogsOverflowed: false,
|
||||
})
|
||||
if err != nil {
|
||||
return xerrors.Errorf("insert provisioner job: %w", err)
|
||||
}
|
||||
|
||||
// Mark the job as failed immediately
|
||||
// nolint: gocritic // At this moment, we are pretending to be provisionerd.
|
||||
err = db.UpdateProvisionerJobWithCompleteWithStartedAtByID(dbauthz.AsProvisionerd(ctx), database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams{
|
||||
ID: provisionerJobID,
|
||||
UpdatedAt: now,
|
||||
CompletedAt: sql.NullTime{Valid: true, Time: now},
|
||||
StartedAt: sql.NullTime{Valid: true, Time: now},
|
||||
Error: sql.NullString{Valid: true, String: buildErr.Message},
|
||||
ErrorCode: sql.NullString{Valid: false},
|
||||
})
|
||||
if err != nil {
|
||||
return xerrors.Errorf("mark provisioner job as failed: %w", err)
|
||||
}
|
||||
|
||||
// Create workspace build linking to the failed job
|
||||
workspaceBuildID := uuid.New()
|
||||
buildNumber := int32(1) // This will be the first build for this workspace
|
||||
if latestBuild, err := db.GetLatestWorkspaceBuildByWorkspaceID(ctx, workspace.ID); err == nil {
|
||||
buildNumber = latestBuild.BuildNumber + 1
|
||||
}
|
||||
|
||||
err = db.InsertWorkspaceBuild(ctx, database.InsertWorkspaceBuildParams{
|
||||
ID: workspaceBuildID,
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
WorkspaceID: workspace.ID,
|
||||
TemplateVersionID: template.ActiveVersionID,
|
||||
BuildNumber: buildNumber,
|
||||
ProvisionerState: []byte("[]"), // Empty state since we never provisioned
|
||||
InitiatorID: database.PrebuildsSystemUserID,
|
||||
Transition: database.WorkspaceTransitionStart,
|
||||
JobID: provisionerJobID,
|
||||
Reason: database.BuildReasonInitiator,
|
||||
Deadline: time.Time{},
|
||||
MaxDeadline: time.Time{},
|
||||
TemplateVersionPresetID: uuid.NullUUID{
|
||||
UUID: presetID,
|
||||
Valid: true,
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
return xerrors.Errorf("insert workspace build: %w", err)
|
||||
}
|
||||
|
||||
c.logger.Info(ctx, "created failed build record for config error",
|
||||
slog.F("workspace_id", workspace.ID.String()),
|
||||
slog.F("build_id", workspaceBuildID.String()),
|
||||
slog.F("preset_id", presetID.String()),
|
||||
slog.F("error", buildErr.Message))
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// provisionDelete provisions a delete transition for a prebuilt workspace.
|
||||
//
|
||||
// If mode is DeprovisionModeOrphan, the builder will not send Terraform state to the provisioner.
|
||||
|
||||
@@ -3,6 +3,7 @@ package prebuilds_test
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"sort"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
@@ -21,6 +22,7 @@ import (
|
||||
|
||||
"github.com/coder/coder/v2/coderd/coderdtest"
|
||||
"github.com/coder/coder/v2/coderd/database"
|
||||
"github.com/coder/coder/v2/coderd/database/dbauthz"
|
||||
"github.com/coder/coder/v2/coderd/database/dbfake"
|
||||
"github.com/coder/coder/v2/coderd/database/dbgen"
|
||||
"github.com/coder/coder/v2/coderd/database/dbtestutil"
|
||||
@@ -2972,3 +2974,336 @@ func TestReconciliationRespectsPauseSetting(t *testing.T) {
|
||||
require.NoError(t, err)
|
||||
require.Len(t, workspaces, 2, "should have recreated 2 prebuilds after resuming")
|
||||
}
|
||||
|
||||
func TestIncrementalBackoffOnCreationFailure(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
ctx := testutil.Context(t, testutil.WaitLong)
|
||||
clock := quartz.NewMock(t)
|
||||
db, ps := dbtestutil.NewDB(t)
|
||||
backoffInterval := 1 * time.Minute
|
||||
cfg := codersdk.PrebuildsConfig{
|
||||
ReconciliationInterval: serpent.Duration(testutil.WaitLong),
|
||||
ReconciliationBackoffInterval: serpent.Duration(backoffInterval),
|
||||
}
|
||||
logger := slogtest.Make(t, nil)
|
||||
cache := files.New(prometheus.NewRegistry(), &coderdtest.FakeAuthorizer{})
|
||||
reconciler := prebuilds.NewStoreReconciler(db, ps, cache, cfg, logger, clock, prometheus.NewRegistry(), newNoopEnqueuer(), newNoopUsageCheckerPtr())
|
||||
|
||||
// Setup a template with a preset
|
||||
org := dbgen.Organization(t, db, database.Organization{})
|
||||
user := dbgen.User(t, db, database.User{})
|
||||
template := dbgen.Template(t, db, database.Template{
|
||||
CreatedBy: user.ID,
|
||||
OrganizationID: org.ID,
|
||||
})
|
||||
templateVersionID := setupTestDBTemplateVersion(ctx, t, clock, db, ps, org.ID, user.ID, template.ID)
|
||||
presetID := setupTestDBPreset(t, db, templateVersionID, 1, "test").ID
|
||||
|
||||
// Test the backoff mechanism directly by simulating failures
|
||||
// First failure
|
||||
reconciler.RecordCreationFailure(presetID)
|
||||
|
||||
// Check that backoff is active
|
||||
shouldBackoff, backoffUntil := reconciler.ShouldBackoffCreation(presetID)
|
||||
require.True(t, shouldBackoff, "should be in backoff after first failure")
|
||||
expectedBackoff := clock.Now().Add(backoffInterval)
|
||||
require.Equal(t, expectedBackoff, backoffUntil, "backoff should be 1x interval after first failure")
|
||||
|
||||
// Advance clock past first backoff
|
||||
clock.Advance(backoffInterval + time.Second)
|
||||
|
||||
// Should no longer be in backoff
|
||||
shouldBackoff, _ = reconciler.ShouldBackoffCreation(presetID)
|
||||
require.False(t, shouldBackoff, "should not be in backoff after period expires")
|
||||
|
||||
// Second consecutive failure
|
||||
reconciler.RecordCreationFailure(presetID)
|
||||
|
||||
// Check that backoff is longer now (2 * interval)
|
||||
shouldBackoff, backoffUntil = reconciler.ShouldBackoffCreation(presetID)
|
||||
require.True(t, shouldBackoff, "should be in backoff after second failure")
|
||||
expectedBackoff = clock.Now().Add(2 * backoffInterval)
|
||||
require.Equal(t, expectedBackoff, backoffUntil, "backoff should be 2x interval after second failure")
|
||||
|
||||
// Advance clock by only 1 interval - should still be in backoff
|
||||
clock.Advance(backoffInterval)
|
||||
shouldBackoff, _ = reconciler.ShouldBackoffCreation(presetID)
|
||||
require.True(t, shouldBackoff, "should still be in backoff after 1 interval with 2 failures")
|
||||
|
||||
// Advance clock by another interval - backoff should expire
|
||||
clock.Advance(backoffInterval + time.Second)
|
||||
shouldBackoff, _ = reconciler.ShouldBackoffCreation(presetID)
|
||||
require.False(t, shouldBackoff, "should not be in backoff after 2 intervals expire")
|
||||
|
||||
// Third consecutive failure
|
||||
reconciler.RecordCreationFailure(presetID)
|
||||
|
||||
// Check that backoff is even longer now (3 * interval)
|
||||
shouldBackoff, backoffUntil = reconciler.ShouldBackoffCreation(presetID)
|
||||
require.True(t, shouldBackoff, "should be in backoff after third failure")
|
||||
expectedBackoff = clock.Now().Add(3 * backoffInterval)
|
||||
require.Equal(t, expectedBackoff, backoffUntil, "backoff should be 3x interval after third failure")
|
||||
|
||||
// Successful creation should clear the backoff
|
||||
reconciler.RecordCreationSuccess(presetID)
|
||||
shouldBackoff, _ = reconciler.ShouldBackoffCreation(presetID)
|
||||
require.False(t, shouldBackoff, "should not be in backoff after successful creation")
|
||||
|
||||
// New failure after success should start backoff from 1x interval again
|
||||
reconciler.RecordCreationFailure(presetID)
|
||||
shouldBackoff, backoffUntil = reconciler.ShouldBackoffCreation(presetID)
|
||||
require.True(t, shouldBackoff, "should be in backoff after failure following success")
|
||||
expectedBackoff = clock.Now().Add(backoffInterval)
|
||||
require.Equal(t, expectedBackoff, backoffUntil, "backoff should reset to 1x interval after success")
|
||||
}
|
||||
|
||||
func TestHardFailureLimitTracking(t *testing.T) {
|
||||
// This test verifies that failed prebuild attempts are correctly tracked
|
||||
// in the database and counted by GetPresetsAtFailureLimit.
|
||||
// Similar to TestIncrementalBackoffOnCreationFailure, this test manually
|
||||
// creates the database state rather than running the full reconciliation.
|
||||
t.Parallel()
|
||||
|
||||
ctx := testutil.Context(t, testutil.WaitLong)
|
||||
ctx = dbauthz.AsSystemRestricted(ctx)
|
||||
clock := quartz.NewMock(t)
|
||||
db, ps := dbtestutil.NewDB(t)
|
||||
|
||||
// Setup template with preset
|
||||
org := dbgen.Organization(t, db, database.Organization{})
|
||||
user := dbgen.User(t, db, database.User{})
|
||||
template := dbgen.Template(t, db, database.Template{
|
||||
CreatedBy: user.ID,
|
||||
OrganizationID: org.ID,
|
||||
})
|
||||
templateVersionID := setupTestDBTemplateVersion(ctx, t, clock, db, ps, org.ID, user.ID, template.ID)
|
||||
preset := setupTestDBPreset(t, db, templateVersionID, 3, "test-preset")
|
||||
|
||||
// Get the template version for provisioner job setup
|
||||
templateVersion, err := db.GetTemplateVersionByID(ctx, templateVersionID)
|
||||
require.NoError(t, err)
|
||||
templateVersionJob, err := db.GetProvisionerJobByID(ctx, templateVersion.JobID)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Helper to create a failed prebuild workspace build
|
||||
createFailedPrebuild := func(buildNum int) {
|
||||
// Create workspace for this prebuild
|
||||
workspace := dbgen.Workspace(t, db, database.Workspace{
|
||||
TemplateID: template.ID,
|
||||
OrganizationID: org.ID,
|
||||
OwnerID: database.PrebuildsSystemUserID,
|
||||
Name: fmt.Sprintf("prebuild-%d-%d", preset.ID, buildNum),
|
||||
})
|
||||
|
||||
// Create failed provisioner job
|
||||
now := clock.Now()
|
||||
job, err := db.InsertProvisionerJob(ctx, database.InsertProvisionerJobParams{
|
||||
ID: uuid.New(),
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
InitiatorID: database.PrebuildsSystemUserID,
|
||||
OrganizationID: org.ID,
|
||||
Provisioner: template.Provisioner,
|
||||
Type: database.ProvisionerJobTypeWorkspaceBuild,
|
||||
StorageMethod: templateVersionJob.StorageMethod,
|
||||
FileID: templateVersionJob.FileID,
|
||||
Input: []byte("{}"),
|
||||
Tags: database.StringMap{},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Mark job as failed - this sets job_status to 'failed' via generated column
|
||||
err = db.UpdateProvisionerJobWithCompleteByID(ctx, database.UpdateProvisionerJobWithCompleteByIDParams{
|
||||
ID: job.ID,
|
||||
UpdatedAt: now,
|
||||
CompletedAt: sql.NullTime{Valid: true, Time: now},
|
||||
Error: sql.NullString{Valid: true, String: fmt.Sprintf("config error: missing required param (build %d)", buildNum)},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Create workspace build linking to failed job
|
||||
workspaceBuildID := uuid.New()
|
||||
err = db.InsertWorkspaceBuild(ctx, database.InsertWorkspaceBuildParams{
|
||||
ID: workspaceBuildID,
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
WorkspaceID: workspace.ID,
|
||||
TemplateVersionID: templateVersionID,
|
||||
BuildNumber: int32(buildNum),
|
||||
ProvisionerState: []byte("[]"),
|
||||
InitiatorID: database.PrebuildsSystemUserID,
|
||||
Transition: database.WorkspaceTransitionStart,
|
||||
JobID: job.ID,
|
||||
Reason: database.BuildReasonInitiator,
|
||||
TemplateVersionPresetID: uuid.NullUUID{
|
||||
UUID: preset.ID,
|
||||
Valid: true,
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Verify the job has failed status
|
||||
verifyJob, err := db.GetProvisionerJobByID(ctx, job.ID)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, database.ProvisionerJobStatusFailed, verifyJob.JobStatus, "job_status should be failed")
|
||||
}
|
||||
|
||||
// Test 1: Create one failed build, should NOT hit hard limit
|
||||
createFailedPrebuild(1)
|
||||
|
||||
presetsAtLimit, err := db.GetPresetsAtFailureLimit(ctx, 3)
|
||||
require.NoError(t, err)
|
||||
require.Empty(t, presetsAtLimit, "preset should not hit hard limit after 1 failure (limit is 3)")
|
||||
|
||||
// Test 2: Create second failed build, still should NOT hit limit
|
||||
createFailedPrebuild(2)
|
||||
|
||||
presetsAtLimit, err = db.GetPresetsAtFailureLimit(ctx, 3)
|
||||
require.NoError(t, err)
|
||||
require.Empty(t, presetsAtLimit, "preset should not hit hard limit after 2 failures (limit is 3)")
|
||||
|
||||
// Test 3: Create third failed build, should NOW hit hard limit
|
||||
createFailedPrebuild(3)
|
||||
|
||||
presetsAtLimit, err = db.GetPresetsAtFailureLimit(ctx, 3)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, presetsAtLimit, 1, "preset should hit hard limit after 3 consecutive failures")
|
||||
require.Equal(t, preset.ID, presetsAtLimit[0].PresetID, "correct preset should be at failure limit")
|
||||
|
||||
// Test 4: Verify lower limit also catches it
|
||||
presetsAtLimit, err = db.GetPresetsAtFailureLimit(ctx, 2)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, presetsAtLimit, 1, "preset should also hit limit=2 with 3 failures")
|
||||
|
||||
// This test validates that our database schema correctly tracks failed
|
||||
// builds and the GetPresetsAtFailureLimit query accurately identifies
|
||||
// presets that have hit the failure threshold.
|
||||
}
|
||||
|
||||
func TestConfigErrorCreatesFailedBuildRecord(t *testing.T) {
|
||||
// This test verifies that when createPrebuiltWorkspace encounters a config error
|
||||
// (HTTP 400-level error from wsbuilder.Build), it creates a failed build record
|
||||
// in the database so the error counts toward the hard failure limit.
|
||||
t.Parallel()
|
||||
|
||||
ctx := testutil.Context(t, testutil.WaitLong)
|
||||
ctx = dbauthz.AsPrebuildsOrchestrator(ctx)
|
||||
clock := quartz.NewMock(t)
|
||||
db, ps := dbtestutil.NewDB(t)
|
||||
cfg := codersdk.PrebuildsConfig{
|
||||
ReconciliationInterval: serpent.Duration(testutil.WaitLong),
|
||||
ReconciliationBackoffInterval: serpent.Duration(1 * time.Minute),
|
||||
}
|
||||
logger := slogtest.Make(t, nil)
|
||||
cache := files.New(prometheus.NewRegistry(), &coderdtest.FakeAuthorizer{})
|
||||
reconciler := prebuilds.NewStoreReconciler(db, ps, cache, cfg, logger, clock, prometheus.NewRegistry(), newNoopEnqueuer(), newNoopUsageCheckerPtr())
|
||||
|
||||
// Setup template with a preset that has required mutable parameters.
|
||||
// This will cause wsbuilder.Build to fail with a BadRequest error when
|
||||
// the preset doesn't provide values for required mutable parameters.
|
||||
org := dbgen.Organization(t, db, database.Organization{})
|
||||
user := dbgen.User(t, db, database.User{})
|
||||
template := dbgen.Template(t, db, database.Template{
|
||||
CreatedBy: user.ID,
|
||||
OrganizationID: org.ID,
|
||||
})
|
||||
|
||||
// Create a template version with a required mutable parameter
|
||||
templateVersionJob := dbgen.ProvisionerJob(t, db, ps, database.ProvisionerJob{
|
||||
CreatedAt: clock.Now().Add(muchEarlier),
|
||||
CompletedAt: sql.NullTime{Time: clock.Now().Add(earlier), Valid: true},
|
||||
OrganizationID: org.ID,
|
||||
InitiatorID: user.ID,
|
||||
})
|
||||
templateVersion := dbgen.TemplateVersion(t, db, database.TemplateVersion{
|
||||
TemplateID: uuid.NullUUID{UUID: template.ID, Valid: true},
|
||||
OrganizationID: org.ID,
|
||||
CreatedBy: user.ID,
|
||||
JobID: templateVersionJob.ID,
|
||||
CreatedAt: clock.Now().Add(muchEarlier),
|
||||
})
|
||||
require.NoError(t, db.UpdateTemplateActiveVersionByID(ctx, database.UpdateTemplateActiveVersionByIDParams{
|
||||
ID: template.ID,
|
||||
ActiveVersionID: templateVersion.ID,
|
||||
}))
|
||||
|
||||
// Add a required mutable parameter - this will cause validation to fail
|
||||
// when the preset doesn't provide a value
|
||||
dbgen.TemplateVersionParameter(t, db, database.TemplateVersionParameter{
|
||||
TemplateVersionID: templateVersion.ID,
|
||||
Name: "required_param",
|
||||
Type: "string",
|
||||
Required: true,
|
||||
Mutable: true,
|
||||
DefaultValue: "",
|
||||
})
|
||||
|
||||
// Create preset without providing the required parameter
|
||||
preset := setupTestDBPreset(t, db, templateVersion.ID, 1, "test-preset")
|
||||
|
||||
// Get initial workspace count
|
||||
workspacesBefore, err := db.GetWorkspacesByTemplateID(ctx, template.ID)
|
||||
require.NoError(t, err)
|
||||
initialWorkspaceCount := len(workspacesBefore)
|
||||
|
||||
// Run reconciliation - this should attempt to create a prebuild, fail with config error,
|
||||
// and create a failed build record
|
||||
_, err = reconciler.ReconcileAll(ctx)
|
||||
require.NoError(t, err, "reconciliation should complete even if prebuild creation fails")
|
||||
|
||||
// Verify a workspace was created (even though build failed)
|
||||
workspacesAfter, err := db.GetWorkspacesByTemplateID(ctx, template.ID)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, initialWorkspaceCount+1, len(workspacesAfter), "should have created one workspace")
|
||||
|
||||
// Find the new workspace
|
||||
var newWorkspaceID uuid.UUID
|
||||
for _, ws := range workspacesAfter {
|
||||
found := false
|
||||
for _, oldWs := range workspacesBefore {
|
||||
if ws.ID == oldWs.ID {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
newWorkspaceID = ws.ID
|
||||
break
|
||||
}
|
||||
}
|
||||
require.NotEqual(t, uuid.Nil, newWorkspaceID, "should have found new workspace")
|
||||
|
||||
// Verify a failed build record was created
|
||||
build, err := db.GetLatestWorkspaceBuildByWorkspaceID(ctx, newWorkspaceID)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, database.WorkspaceTransitionStart, build.Transition, "build should be start transition")
|
||||
require.Equal(t, preset.ID, build.TemplateVersionPresetID.UUID, "build should reference preset")
|
||||
|
||||
// Verify the provisioner job exists and is marked as failed
|
||||
job, err := db.GetProvisionerJobByID(ctx, build.JobID)
|
||||
require.NoError(t, err)
|
||||
require.True(t, job.CompletedAt.Valid, "job should be completed")
|
||||
require.True(t, job.Error.Valid, "job should have error set")
|
||||
require.NotEmpty(t, job.Error.String, "job error message should not be empty")
|
||||
require.Contains(t, job.Error.String, "required_param", "error should mention the missing parameter")
|
||||
|
||||
// Most importantly: verify job_status is 'failed' (this is what counts toward hard limit)
|
||||
// job_status is a generated column that becomes 'failed' when completed_at is set and error is non-empty
|
||||
require.Equal(t, database.ProvisionerJobStatusFailed, job.JobStatus, "job status should be failed")
|
||||
|
||||
// Verify this failure would be counted by GetPresetsAtFailureLimit query
|
||||
// The query looks at workspace_latest_builds view which includes prebuilds with failed job_status
|
||||
presetsAtLimit, err := db.GetPresetsAtFailureLimit(ctx, 1)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Check if our preset appears in the list (it should after 1 failure)
|
||||
foundPreset := false
|
||||
for _, p := range presetsAtLimit {
|
||||
if p.PresetID == preset.ID {
|
||||
foundPreset = true
|
||||
break
|
||||
}
|
||||
}
|
||||
require.True(t, foundPreset, "preset should appear in failure limit list after config error")
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user