chore: make corrupted directories non-fatal (#21707)

From https://github.com/coder/coder/pull/20563#discussion_r2513135196
Closes https://github.com/coder/coder/issues/20751
This commit is contained in:
Steven Masley
2026-01-28 11:35:17 -06:00
committed by GitHub
parent 264ae77458
commit 921fad098b
2 changed files with 105 additions and 1 deletions
+16 -1
View File
@@ -236,7 +236,22 @@ func (l Layout) CleanStaleSessions(ctx context.Context, logger slog.Logger, fs a
logger.Info(ctx, "remove stale session directory", slog.F("session_path", sessionDirPath))
err = fs.RemoveAll(sessionDirPath)
if err != nil {
return xerrors.Errorf("can't remove %q directory: %w", sessionDirPath, err)
// This should not be a fatal error. If it is, the provisioner would be rendered
// non-functional until this directory is cleaned up. Ideally there would be a
// way to escalate this to an operator alert in Coder. Until then, the best we
// can do is log it on every cleanup attempt (every build). Eventually the disk
// usage will be noticeable, and hopefully these logs are noticed.
logger.Error(ctx, "failed to remove stale session directory",
slog.F("directory", sessionDirPath),
slog.Error(err),
)
if l.WorkDirectory() == sessionDirPath {
// This should never happen because sessions are uuid's. But if that logic ever
// changes, this would be a bad state to be in. The directory that the
// provisioner is going to use cannot be stale.
return xerrors.Errorf("remove %q directory, will not work inside a stale directory: %w", sessionDirPath, err)
}
}
}
}
+89
View File
@@ -0,0 +1,89 @@
package tfpath_test
import (
"testing"
"time"
"github.com/spf13/afero"
"github.com/stretchr/testify/require"
"golang.org/x/xerrors"
"cdr.dev/slog/v3/sloggers/slogtest"
"github.com/coder/coder/v2/provisionersdk/tfpath"
"github.com/coder/coder/v2/testutil"
)
func TestCleanStaleSessions(t *testing.T) {
t.Parallel()
t.Run("NonFatalRemoveFailure", func(t *testing.T) {
t.Parallel()
const parentDir = "parent"
// Verify RemoveAll failure is not fatal
ctx := testutil.Context(t, testutil.WaitShort)
called := false
mem := afero.NewMemMapFs()
staleSession := tfpath.Session(parentDir, "stale")
err := mem.MkdirAll(staleSession.WorkDirectory(), 0o777)
require.NoError(t, err)
failingFs := &removeFailure{
Fs: mem,
removeAll: func(path string) error {
called = true
return xerrors.New("constant failure")
},
}
future := time.Now().Add(time.Hour * 24 * 120)
l := tfpath.Session(parentDir, "sess1")
err = l.CleanStaleSessions(ctx, slogtest.Make(t, &slogtest.Options{
IgnoreErrors: true,
}), failingFs, future)
require.NoError(t, err)
require.True(t, called)
})
t.Run("FatalRemoveFailure", func(t *testing.T) {
// If the stale directory is the same one we plan to use, that is
// an issue.
t.Parallel()
const parentDir = "parent"
// Verify RemoveAll failure is not fatal
ctx := testutil.Context(t, testutil.WaitShort)
called := false
mem := afero.NewMemMapFs()
staleSession := tfpath.Session(parentDir, "stale")
err := mem.MkdirAll(staleSession.WorkDirectory(), 0o777)
require.NoError(t, err)
failingFs := &removeFailure{
Fs: mem,
removeAll: func(path string) error {
called = true
return xerrors.New("constant failure")
},
}
future := time.Now().Add(time.Hour * 24 * 120)
err = staleSession.CleanStaleSessions(ctx, slogtest.Make(t, &slogtest.Options{
IgnoreErrors: true,
}), failingFs, future)
require.ErrorContains(t, err, "constant failure")
require.True(t, called)
})
}
type removeFailure struct {
afero.Fs
removeAll func(path string) error
}
func (rf *removeFailure) RemoveAll(path string) error {
if rf.removeAll != nil {
return rf.removeAll(path)
}
return rf.Fs.RemoveAll(path)
}