chore: make corrupted directories non-fatal (#21707)
From https://github.com/coder/coder/pull/20563#discussion_r2513135196 Closes https://github.com/coder/coder/issues/20751
This commit is contained in:
@@ -236,7 +236,22 @@ func (l Layout) CleanStaleSessions(ctx context.Context, logger slog.Logger, fs a
|
||||
logger.Info(ctx, "remove stale session directory", slog.F("session_path", sessionDirPath))
|
||||
err = fs.RemoveAll(sessionDirPath)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("can't remove %q directory: %w", sessionDirPath, err)
|
||||
// This should not be a fatal error. If it is, the provisioner would be rendered
|
||||
// non-functional until this directory is cleaned up. Ideally there would be a
|
||||
// way to escalate this to an operator alert in Coder. Until then, the best we
|
||||
// can do is log it on every cleanup attempt (every build). Eventually the disk
|
||||
// usage will be noticeable, and hopefully these logs are noticed.
|
||||
logger.Error(ctx, "failed to remove stale session directory",
|
||||
slog.F("directory", sessionDirPath),
|
||||
slog.Error(err),
|
||||
)
|
||||
|
||||
if l.WorkDirectory() == sessionDirPath {
|
||||
// This should never happen because sessions are uuid's. But if that logic ever
|
||||
// changes, this would be a bad state to be in. The directory that the
|
||||
// provisioner is going to use cannot be stale.
|
||||
return xerrors.Errorf("remove %q directory, will not work inside a stale directory: %w", sessionDirPath, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,89 @@
|
||||
package tfpath_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/spf13/afero"
|
||||
"github.com/stretchr/testify/require"
|
||||
"golang.org/x/xerrors"
|
||||
|
||||
"cdr.dev/slog/v3/sloggers/slogtest"
|
||||
"github.com/coder/coder/v2/provisionersdk/tfpath"
|
||||
"github.com/coder/coder/v2/testutil"
|
||||
)
|
||||
|
||||
func TestCleanStaleSessions(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
t.Run("NonFatalRemoveFailure", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
const parentDir = "parent"
|
||||
// Verify RemoveAll failure is not fatal
|
||||
ctx := testutil.Context(t, testutil.WaitShort)
|
||||
|
||||
called := false
|
||||
mem := afero.NewMemMapFs()
|
||||
staleSession := tfpath.Session(parentDir, "stale")
|
||||
err := mem.MkdirAll(staleSession.WorkDirectory(), 0o777)
|
||||
require.NoError(t, err)
|
||||
|
||||
failingFs := &removeFailure{
|
||||
Fs: mem,
|
||||
removeAll: func(path string) error {
|
||||
called = true
|
||||
return xerrors.New("constant failure")
|
||||
},
|
||||
}
|
||||
|
||||
future := time.Now().Add(time.Hour * 24 * 120)
|
||||
l := tfpath.Session(parentDir, "sess1")
|
||||
err = l.CleanStaleSessions(ctx, slogtest.Make(t, &slogtest.Options{
|
||||
IgnoreErrors: true,
|
||||
}), failingFs, future)
|
||||
require.NoError(t, err)
|
||||
require.True(t, called)
|
||||
})
|
||||
|
||||
t.Run("FatalRemoveFailure", func(t *testing.T) {
|
||||
// If the stale directory is the same one we plan to use, that is
|
||||
// an issue.
|
||||
t.Parallel()
|
||||
const parentDir = "parent"
|
||||
// Verify RemoveAll failure is not fatal
|
||||
ctx := testutil.Context(t, testutil.WaitShort)
|
||||
|
||||
called := false
|
||||
mem := afero.NewMemMapFs()
|
||||
staleSession := tfpath.Session(parentDir, "stale")
|
||||
err := mem.MkdirAll(staleSession.WorkDirectory(), 0o777)
|
||||
require.NoError(t, err)
|
||||
|
||||
failingFs := &removeFailure{
|
||||
Fs: mem,
|
||||
removeAll: func(path string) error {
|
||||
called = true
|
||||
return xerrors.New("constant failure")
|
||||
},
|
||||
}
|
||||
|
||||
future := time.Now().Add(time.Hour * 24 * 120)
|
||||
err = staleSession.CleanStaleSessions(ctx, slogtest.Make(t, &slogtest.Options{
|
||||
IgnoreErrors: true,
|
||||
}), failingFs, future)
|
||||
require.ErrorContains(t, err, "constant failure")
|
||||
require.True(t, called)
|
||||
})
|
||||
}
|
||||
|
||||
type removeFailure struct {
|
||||
afero.Fs
|
||||
removeAll func(path string) error
|
||||
}
|
||||
|
||||
func (rf *removeFailure) RemoveAll(path string) error {
|
||||
if rf.removeAll != nil {
|
||||
return rf.removeAll(path)
|
||||
}
|
||||
return rf.Fs.RemoveAll(path)
|
||||
}
|
||||
Reference in New Issue
Block a user