chore: retry TestAgent_Dial subtests (#19387)

Closes https://github.com/coder/internal/issues/595
2025-08-18 23:51:19 +10:00
parent a8c89a120f
commit e2ba9e7d62
4 changed files with 307 additions and 58 deletions
@@ -6,7 +6,7 @@ import (
 	"time"
 )

-func Context(t *testing.T, dur time.Duration) context.Context {
+func Context(t testing.TB, dur time.Duration) context.Context {
 	ctx, cancel := context.WithTimeout(context.Background(), dur)
 	t.Cleanup(cancel)
 	return ctx
@@ -0,0 +1,238 @@
+package testutil
+
+import (
+	"context"
+	"fmt"
+	"runtime"
+	"slices"
+	"sync"
+	"testing"
+	"time"
+)
+
+// RunRetry runs a test function up to `count` times, retrying if it fails. If
+// all attempts fail or the context is canceled, the test will fail. It is safe
+// to use the parent context in the test function, but do note that the context
+// deadline will apply to all attempts.
+//
+// DO NOT USE THIS FUNCTION IN TESTS UNLESS YOU HAVE A GOOD REASON. It should
+// only be used in tests that can flake under high load. It is not a replacement
+// for writing a good test.
+//
+// Note that the `testing.TB` supplied to the function is a fake implementation
+// for all runs. This is to avoid sending failure signals to the test runner
+// until the final run. Unrecovered panics will still always be bubbled up to
+// the test runner.
+//
+// Some functions are not implemented and will panic when using the fake
+// implementation:
+// - Chdir
+// - Setenv
+// - Skip, SkipNow, Skipf, Skipped
+// - TempDir
+//
+// Cleanup functions will be executed after each attempt.
+func RunRetry(t *testing.T, count int, fn func(t testing.TB)) {
+	t.Helper()
+
+	for i := 1; i <= count; i++ {
+		// Canceled in the attempt goroutine before running cleanup functions.
+		attemptCtx, attemptCancel := context.WithCancel(t.Context())
+		attemptT := &fakeT{
+			T:    t,
+			ctx:  attemptCtx,
+			name: fmt.Sprintf("%s (attempt %d/%d)", t.Name(), i, count),
+		}
+
+		// Run the test in a goroutine so we can capture runtime.Goexit()
+		// and run cleanup functions.
+		done := make(chan struct{}, 1)
+		go func() {
+			defer close(done)
+			defer func() {
+				// As per t.Context(), the context is canceled right before
+				// cleanup functions are executed.
+				attemptCancel()
+				attemptT.runCleanupFns()
+			}()
+
+			t.Logf("testutil.RunRetry: running test: attempt %d/%d", i, count)
+			fn(attemptT)
+		}()
+
+		// We don't wait on the context here, because we want to be sure that
+		// the test function and cleanup functions have finished before
+		// returning from the test.
+		<-done
+		if !attemptT.Failed() {
+			t.Logf("testutil.RunRetry: test passed on attempt %d/%d", i, count)
+			return
+		}
+		t.Logf("testutil.RunRetry: test failed on attempt %d/%d", i, count)
+
+		// Wait a few seconds in case the test failure was due to system load.
+		// There's not really a good way to check for this, so we just do it
+		// every time.
+		// No point waiting on t.Context() here because it doesn't factor in
+		// the test deadline, and only gets canceled when the test function
+		// completes.
+		time.Sleep(2 * time.Second)
+	}
+	t.Fatalf("testutil.RunRetry: all %d attempts failed", count)
+}
+
+// fakeT is a fake implementation of testing.TB that never fails and only logs
+// errors. Fatal errors will cause the goroutine to exit without failing the
+// test.
+//
+// The behavior of the fake implementation should be as close as possible to
+// the real implementation from the test function's perspective (minus
+// intentionally unimplemented methods).
+type fakeT struct {
+	*testing.T
+	ctx  context.Context
+	name string
+
+	mu         sync.Mutex
+	failed     bool
+	cleanupFns []func()
+}
+
+var _ testing.TB = &fakeT{}
+
+func (t *fakeT) runCleanupFns() {
+	t.mu.Lock()
+	cleanupFns := slices.Clone(t.cleanupFns)
+	t.mu.Unlock()
+
+	// Execute in LIFO order to match the behavior of *testing.T.
+	slices.Reverse(cleanupFns)
+	for _, fn := range cleanupFns {
+		fn()
+	}
+}
+
+// Chdir implements testing.TB.
+func (*fakeT) Chdir(_ string) {
+	panic("t.Chdir is not implemented in testutil.RunRetry closures")
+}
+
+// Cleanup implements testing.TB. Cleanup registers a function to be called when
+// the test completes. Cleanup functions will be called in last added, first
+// called order.
+func (t *fakeT) Cleanup(fn func()) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	t.cleanupFns = append(t.cleanupFns, fn)
+}
+
+// Context implements testing.TB. Context returns a context that is canceled
+// just before Cleanup-registered functions are called.
+func (t *fakeT) Context() context.Context {
+	return t.ctx
+}
+
+// Error implements testing.TB. Error is equivalent to Log followed by Fail.
+func (t *fakeT) Error(args ...any) {
+	t.T.Helper()
+	t.T.Log(args...)
+	t.Fail()
+}
+
+// Errorf implements testing.TB. Errorf is equivalent to Logf followed by Fail.
+func (t *fakeT) Errorf(format string, args ...any) {
+	t.T.Helper()
+	t.T.Logf(format, args...)
+	t.Fail()
+}
+
+// Fail implements testing.TB. Fail marks the function as having failed but
+// continues execution.
+func (t *fakeT) Fail() {
+	t.T.Helper()
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.failed = true
+	t.T.Log("testutil.RunRetry: t.Fail called in testutil.RunRetry closure")
+}
+
+// FailNow implements testing.TB. FailNow marks the function as having failed
+// and stops its execution by calling runtime.Goexit (which then runs all the
+// deferred calls in the current goroutine).
+func (t *fakeT) FailNow() {
+	t.T.Helper()
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.failed = true
+	t.T.Log("testutil.RunRetry: t.FailNow called in testutil.RunRetry closure")
+	runtime.Goexit()
+}
+
+// Failed implements testing.TB. Failed reports whether the function has failed.
+func (t *fakeT) Failed() bool {
+	t.T.Helper()
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.failed
+}
+
+// Fatal implements testing.TB. Fatal is equivalent to Log followed by FailNow.
+func (t *fakeT) Fatal(args ...any) {
+	t.T.Helper()
+	t.T.Log(args...)
+	t.FailNow()
+}
+
+// Fatalf implements testing.TB. Fatalf is equivalent to Logf followed by
+// FailNow.
+func (t *fakeT) Fatalf(format string, args ...any) {
+	t.T.Helper()
+	t.T.Logf(format, args...)
+	t.FailNow()
+}
+
+// Helper is proxied to the original *testing.T. This is to avoid the fake
+// method appearing in the call stack.
+
+// Log is proxied to the original *testing.T.
+
+// Logf is proxied to the original *testing.T.
+
+// Name implements testing.TB.
+func (t *fakeT) Name() string {
+	return t.name
+}
+
+// Setenv implements testing.TB.
+func (*fakeT) Setenv(_ string, _ string) {
+	panic("t.Setenv is not implemented in testutil.RunRetry closures")
+}
+
+// Skip implements testing.TB.
+func (*fakeT) Skip(_ ...any) {
+	panic("t.Skip is not implemented in testutil.RunRetry closures")
+}
+
+// SkipNow implements testing.TB.
+func (*fakeT) SkipNow() {
+	panic("t.SkipNow is not implemented in testutil.RunRetry closures")
+}
+
+// Skipf implements testing.TB.
+func (*fakeT) Skipf(_ string, _ ...any) {
+	panic("t.Skipf is not implemented in testutil.RunRetry closures")
+}
+
+// Skipped implements testing.TB.
+func (*fakeT) Skipped() bool {
+	panic("t.Skipped is not implemented in testutil.RunRetry closures")
+}
+
+// TempDir implements testing.TB.
+func (*fakeT) TempDir() string {
+	panic("t.TempDir is not implemented in testutil.RunRetry closures")
+}
+
+// private is proxied to the original *testing.T. It cannot be implemented by
+// our fake implementation since it's a private method.