fix(cli): retry dial timeouts in SSH connection setup (#24199)

Reorder error checks in isRetryableError so IsConnectionError is evaluated before context.DeadlineExceeded. Dial timeouts (*net.OpError wrapping DeadlineExceeded) were incorrectly treated as non-retryable, causing Coder Connect to fail immediately on broken tunnels with valid DNS despite existing retry logic.

Fixes #24201
This commit is contained in:
Ehab Younes
2026-04-10 00:55:16 +03:00
committed by GitHub
parent 95cff8c5fb
commit 1d0653cdab
2 changed files with 23 additions and 4 deletions
+6 -4
View File
@@ -69,15 +69,17 @@ var (
// isRetryableError checks for transient connection errors worth
// retrying: DNS failures, connection refused, and server 5xx.
func isRetryableError(err error) bool {
if err == nil {
return false
}
if xerrors.Is(err, context.Canceled) || xerrors.Is(err, context.DeadlineExceeded) {
if err == nil || xerrors.Is(err, context.Canceled) {
return false
}
// Check connection errors before context.DeadlineExceeded because
// net.Dialer.Timeout produces *net.OpError that matches both.
if codersdk.IsConnectionError(err) {
return true
}
if xerrors.Is(err, context.DeadlineExceeded) {
return false
}
var sdkErr *codersdk.Error
if xerrors.As(err, &sdkErr) {
return sdkErr.StatusCode() >= 500
+17
View File
@@ -516,6 +516,23 @@ func TestIsRetryableError(t *testing.T) {
assert.Equal(t, tt.retryable, isRetryableError(tt.err))
})
}
// net.Dialer.Timeout produces *net.OpError that matches both
// IsConnectionError and context.DeadlineExceeded. Verify it is retryable.
t.Run("DialTimeout", func(t *testing.T) {
t.Parallel()
ctx, cancel := context.WithDeadline(context.Background(), time.Now())
defer cancel()
<-ctx.Done() // ensure deadline has fired
_, err := (&net.Dialer{}).DialContext(ctx, "tcp", "127.0.0.1:1")
require.Error(t, err)
// Proves the ambiguity: this error matches BOTH checks.
require.ErrorIs(t, err, context.DeadlineExceeded)
require.ErrorAs(t, err, new(*net.OpError))
assert.True(t, isRetryableError(err))
// Also when wrapped, as runCoderConnectStdio does.
assert.True(t, isRetryableError(xerrors.Errorf("dial coder connect: %w", err)))
})
}
func TestRetryWithInterval(t *testing.T) {