Compare commits

...

2 Commits

Author SHA1 Message Date
blink-so[bot] 31a77081f3 fix: address review comments on ssh retry logic
- Add clarifying comment for retry param in cli/ping.go
- Stop workspace before retrying to clean up partially provisioned
  resources (per #22043 and review feedback)
- Rebase on latest main
2026-03-02 02:15:26 +00:00
blink-so[bot] 6239cf0618 feat(cli): interactive retry prompt for failed workspace SSH
When SSH-ing into a workspace that is in a failed state, Coder now:

- Interactive mode (TTY): Shows build logs link and prompts to retry
  the build. On confirmation, reuses the full autostart flow.
- Non-interactive mode: Shows error with build logs link and a
  `coder start` command to manually retry.

GetWorkspaceAndAgent gains a `retry` parameter. Only `coder ssh`
passes retry=true; all other callers pass retry=false. The recursive
call passes retry=false to prevent infinite recursion.
2026-03-02 01:54:50 +00:00
10 changed files with 164 additions and 20 deletions
+1 -1
View File
@@ -97,7 +97,7 @@ func handleRPTY(inv *serpent.Invocation, client *codersdk.Client, args handleRPT
reconnectID = uuid.New()
}
ws, agt, _, err := GetWorkspaceAndAgent(ctx, inv, client, true, args.NamedWorkspace)
ws, agt, _, err := GetWorkspaceAndAgent(ctx, inv, client, true, false, args.NamedWorkspace)
if err != nil {
return err
}
+2 -2
View File
@@ -73,7 +73,7 @@ func (r *RootCmd) openVSCode() *serpent.Command {
// need to wait for the agent to start.
workspaceQuery := inv.Args[0]
autostart := true
workspace, workspaceAgent, otherWorkspaceAgents, err := GetWorkspaceAndAgent(ctx, inv, client, autostart, workspaceQuery)
workspace, workspaceAgent, otherWorkspaceAgents, err := GetWorkspaceAndAgent(ctx, inv, client, autostart, false, workspaceQuery)
if err != nil {
return xerrors.Errorf("get workspace and agent: %w", err)
}
@@ -324,7 +324,7 @@ func (r *RootCmd) openApp() *serpent.Command {
}
workspaceName := inv.Args[0]
ws, agt, _, err := GetWorkspaceAndAgent(ctx, inv, client, false, workspaceName)
ws, agt, _, err := GetWorkspaceAndAgent(ctx, inv, client, false, false, workspaceName)
if err != nil {
var sdkErr *codersdk.Error
if errors.As(err, &sdkErr) && sdkErr.StatusCode() == http.StatusNotFound {
+1
View File
@@ -109,6 +109,7 @@ func (r *RootCmd) ping() *serpent.Command {
_, workspaceAgent, _, err := GetWorkspaceAndAgent(
ctx, inv, client,
false, // Do not autostart for a ping.
false, // Do not retry failed builds for a ping.
workspaceName,
)
if err != nil {
+1 -1
View File
@@ -84,7 +84,7 @@ func (r *RootCmd) portForward() *serpent.Command {
return xerrors.New("no port-forwards requested")
}
workspace, workspaceAgent, _, err := GetWorkspaceAndAgent(ctx, inv, client, !disableAutostart, inv.Args[0])
workspace, workspaceAgent, _, err := GetWorkspaceAndAgent(ctx, inv, client, !disableAutostart, false, inv.Args[0])
if err != nil {
return err
}
+1 -1
View File
@@ -83,7 +83,7 @@ func (r *RootCmd) speedtest() *serpent.Command {
return xerrors.Errorf("--direct (-d) is incompatible with --%s", varDisableDirect)
}
_, workspaceAgent, _, err := GetWorkspaceAndAgent(ctx, inv, client, false, inv.Args[0])
_, workspaceAgent, _, err := GetWorkspaceAndAgent(ctx, inv, client, false, false, inv.Args[0])
if err != nil {
return err
}
+59 -13
View File
@@ -802,7 +802,7 @@ func findWorkspaceAndAgentByHostname(
}
hostname = normalizeWorkspaceInput(hostname)
ws, agent, otherAgents, err := GetWorkspaceAndAgent(ctx, inv, client, !disableAutostart, hostname)
ws, agent, otherAgents, err := GetWorkspaceAndAgent(ctx, inv, client, !disableAutostart, true, hostname)
if err != nil && strings.Contains(err.Error(), "multiple agents found") {
var errorMsg strings.Builder
_, _ = errorMsg.WriteString(fmt.Sprintf("%s\nTry running:\n", err.Error()))
@@ -894,7 +894,9 @@ startWatchLoop:
// `<workspace>[.<agent>]` syntax via `in`. It will also return any other agents
// in the workspace as a slice for use in child->parent lookups.
// If autoStart is true, the workspace will be started if it is not already running.
func GetWorkspaceAndAgent(ctx context.Context, inv *serpent.Invocation, client *codersdk.Client, autostart bool, input string) (codersdk.Workspace, codersdk.WorkspaceAgent, []codersdk.WorkspaceAgent, error) { //nolint:revive
// If retry is true and the workspace is in a failed state, the user will be
// prompted to retry the build (when running interactively).
func GetWorkspaceAndAgent(ctx context.Context, inv *serpent.Invocation, client *codersdk.Client, autostart bool, retry bool, input string) (codersdk.Workspace, codersdk.WorkspaceAgent, []codersdk.WorkspaceAgent, error) { //nolint:revive
var (
workspace codersdk.Workspace
// The input will be `owner/name.agent`
@@ -918,17 +920,15 @@ func GetWorkspaceAndAgent(ctx context.Context, inv *serpent.Invocation, client *
// Any sort of deleting status, we should reject with a nicer error.
return codersdk.Workspace{}, codersdk.WorkspaceAgent{}, nil, xerrors.Errorf("workspace %q is deleted", workspace.Name)
}
if workspace.LatestBuild.Job.Status == codersdk.ProvisionerJobFailed {
// The workspace needs to be stopped (or failed) before we can start it.
// It cannot be in any other pending state.
if workspace.LatestBuild.Status != codersdk.WorkspaceStatusStopped &&
workspace.LatestBuild.Status != codersdk.WorkspaceStatusFailed {
return codersdk.Workspace{}, codersdk.WorkspaceAgent{}, nil,
xerrors.Errorf("workspace %q is in failed state, unable to autostart the workspace", workspace.Name)
}
// The workspace needs to be stopped before we can start it.
// It cannot be in any pending or failed state.
if workspace.LatestBuild.Status != codersdk.WorkspaceStatusStopped {
return codersdk.Workspace{}, codersdk.WorkspaceAgent{}, nil,
xerrors.Errorf("workspace must be started; was unable to autostart as the last build job is %q, expected %q",
xerrors.Errorf("workspace must be started; was unable to autostart as the last build job is %q, expected %q or %q",
workspace.LatestBuild.Status,
codersdk.WorkspaceStatusStopped,
codersdk.WorkspaceStatusFailed,
)
}
@@ -943,7 +943,8 @@ func GetWorkspaceAndAgent(ctx context.Context, inv *serpent.Invocation, client *
switch cerr.StatusCode() {
case http.StatusConflict:
_, _ = fmt.Fprintln(inv.Stderr, "Unable to start the workspace due to conflict, the workspace may be starting, retrying without autostart...")
return GetWorkspaceAndAgent(ctx, inv, client, false, input)
// Pass retry=false to prevent infinite recursion.
return GetWorkspaceAndAgent(ctx, inv, client, false, false, input)
case http.StatusForbidden:
_, err = startWorkspace(inv, client, workspace, workspaceParameterFlags{}, buildFlags{}, WorkspaceUpdate)
@@ -984,11 +985,51 @@ func GetWorkspaceAndAgent(ctx context.Context, inv *serpent.Invocation, client *
agentName = workspaceParts[1]
}
workspaceAgent, otherWorkspaceAgents, err := getWorkspaceAgent(workspace, agentName)
if err != nil {
if err == nil {
return workspace, workspaceAgent, otherWorkspaceAgents, nil
}
// If the latest build did not fail, return the original error.
if workspace.LatestBuild.Job.Status != codersdk.ProvisionerJobFailed {
return workspace, codersdk.WorkspaceAgent{}, otherWorkspaceAgents, err
}
return workspace, workspaceAgent, otherWorkspaceAgents, nil
// The latest build failed. Show a helpful message and offer to retry.
buildLink := buildWorkspaceBuildLink(client.URL, workspace)
_, _ = fmt.Fprintf(inv.Stderr, "Workspace %q failed its most recent build.\n Build logs: %s\n", workspace.Name, buildLink.String())
if !retry || !isTTYIn(inv) {
return workspace, codersdk.WorkspaceAgent{}, otherWorkspaceAgents,
xerrors.Errorf("workspace %q is in failed state, the last build failed\n See: %s\n Run `coder start %s/%s` to retry starting the workspace",
workspace.Name, buildLink.String(), workspace.OwnerName, workspace.Name)
}
_, promptErr := cliui.Prompt(inv, cliui.PromptOptions{
Text: "Retry the build?",
IsConfirm: true,
Default: cliui.ConfirmYes,
})
if promptErr != nil {
return workspace, codersdk.WorkspaceAgent{}, otherWorkspaceAgents, xerrors.Errorf("workspace %q is in failed state", workspace.Name)
}
// Stop the workspace first to clean up any partially provisioned
// resources before retrying (see #22043).
_, _ = fmt.Fprintf(inv.Stderr, "Stopping workspace %q before retrying...\n", workspace.Name)
stopBuild, err := stopWorkspace(inv, client, workspace, buildFlags{})
if err != nil {
return workspace, codersdk.WorkspaceAgent{}, otherWorkspaceAgents, xerrors.Errorf("stop workspace before retry: %w", err)
}
// Wait for the stop build to complete.
err = cliui.WorkspaceBuild(ctx, inv.Stderr, client, stopBuild.ID)
if err != nil {
return workspace, codersdk.WorkspaceAgent{}, otherWorkspaceAgents, xerrors.Errorf("wait for workspace stop: %w", err)
}
// Now start the workspace. Reuse the full autostart flow which handles
// conflicts, forbidden errors, template version updates, etc.
// Pass retry=false to prevent infinite recursion.
return GetWorkspaceAndAgent(ctx, inv, client, true, false, input)
}
func getWorkspaceAgent(workspace codersdk.Workspace, agentName string) (workspaceAgent codersdk.WorkspaceAgent, otherAgents []codersdk.WorkspaceAgent, err error) {
@@ -1092,6 +1133,11 @@ func buildWorkspaceLink(serverURL *url.URL, workspace codersdk.Workspace) *url.U
return serverURL.ResolveReference(&url.URL{Path: fmt.Sprintf("@%s/%s", workspace.OwnerName, workspace.Name)})
}
// buildWorkspaceBuildLink returns a link to the specific workspace build in the Coder web UI.
func buildWorkspaceBuildLink(serverURL *url.URL, workspace codersdk.Workspace) *url.URL {
return serverURL.ResolveReference(&url.URL{Path: fmt.Sprintf("@%s/%s/builds/%d", workspace.OwnerName, workspace.Name, workspace.LatestBuild.BuildNumber)})
}
// runLocal runs a command on the local machine.
func runLocal(ctx context.Context, stdin io.Reader, name string, args ...string) ([]byte, error) {
cmd := exec.CommandContext(ctx, name, args...)
+72
View File
@@ -145,6 +145,78 @@ func TestSSH(t *testing.T) {
})
}
})
t.Run("FailedWorkspaceShowsBuildLink", func(t *testing.T) {
t.Parallel()
client, store := coderdtest.NewWithDatabase(t, nil)
client.SetLogger(testutil.Logger(t).Named("client"))
first := coderdtest.CreateFirstUser(t, client)
userClient, user := coderdtest.CreateAnotherUserMutators(t, client, first.OrganizationID, nil, func(r *codersdk.CreateUserRequestWithOrgs) {
r.Username = "myuser"
})
r := dbfake.WorkspaceBuild(t, store, database.WorkspaceTable{
Name: "myworkspace",
OrganizationID: first.OrganizationID,
OwnerID: user.ID,
}).WithAgent().Failed().Do()
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
defer cancel()
inv, root := clitest.New(t, "ssh", r.Workspace.Name)
clitest.SetupConfig(t, userClient, root)
var stderr bytes.Buffer
inv.Stderr = &stderr
err := inv.WithContext(ctx).Run()
require.Error(t, err)
// Non-interactive mode: error should contain helpful message
require.Contains(t, err.Error(), "is in failed state")
require.Contains(t, err.Error(), "coder start")
// Stderr should show build logs link
require.Contains(t, stderr.String(), "failed its most recent build")
require.Contains(t, stderr.String(), "/builds/")
})
t.Run("FailedWorkspaceInteractiveRetryPrompt", func(t *testing.T) {
t.Parallel()
client, store := coderdtest.NewWithDatabase(t, nil)
client.SetLogger(testutil.Logger(t).Named("client"))
first := coderdtest.CreateFirstUser(t, client)
userClient, user := coderdtest.CreateAnotherUserMutators(t, client, first.OrganizationID, nil, func(r *codersdk.CreateUserRequestWithOrgs) {
r.Username = "myuser"
})
r := dbfake.WorkspaceBuild(t, store, database.WorkspaceTable{
Name: "myworkspace",
OrganizationID: first.OrganizationID,
OwnerID: user.ID,
}).WithAgent().Failed().Do()
inv, root := clitest.New(t, "ssh", "--force-tty", r.Workspace.Name)
clitest.SetupConfig(t, userClient, root)
pty := ptytest.New(t)
inv.Stdin = pty.Input()
inv.Stderr = pty.Output()
inv.Stdout = pty.Output()
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
defer cancel()
cmdDone := make(chan error, 1)
go func() {
cmdDone <- inv.WithContext(ctx).Run()
}()
// Should see the failure message and prompt.
pty.ExpectMatchContext(ctx, "failed its most recent build")
pty.ExpectMatchContext(ctx, "Retry the build?")
// Decline the retry.
pty.WriteLine("no")
err := testutil.TryReceive(ctx, t, cmdDone)
require.Error(t, err)
require.Contains(t, err.Error(), "is in failed state")
})
t.Run("StartStoppedWorkspace", func(t *testing.T) {
t.Parallel()
+1 -1
View File
@@ -102,7 +102,7 @@ func (r *RootCmd) vscodeSSH() *serpent.Command {
// will call this command after the workspace is started.
autostart := false
workspace, workspaceAgent, _, err := GetWorkspaceAndAgent(ctx, inv, client, autostart, fmt.Sprintf("%s/%s", owner, name))
workspace, workspaceAgent, _, err := GetWorkspaceAndAgent(ctx, inv, client, autostart, false, fmt.Sprintf("%s/%s", owner, name))
if err != nil {
return xerrors.Errorf("find workspace and agent: %w", err)
}
+25
View File
@@ -111,6 +111,31 @@ your template's Terraform file and the target resources on your infrastructure.
Unhealthy workspaces are usually caused by a misconfiguration in the agent or
workspace startup scripts.
### Connecting to a failed workspace via SSH
When you attempt to SSH into a workspace that is in a failed state, Coder will
show you a link to the build logs and offer to retry the build:
```console
$ coder ssh myworkspace
Workspace "myworkspace" failed its most recent build.
Build logs: https://coder.example.com/@user/myworkspace/builds/1
Retry the build? (yes/no) yes
```
If you confirm, Coder will start a new build and connect you once it succeeds.
In non-interactive mode (e.g. piped commands or scripts), the error message
includes the build log link and a `coder start` command you can use to manually
retry:
```console
$ coder ssh myworkspace 2>&1
error: workspace "myworkspace" is in failed state, the last build failed
See: https://coder.example.com/@user/myworkspace/builds/1
Run `coder start user/myworkspace` to retry starting the workspace
```
## Workspace build times
After a successful build, you can see a timing breakdown of the workspace
+1 -1
View File
@@ -117,7 +117,7 @@ func (r *RootCmd) externalWorkspaceAgentInstructions() *serpent.Command {
return err
}
workspace, workspaceAgent, _, err := agpl.GetWorkspaceAndAgent(inv.Context(), inv, client, false, inv.Args[0])
workspace, workspaceAgent, _, err := agpl.GetWorkspaceAndAgent(inv.Context(), inv, client, false, false, inv.Args[0])
if err != nil {
return xerrors.Errorf("find workspace and agent: %w", err)
}