fix(cli): bypass access URL redirect for inter-replica chat relay (#22635)
## Summary
Fixes cross-replica chat relay failing with:
```
failed to open initial relay for chat stream
error= dial relay stream: - failed to WebSocket dial: expected handshake response status code 101 but got 200
failed to open relay for message parts
error= dial relay stream: - failed to WebSocket dial: expected handshake response status code 101 but got 200
```
Subscribers see accurate `status=running` (delivered via pubsub) but
miss all in-progress `message_part` events (delivered only via the relay
WebSocket that never connects).
## Root cause
`redirectToAccessURL` in `cli/server.go` redirects any request whose
`Host` header doesn't match the access URL. The enterprise chat relay
dials another replica directly via its DERP relay address (e.g.
`http://10.0.0.2:8080`), so the `Host` header is the pod IP — not the
access URL.
This triggers a **307 redirect** to the access URL. The WebSocket
library follows the redirect, but the second request is a plain GET —
`Connection: Upgrade` and `Upgrade: websocket` headers are **not carried
over** by HTTP redirect semantics. The load-balanced access URL routes
the plain GET to any replica, which serves the SPA catch-all handler and
returns **HTTP 200 with `index.html`**.
The WebSocket library then fails: `expected handshake response status
code 101 but got 200`.
DERP mesh already has an exemption for this exact scenario
(`isDERPPath`). Chat relay was added later and didn't get one.
## Fix
Bypass `redirectToAccessURL` for requests that carry the
`X-Coder-Relay-Source-Replica` header, which the enterprise relay
already sets on every request (`enterprise/coderd/chatd/chatd.go:573`).
## Sequence diagram
**Before (broken):**
```
Replica A (subscriber) Replica B (worker) Load Balancer
| | |
|--- WS dial pod-ip:8080 ----->| |
| |-- 307 redirect to LB --->|
| | |
|<----------- plain GET (no Upgrade headers) ------------->|
| | |-- routes to any replica
|<----------- 200 index.html -------------------------------|
| |
X 'expected 101 but got 200' |
```
**After (fixed):**
```
Replica A (subscriber) Replica B (worker)
| |
|--- WS dial pod-ip:8080 ----->|
| (X-Coder-Relay-Source- |
| Replica header set) |
| |-- bypass redirect
|<--------- 101 Upgrade ------|
|<==== message_part events ====|
```
This commit is contained in:
@@ -2376,6 +2376,19 @@ func redirectToAccessURL(handler http.Handler, accessURL *url.URL, tunnel bool,
|
||||
return
|
||||
}
|
||||
|
||||
// Exception: inter-replica relay.
|
||||
// Enterprise chat streaming relays message_part events
|
||||
// between replicas by dialing the worker replica's
|
||||
// DERP relay address directly. Redirecting these
|
||||
// requests to the access URL breaks the WebSocket
|
||||
// handshake because the redirect strips the Upgrade
|
||||
// headers, causing the load-balanced access URL to
|
||||
// return HTTP 200 (SPA catch-all) instead of 101.
|
||||
if isReplicaRelayRequest(r) {
|
||||
handler.ServeHTTP(w, r)
|
||||
return
|
||||
}
|
||||
|
||||
// Only do this if we aren't tunneling.
|
||||
// If we are tunneling, we want to allow the request to go through
|
||||
// because the tunnel doesn't proxy with TLS.
|
||||
@@ -2411,6 +2424,14 @@ func isDERPPath(p string) bool {
|
||||
return segments[1] == "derp"
|
||||
}
|
||||
|
||||
// isReplicaRelayRequest returns true when the request was sent by
|
||||
// another coderd replica as part of cross-replica streaming. The
|
||||
// enterprise chat relay sets X-Coder-Relay-Source-Replica on every
|
||||
// request to identify itself.
|
||||
func isReplicaRelayRequest(r *http.Request) bool {
|
||||
return r.Header.Get("X-Coder-Relay-Source-Replica") != ""
|
||||
}
|
||||
|
||||
// IsLocalhost returns true if the host points to the local machine. Intended to
|
||||
// be called with `u.Hostname()`.
|
||||
func IsLocalhost(host string) bool {
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"bytes"
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"net/http"
|
||||
"testing"
|
||||
|
||||
"github.com/spf13/pflag"
|
||||
@@ -314,6 +315,30 @@ func TestIsDERPPath(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsReplicaRelayRequest(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
t.Run("WithHeader", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
r, _ := http.NewRequestWithContext(context.Background(), "GET", "/api/experimental/chats/abc/stream", nil)
|
||||
r.Header.Set("X-Coder-Relay-Source-Replica", "some-uuid")
|
||||
require.True(t, isReplicaRelayRequest(r))
|
||||
})
|
||||
|
||||
t.Run("WithoutHeader", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
r, _ := http.NewRequestWithContext(context.Background(), "GET", "/api/experimental/chats/abc/stream", nil)
|
||||
require.False(t, isReplicaRelayRequest(r))
|
||||
})
|
||||
|
||||
t.Run("EmptyHeader", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
r, _ := http.NewRequestWithContext(context.Background(), "GET", "/api/experimental/chats/abc/stream", nil)
|
||||
r.Header.Set("X-Coder-Relay-Source-Replica", "")
|
||||
require.False(t, isReplicaRelayRequest(r))
|
||||
})
|
||||
}
|
||||
|
||||
func TestEscapePostgresURLUserInfo(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user