Files
coder-server/coderd/debug.go
Kacper Sawicki df2360f56a feat(coderd): add consolidated /debug/profile endpoint for pprof collection (#22892)
## Summary

Adds a new `GET /api/v2/debug/profile` endpoint that collects multiple
pprof profiles in a single request and returns them as a tar.gz archive.
This allows collecting profiles (including block and mutex) without
requiring `CODER_PPROF_ENABLE` to be set, and without restarting
`coderd`.

Closes #21679

## What it does

The endpoint:
- Temporarily enables block and mutex profiling (normally disabled at
runtime)
- Runs CPU profile and/or trace for a configurable duration (default
10s, max 60s)
- Collects snapshot profiles (heap, allocs, block, mutex, goroutine,
threadcreate)
- Returns a tar.gz archive containing all requested `.prof` files
- Uses an atomic bool to prevent concurrent collections (returns 409
Conflict)
- Is protected by the existing debug endpoint RBAC (owner-only)

**Supported profile types:** cpu, heap, allocs, block, mutex, goroutine,
threadcreate, trace

**Query parameters:**
- `duration`: How long to run timed profiles (default: `10s`, max:
`60s`)
- `profiles`: Comma-separated list of profile types (default:
`cpu,heap,allocs,block,mutex,goroutine`)

## Additional changes

- **SDK client method** (`codersdk.Client.DebugCollectProfile`) for easy
programmatic access
- **`coder support bundle --pprof` integration**: tries the consolidated
endpoint first, falls back to individual `/debug/pprof/*` endpoints for
older servers
- **8 new tests** covering defaults, custom profiles, trace+CPU,
validation errors, authorization, and conflict detection
2026-03-13 14:09:39 +00:00

685 lines
20 KiB
Go

package coderd
import (
"archive/tar"
"bytes"
"compress/gzip"
"context"
"database/sql"
"encoding/json"
"fmt"
"io"
"net/http"
"runtime"
"runtime/pprof"
"runtime/trace"
"slices"
"strings"
"time"
"github.com/google/uuid"
"golang.org/x/xerrors"
"cdr.dev/slog/v3"
"github.com/coder/coder/v2/coderd/audit"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/httpapi"
"github.com/coder/coder/v2/coderd/httpmw"
"github.com/coder/coder/v2/coderd/rbac"
"github.com/coder/coder/v2/coderd/rbac/policy"
"github.com/coder/coder/v2/coderd/util/slice"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/codersdk/healthsdk"
)
// @Summary Debug Info Wireguard Coordinator
// @ID debug-info-wireguard-coordinator
// @Security CoderSessionToken
// @Produce text/html
// @Tags Debug
// @Success 200
// @Router /debug/coordinator [get]
func (api *API) debugCoordinator(rw http.ResponseWriter, r *http.Request) {
(*api.TailnetCoordinator.Load()).ServeHTTPDebug(rw, r)
}
// @Summary Debug Info Tailnet
// @ID debug-info-tailnet
// @Security CoderSessionToken
// @Produce text/html
// @Tags Debug
// @Success 200
// @Router /debug/tailnet [get]
func (api *API) debugTailnet(rw http.ResponseWriter, r *http.Request) {
api.agentProvider.ServeHTTPDebug(rw, r)
}
// @Summary Debug Info Deployment Health
// @ID debug-info-deployment-health
// @Security CoderSessionToken
// @Produce json
// @Tags Debug
// @Success 200 {object} healthsdk.HealthcheckReport
// @Router /debug/health [get]
// @Param force query boolean false "Force a healthcheck to run"
func (api *API) debugDeploymentHealth(rw http.ResponseWriter, r *http.Request) {
apiKey := httpmw.APITokenFromRequest(r)
ctx, cancel := context.WithTimeout(r.Context(), api.Options.HealthcheckTimeout)
defer cancel()
// Load sections previously marked as dismissed.
// We hydrate this here as we cache the healthcheck and hydrating in the
// healthcheck function itself can lead to stale results.
dismissed := loadDismissedHealthchecks(ctx, api.Database, api.Logger)
// Check if the forced query parameter is set.
forced := r.URL.Query().Get("force") == "true"
// Get cached report if it exists and the requester did not force a refresh.
if !forced {
if report := api.healthCheckCache.Load(); report != nil {
if time.Since(report.Time) < api.Options.HealthcheckRefresh {
formatHealthcheck(ctx, rw, r, *report, dismissed...)
return
}
}
}
resChan := api.healthCheckGroup.DoChan("", func() (*healthsdk.HealthcheckReport, error) {
// Create a new context not tied to the request.
ctx, cancel := context.WithTimeout(context.Background(), api.Options.HealthcheckTimeout)
defer cancel()
// Create and store progress tracker for timeout diagnostics.
report := api.HealthcheckFunc(ctx, apiKey, &api.healthCheckProgress)
if report != nil { // Only store non-nil reports.
api.healthCheckCache.Store(report)
}
api.healthCheckProgress.Reset()
return report, nil
})
select {
case <-ctx.Done():
summary := api.healthCheckProgress.Summary()
httpapi.Write(ctx, rw, http.StatusServiceUnavailable, codersdk.Response{
Message: "Healthcheck timed out.",
Detail: summary,
})
return
case res := <-resChan:
report := res.Val
if report == nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "There was an unknown error completing the healthcheck.",
Detail: "nil report from healthcheck result channel",
})
return
}
formatHealthcheck(ctx, rw, r, *report, dismissed...)
return
}
}
func formatHealthcheck(ctx context.Context, rw http.ResponseWriter, r *http.Request, hc healthsdk.HealthcheckReport, dismissed ...healthsdk.HealthSection) {
// Mark any sections previously marked as dismissed.
for _, d := range dismissed {
switch d {
case healthsdk.HealthSectionAccessURL:
hc.AccessURL.Dismissed = true
case healthsdk.HealthSectionDERP:
hc.DERP.Dismissed = true
case healthsdk.HealthSectionDatabase:
hc.Database.Dismissed = true
case healthsdk.HealthSectionWebsocket:
hc.Websocket.Dismissed = true
case healthsdk.HealthSectionWorkspaceProxy:
hc.WorkspaceProxy.Dismissed = true
}
}
format := r.URL.Query().Get("format")
switch format {
case "text":
rw.Header().Set("Content-Type", "text/plain; charset=utf-8")
rw.WriteHeader(http.StatusOK)
_, _ = fmt.Fprintln(rw, "time:", hc.Time.Format(time.RFC3339))
_, _ = fmt.Fprintln(rw, "healthy:", hc.Healthy)
_, _ = fmt.Fprintln(rw, "derp:", hc.DERP.Healthy)
_, _ = fmt.Fprintln(rw, "access_url:", hc.AccessURL.Healthy)
_, _ = fmt.Fprintln(rw, "websocket:", hc.Websocket.Healthy)
_, _ = fmt.Fprintln(rw, "database:", hc.Database.Healthy)
case "", "json":
httpapi.WriteIndent(ctx, rw, http.StatusOK, hc)
default:
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
Message: fmt.Sprintf("Invalid format option %q.", format),
Detail: "Allowed values are: \"json\", \"simple\".",
})
}
}
// @Summary Get health settings
// @ID get-health-settings
// @Security CoderSessionToken
// @Produce json
// @Tags Debug
// @Success 200 {object} healthsdk.HealthSettings
// @Router /debug/health/settings [get]
func (api *API) deploymentHealthSettings(rw http.ResponseWriter, r *http.Request) {
settingsJSON, err := api.Database.GetHealthSettings(r.Context())
if err != nil {
httpapi.Write(r.Context(), rw, http.StatusInternalServerError, codersdk.Response{
Message: "Failed to fetch health settings.",
Detail: err.Error(),
})
return
}
var settings healthsdk.HealthSettings
err = json.Unmarshal([]byte(settingsJSON), &settings)
if err != nil {
httpapi.Write(r.Context(), rw, http.StatusInternalServerError, codersdk.Response{
Message: "Failed to unmarshal health settings.",
Detail: err.Error(),
})
return
}
if len(settings.DismissedHealthchecks) == 0 {
settings.DismissedHealthchecks = []healthsdk.HealthSection{}
}
httpapi.Write(r.Context(), rw, http.StatusOK, settings)
}
// @Summary Update health settings
// @ID update-health-settings
// @Security CoderSessionToken
// @Accept json
// @Produce json
// @Tags Debug
// @Param request body healthsdk.UpdateHealthSettings true "Update health settings"
// @Success 200 {object} healthsdk.UpdateHealthSettings
// @Router /debug/health/settings [put]
func (api *API) putDeploymentHealthSettings(rw http.ResponseWriter, r *http.Request) {
ctx := r.Context()
if !api.Authorize(r, policy.ActionUpdate, rbac.ResourceDeploymentConfig) {
httpapi.Write(ctx, rw, http.StatusForbidden, codersdk.Response{
Message: "Insufficient permissions to update health settings.",
})
return
}
var settings healthsdk.HealthSettings
if !httpapi.Read(ctx, rw, r, &settings) {
return
}
err := validateHealthSettings(settings)
if err != nil {
httpapi.Write(r.Context(), rw, http.StatusInternalServerError, codersdk.Response{
Message: "Failed to validate health settings.",
Detail: err.Error(),
})
return
}
settingsJSON, err := json.Marshal(&settings)
if err != nil {
httpapi.Write(r.Context(), rw, http.StatusInternalServerError, codersdk.Response{
Message: "Failed to marshal health settings.",
Detail: err.Error(),
})
return
}
currentSettingsJSON, err := api.Database.GetHealthSettings(r.Context())
if err != nil {
httpapi.Write(r.Context(), rw, http.StatusInternalServerError, codersdk.Response{
Message: "Failed to fetch current health settings.",
Detail: err.Error(),
})
return
}
if bytes.Equal(settingsJSON, []byte(currentSettingsJSON)) {
// See: https://www.rfc-editor.org/rfc/rfc7231#section-6.3.5
rw.WriteHeader(http.StatusNoContent)
return
}
auditor := api.Auditor.Load()
aReq, commitAudit := audit.InitRequest[database.HealthSettings](rw, &audit.RequestParams{
Audit: *auditor,
Log: api.Logger,
Request: r,
Action: database.AuditActionWrite,
})
defer commitAudit()
aReq.New = database.HealthSettings{
ID: uuid.New(),
DismissedHealthchecks: slice.ToStrings(settings.DismissedHealthchecks),
}
err = api.Database.UpsertHealthSettings(ctx, string(settingsJSON))
if err != nil {
httpapi.Write(r.Context(), rw, http.StatusInternalServerError, codersdk.Response{
Message: "Failed to update health settings.",
Detail: err.Error(),
})
return
}
httpapi.Write(r.Context(), rw, http.StatusOK, settings)
}
func validateHealthSettings(settings healthsdk.HealthSettings) error {
for _, dismissed := range settings.DismissedHealthchecks {
ok := slices.Contains(healthsdk.HealthSections, dismissed)
if !ok {
return xerrors.Errorf("unknown healthcheck section: %s", dismissed)
}
}
return nil
}
// For some reason the swagger docs need to be attached to a function.
// @Summary Debug Info Websocket Test
// @ID debug-info-websocket-test
// @Security CoderSessionToken
// @Produce json
// @Tags Debug
// @Success 201 {object} codersdk.Response
// @Router /debug/ws [get]
// @x-apidocgen {"skip": true}
func _debugws(http.ResponseWriter, *http.Request) {} //nolint:unused
// @Summary Debug DERP traffic
// @ID debug-derp-traffic
// @Security CoderSessionToken
// @Produce json
// @Success 200 {array} derp.BytesSentRecv
// @Tags Debug
// @Router /debug/derp/traffic [get]
// @x-apidocgen {"skip": true}
func _debugDERPTraffic(http.ResponseWriter, *http.Request) {} //nolint:unused
// @Summary Debug expvar
// @ID debug-expvar
// @Security CoderSessionToken
// @Produce json
// @Tags Debug
// @Success 200 {object} map[string]any
// @Router /debug/expvar [get]
// @x-apidocgen {"skip": true}
func _debugExpVar(http.ResponseWriter, *http.Request) {} //nolint:unused
func loadDismissedHealthchecks(ctx context.Context, db database.Store, logger slog.Logger) []healthsdk.HealthSection {
dismissedHealthchecks := []healthsdk.HealthSection{}
settingsJSON, err := db.GetHealthSettings(ctx)
if err == nil {
var settings healthsdk.HealthSettings
err = json.Unmarshal([]byte(settingsJSON), &settings)
if len(settings.DismissedHealthchecks) > 0 {
dismissedHealthchecks = settings.DismissedHealthchecks
}
}
if err != nil && !xerrors.Is(err, sql.ErrNoRows) {
logger.Error(ctx, "unable to fetch health settings", slog.Error(err))
}
return dismissedHealthchecks
}
// ProfileCollector abstracts the mechanics of collecting pprof/trace
// data from the Go runtime. Production code uses defaultProfileCollector;
// tests can substitute a stub to avoid process-global side-effects.
type ProfileCollector interface {
// StartCPUProfile begins CPU profiling, writing to w. It returns
// a stop function that must be called to finish profiling.
StartCPUProfile(w io.Writer) (stop func(), err error)
// StartTrace begins execution tracing, writing to w. It returns
// a stop function that must be called to finish tracing.
StartTrace(w io.Writer) (stop func(), err error)
// LookupProfile writes the named snapshot profile to w.
LookupProfile(name string, w io.Writer) error
// SetBlockProfileRate enables/disables block profiling.
SetBlockProfileRate(rate int)
// SetMutexProfileFraction enables/disables mutex profiling.
// Returns the previous fraction.
SetMutexProfileFraction(rate int) int
}
// defaultProfileCollector delegates to the real runtime/pprof and
// runtime/trace packages.
type defaultProfileCollector struct{}
func (defaultProfileCollector) StartCPUProfile(w io.Writer) (func(), error) {
if err := pprof.StartCPUProfile(w); err != nil {
return nil, err
}
return pprof.StopCPUProfile, nil
}
func (defaultProfileCollector) StartTrace(w io.Writer) (func(), error) {
if err := trace.Start(w); err != nil {
return nil, err
}
return trace.Stop, nil
}
func (defaultProfileCollector) LookupProfile(name string, w io.Writer) error {
p := pprof.Lookup(name)
if p == nil {
return nil
}
return p.WriteTo(w, 0)
}
func (defaultProfileCollector) SetBlockProfileRate(rate int) { runtime.SetBlockProfileRate(rate) }
func (defaultProfileCollector) SetMutexProfileFraction(rate int) int {
return runtime.SetMutexProfileFraction(rate)
}
// defaultProfiles is the set of profiles collected when none are specified.
var defaultProfiles = []string{"cpu", "heap", "allocs", "block", "mutex", "goroutine"}
// allValidProfiles enumerates every profile name accepted by the endpoint.
var allValidProfiles = map[string]bool{
"cpu": true,
"heap": true,
"allocs": true,
"block": true,
"mutex": true,
"goroutine": true,
"threadcreate": true,
"trace": true,
}
const (
// profileDurationDefault is used when no ?duration is supplied.
profileDurationDefault = 10 * time.Second
// profileDurationMax prevents callers from asking for arbitrarily long
// collections that tie up the runtime-global CPU profiler.
profileDurationMax = 60 * time.Second
)
// @Summary Collect debug profiles
// @ID collect-debug-profiles
// @Security CoderSessionToken
// @Tags Debug
// @Success 200
// @Router /debug/profile [post]
// @x-apidocgen {"skip": true}
func (api *API) debugCollectProfile(rw http.ResponseWriter, r *http.Request) {
ctx := r.Context()
// Parse duration.
duration := profileDurationDefault
if v := r.URL.Query().Get("duration"); v != "" {
d, err := time.ParseDuration(v)
if err != nil {
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
Message: "Invalid duration parameter.",
Detail: err.Error(),
})
return
}
if d <= 0 {
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
Message: "Duration must be positive.",
})
return
}
if d > profileDurationMax {
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
Message: fmt.Sprintf("Duration cannot exceed %s.", profileDurationMax),
})
return
}
duration = d
}
// Parse requested profiles.
profiles := defaultProfiles
if v := r.URL.Query().Get("profiles"); v != "" {
profiles = strings.Split(v, ",")
for _, p := range profiles {
if !allValidProfiles[p] {
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
Message: fmt.Sprintf("Unknown profile type: %q.", p),
Detail: "Valid types: cpu, heap, allocs, block, mutex, goroutine, threadcreate, trace",
})
return
}
}
}
// Only one profile collection can run at a time because the CPU
// profiler is process-global.
if !api.ProfileCollecting.CompareAndSwap(false, true) {
httpapi.Write(ctx, rw, http.StatusConflict, codersdk.Response{
Message: "A profile collection is already in progress. Try again later.",
})
return
}
defer api.ProfileCollecting.Store(false)
// Temporarily enable block and mutex profiling so those profiles are
// actually populated. Restore previous values when we are done.
// SetBlockProfileRate does not return the previous value, so we
// simply disable it again after collection (the default is 0).
pc := api.ProfileCollector
pc.SetBlockProfileRate(1)
prevMutexFraction := pc.SetMutexProfileFraction(1)
defer pc.SetBlockProfileRate(0)
defer pc.SetMutexProfileFraction(prevMutexFraction)
// Determine which profiles need the timed collection (cpu, trace) vs
// instant snapshots.
wantCPU := false
wantTrace := false
for _, p := range profiles {
switch p {
case "cpu":
wantCPU = true
case "trace":
wantTrace = true
}
}
// Collect timed profiles (cpu and/or trace) for the requested
// duration. StartCPUProfile and StartTrace each return a stop
// function that must be called to finish collection.
var cpuBuf, traceBuf bytes.Buffer
var stopCPU, stopTrace func()
if wantCPU {
var err error
stopCPU, err = pc.StartCPUProfile(&cpuBuf)
if err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Failed to start CPU profile.",
Detail: err.Error(),
})
return
}
}
if wantTrace {
var err error
stopTrace, err = pc.StartTrace(&traceBuf)
if err != nil {
if stopCPU != nil {
stopCPU()
}
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Failed to start trace.",
Detail: err.Error(),
})
return
}
}
if wantCPU || wantTrace {
timer := api.Clock.NewTimer(duration, "debugCollectProfile")
defer timer.Stop()
select {
case <-ctx.Done():
if stopCPU != nil {
stopCPU()
}
if stopTrace != nil {
stopTrace()
}
// Client disconnected; nothing to write.
return
case <-timer.C:
}
if stopCPU != nil {
stopCPU()
}
if stopTrace != nil {
stopTrace()
}
}
// Build the tar.gz archive.
var archive bytes.Buffer
gzw := gzip.NewWriter(&archive)
tw := tar.NewWriter(gzw)
addFile := func(name string, data []byte) error {
hdr := &tar.Header{
Name: name,
Mode: 0o644,
Size: int64(len(data)),
}
if err := tw.WriteHeader(hdr); err != nil {
return xerrors.Errorf("write tar header for %s: %w", name, err)
}
if _, err := tw.Write(data); err != nil {
return xerrors.Errorf("write tar data for %s: %w", name, err)
}
return nil
}
for _, p := range profiles {
switch p {
case "cpu":
if err := addFile("cpu.prof", cpuBuf.Bytes()); err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Failed to write CPU profile to archive.",
Detail: err.Error(),
})
return
}
case "trace":
if err := addFile("trace.out", traceBuf.Bytes()); err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Failed to write trace to archive.",
Detail: err.Error(),
})
return
}
default:
// Snapshot profiles: heap, allocs, block, mutex, goroutine,
// threadcreate.
var buf bytes.Buffer
if err := pc.LookupProfile(p, &buf); err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: fmt.Sprintf("Failed to collect %s profile.", p),
Detail: err.Error(),
})
return
}
if err := addFile(p+".prof", buf.Bytes()); err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: fmt.Sprintf("Failed to write %s profile to archive.", p),
Detail: err.Error(),
})
return
}
}
}
if err := tw.Close(); err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Failed to finalize tar archive.",
Detail: err.Error(),
})
return
}
if err := gzw.Close(); err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Failed to finalize gzip archive.",
Detail: err.Error(),
})
return
}
filename := fmt.Sprintf("coderd-profile-%d.tar.gz", time.Now().Unix())
rw.Header().Set("Content-Type", "application/gzip")
rw.Header().Set("Content-Disposition", fmt.Sprintf("attachment; filename=%q", filename))
rw.WriteHeader(http.StatusOK)
_, _ = rw.Write(archive.Bytes())
}
// @Summary Debug pprof index
// @ID debug-pprof-index
// @Security CoderSessionToken
// @Success 200
// @Tags Debug
// @Router /debug/pprof [get]
// @x-apidocgen {"skip": true}
func _debugPprofIndex(http.ResponseWriter, *http.Request) {} //nolint:unused
// @Summary Debug pprof cmdline
// @ID debug-pprof-cmdline
// @Security CoderSessionToken
// @Success 200
// @Tags Debug
// @Router /debug/pprof/cmdline [get]
// @x-apidocgen {"skip": true}
func _debugPprofCmdline(http.ResponseWriter, *http.Request) {} //nolint:unused
// @Summary Debug pprof profile
// @ID debug-pprof-profile
// @Security CoderSessionToken
// @Success 200
// @Tags Debug
// @Router /debug/pprof/profile [get]
// @x-apidocgen {"skip": true}
func _debugPprofProfile(http.ResponseWriter, *http.Request) {} //nolint:unused
// @Summary Debug pprof symbol
// @ID debug-pprof-symbol
// @Security CoderSessionToken
// @Success 200
// @Tags Debug
// @Router /debug/pprof/symbol [get]
// @x-apidocgen {"skip": true}
func _debugPprofSymbol(http.ResponseWriter, *http.Request) {} //nolint:unused
// @Summary Debug pprof trace
// @ID debug-pprof-trace
// @Security CoderSessionToken
// @Success 200
// @Tags Debug
// @Router /debug/pprof/trace [get]
// @x-apidocgen {"skip": true}
func _debugPprofTrace(http.ResponseWriter, *http.Request) {} //nolint:unused
// @Summary Debug metrics
// @ID debug-metrics
// @Security CoderSessionToken
// @Success 200
// @Tags Debug
// @Router /debug/metrics [get]
// @x-apidocgen {"skip": true}
func _debugMetrics(http.ResponseWriter, *http.Request) {} //nolint:unused