Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions cmd/root/api.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package root

import (
"cmp"
"errors"
"fmt"
"log/slog"
Expand All @@ -13,6 +14,7 @@ import (
"github.com/docker/docker-agent/pkg/cli"
"github.com/docker/docker-agent/pkg/config"
pathx "github.com/docker/docker-agent/pkg/path"
"github.com/docker/docker-agent/pkg/profiling"
"github.com/docker/docker-agent/pkg/server"
"github.com/docker/docker-agent/pkg/session"
"github.com/docker/docker-agent/pkg/telemetry"
Expand All @@ -25,6 +27,7 @@ type apiFlags struct {
fakeResponses string
recordPath string
authToken string
pprofAddr string
runConfig config.RuntimeConfig
}

Expand All @@ -44,6 +47,8 @@ func newAPICmd() *cobra.Command {
cmd.PersistentFlags().StringVar(&flags.fakeResponses, "fake", "", "Replay AI responses from cassette file (for testing)")
cmd.PersistentFlags().StringVar(&flags.recordPath, "record", "", "Record AI API interactions to cassette file")
cmd.PersistentFlags().StringVar(&flags.authToken, "auth-token", "", "Bearer token required for API requests (empty = no authentication)")
cmd.PersistentFlags().StringVar(&flags.pprofAddr, "pprof-addr", "", "TCP host:port to expose Go pprof endpoints at /debug/pprof/ (e.g. 127.0.0.1:6060); also set via CAGENT_PPROF_ADDR")
_ = cmd.PersistentFlags().MarkHidden("pprof-addr")
cmd.MarkFlagsMutuallyExclusive("fake", "record")
addRuntimeConfigFlags(cmd, &flags.runConfig)

Expand Down Expand Up @@ -89,6 +94,12 @@ func (f *apiFlags) runAPICommand(cmd *cobra.Command, args []string) (commandErr
return errors.New("--pull-interval flag can only be used with OCI or URL references, not local files")
}

if pprofAddr := cmp.Or(f.pprofAddr, os.Getenv("CAGENT_PPROF_ADDR")); pprofAddr != "" {
if err := profiling.StartPprofServer(ctx, pprofAddr); err != nil {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[MEDIUM] pprof bind failure aborts the entire serve api command

If the address set via --pprof-addr / CAGENT_PPROF_ADDR is unavailable (e.g. port already in use, address already bound by another process, or a typo), StartPprofServer returns an error that propagates here — preventing the main API listener from ever starting.

This is a disproportionate failure mode for a debug-only facility: a misconfigured CAGENT_PPROF_ADDR environment variable in a production deployment would silently take the entire agent down. Consider logging the error and continuing rather than returning it fatally, or at minimum documenting that the env-var can cause a hard service outage:

if err := profiling.StartPprofServer(ctx, pprofAddr); err != nil {
    // OPTION A – degrade gracefully
    slog.WarnContext(ctx, "pprof server failed to start; continuing without profiling", "error", err)
    // OPTION B – keep fatal, but make the impact explicit in the flag help text
}

return err
}
}

ln, lnCleanup, err := newListener(ctx, f.listenAddr)
if err != nil {
return err
Expand Down
69 changes: 69 additions & 0 deletions pkg/profiling/pprof_server.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
package profiling

import (
"context"
"errors"
"fmt"
"log/slog"
"net"
"net/http"
"net/http/pprof"
"time"
)

// StartPprofServer starts an HTTP server exposing Go runtime profiling endpoints
// at /debug/pprof/ on the given addr. It binds the listener synchronously and
// returns an error if the address is unavailable. The server runs in a background
// goroutine and shuts down when ctx is cancelled.
// addr must be a TCP host:port address (e.g. "127.0.0.1:6060"); unix://, npipe://,
// and fd:// schemes are not supported. Prefer a loopback address over a bare port
// (":6060") — the latter binds all interfaces, exposing process memory and arguments
// to the network.
func StartPprofServer(ctx context.Context, addr string) error {
mux := http.NewServeMux()
mux.HandleFunc("/debug/pprof/", pprof.Index)
mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline)
mux.HandleFunc("/debug/pprof/profile", pprof.Profile)
mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol)
mux.HandleFunc("/debug/pprof/trace", pprof.Trace)

ln, err := (&net.ListenConfig{}).Listen(ctx, "tcp", addr)
if err != nil {
return fmt.Errorf("pprof: listen on %s: %w", addr, err)
}

// ReadHeaderTimeout guards against slow-loris connections on the debug port.
// WriteTimeout is intentionally omitted: profile/trace captures legitimately
// run for tens of seconds and would be truncated by a short write deadline.
srv := &http.Server{Handler: mux, ReadHeaderTimeout: 10 * time.Second}

slog.InfoContext(ctx, "pprof server listening", "addr", ln.Addr().String())
if tcpAddr, ok := ln.Addr().(*net.TCPAddr); ok && !tcpAddr.IP.IsLoopback() {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[MEDIUM] Non-loopback pprof binding warns but does not refuse — unauthenticated memory access over the network

When a non-loopback address (e.g. 0.0.0.0:6060) is used, the warning is logged and the server starts anyway. Unlike the main API listener — which requires bearer-token authentication — pprof requires no authentication and exposes:

  • /debug/pprof/heap — full heap dump (may contain in-memory secrets, tokens, TLS keys)
  • /debug/pprof/cmdline — process command-line arguments (may contain passwords)
  • /debug/pprof/goroutine — goroutine stacks (code paths, internal state)

The existing warnIfNotLoopback for the main API listener is a reasonable pattern because that listener is authenticated. For an unauthenticated endpoint, a warn-and-proceed policy means any actor who can set CAGENT_PPROF_ADDR=0.0.0.0:6060 in the environment (e.g. via a Docker Compose override or a K8s config map) can read process memory from the network with no further credentials.

Consider either rejecting non-loopback addresses outright, or making the security implication explicit in user-facing documentation beyond the hidden flag help text.

slog.WarnContext(ctx, "pprof server is listening on a non-loopback address — "+
"/debug/pprof/cmdline and heap profiles are network-reachable without authentication",
"addr", tcpAddr.String())
}

go func() {
if err := srv.Serve(ln); err != nil && !errors.Is(err, http.ErrServerClosed) {
slog.Warn("pprof server error", "error", err)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[LOW] slog.Warn used without context — loses trace IDs and structured context values

The serve goroutine calls slog.Warn(...) (context-free variant) while every other log statement in this file uses a context-aware variant (slog.InfoContext, slog.WarnContext). The goroutine closes over ctx, so the context is available and should be passed.

// Current (loses ctx values):
slog.Warn("pprof server error", "error", err)

// Should be:
slog.WarnContext(ctx, "pprof server error", "error", err)

This loses any trace IDs or structured context fields attached to ctx at the time of the error, making it harder to correlate this log entry with the request/session that triggered the failure.

}
}()

go func() {
<-ctx.Done()
// 5s grace: favors prompt process exit over draining in-flight profile
// captures. CPU/trace profiles run up to 30s by default; callers should
// cancel their requests before the process exits rather than relying on
// this timeout to drain them.
// context.WithoutCancel preserves ctx values (e.g. trace IDs) without
// inheriting the cancellation, so the shutdown timeout is not pre-expired.
shutdownCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 5*time.Second)
defer cancel()
if err := srv.Shutdown(shutdownCtx); err != nil {
slog.WarnContext(shutdownCtx, "pprof server shutdown error", "error", err)
}
}()

return nil
}
Loading