docker · aheritier · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026
@@ -127,6 +127,8 @@ MCP and LSP toolsets are managed by a supervisor that auto-restarts them when th
 - `/tools` — the unified tools dialog. Its top section lists every toolset with its current state (`Stopped`, `Starting`, `Ready`, `Degraded`, `Restarting`, `Failed`), restart count, and last error; the bottom section lists every tool the agent can call. Start here whenever a tool seems missing or stuck.
 - `/toolset-restart <name>` — force a supervisor-driven reconnect of the named toolset. Useful after completing OAuth, when a remote MCP server has been redeployed, or when a language server like `gopls` is unresponsive.
 
+Remote MCP servers that return `401 invalid_token` (e.g. because the stored OAuth token was revoked or rotated) are now self-healing: docker-agent silently exchanges the refresh token for a new one when possible, or surfaces an OAuth re-authentication prompt on your next message when refresh is not possible. No more stuck toolsets that require a process restart — but if you want to trigger re-auth immediately, `/toolset-restart <name>` forces it right away.
+
 MCP tools using stdio transport must complete the initialization handshake before becoming available. If tools fail silently:
 
 1. Run `/tools` to see whether the toolset is `Failed` or stuck in `Restarting`, and what the last error was.

@@ -64,6 +64,17 @@ Set `allow_private_ips: true` on a remote MCP toolset only when the MCP server o
   <p>Remote MCP connections (Streamable HTTP / SSE) automatically reconnect after the server closes an idle connection — no configuration needed. Services like Notion and Linear close idle connections periodically; docker-agent detects the clean close and reconnects with exponential backoff. To tune reconnect behaviour or disable reconnection entirely, use the <a href="{{ '/configuration/tools/#toolset-lifecycle' | relative_url }}"><code>lifecycle</code> block</a>.</p>
 </div>
 
+<div class="callout callout-info" markdown="1">
+<div class="callout-title">Automatic recovery from revoked or rotated OAuth tokens
+</div>
+  <p>If a remote MCP server rejects the cached token with a <code>401 invalid_token</code> error (for example, because the token was revoked or rotated server-side), docker-agent handles the failure automatically:</p>
+  <ul>
+    <li><strong>Silent refresh:</strong> when a refresh token is available, docker-agent silently exchanges it for a new access token and replays the request — no user interaction required.</li>
+    <li><strong>Re-authentication prompt:</strong> when the refresh token is absent or has also expired, the toolset transitions to a "needs re-auth" state and surfaces an OAuth prompt on your next message (exactly like the first-time flow).</li>
+  </ul>
+  <p>Either way, the agent never burns 5 reconnect attempts on an auth failure — it fails fast and either refreshes silently or defers to interactive re-auth. If you want to trigger re-auth immediately without waiting for the next message, run <code>/toolset-restart &lt;name&gt;</code> from the TUI.</p>
+</div>
+
 ### OAuth for servers without Dynamic Client Registration
 
 Most remote MCP servers that require OAuth support [Dynamic Client Registration (RFC 7591)]({{ 'https://datatracker.ietf.org/doc/html/rfc7591' }}) — no configuration is needed, docker-agent handles the flow for you.

@@ -79,7 +79,7 @@ toolsets:
 
 ## Remote MCP (Streamable HTTP / SSE)
 
-Connect to MCP servers over the network. OAuth flows (including [Dynamic Client Registration](https://datatracker.ietf.org/doc/html/rfc7591)) are handled automatically — docker-agent opens your browser when authentication is required and caches tokens for subsequent sessions.
+Connect to MCP servers over the network. OAuth flows (including [Dynamic Client Registration](https://datatracker.ietf.org/doc/html/rfc7591)) are handled automatically — docker-agent opens your browser when authentication is required and caches tokens for subsequent sessions. Tokens are refreshed silently when they expire or are revoked server-side; if a silent refresh is not possible, the OAuth prompt reappears on the next message.
 
 ```yaml
 toolsets:

@@ -15,6 +15,7 @@ import (
 	"github.com/docker/docker-agent/pkg/config/types"
 	"github.com/docker/docker-agent/pkg/model/provider"
 	"github.com/docker/docker-agent/pkg/tools"
+	mcptools "github.com/docker/docker-agent/pkg/tools/mcp"
 )
 
 // Agent represents an AI agent
@@ -409,6 +410,18 @@ func (a *Agent) ensureToolSetsAreStarted(ctx context.Context) {
 			continue
 		}
 		desc := tools.DescribeToolSet(toolSet)
+		if mcptools.IsAuthorizationRequired(err) {
+			// Recovery: previously-working toolset lost its OAuth token in the
+			// background. Emit the targeted re-auth notice once per streak so the
+			// user knows a dialog will appear on their next message.
+			// Initial-startup auth deferral (ShouldReportRecoveryFailure==false)
+			// stays silent — the dialog appears naturally on the first turn.
+			if toolSet.ShouldReportRecoveryFailure() {
+				slog.WarnContext(ctx, "Toolset needs re-authentication after background token rejection", "agent", a.Name(), "toolset", desc)
+				a.AddToolWarning(desc + " needs re-authentication — it will prompt on your next message, or use /toolset-restart")
+			}
+			continue
+		}
 		if toolSet.ShouldReportFailure() {
 			slog.WarnContext(ctx, "Toolset start failed; will retry on next turn", "agent", a.Name(), "toolset", desc, "error", err)
 			a.AddToolWarning(fmt.Sprintf("%s start failed: %v", desc, err))

@@ -1243,41 +1243,53 @@ func (r *LocalRuntime) emitToolsProgressively(ctx context.Context, a *agent.Agen
 
 		isLast := i == totalToolsets-1
 
-		// Start the toolset if needed
+		// Start the toolset if needed, including recovery: a previously-started
+		// toolset whose inner connection died (e.g. background invalid_token)
+		// must have its recovery Start() called here so ShouldReportRecoveryFailure
+		// can fire the targeted re-auth notice. Start() is a no-op when the
+		// toolset is already healthy, so calling it unconditionally is safe.
 		if startable, ok := toolset.(*tools.StartableToolSet); ok {
-			if !startable.IsStarted() {
-				if err := startable.Start(ctx); err != nil {
-					desc := tools.DescribeToolSet(startable.ToolSet)
-					// IsAuthorizationRequired must be checked BEFORE
-					// ShouldReportFailure: this is the first — expected —
-					// failure of a deferred-OAuth toolset, and consuming the
-					// failure-reported flag here would suppress the *real*
-					// failure (e.g. server 4xx on the eventual interactive
-					// retry) that the user actually needs to see.
-					if mcptools.IsAuthorizationRequired(err) {
-						// The toolset just needs an OAuth approval that we
-						// deliberately deferred until the user is interacting
-						// with the agent. The dialog will appear naturally on
-						// the first RunStream — no need to pre-announce it.
+			if err := startable.Start(ctx); err != nil {
+				desc := tools.DescribeToolSet(startable.ToolSet)
+				// IsAuthorizationRequired must be checked BEFORE
+				// ShouldReportFailure: this is the first — expected —
+				// failure of a deferred-OAuth toolset, and consuming the
+				// failure-reported flag here would suppress the *real*
+				// failure (e.g. server 4xx on the eventual interactive
+				// retry) that the user actually needs to see.
+				if mcptools.IsAuthorizationRequired(err) {
+					// Two cases:
+					// 1. Initial startup deferral (toolset never ran): the
+					//    OAuth dialog will appear naturally on the first user
+					//    message — no need to pre-announce it.
+					// 2. Recovery: the toolset was previously working but the
+					//    background watcher detected a server-side invalid_token
+					//    (fixes #3198). Surface a deduped re-auth notice so the
+					//    user knows what is about to prompt on their next message.
+					if startable.ShouldReportRecoveryFailure() {
+						slog.WarnContext(ctx, "Toolset needs re-authentication after background token rejection",
+							"agent", a.Name(), "toolset", desc)
+						a.AddToolWarning(desc + " needs re-authentication — it will prompt on your next message, or use /toolset-restart")
+					} else {
 						slog.DebugContext(ctx, "Toolset deferred until first message", "agent", a.Name(), "toolset", desc, "reason", err)
-						continue
 					}
-					// Route real failures through the agent's warning
-					// channel so the TUI surfaces a persistent,
-					// user-visible notice that includes the actual
-					// server-side cause (threaded through by
-					// remoteMCPClient.Initialize). Use the same
-					// once-per-streak guard as ensureToolSetsAreStarted
-					// so a failing toolset doesn't flood the UI with a
-					// new warning every time the agent is restarted.
-					if !startable.ShouldReportFailure() {
-						slog.DebugContext(ctx, "Toolset still unavailable; skipping", "agent", a.Name(), "toolset", desc, "error", err)
-						continue
-					}
-					slog.WarnContext(ctx, "Toolset start failed; skipping", "agent", a.Name(), "toolset", desc, "error", err)
-					a.AddToolWarning(fmt.Sprintf("%s start failed: %v", desc, err))
 					continue
 				}
+				// Route real failures through the agent's warning
+				// channel so the TUI surfaces a persistent,
+				// user-visible notice that includes the actual
+				// server-side cause (threaded through by
+				// remoteMCPClient.Initialize). Use the same
+				// once-per-streak guard as ensureToolSetsAreStarted
+				// so a failing toolset doesn't flood the UI with a
+				// new warning every time the agent is restarted.
+				if !startable.ShouldReportFailure() {
+					slog.DebugContext(ctx, "Toolset still unavailable; skipping", "agent", a.Name(), "toolset", desc, "error", err)
+					continue
+				}
+				slog.WarnContext(ctx, "Toolset start failed; skipping", "agent", a.Name(), "toolset", desc, "error", err)
+				a.AddToolWarning(fmt.Sprintf("%s start failed: %v", desc, err))
+				continue
 			}
 		}
 

@@ -1233,6 +1233,99 @@ func TestEmitStartupInfo_DeferredAuthDoesNotConsumeFailureGate(t *testing.T) {
 			"and the user sees zero tools with no explanation")
 }
 
+// recoveryAuthToolSet simulates a toolset whose first Start() always succeeds,
+// and whose Restart() returns a configurable error (used to simulate a
+// background invalid_token loss after a prior successful start).
+// IsStarted() reflects live connection state so StartableToolSet.Start() can
+// detect the "inner went dead" recovery scenario.
+type recoveryAuthToolSet struct {
+	started    bool
+	restartErr error
+}
+
+func (r *recoveryAuthToolSet) Tools(context.Context) ([]tools.Tool, error) { return nil, nil }
+func (r *recoveryAuthToolSet) Start(context.Context) error                 { r.started = true; return nil }
+func (r *recoveryAuthToolSet) Stop(context.Context) error                  { r.started = false; return nil }
+func (r *recoveryAuthToolSet) IsStarted() bool                             { return r.started }
+func (r *recoveryAuthToolSet) Restart(context.Context) error               { return r.restartErr }
+
+// TestEmitStartupInfo_RecoveryAuthNoticeEmittedOnce is the regression test for
+// blocking issue 3: when a toolset was previously started and working but the
+// background watcher detected a server-side invalid_token, the next call to
+// emitToolsProgressively must attempt a recovery Start() and emit exactly one
+// targeted re-auth notice. Initial-startup auth deferral (toolset never worked
+// before) must remain silent. The streak resets on success so a subsequent
+// background failure produces a fresh notice.
+func TestEmitStartupInfo_RecoveryAuthNoticeEmittedOnce(t *testing.T) {
+	prov := &mockProvider{id: "test/startup-model", stream: &mockStream{}}
+	authErr := &mcptools.AuthorizationRequiredError{URL: "https://example.test/mcp"}
+
+	inner := &recoveryAuthToolSet{restartErr: authErr}
+	root := agent.New("root", "agent",
+		agent.WithModel(prov),
+		agent.WithToolSets(inner),
+	)
+	tm := team.New(team.WithAgents(root))
+	rt, err := NewLocalRuntime(tm, WithCurrentAgent("root"), WithModelStore(mockModelStore{}))
+	require.NoError(t, err)
+
+	var wrapped *tools.StartableToolSet
+	for _, ts := range root.ToolSets() {
+		if s, ok := ts.(*tools.StartableToolSet); ok {
+			wrapped = s
+			break
+		}
+	}
+	require.NotNil(t, wrapped, "agent.ToolSets() must wrap the inner toolset in a *tools.StartableToolSet")
+
+	// nopSend discards sidebar events; we inspect agent.DrainWarnings() instead.
+	nopSend := func(Event) bool { return true }
+	// Mirror EmitStartupInfo\'s non-interactive context so toolsets with OAuth
+	// fail fast rather than blocking on a prompt.
+	ctx := mcptools.WithoutInteractivePrompts(t.Context())
+
+	// Phase 1: initial startup — inner.Start() succeeds (first call); no recovery
+	// notice because the toolset was never previously working.
+	rt.emitToolsProgressively(ctx, root, nopSend)
+	_ = root.DrainWarnings() // clear any unrelated warnings
+	require.True(t, wrapped.IsStarted(), "toolset must be started after initial success")
+
+	// Phase 2: background failure — inner loses its connection (e.g. server-side
+	// invalid_token eviction set the live started flag to false).
+	inner.started = false
+
+	// First emitToolsProgressively after the background failure: recovery Start()
+	// is attempted (Restart returns authErr), and exactly one targeted notice is
+	// added to the agent\'s warning queue.
+	rt.emitToolsProgressively(ctx, root, nopSend)
+	noticesPhase2 := root.DrainWarnings()
+	require.Len(t, noticesPhase2, 1,
+		"exactly one targeted re-auth notice must be emitted on the first recovery failure")
+	assert.Contains(t, noticesPhase2[0], "needs re-authentication",
+		"recovery notice must use the targeted re-auth framing, not the generic start-failed message")
+
+	// Dedup: ShouldReportRecoveryFailure was consumed by emitToolsProgressively;
+	// a direct call must return false (streak is still active but pending cleared).
+	assert.False(t, wrapped.ShouldReportRecoveryFailure(),
+		"ShouldReportRecoveryFailure must return false after the first notice was emitted (dedup)")
+
+	// Phase 3: inner recovers — successful Start() (via inner.Start() since
+	// wrapped.started==false after failed Restart) resets the recovery streak.
+	inner.started = true
+	rt.emitToolsProgressively(ctx, root, nopSend)
+	_ = root.DrainWarnings()
+	require.True(t, wrapped.IsStarted(), "toolset must be re-started after recovery")
+	assert.False(t, wrapped.ShouldReportRecoveryFailure(),
+		"recovery streak must be reset after a successful Start")
+
+	// Phase 4: background failure again — streak was reset, so a fresh notice
+	// is expected (verifies reset-on-success behavior).
+	inner.started = false
+	rt.emitToolsProgressively(ctx, root, nopSend)
+	noticesPhase4 := root.DrainWarnings()
+	require.Len(t, noticesPhase4, 1, "fresh failure after streak reset must emit a new notice")
+}
+
 // TestEmitAgentWarnings_OnlyEmitsFailures verifies that emitAgentWarnings
 // only surfaces real failures to the user. Recovery is intentionally
 // silent: a previously-failed toolset becoming available again does NOT

@@ -76,6 +76,15 @@ func classifyByMessage(err error) error {
 		strings.Contains(lower, "broken pipe"),
 		strings.Contains(msg, "EOF"):
 		return wrap(ErrTransport, err)
+	// Map server-side OAuth token rejection to ErrAuthRequired. We match
+	// "invalid_token" (RFC 6750 §3.1 canonical error code) and its space-
+	// separated variant. We deliberately do NOT match bare "unauthorized"
+	// here to avoid classifying application-level 401s (unrelated to OAuth)
+	// as permanent auth failures; the token-was-attached gating in
+	// oauthTransport.roundTrip is the correct place for that check.
+	case strings.Contains(lower, "invalid_token"),
+		strings.Contains(lower, "invalid token"):
+		return wrap(ErrAuthRequired, err)
 	}
 	return err
 }

@@ -81,6 +81,43 @@ func TestClassify_AlreadyClassifiedPasses(t *testing.T) {
 	assert.Check(t, errors.Is(got, lifecycle.ErrAuthRequired))
 }
 
+func TestClassify_InvalidToken(t *testing.T) {
+	t.Parallel()
+	cases := []struct {
+		name string
+		msg  string
+	}{
+		{"rfc6750_error_code", `401 Unauthorized: {"error":"invalid_token","error_description":"Invalid access token"}`},
+		{"space_variant", "server rejected token: invalid token"},
+		{"upper_case", "INVALID_TOKEN: token expired"},
+	}
+	for _, tc := range cases {
+		got := lifecycle.Classify(errors.New(tc.msg))
+		assert.Check(t, errors.Is(got, lifecycle.ErrAuthRequired), "msg=%q", tc.msg)
+		assert.Check(t, lifecycle.IsPermanent(got), "msg=%q: must be permanent", tc.msg)
+	}
+}
+
+func TestClassify_BareUnauthorizedIsNotAuth(t *testing.T) {
+	t.Parallel()
+	// A bare "unauthorized" without "invalid_token" must NOT be classified as
+	// ErrAuthRequired to avoid misreading application-level 401s as permanent
+	// auth failures (see human decision Q3 in the implementation plan).
+	got := lifecycle.Classify(errors.New("401 Unauthorized"))
+	assert.Check(t, !errors.Is(got, lifecycle.ErrAuthRequired), "bare unauthorized must not map to ErrAuthRequired")
+}
+
+func TestClassify_InvalidToken_Idempotent(t *testing.T) {
+	t.Parallel()
+	// Classify must be idempotent: an already-wrapped ErrAuthRequired that
+	// also contains "invalid_token" in its message must not be double-wrapped.
+	inner := errors.New("invalid_token: expired")
+	first := lifecycle.Classify(inner)
+	second := lifecycle.Classify(first)
+	assert.Check(t, errors.Is(second, lifecycle.ErrAuthRequired))
+	assert.Check(t, errors.Is(second, inner))
+}
+
 func TestClassify_UnknownPassthrough(t *testing.T) {
 	t.Parallel()
 	in := errors.New("totally unrelated")

@@ -10,6 +10,31 @@ import (
 	"time"
 )
 
+// backgroundReconnectKey is a context key that the supervisor attaches to
+// connector.Connect calls made during background watcher reconnect attempts,
+// distinguishing them from the initial interactive Start. Connector
+// implementations (e.g. the MCP clientConnector) use this to apply
+// non-interactive constraints on background reconnects so a 401 defers
+// cleanly rather than blocking on a dead elicitation bridge.
+type backgroundReconnectKey struct{}
+
+// withBackgroundReconnect returns a copy of ctx marked as a background
+// reconnect attempt. It is set by tryRestart before calling
+// connector.Connect so the connector can distinguish watcher reconnects
+// from the initial interactive Start.
+func withBackgroundReconnect(ctx context.Context) context.Context {
+	return context.WithValue(ctx, backgroundReconnectKey{}, true)
+}
+
+// IsBackgroundReconnect reports whether ctx was created by the supervisor
+// for a background reconnect attempt. Connector.Connect implementations can
+// use this to disable interactive operations (e.g. OAuth prompts) that
+// should not run in the background.
+func IsBackgroundReconnect(ctx context.Context) bool {
+	v, _ := ctx.Value(backgroundReconnectKey{}).(bool)
+	return v
+}
+
 // Connector creates new sessions for a Supervisor. Implementations are
 // transport-specific: stdio MCP, remote MCP, LSP stdio.
 type Connector interface {
@@ -462,8 +487,21 @@ func (s *Supervisor) tryRestart(ctx context.Context) bool {
 		}
 		s.mu.Unlock()
 
-		sess, err := s.connector.Connect(ctx)
+		sess, err := s.connector.Connect(withBackgroundReconnect(ctx))
 		if err != nil {
+			// A permanent error on reconnect (e.g. ErrAuthRequired from a
+			// server-side invalid_token) must not be retried: doing so would
+			// burn through the budget and mask the real failure. Symmetric
+			// with the shouldRestart check on the Wait() path.
+			if IsPermanent(err) {
+				log.Warn("supervisor: permanent error on reconnect; not retrying", "name", s.name, "error", err)
+				s.tracker.Fail(StateFailed, err)
+				if cb := s.policy.OnFailed; cb != nil {
+					cb(err)
+				}
+				s.signalDone()
+				return false
+			}
 			s.tracker.Fail(StateRestarting, err)
 			s.tracker.IncRestarts()
 			log.Warn("supervisor: restart failed", "name", s.name, "attempt", attempt+1, "error", err)