From ef5d57685cf6e79e641c2c7140bb07fbb33e8ae8 Mon Sep 17 00:00:00 2001 From: Charles Vien Date: Fri, 12 Jun 2026 22:08:41 -0700 Subject: [PATCH 1/4] surface real agent init errors and kill orphans --- .../claude/claude-agent.refresh.test.ts | 18 ++++++--- .../agent/src/adapters/claude/claude-agent.ts | 37 ++++++++++++++++--- packages/agent/src/gateway-models.test.ts | 29 ++++++++++++++- packages/agent/src/gateway-models.ts | 13 ++++++- .../src/services/agent/agent.ts | 8 +++- 5 files changed, 90 insertions(+), 15 deletions(-) diff --git a/packages/agent/src/adapters/claude/claude-agent.refresh.test.ts b/packages/agent/src/adapters/claude/claude-agent.refresh.test.ts index 3f6b26002..a70aab87d 100644 --- a/packages/agent/src/adapters/claude/claude-agent.refresh.test.ts +++ b/packages/agent/src/adapters/claude/claude-agent.refresh.test.ts @@ -1,4 +1,7 @@ -import type { AgentSideConnection } from "@agentclientprotocol/sdk"; +import { + type AgentSideConnection, + RequestError, +} from "@agentclientprotocol/sdk"; import { beforeEach, describe, expect, it, vi } from "vitest"; import { POSTHOG_METHODS } from "../../acp-extensions"; import { Pushable } from "../../utils/streams"; @@ -15,6 +18,7 @@ type SdkQueryHandle = { setMcpServers: ReturnType; supportedCommands: ReturnType; initializationResult: ReturnType; + close: ReturnType; [Symbol.asyncIterator]: () => AsyncIterator; }; @@ -31,6 +35,7 @@ function makeQueryHandle(): SdkQueryHandle { setMcpServers: vi.fn().mockResolvedValue(undefined), supportedCommands: vi.fn().mockResolvedValue([]), initializationResult: vi.fn().mockImplementation(() => nextInitPromise), + close: vi.fn(), [Symbol.asyncIterator]: async function* () { /* never yields */ } as never, @@ -192,7 +197,7 @@ describe("ClaudeAcpAgent.extMethod refresh_session", () => { ).rejects.toThrow(/does not support MCP injection/); }); - it("throws when initialization of the new query times out", async () => { + it("throws a RequestError and closes the timed-out query so it cannot leak", async () => { vi.useFakeTimers(); try { const agent = makeAgent(); @@ -209,9 +214,12 @@ describe("ClaudeAcpAgent.extMethod refresh_session", () => { await vi.advanceTimersByTimeAsync(30_001); - await expect(promise).rejects.toThrow( - /Session refresh timed out for s-timeout/, - ); + // A RequestError (not a plain Error) is what survives the ACP layer + // instead of being collapsed into a generic "Internal error". + await expect(promise).rejects.toBeInstanceOf(RequestError); + await expect(promise).rejects.toThrow(/Session refresh timed out after/); + // The new query is closed so its CLI subprocess does not leak. + expect(createdQueries[0]?.close).toHaveBeenCalledTimes(1); } finally { vi.useRealTimers(); } diff --git a/packages/agent/src/adapters/claude/claude-agent.ts b/packages/agent/src/adapters/claude/claude-agent.ts index 6b1a5291b..b0a1a9453 100644 --- a/packages/agent/src/adapters/claude/claude-agent.ts +++ b/packages/agent/src/adapters/claude/claude-agent.ts @@ -1286,7 +1286,12 @@ export class ClaudeAcpAgent extends BaseAcpAgent { SESSION_VALIDATION_TIMEOUT_MS, ); if (result.result === "timeout") { - throw new Error(`Session refresh timed out for ${this.sessionId}`); + this.terminateQuery(newQuery, newAbortController); + throw new RequestError( + -32603, + `Session refresh timed out after ${SESSION_VALIDATION_TIMEOUT_MS}ms`, + { sessionId: this.sessionId }, + ); } // Re-fetch MCP tool metadata + slash commands — the server list changed. @@ -1467,6 +1472,21 @@ export class ClaudeAcpAgent extends BaseAcpAgent { } } + /** + * Terminate a query whose initialization failed or timed out: abort the + * controller (killing the CLI subprocess via the spawn signal) and close the + * SDK query so no further messages are read. Without this a timed-out session + * leaks an orphaned `claude` process that the retry loop then multiplies. + */ + private terminateQuery(sdkQuery: Query, controller: AbortController): void { + controller.abort(); + try { + sdkQuery.close(); + } catch { + // Query may already be closed. + } + } + private async createSession( params: { cwd: string; @@ -1655,8 +1675,10 @@ export class ClaudeAcpAgent extends BaseAcpAgent { SESSION_VALIDATION_TIMEOUT_MS, ); if (result.result === "timeout") { - throw new Error( - `Session ${forkSession ? "fork" : "resumption"} timed out for sessionId=${sessionId}`, + throw new RequestError( + -32603, + `Session ${forkSession ? "fork" : "resumption"} timed out after ${SESSION_VALIDATION_TIMEOUT_MS}ms`, + { sessionId, taskId, taskRunId: meta?.taskRunId }, ); } session.knownSlashCommands = collectKnownSlashCommands( @@ -1664,6 +1686,7 @@ export class ClaudeAcpAgent extends BaseAcpAgent { ); } catch (err) { settingsManager.dispose(); + this.terminateQuery(q, abortController); if ( err instanceof Error && err.message === "Query closed before response received" @@ -1718,9 +1741,10 @@ export class ClaudeAcpAgent extends BaseAcpAgent { try { const initResult = await initPromise; if (initResult.result === "timeout") { - settingsManager.dispose(); - throw new Error( - `Session initialization timed out for sessionId=${sessionId}`, + throw new RequestError( + -32603, + `Session initialization timed out after ${SESSION_VALIDATION_TIMEOUT_MS}ms`, + { sessionId, taskId, taskRunId: meta?.taskRunId }, ); } session.knownSlashCommands = collectKnownSlashCommands( @@ -1728,6 +1752,7 @@ export class ClaudeAcpAgent extends BaseAcpAgent { ); } catch (err) { settingsManager.dispose(); + this.terminateQuery(q, abortController); this.logger.error("Session initialization failed", { sessionId, taskId, diff --git a/packages/agent/src/gateway-models.test.ts b/packages/agent/src/gateway-models.test.ts index 37f354807..9b5d343f1 100644 --- a/packages/agent/src/gateway-models.test.ts +++ b/packages/agent/src/gateway-models.test.ts @@ -1,5 +1,6 @@ -import { describe, expect, it } from "vitest"; +import { afterEach, describe, expect, it, vi } from "vitest"; import { + fetchGatewayModels, formatGatewayModelName, getClaudeModelRecency, isBlockedModelId, @@ -109,3 +110,29 @@ describe("getClaudeModelRecency", () => { ]); }); }); + +describe("fetchGatewayModels", () => { + afterEach(() => { + vi.restoreAllMocks(); + }); + + it("bounds the request with an abort signal and returns [] when it times out", async () => { + // Simulate a stalled gateway: the request rejects the way + // AbortSignal.timeout would once the deadline passes. fetchGatewayModels + // runs inside the Promise.all that gates session-init, so it must degrade + // to "no models" rather than hang. + const fetchMock = vi + .spyOn(globalThis, "fetch") + .mockRejectedValue( + new DOMException("The operation was aborted.", "TimeoutError"), + ); + + await expect( + fetchGatewayModels({ gatewayUrl: "https://gateway.timeout-test" }), + ).resolves.toEqual([]); + + expect(fetchMock).toHaveBeenCalledTimes(1); + const init = fetchMock.mock.calls[0]?.[1] as RequestInit | undefined; + expect(init?.signal).toBeInstanceOf(AbortSignal); + }); +}); diff --git a/packages/agent/src/gateway-models.ts b/packages/agent/src/gateway-models.ts index 82accd258..dfd9de016 100644 --- a/packages/agent/src/gateway-models.ts +++ b/packages/agent/src/gateway-models.ts @@ -51,6 +51,11 @@ type ModelsListResponse = const CACHE_TTL = 10 * 60 * 1000; // 10 minutes +// Bound the gateway /v1/models request so a stalled connection cannot hold up +// session init: this fetch runs inside the Promise.all that gates the SDK +// initialization timeout. On abort the callers fall through to `return []`. +const GATEWAY_FETCH_TIMEOUT_MS = 10_000; + let gatewayModelsCache: { models: GatewayModel[]; expiry: number; @@ -76,7 +81,9 @@ export async function fetchGatewayModels( const modelsUrl = `${gatewayUrl}/v1/models`; try { - const response = await fetch(modelsUrl); + const response = await fetch(modelsUrl, { + signal: AbortSignal.timeout(GATEWAY_FETCH_TIMEOUT_MS), + }); if (!response.ok) { return []; @@ -138,7 +145,9 @@ export async function fetchModelsList( try { const modelsUrl = `${gatewayUrl}/v1/models`; - const response = await fetch(modelsUrl); + const response = await fetch(modelsUrl, { + signal: AbortSignal.timeout(GATEWAY_FETCH_TIMEOUT_MS), + }); if (!response.ok) { return []; } diff --git a/packages/workspace-server/src/services/agent/agent.ts b/packages/workspace-server/src/services/agent/agent.ts index 7b1b59468..c1fe9a887 100644 --- a/packages/workspace-server/src/services/agent/agent.ts +++ b/packages/workspace-server/src/services/agent/agent.ts @@ -921,10 +921,16 @@ When creating pull requests, add the following footer at the end of the PR descr ); return this.getOrCreateSession(config, isReconnect, true); } + // When the in-process ACP layer masks a thrown error as a generic + // "Internal error", the real text survives in `data.details`. Surface it + // here (host-side, before the tRPC boundary drops `data`) so the exported + // log names the actual cause. + const maskedDetail = (err as { data?: { details?: unknown } })?.data + ?.details; this.log.error( `Failed to ${isReconnect ? "reconnect" : "create"} session${ isRetry ? " after retry" : "" - }`, + }${typeof maskedDetail === "string" && maskedDetail ? `: ${maskedDetail}` : ""}`, err, ); // Non-auth reconnect failure on first attempt: fall back to a fresh session. From 17644d619b103f25e8e8b15bc8063638fc35164e Mon Sep 17 00:00:00 2001 From: Charles Vien Date: Fri, 12 Jun 2026 23:22:58 -0700 Subject: [PATCH 2/4] add init-failure tests and address review nits --- .../claude/claude-agent.resume-model.test.ts | 58 +++++++++++++++++-- .../agent/src/adapters/claude/claude-agent.ts | 7 +-- packages/agent/src/gateway-models.test.ts | 42 ++++++++------ packages/agent/src/gateway-models.ts | 5 +- .../src/services/agent/agent.ts | 9 ++- 5 files changed, 89 insertions(+), 32 deletions(-) diff --git a/packages/agent/src/adapters/claude/claude-agent.resume-model.test.ts b/packages/agent/src/adapters/claude/claude-agent.resume-model.test.ts index 65597c4b7..bec1885f8 100644 --- a/packages/agent/src/adapters/claude/claude-agent.resume-model.test.ts +++ b/packages/agent/src/adapters/claude/claude-agent.resume-model.test.ts @@ -10,20 +10,24 @@ type SdkQueryHandle = { setMcpServers: ReturnType; supportedCommands: ReturnType; initializationResult: ReturnType; + close: ReturnType; [Symbol.asyncIterator]: () => AsyncIterator; }; +let nextInitPromise: Promise = Promise.resolve({ + result: "success", + commands: [], + models: [], +}); + function makeQueryHandle(): SdkQueryHandle { return { interrupt: vi.fn().mockResolvedValue(undefined), setModel: vi.fn().mockResolvedValue(undefined), setMcpServers: vi.fn().mockResolvedValue(undefined), supportedCommands: vi.fn().mockResolvedValue([]), - initializationResult: vi.fn().mockResolvedValue({ - result: "success", - commands: [], - models: [], - }), + initializationResult: vi.fn().mockImplementation(() => nextInitPromise), + close: vi.fn(), [Symbol.asyncIterator]: async function* () { /* never yields */ } as never, @@ -101,6 +105,11 @@ afterAll(() => { describe("ClaudeAcpAgent session model on resume", () => { beforeEach(() => { createdQueries.length = 0; + nextInitPromise = Promise.resolve({ + result: "success", + commands: [], + models: [], + }); // No gateway: fetchGatewayModels returns [] and the requested model is // kept as a custom option — mirrors the gateway-outage failure mode. delete process.env.ANTHROPIC_BASE_URL; @@ -157,4 +166,43 @@ describe("ClaudeAcpAgent session model on resume", () => { expect(createdQueries).toHaveLength(1); expect(createdQueries[0].setModel).not.toHaveBeenCalled(); }); + + // The timeout *message* (RequestError "... timed out after ...") is covered + // by claude-agent.refresh.test.ts. Here we cover the leak fix on the + // new-session and resume paths: any init failure must close the query so the + // CLI subprocess can't leak and be multiplied by the retry loop. + it("closes the query and rethrows when new-session init fails", async () => { + const failedInit = Promise.reject(new Error("init boom")); + failedInit.catch(() => {}); + nextInitPromise = failedInit; + const agent = makeAgent(); + + await expect( + agent.newSession({ + cwd, + mcpServers: [], + _meta: { taskRunId: "run-init-fail-new" }, + }), + ).rejects.toThrow(/init boom/); + + expect(createdQueries[0]?.close).toHaveBeenCalledTimes(1); + }); + + it("closes the query and rethrows when resume init fails", async () => { + const failedInit = Promise.reject(new Error("resume boom")); + failedInit.catch(() => {}); + nextInitPromise = failedInit; + const agent = makeAgent(); + + await expect( + agent.resumeSession({ + sessionId: "0197a000-0000-7000-8000-0000000000ff", + cwd, + mcpServers: [], + _meta: { taskRunId: "run-init-fail-resume" }, + }), + ).rejects.toThrow(/resume boom/); + + expect(createdQueries[0]?.close).toHaveBeenCalledTimes(1); + }); }); diff --git a/packages/agent/src/adapters/claude/claude-agent.ts b/packages/agent/src/adapters/claude/claude-agent.ts index b0a1a9453..8df465ae2 100644 --- a/packages/agent/src/adapters/claude/claude-agent.ts +++ b/packages/agent/src/adapters/claude/claude-agent.ts @@ -1473,10 +1473,9 @@ export class ClaudeAcpAgent extends BaseAcpAgent { } /** - * Terminate a query whose initialization failed or timed out: abort the - * controller (killing the CLI subprocess via the spawn signal) and close the - * SDK query so no further messages are read. Without this a timed-out session - * leaks an orphaned `claude` process that the retry loop then multiplies. + * Without this, a timed-out session leaks an orphaned `claude` process that + * the retry loop then multiplies. Aborting the controller kills the + * subprocess via the spawn signal; closing the query stops further reads. */ private terminateQuery(sdkQuery: Query, controller: AbortController): void { controller.abort(); diff --git a/packages/agent/src/gateway-models.test.ts b/packages/agent/src/gateway-models.test.ts index 9b5d343f1..ee83e138b 100644 --- a/packages/agent/src/gateway-models.test.ts +++ b/packages/agent/src/gateway-models.test.ts @@ -1,6 +1,7 @@ import { afterEach, describe, expect, it, vi } from "vitest"; import { fetchGatewayModels, + fetchModelsList, formatGatewayModelName, getClaudeModelRecency, isBlockedModelId, @@ -111,28 +112,33 @@ describe("getClaudeModelRecency", () => { }); }); -describe("fetchGatewayModels", () => { +describe("gateway model fetch timeout", () => { afterEach(() => { vi.restoreAllMocks(); }); - it("bounds the request with an abort signal and returns [] when it times out", async () => { - // Simulate a stalled gateway: the request rejects the way - // AbortSignal.timeout would once the deadline passes. fetchGatewayModels - // runs inside the Promise.all that gates session-init, so it must degrade - // to "no models" rather than hang. - const fetchMock = vi - .spyOn(globalThis, "fetch") - .mockRejectedValue( - new DOMException("The operation was aborted.", "TimeoutError"), - ); + // Both fetches run inside the Promise.all that gates session-init, so a + // stalled gateway must degrade to "no models" rather than hang. + it.each([ + { name: "fetchGatewayModels", fn: fetchGatewayModels }, + { name: "fetchModelsList", fn: fetchModelsList }, + ])( + "$name bounds the request and returns [] when it times out", + async ({ fn }) => { + // Reject the way AbortSignal.timeout would once the deadline passes. + const fetchMock = vi + .spyOn(globalThis, "fetch") + .mockRejectedValue( + new DOMException("The operation was aborted.", "TimeoutError"), + ); - await expect( - fetchGatewayModels({ gatewayUrl: "https://gateway.timeout-test" }), - ).resolves.toEqual([]); + await expect( + fn({ gatewayUrl: "https://gateway.timeout-test" }), + ).resolves.toEqual([]); - expect(fetchMock).toHaveBeenCalledTimes(1); - const init = fetchMock.mock.calls[0]?.[1] as RequestInit | undefined; - expect(init?.signal).toBeInstanceOf(AbortSignal); - }); + expect(fetchMock).toHaveBeenCalledTimes(1); + const init = fetchMock.mock.calls[0]?.[1] as RequestInit | undefined; + expect(init?.signal).toBeInstanceOf(AbortSignal); + }, + ); }); diff --git a/packages/agent/src/gateway-models.ts b/packages/agent/src/gateway-models.ts index dfd9de016..cacc55e51 100644 --- a/packages/agent/src/gateway-models.ts +++ b/packages/agent/src/gateway-models.ts @@ -52,8 +52,9 @@ type ModelsListResponse = const CACHE_TTL = 10 * 60 * 1000; // 10 minutes // Bound the gateway /v1/models request so a stalled connection cannot hold up -// session init: this fetch runs inside the Promise.all that gates the SDK -// initialization timeout. On abort the callers fall through to `return []`. +// session init: this fetch runs inside the Promise.all that gates the 30s SDK +// initialization timeout, so it must resolve well within that window. On abort +// the callers fall through to `return []`. const GATEWAY_FETCH_TIMEOUT_MS = 10_000; let gatewayModelsCache: { diff --git a/packages/workspace-server/src/services/agent/agent.ts b/packages/workspace-server/src/services/agent/agent.ts index c1fe9a887..dc8f8d66c 100644 --- a/packages/workspace-server/src/services/agent/agent.ts +++ b/packages/workspace-server/src/services/agent/agent.ts @@ -927,10 +927,13 @@ When creating pull requests, add the following footer at the end of the PR descr // log names the actual cause. const maskedDetail = (err as { data?: { details?: unknown } })?.data ?.details; + const detailSuffix = + typeof maskedDetail === "string" && maskedDetail + ? `: ${maskedDetail}` + : ""; + const action = isReconnect ? "reconnect" : "create"; this.log.error( - `Failed to ${isReconnect ? "reconnect" : "create"} session${ - isRetry ? " after retry" : "" - }${typeof maskedDetail === "string" && maskedDetail ? `: ${maskedDetail}` : ""}`, + `Failed to ${action} session${isRetry ? " after retry" : ""}${detailSuffix}`, err, ); // Non-auth reconnect failure on first attempt: fall back to a fresh session. From 975837d61afbbeb5fe104a3da560f84dc77fec1f Mon Sep 17 00:00:00 2001 From: Charles Vien Date: Fri, 12 Jun 2026 22:31:14 -0700 Subject: [PATCH 3/4] auto-recover org projects after failed auth fetch --- packages/core/src/auth/auth.test.ts | 130 ++++++++++++++++++++++++ packages/core/src/auth/auth.ts | 149 +++++++++++++++++++++++----- 2 files changed, 255 insertions(+), 24 deletions(-) diff --git a/packages/core/src/auth/auth.test.ts b/packages/core/src/auth/auth.test.ts index bff2fc391..1c850850e 100644 --- a/packages/core/src/auth/auth.test.ts +++ b/packages/core/src/auth/auth.test.ts @@ -837,6 +837,136 @@ describe("AuthService", () => { }); }); + describe("project-less recovery", () => { + function stubOrgFetch(state: { succeeds: boolean; orgCalls: number }) { + vi.stubGlobal( + "fetch", + vi.fn(async (input: string | Request) => { + const url = typeof input === "string" ? input : input.url; + + if (url.includes("/api/users/@me/")) { + return { + ok: true, + json: vi.fn().mockResolvedValue({ + uuid: "user-1", + organization: { id: "org-1" }, + }), + } as unknown as Response; + } + + if (/\/api\/organizations\/[^/]+\/$/.test(url)) { + state.orgCalls++; + if (!state.succeeds) { + throw new TypeError("fetch failed"); + } + return { + ok: true, + json: vi.fn().mockResolvedValue({ + name: "Org 1", + teams: [ + { id: 42, name: "Project 42" }, + { id: 84, name: "Project 84" }, + ], + }), + } as unknown as Response; + } + + return { + ok: true, + json: vi.fn().mockResolvedValue({ has_access: true }), + } as unknown as Response; + }) as unknown as typeof fetch, + ); + } + + it("authenticates without a project when a scoped org fails to load, then recovers and restores the previous project on reconnect", async () => { + seedStoredSession({ + selectedProjectId: 84, + scopeVersion: OAUTH_SCOPE_VERSION - 1, + }); + await service.initialize(); + expect(service.getState()).toMatchObject({ + status: "anonymous", + needsScopeReauth: true, + currentProjectId: 84, + }); + + const fetchState = { succeeds: false, orgCalls: 0 }; + stubOrgFetch(fetchState); + oauthFlow.startFlow.mockResolvedValue(mockTokenResponse()); + vi.mocked(connectivity.getStatus).mockReturnValue({ isOnline: false }); + + await service.login("us"); + + expect(service.getState()).toMatchObject({ + status: "authenticated", + currentProjectId: null, + orgProjectsMap: { "org-1": { projects: [] } }, + }); + + await new Promise((r) => setTimeout(r, 0)); + + fetchState.succeeds = true; + vi.mocked(connectivity.getStatus).mockReturnValue({ isOnline: true }); + emitStatus(true); + + await vi.waitFor(() => { + expect(service.getState().currentProjectId).toBe(84); + }); + expect(service.getState().orgProjectsMap).toMatchObject({ + "org-1": { + orgName: "Org 1", + projects: [ + { id: 42, name: "Project 42" }, + { id: 84, name: "Project 84" }, + ], + }, + }); + }); + + it("recovers org/projects on power-monitor resume", async () => { + const fetchState = { succeeds: false, orgCalls: 0 }; + stubOrgFetch(fetchState); + oauthFlow.startFlow.mockResolvedValue(mockTokenResponse()); + vi.mocked(connectivity.getStatus).mockReturnValue({ isOnline: false }); + + await service.login("us"); + expect(service.getState().currentProjectId).toBeNull(); + + await new Promise((r) => setTimeout(r, 0)); + + fetchState.succeeds = true; + vi.mocked(connectivity.getStatus).mockReturnValue({ isOnline: true }); + getResumeHandler()(); + + await vi.waitFor(() => { + expect(service.getState().currentProjectId).toBe(42); + }); + }); + + it("does not attempt recovery when the token grants no scoped organizations", async () => { + const fetchState = { succeeds: true, orgCalls: 0 }; + stubOrgFetch(fetchState); + oauthFlow.startFlow.mockResolvedValue( + mockTokenResponse({ scopedOrgs: [] }), + ); + + await service.login("us"); + + expect(service.getState()).toMatchObject({ + status: "authenticated", + currentProjectId: null, + orgProjectsMap: {}, + }); + + emitStatus(true); + await new Promise((r) => setTimeout(r, 10)); + + expect(fetchState.orgCalls).toBe(0); + expect(service.getState().currentProjectId).toBeNull(); + }); + }); + describe("switchOrg", () => { const twoOrgs = { "org-1": { diff --git a/packages/core/src/auth/auth.ts b/packages/core/src/auth/auth.ts index 8d023d9fd..349d57a05 100644 --- a/packages/core/src/auth/auth.ts +++ b/packages/core/src/auth/auth.ts @@ -57,6 +57,7 @@ interface InMemorySession { orgProjectsMap: OrgProjectsMap; currentOrgId: string | null; currentProjectId: number | null; + orgProjectsIncomplete: boolean; } interface StoredSessionInput { @@ -338,6 +339,7 @@ export class AuthService extends TypedEventEmitter { orgProjectsMap: next.orgProjectsMap, currentOrgId: next.currentOrgId, currentProjectId: next.currentProjectId, + orgProjectsIncomplete: false, }; this.persistProjectPreference(this.session); @@ -557,12 +559,13 @@ export class AuthService extends TypedEventEmitter { tokenResponse.access_token, options.cloudRegion, ); - const orgProjectsMap = await this.buildOrgProjectsMap( - tokenResponse.access_token, - options.cloudRegion, - scopedOrgIds, - this.session?.orgProjectsMap ?? {}, - ); + const { map: orgProjectsMap, incomplete: orgProjectsIncomplete } = + await this.buildOrgProjectsMap( + tokenResponse.access_token, + options.cloudRegion, + scopedOrgIds, + this.session?.orgProjectsMap ?? {}, + ); const lastPrefs = accountKey ? this.authPreference.get(accountKey, options.cloudRegion) : null; @@ -583,6 +586,7 @@ export class AuthService extends TypedEventEmitter { orgProjectsMap, currentOrgId, currentProjectId, + orgProjectsIncomplete, }; return session; @@ -592,43 +596,48 @@ export class AuthService extends TypedEventEmitter { cloudRegion: CloudRegion, orgIds: string[], previousMap: OrgProjectsMap, - ): Promise { + ): Promise<{ map: OrgProjectsMap; incomplete: boolean }> { + let incomplete = false; const entries = await Promise.all( orgIds.map(async (orgId): Promise<[string, OrgProjects]> => { - const result = await this.fetchOrgWithProjects( + const { org, transient } = await this.fetchOrgWithProjects( accessToken, cloudRegion, orgId, ); - if (result) { - return [orgId, result]; + if (org) { + return [orgId, org]; } - return [ - orgId, - previousMap[orgId] ?? { orgName: "(unknown)", projects: [] }, - ]; + const fallback = previousMap[orgId] ?? { + orgName: "(unknown)", + projects: [], + }; + if (transient && fallback.projects.length === 0) { + incomplete = true; + } + return [orgId, fallback]; }), ); - return Object.fromEntries(entries); + return { map: Object.fromEntries(entries), incomplete }; } private async fetchOrgProjects( accessToken: string, cloudRegion: CloudRegion, orgId: string, ): Promise<{ id: number; name: string }[] | null> { - const result = await this.fetchOrgWithProjects( + const { org } = await this.fetchOrgWithProjects( accessToken, cloudRegion, orgId, ); - return result?.projects ?? null; + return org?.projects ?? null; } private async fetchOrgWithProjects( accessToken: string, cloudRegion: CloudRegion, orgId: string, - ): Promise { + ): Promise<{ org: OrgProjects | null; transient: boolean }> { for ( let attempt = 0; attempt < AuthService.ORG_FETCH_MAX_ATTEMPTS; @@ -640,10 +649,10 @@ export class AuthService extends TypedEventEmitter { orgId, ); if (result.ok) { - return result.data; + return { org: result.data, transient: false }; } if (!result.retryable) { - break; + return { org: null, transient: false }; } const isLastAttempt = attempt === AuthService.ORG_FETCH_MAX_ATTEMPTS - 1; @@ -658,7 +667,7 @@ export class AuthService extends TypedEventEmitter { await sleepWithBackoff(attempt, AuthService.REFRESH_BACKOFF); } - return null; + return { org: null, transient: true }; } private async fetchOrgWithProjectsOnce( accessToken: string, @@ -744,21 +753,27 @@ export class AuthService extends TypedEventEmitter { needsScopeReauth: false, }); await this.updateCodeAccessFromSession(); + + if (session.orgProjectsIncomplete) { + void this.refreshOrgProjects(); + } } private persistSession(input: { refreshToken: string; cloudRegion: CloudRegion; selectedProjectId: number | null; }): void { + const priorSelected = + this.authSession.getCurrent()?.selectedProjectId ?? null; this.authSession.saveCurrent({ refreshTokenEncrypted: this.cipher.encrypt(input.refreshToken), cloudRegion: input.cloudRegion, - selectedProjectId: input.selectedProjectId, + selectedProjectId: input.selectedProjectId ?? priorSelected, scopeVersion: OAUTH_SCOPE_VERSION, }); } private persistProjectPreference(session: InMemorySession): void { - if (!session.accountKey) { + if (!session.accountKey || session.currentProjectId === null) { return; } @@ -885,12 +900,14 @@ export class AuthService extends TypedEventEmitter { } private static readonly REFRESH_MAX_ATTEMPTS = 3; private static readonly ORG_FETCH_MAX_ATTEMPTS = 3; + private static readonly ORG_RECOVERY_MAX_ATTEMPTS = 5; private static readonly REFRESH_BACKOFF: BackoffOptions = { initialDelayMs: 1_000, maxDelayMs: 5_000, multiplier: 2, }; private recoveryPromise: Promise | null = null; + private orgProjectsRefreshPromise: Promise | null = null; private connectivityUnsubscribe: (() => void) | null = null; private resumeUnsubscribe: (() => void) | null = null; @postConstruct() @@ -929,7 +946,12 @@ export class AuthService extends TypedEventEmitter { }; } private attemptSessionRecovery(): void { - if (this.session) return; + if (this.session) { + if (this.session.orgProjectsIncomplete) { + void this.refreshOrgProjects(); + } + return; + } if (this.recoveryPromise) return; const stored = this.authSession.getCurrent(); @@ -948,6 +970,85 @@ export class AuthService extends TypedEventEmitter { }); } + private refreshOrgProjects(): Promise { + if (this.orgProjectsRefreshPromise) { + return this.orgProjectsRefreshPromise; + } + + this.orgProjectsRefreshPromise = this.doRefreshOrgProjects() + .catch((error) => { + this.logger.warn("Org/projects recovery failed", { error }); + }) + .finally(() => { + this.orgProjectsRefreshPromise = null; + }); + return this.orgProjectsRefreshPromise; + } + + private async doRefreshOrgProjects(): Promise { + for ( + let attempt = 0; + attempt < AuthService.ORG_RECOVERY_MAX_ATTEMPTS; + attempt++ + ) { + if (!this.session?.orgProjectsIncomplete) return; + if (!this.connectivity.getStatus().isOnline) return; + + let session: InMemorySession; + try { + session = await this.ensureValidSession(); + } catch (error) { + this.logger.warn("Org/projects recovery aborted: session invalid", { + error, + }); + return; + } + + if (!session.orgProjectsIncomplete) return; + + const orgIds = Object.keys(session.orgProjectsMap); + const { map, incomplete } = await this.buildOrgProjectsMap( + session.accessToken, + session.cloudRegion, + orgIds, + session.orgProjectsMap, + ); + + if (!incomplete) { + const lastPrefs = session.accountKey + ? this.authPreference.get(session.accountKey, session.cloudRegion) + : null; + const storedSelected = + this.authSession.getCurrent()?.selectedProjectId ?? null; + const currentProjectId = pickInitialProjectId({ + orgProjectsMap: map, + currentOrgId: session.currentOrgId, + preferredProjectId: + session.currentProjectId ?? + storedSelected ?? + lastPrefs?.lastSelectedProjectId ?? + null, + lastSelectedOrgId: lastPrefs?.lastSelectedOrgId ?? null, + }); + this.commitSessionState(session, { + orgProjectsMap: map, + currentOrgId: session.currentOrgId, + currentProjectId, + }); + this.logger.info( + "Recovered organizations/projects after incomplete sync", + ); + return; + } + + const isLastAttempt = + attempt === AuthService.ORG_RECOVERY_MAX_ATTEMPTS - 1; + if (isLastAttempt) break; + + await sleepWithBackoff(attempt, AuthService.REFRESH_BACKOFF); + } + } + private updateState(partial: Partial): void { this.state = { ...this.state, From 210a0aed29388b573f3339db09aff5c4cd5c2d6b Mon Sep 17 00:00:00 2001 From: Charles Vien Date: Fri, 12 Jun 2026 23:37:32 -0700 Subject: [PATCH 4/4] harden org/projects recovery and cover edge cases --- packages/core/src/auth/auth.test.ts | 230 ++++++++++++++++++++++++---- packages/core/src/auth/auth.ts | 6 + 2 files changed, 203 insertions(+), 33 deletions(-) diff --git a/packages/core/src/auth/auth.test.ts b/packages/core/src/auth/auth.test.ts index 1c850850e..769db90c0 100644 --- a/packages/core/src/auth/auth.test.ts +++ b/packages/core/src/auth/auth.test.ts @@ -879,69 +879,233 @@ describe("AuthService", () => { ); } - it("authenticates without a project when a scoped org fails to load, then recovers and restores the previous project on reconnect", async () => { - seedStoredSession({ - selectedProjectId: 84, - scopeVersion: OAUTH_SCOPE_VERSION - 1, - }); - await service.initialize(); - expect(service.getState()).toMatchObject({ - status: "anonymous", - needsScopeReauth: true, - currentProjectId: 84, + // Let the fire-and-forget refreshOrgProjects() that syncAuthenticatedSession + // kicks settle (it no-ops while offline) so orgProjectsRefreshPromise is cleared + // before we trigger recovery explicitly. + const flushPostSyncKick = () => new Promise((r) => setTimeout(r, 0)); + + it.each([ + { + trigger: "connectivity online", + seedProjectId: 84, + expectProjectId: 84, + }, + { + trigger: "power-monitor resume", + seedProjectId: null, + expectProjectId: 42, + }, + ])( + "authenticates without a project on transient org failure, then recovers via $trigger", + async ({ trigger, seedProjectId, expectProjectId }) => { + if (seedProjectId !== null) { + seedStoredSession({ + selectedProjectId: seedProjectId, + scopeVersion: OAUTH_SCOPE_VERSION - 1, + }); + await service.initialize(); + } + + const fetchState = { succeeds: false, orgCalls: 0 }; + stubOrgFetch(fetchState); + oauthFlow.startFlow.mockResolvedValue(mockTokenResponse()); + vi.mocked(connectivity.getStatus).mockReturnValue({ isOnline: false }); + + await service.login("us"); + + expect(service.getState()).toMatchObject({ + status: "authenticated", + currentProjectId: null, + orgProjectsMap: { "org-1": { projects: [] } }, + }); + + await flushPostSyncKick(); + + fetchState.succeeds = true; + vi.mocked(connectivity.getStatus).mockReturnValue({ isOnline: true }); + if (trigger === "connectivity online") { + emitStatus(true); + } else { + getResumeHandler()(); + } + + await vi.waitFor(() => { + expect(service.getState().currentProjectId).toBe(expectProjectId); + }); + expect(service.getState().orgProjectsMap).toMatchObject({ + "org-1": { + orgName: "Org 1", + projects: [ + { id: 42, name: "Project 42" }, + { id: 84, name: "Project 84" }, + ], + }, + }); + }, + ); + + it("retries across multiple recovery passes before succeeding", async () => { + let orgCalls = 0; + let succeedFromCall = Number.POSITIVE_INFINITY; + vi.stubGlobal( + "fetch", + vi.fn(async (input: string | Request) => { + const url = typeof input === "string" ? input : input.url; + if (url.includes("/api/users/@me/")) { + return { + ok: true, + json: vi.fn().mockResolvedValue({ + uuid: "user-1", + organization: { id: "org-1" }, + }), + } as unknown as Response; + } + if (/\/api\/organizations\/[^/]+\/$/.test(url)) { + orgCalls++; + if (orgCalls < succeedFromCall) { + throw new TypeError("fetch failed"); + } + return { + ok: true, + json: vi.fn().mockResolvedValue({ + name: "Org 1", + teams: [{ id: 42, name: "Project 42" }], + }), + } as unknown as Response; + } + return { + ok: true, + json: vi.fn().mockResolvedValue({ has_access: true }), + } as unknown as Response; + }) as unknown as typeof fetch, + ); + oauthFlow.startFlow.mockResolvedValue(mockTokenResponse()); + vi.mocked(connectivity.getStatus).mockReturnValue({ isOnline: false }); + + await service.login("us"); + expect(service.getState().currentProjectId).toBeNull(); + await flushPostSyncKick(); + + // Two recovery passes fail (3 org-fetch attempts each); the third succeeds. + orgCalls = 0; + succeedFromCall = 7; + vi.mocked(connectivity.getStatus).mockReturnValue({ isOnline: true }); + emitStatus(true); + + await vi.waitFor(() => { + expect(service.getState().currentProjectId).toBe(42); }); + expect(orgCalls).toBe(7); + }); + it("stays project-less without crashing when recovery exhausts every attempt", async () => { const fetchState = { succeeds: false, orgCalls: 0 }; stubOrgFetch(fetchState); oauthFlow.startFlow.mockResolvedValue(mockTokenResponse()); vi.mocked(connectivity.getStatus).mockReturnValue({ isOnline: false }); await service.login("us"); + await flushPostSyncKick(); + fetchState.orgCalls = 0; + vi.mocked(connectivity.getStatus).mockReturnValue({ isOnline: true }); + emitStatus(true); + + // 5 recovery passes x 3 org-fetch attempts each. + await vi.waitFor(() => { + expect(fetchState.orgCalls).toBe(15); + }); expect(service.getState()).toMatchObject({ status: "authenticated", currentProjectId: null, - orgProjectsMap: { "org-1": { projects: [] } }, }); + expect(mockLogger.warn).toHaveBeenCalledWith( + "Org/projects recovery exhausted retries", + ); + }); + + it("collapses concurrent recovery triggers into a single run", async () => { + let orgCalls = 0; + let hangRecovery = false; + let releaseOrg!: () => void; + const orgGate = new Promise((resolve) => { + releaseOrg = resolve; + }); + vi.stubGlobal( + "fetch", + vi.fn(async (input: string | Request) => { + const url = typeof input === "string" ? input : input.url; + if (url.includes("/api/users/@me/")) { + return { + ok: true, + json: vi.fn().mockResolvedValue({ + uuid: "user-1", + organization: { id: "org-1" }, + }), + } as unknown as Response; + } + if (/\/api\/organizations\/[^/]+\/$/.test(url)) { + orgCalls++; + if (!hangRecovery) { + throw new TypeError("fetch failed"); + } + await orgGate; + return { + ok: true, + json: vi.fn().mockResolvedValue({ + name: "Org 1", + teams: [{ id: 42, name: "Project 42" }], + }), + } as unknown as Response; + } + return { + ok: true, + json: vi.fn().mockResolvedValue({ has_access: true }), + } as unknown as Response; + }) as unknown as typeof fetch, + ); + oauthFlow.startFlow.mockResolvedValue(mockTokenResponse()); + vi.mocked(connectivity.getStatus).mockReturnValue({ isOnline: false }); - await new Promise((r) => setTimeout(r, 0)); + await service.login("us"); + expect(service.getState().currentProjectId).toBeNull(); + await flushPostSyncKick(); - fetchState.succeeds = true; + orgCalls = 0; + hangRecovery = true; vi.mocked(connectivity.getStatus).mockReturnValue({ isOnline: true }); emitStatus(true); + emitStatus(true); + getResumeHandler()(); await vi.waitFor(() => { - expect(service.getState().currentProjectId).toBe(84); + expect(orgCalls).toBe(1); }); - expect(service.getState().orgProjectsMap).toMatchObject({ - "org-1": { - orgName: "Org 1", - projects: [ - { id: 42, name: "Project 42" }, - { id: 84, name: "Project 84" }, - ], - }, + await flushPostSyncKick(); + expect(orgCalls).toBe(1); + + releaseOrg(); + await vi.waitFor(() => { + expect(service.getState().currentProjectId).toBe(42); }); }); - it("recovers org/projects on power-monitor resume", async () => { + it("preserves the stored project while project-less so recovery can restore it", async () => { + seedStoredSession({ + selectedProjectId: 84, + scopeVersion: OAUTH_SCOPE_VERSION - 1, + }); + await service.initialize(); + const fetchState = { succeeds: false, orgCalls: 0 }; stubOrgFetch(fetchState); oauthFlow.startFlow.mockResolvedValue(mockTokenResponse()); vi.mocked(connectivity.getStatus).mockReturnValue({ isOnline: false }); await service.login("us"); - expect(service.getState().currentProjectId).toBeNull(); - await new Promise((r) => setTimeout(r, 0)); - - fetchState.succeeds = true; - vi.mocked(connectivity.getStatus).mockReturnValue({ isOnline: true }); - getResumeHandler()(); - - await vi.waitFor(() => { - expect(service.getState().currentProjectId).toBe(42); - }); + expect(service.getState().currentProjectId).toBeNull(); + expect(sessionPort.getCurrent()?.selectedProjectId).toBe(84); }); it("does not attempt recovery when the token grants no scoped organizations", async () => { diff --git a/packages/core/src/auth/auth.ts b/packages/core/src/auth/auth.ts index 349d57a05..28045f2ef 100644 --- a/packages/core/src/auth/auth.ts +++ b/packages/core/src/auth/auth.ts @@ -1014,6 +1014,10 @@ export class AuthService extends TypedEventEmitter { session.orgProjectsMap, ); + // The session may have been replaced (logout, re-login) while the fetch + // was in flight; committing the stale one would resurrect it. + if (this.session !== session) return; + if (!incomplete) { const lastPrefs = session.accountKey ? this.authPreference.get(session.accountKey, session.cloudRegion) @@ -1047,6 +1051,8 @@ export class AuthService extends TypedEventEmitter { await sleepWithBackoff(attempt, AuthService.REFRESH_BACKOFF); } + + this.logger.warn("Org/projects recovery exhausted retries"); } private updateState(partial: Partial): void {