From b73bb154150378f5c62474eafaf7220e49fa182c Mon Sep 17 00:00:00 2001 From: Jonathan Date: Thu, 19 Dec 2024 12:34:18 -0800 Subject: [PATCH] Attempt to re-establish session connection after unexpected disconnect (#5832) This change adds resiliency to the WebSocket connections that Positron uses to send and receive kernel messages. We were already detecting unexpected connection drops and using them to recover from server crashes. However, we've seen a few reports in the wild of these connection drops happening even when everything else is healthy. The improvement here is as follows: - If the session appears to be running, and the server is also running, we try to reestablish the connection. If we can, it is seamlessly resumed. Because messages that aren't delivered are queued on the supervisor side, we shouldn't miss any. - If the connection cannot be re-established, the user is shown a message and the session is marked offline. This prevents the session from looking inexplicably frozen. To help test this change without leaving Positron running for a day or more, I've added a development-only reconnect command that disconnects the socket for the active console session and lets the reconnect logic kick in to bring it back online. Addresses https://github.com/posit-dev/positron/issues/5788. ### QA Notes If you can't reproduce the original issue, you can use a development build and this command. image Since reconnect is meant to be seamless, check that the session can still run code after reconnecting, and check the logs to ensure that the reconnect sequence happened as expected. You should see something like this: ``` Session 'R 4.3.3' disconnected while in state 'idle'. This is unexpected; checking server status. Kallichore server PID 75482 is still running The server is still running; attempting to reconnect to session r-e6137faa Successfully restored connection to r-e6137faa ``` --- extensions/positron-supervisor/package.json | 7 ++ .../positron-supervisor/package.nls.json | 3 +- .../src/KallichoreAdapterApi.ts | 80 ++++++++++++++++--- .../src/KallichoreSession.ts | 20 ++++- 4 files changed, 95 insertions(+), 15 deletions(-) diff --git a/extensions/positron-supervisor/package.json b/extensions/positron-supervisor/package.json index 1e55ea4d228..41e3b39fdb8 100644 --- a/extensions/positron-supervisor/package.json +++ b/extensions/positron-supervisor/package.json @@ -83,6 +83,13 @@ "category": "%command.positron.supervisor.category%", "title": "%command.showKernelSupervisorLog.title%", "shortTitle": "%command.showKernelSupervisorLog.title%" + }, + { + "command": "positron.supervisor.reconnectSession", + "category": "%command.positron.supervisor.category%", + "title": "%command.reconnectSession.title%", + "shortTitle": "%command.reconnectSession.title%", + "enablement": "isDevelopment" } ] }, diff --git a/extensions/positron-supervisor/package.nls.json b/extensions/positron-supervisor/package.nls.json index 8f4b97ae176..b37e03e37a8 100644 --- a/extensions/positron-supervisor/package.nls.json +++ b/extensions/positron-supervisor/package.nls.json @@ -13,5 +13,6 @@ "configuration.attachOnStartup.description": "Run before starting up Jupyter kernel (when supported)", "configuration.sleepOnStartup.description": "Sleep for n seconds before starting up Jupyter kernel (when supported)", "command.positron.supervisor.category": "Kernel Supervisor", - "command.showKernelSupervisorLog.title": "Show the Kernel Supervisor Log" + "command.showKernelSupervisorLog.title": "Show the Kernel Supervisor Log", + "command.reconnectSession.title": "Reconnect the current session" } diff --git a/extensions/positron-supervisor/src/KallichoreAdapterApi.ts b/extensions/positron-supervisor/src/KallichoreAdapterApi.ts index 358f8d3f0ba..a9f88c46143 100644 --- a/extensions/positron-supervisor/src/KallichoreAdapterApi.ts +++ b/extensions/positron-supervisor/src/KallichoreAdapterApi.ts @@ -13,7 +13,7 @@ import { findAvailablePort } from './PortFinder'; import { KallichoreAdapterApi } from './positron-supervisor'; import { JupyterKernelExtra, JupyterKernelSpec, JupyterLanguageRuntimeSession } from './jupyter-adapter'; import { KallichoreSession } from './KallichoreSession'; -import { Barrier, PromiseHandles } from './async'; +import { Barrier, PromiseHandles, withTimeout } from './async'; import { LogStreamer } from './LogStreamer'; import { createUniqueId, summarizeHttpError } from './util'; @@ -93,6 +93,10 @@ export class KCApi implements KallichoreAdapterApi { this._log.appendLine(`Failed to start Kallichore server: ${err}`); }); } + + _context.subscriptions.push(vscode.commands.registerCommand('positron.supervisor.reconnectSession', () => { + this.reconnectActiveSession(); + })); } /** @@ -216,7 +220,7 @@ export class KCApi implements KallichoreAdapterApi { // Start the server in a new terminal this._log.appendLine(`Starting Kallichore server ${shellPath} on port ${port}`); - const terminal = vscode.window.createTerminal({ + const terminal = vscode.window.createTerminal({ name: 'Kallichore', shellPath: wrapperPath, shellArgs: [ @@ -231,7 +235,7 @@ export class KCApi implements KallichoreAdapterApi { message: `*** Kallichore Server (${shellPath}) ***`, hideFromUser: !showTerminal, isTransient: false - }); + } satisfies vscode.TerminalOptions); // Flag to track if the terminal exited before the start barrier opened let exited = false; @@ -489,15 +493,31 @@ export class KCApi implements KallichoreAdapterApi { * @param session The session to add the disconnect handler to */ private addDisconnectHandler(session: KallichoreSession) { - session.disconnected.event(async (state: positron.RuntimeState) => { + this._disposables.push(session.disconnected.event(async (state: positron.RuntimeState) => { if (state !== positron.RuntimeState.Exited) { // The websocket disconnected while the session was still // running. This could signal a problem with the supervisor; we // should see if it's still running. this._log.appendLine(`Session '${session.metadata.sessionName}' disconnected while in state '${state}'. This is unexpected; checking server status.`); - await this.testServerExited(); + + // If the server did not exit, and the session also appears to + // still be running, try to reconnect the websocket. It's + // possible the connection just got dropped or interrupted. + const exited = await this.testServerExited(); + if (!exited) { + this._log.appendLine(`The server is still running; attempting to reconnect to session ${session.metadata.sessionId}`); + try { + await withTimeout(session.connect(), 2000, `Timed out reconnecting to session ${session.metadata.sessionId}`); + this._log.appendLine(`Successfully restored connection to ${session.metadata.sessionId}`); + } catch (err) { + // The session could not be reconnected; mark it as + // offline and explain to the user what happened. + session.markOffline('Lost connection to the session WebSocket event stream and could not restore it: ' + err); + vscode.window.showErrorMessage(vscode.l10n.t('Unable to re-establish connection to {0}: {1}', session.metadata.sessionName, err)); + } + } } - }); + })); } /** @@ -510,14 +530,16 @@ export class KCApi implements KallichoreAdapterApi { * handle the case where the server process is running but it's become * unresponsive. * - * @returns A promise that resolves when the server has been confirmed to be - * running or has been restarted. + * @returns A promise that resolves when the server has been confirmed to + * be running or has been restarted. Resolves with `true` if the server did + * in fact exit, `false` otherwise. */ - private async testServerExited() { + private async testServerExited(): Promise { // If we're currently starting, it doesn't make sense to test the // server status since we're already in the process of starting it. if (this._starting) { - return this._starting.promise; + await this._starting.promise; + return false; } // Load the server state so we can check the process ID @@ -527,7 +549,7 @@ export class KCApi implements KallichoreAdapterApi { // If there's no server state, return as we can't check its status if (!serverState) { this._log.appendLine(`No Kallichore server state found; cannot test server process`); - return; + return false; } // Test the process ID to see if the server is still running. @@ -546,7 +568,7 @@ export class KCApi implements KallichoreAdapterApi { // The server is still running; nothing to do if (serverRunning) { - return; + return false; } // Clean up the state so we don't try to reconnect to a server that @@ -578,6 +600,9 @@ export class KCApi implements KallichoreAdapterApi { vscode.window.showInformationMessage( vscode.l10n.t('The process supervising the interpreters has exited unexpectedly and could not automatically restarted: ' + err)); } + + // The server did exit. + return true; } /** @@ -715,4 +740,35 @@ export class KCApi implements KallichoreAdapterApi { throw new Error(`Kallichore server not found (expected at ${embeddedBinary})`); } + + /** + * Reconnects to the active session, if one exists. Primarily useful as a + * troubleshooting tool. + */ + async reconnectActiveSession() { + // Get the foreground session from the Positron API + const session = await positron.runtime.getForegroundSession(); + if (!session) { + vscode.window.showInformationMessage(vscode.l10n.t('No active session to reconnect to')); + return; + } + + // Find the session in our list + const kallichoreSession = this._sessions.find(s => s.metadata.sessionId === session.metadata.sessionId); + if (!kallichoreSession) { + vscode.window.showInformationMessage(vscode.l10n.t('Active session {0} not managed by the kernel supervisor', session.metadata.sessionName)); + return; + } + + // Ensure the session is still active + if (kallichoreSession.runtimeState === positron.RuntimeState.Exited) { + vscode.window.showInformationMessage(vscode.l10n.t('Session {0} is not running', session.metadata.sessionName)); + return; + } + + // Disconnect the session; since the session is active, this should + // trigger a reconnect. + kallichoreSession.log('Disconnecting by user request', vscode.LogLevel.Info); + kallichoreSession.disconnect(); + } } diff --git a/extensions/positron-supervisor/src/KallichoreSession.ts b/extensions/positron-supervisor/src/KallichoreSession.ts index 3b1fd2053bf..f925bfe1f8a 100644 --- a/extensions/positron-supervisor/src/KallichoreSession.ts +++ b/extensions/positron-supervisor/src/KallichoreSession.ts @@ -571,8 +571,8 @@ export class KallichoreSession implements JupyterLanguageRuntimeSession { type === positron.RuntimeClientType.IPyWidgetControl) { const msg: JupyterCommOpen = { - target_name: type, // eslint-disable-line - comm_id: id, // eslint-disable-line + target_name: type, + comm_id: id, data: params }; const commOpen = new CommOpenCommand(msg, metadata); @@ -1044,6 +1044,13 @@ export class KallichoreSession implements JupyterLanguageRuntimeSession { this._disposables = []; } + /** + * Disconnect the session + */ + public disconnect() { + this._socket?.ws.close(); + } + /** * Main entry point for handling messages delivered over the websocket from * the Kallichore server. @@ -1165,6 +1172,15 @@ export class KallichoreSession implements JupyterLanguageRuntimeSession { this.onExited(exitCode); } + /** + * Marks the kernel as offline. + * + * @param reason The reason for the kernel going offline + */ + markOffline(reason: string) { + this.onStateChange(positron.RuntimeState.Offline, reason); + } + private onExited(exitCode: number) { if (this._restarting) { // If we're restarting, wait for the kernel to start up again