Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve diagnostics for kernel supervisor startup failures #5705

Merged
merged 4 commits into from
Dec 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions extensions/positron-supervisor/positron.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"binaries": [
{
"from": "resources/supervisor-wrapper.*",
"to": "resources"
}
]
}
44 changes: 44 additions & 0 deletions extensions/positron-supervisor/resources/supervisor-wrapper.bat
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
@echo off

REM ---------------------------------------------------------------------------------------------
REM Copyright (C) 2024 Posit Software, PBC. All rights reserved.
REM ---------------------------------------------------------------------------------------------

REM This script is used to run a program and capture its output to a file. It is
REM used to capture the output of the supervisor process so that it can be displayed
REM in the UI in the case of a startup failure.

REM Check that the user provided at least two arguments; the first is the output
REM file and the second is the program to run and any arguments. If not, print a
REM usage message and exit with an error code.

if "%~2"=="" (
echo Usage: %0 ^<output-file^> ^<program^> [program-args...] >&2
exit /b 1
)

REM The first argument is the output file; consume it.
set output_file=%1
shift

REM `shift` doesn't affect `%*`, so we have to manually remove the first argument
set "args="
:parse
if "%~1" neq "" (
set args=%args% %1
shift
goto :parse
)
if defined args set args=%args:~1%

REM Print the command line to the log file
echo %args% >> "%output_file%"

REM Run the program with its arguments and capture the output
%args% >> "%output_file%"

REM Save the exit code of the program
set exit_code=%ERRORLEVEL%

REM Exit with the same code as the program so that the caller can correctly report errors
exit /b exit_code
34 changes: 34 additions & 0 deletions extensions/positron-supervisor/resources/supervisor-wrapper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/usr/bin/env bash

# ---------------------------------------------------------------------------------------------
# Copyright (C) 2024 Posit Software, PBC. All rights reserved.
# Licensed under the Elastic License 2.0. See LICENSE.txt for license information.
# ---------------------------------------------------------------------------------------------

# This script is used to run a program and capture its output to a file. It is
# used to capture the output of the supervisor process so that it can be displayed
# in the UI in the case of a startup failure

# Check that the user provided at least two arguments; the first is the output
# file and the second is the program to run and any arguments. If not, print a
# usage message and exit with an error code.
if [ $# -lt 2 ]; then
echo "Usage: $0 <output-file> <program> [program-args...]" >&2
exit 1
fi

# The first argument is the output file; consume it.
output_file="$1"
shift

# Print the command line to the log file
echo "$@" >> "$output_file"

# Run the program with its arguments, redirecting stdout and stderr to the output file
"$@" >> "$output_file" 2>&1

# Save the exit code of the program
exit_code=$?

# Exit with the same code as the program so that the caller can correctly report errors
exit $exit_code
81 changes: 80 additions & 1 deletion extensions/positron-supervisor/src/KallichoreAdapterApi.ts
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,11 @@ export class KCApi implements KallichoreAdapterApi {
*/
private _logStreamer: LogStreamer | undefined;

/**
* An array of disposables that need to be cleaned up when the API is disposed.
*/
private _disposables: vscode.Disposable[] = [];

/**
* Create a new Kallichore API object.
*
Expand Down Expand Up @@ -202,12 +207,21 @@ export class KCApi implements KallichoreAdapterApi {
// Create a temporary file with a random name to use for logs
const logFile = path.join(os.tmpdir(), `kallichore-${sessionId}.log`);

// Create a second file to capture the server's stdout and stderr
const outFile = path.join(os.tmpdir(), `kallichore-${sessionId}.out.log`);

// Determine the path to the wrapper script.
const wrapperName = os.platform() === 'win32' ? 'supervisor-wrapper.bat' : 'supervisor-wrapper.sh';
const wrapperPath = path.join(this._context.extensionPath, 'resources', wrapperName);

// Start the server in a new terminal
this._log.appendLine(`Starting Kallichore server ${shellPath} on port ${port}`);
const terminal = vscode.window.createTerminal(<vscode.TerminalOptions>{
name: 'Kallichore',
shellPath: shellPath,
shellPath: wrapperPath,
shellArgs: [
outFile,
shellPath,
'--port', port.toString(),
'--token', tokenPath,
'--log-level', logLevel,
Expand All @@ -219,6 +233,60 @@ export class KCApi implements KallichoreAdapterApi {
isTransient: false
});

// Flag to track if the terminal exited before the start barrier opened
let exited = false;

// Listen for the terminal to close. If it closes unexpectedly before
// the start barrier opens, provide some feedback.
const closeListener = vscode.window.onDidCloseTerminal(async (closedTerminal) => {
// Ignore closed terminals that aren't the one we started
if (closedTerminal !== terminal) {
return;
}

// Ignore if the start barrier is already open (that means the
// server started successfully), or if more than 5 minutes have elapsed
if (this._started.isOpen()) {
return;
}

// Ignore if it's been more than 5 minutes since the start time
if (Date.now() - startTime > 300000) {
return;
}

// Mark the terminal as exited
exited = true;

// Read the contents of the output file and log it
const contents = fs.readFileSync(outFile, 'utf8');
if (terminal.exitStatus && terminal.exitStatus.code) {
this._log.appendLine(`Supervisor terminal closed with exit code ${terminal.exitStatus.code}; output:\n${contents}`);
} else {
this._log.appendLine(`Supervisor terminal closed unexpectedly; output:\n${contents}`);
}

// Display a notification that directs users to open the log to get more information
const selection = await vscode.window.showInformationMessage(
vscode.l10n.t('There was an error starting the kernel supervisor. Check the log for more information.'), {
title: vscode.l10n.t('Open Log'),
execute: () => {
this._log.show(false);
}
});
if (selection) {
selection.execute();
}
});

// Ensure this listener is disposed when the API is disposed
this._disposables.push(closeListener);

// Ensure the output file is cleaned up when the API is disposed
this._disposables.push(new vscode.Disposable(() => {
fs.unlinkSync(outFile);
}));

// Wait for the terminal to start and get the PID
await terminal.processId;

Expand All @@ -237,6 +305,11 @@ export class KCApi implements KallichoreAdapterApi {
} catch (err) {
const elapsed = Date.now() - startTime;

// Has the terminal exited? if it has, there's no point in continuing to retry.
if (exited) {
throw new Error(`The supervisor process exited before the server was ready.`);
}

// ECONNREFUSED is a normal condition during startup; the server
// isn't ready yet. Keep trying until we hit the retry limit,
// about 2 seconds from the time we got a process ID
Expand Down Expand Up @@ -285,6 +358,9 @@ export class KCApi implements KallichoreAdapterApi {
this._log.appendLine(`Streaming Kallichore server logs from ${logFile} (log level: ${logLevel})`);
});

// Now that we're online, we can dispose of the close listener
closeListener.dispose();

// Open the started barrier and save the server state since we're online
this._started.open();
const state: KallichoreServerState = {
Expand Down Expand Up @@ -579,6 +655,9 @@ export class KCApi implements KallichoreAdapterApi {
this._logStreamer.dispose();
this._logStreamer = undefined;
}

// Dispose of any other disposables
this._disposables.forEach(disposable => disposable.dispose());
}

findAvailablePort(excluding: Array<number>, maxTries: number): Promise<number> {
Expand Down
Loading