Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Get thread and task backtraces before terminating a worker on timeout #157

Closed
wants to merge 13 commits into from
34 changes: 24 additions & 10 deletions src/ReTestItems.jl
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,9 @@ will be run.
Zero means no profile will be taken. Can also be set using the `RETESTITEMS_TIMEOUT_PROFILE_WAIT`
environment variable. See the [Profile documentation](https://docs.julialang.org/en/v1/stdlib/Profile/#Triggered-During-Execution)
for more information on triggered profiles. Note you can use `worker_init_expr` to tweak the profile settings on workers.
- `timeout_backtraces::Bool=false`: Linux only. If `true`, we use GDB to get thread and task
backtraces from a worker that times-out. Can also be set using the `RETESTITEMS_TIMEOUT_BACKTRACES`
environment variable. If a CPU profile was also requested, this runs after that is taken.
"""
function runtests end

Expand Down Expand Up @@ -243,6 +246,7 @@ function runtests(
test_end_expr::Expr=Expr(:block),
validate_paths::Bool=parse(Bool, get(ENV, "RETESTITEMS_VALIDATE_PATHS", "false")),
timeout_profile_wait::Real=parse(Int, get(ENV, "RETESTITEMS_TIMEOUT_PROFILE_WAIT", "0")),
timeout_backtraces::Bool=parse(Bool, get(ENV, "RETESTITEMS_TIMEOUT_BACKTRACES", "false")),
)
nworker_threads = _validated_nworker_threads(nworker_threads)
paths′ = _validated_paths(paths, validate_paths)
Expand All @@ -262,13 +266,14 @@ function runtests(
timeout = ceil(Int, testitem_timeout)
timeout_profile_wait = ceil(Int, timeout_profile_wait)
(timeout_profile_wait > 0 && Sys.iswindows()) && @warn "CPU profiles on timeout is not supported on Windows, ignoring `timeout_profile_wait`"
(timeout_backtraces && !Sys.islinux()) && @warn "Backtraces on timeout is only supported on Linux, ignoring `timeout_backtraces`"
debuglvl = Int(debug)
if debuglvl > 0
LoggingExtras.withlevel(LoggingExtras.Debug; verbosity=debuglvl) do
_runtests(shouldrun_combined, paths′, nworkers, nworker_threads, worker_init_expr, test_end_expr, timeout, retries, memory_threshold, verbose_results, debuglvl, report, logs, timeout_profile_wait)
_runtests(shouldrun_combined, paths′, nworkers, nworker_threads, worker_init_expr, test_end_expr, timeout, retries, memory_threshold, verbose_results, debuglvl, report, logs, timeout_profile_wait, timeout_backtraces)
end
else
return _runtests(shouldrun_combined, paths′, nworkers, nworker_threads, worker_init_expr, test_end_expr, timeout, retries, memory_threshold, verbose_results, debuglvl, report, logs, timeout_profile_wait)
return _runtests(shouldrun_combined, paths′, nworkers, nworker_threads, worker_init_expr, test_end_expr, timeout, retries, memory_threshold, verbose_results, debuglvl, report, logs, timeout_profile_wait, timeout_backtraces)
end
end

Expand All @@ -282,7 +287,7 @@ end
# By tracking and reusing test environments, we can avoid this issue.
const TEST_ENVS = Dict{String, String}()

function _runtests(shouldrun, paths, nworkers::Int, nworker_threads::String, worker_init_expr::Expr, test_end_expr::Expr, testitem_timeout::Int, retries::Int, memory_threshold::Real, verbose_results::Bool, debug::Int, report::Bool, logs::Symbol, timeout_profile_wait::Int)
function _runtests(shouldrun, paths, nworkers::Int, nworker_threads::String, worker_init_expr::Expr, test_end_expr::Expr, testitem_timeout::Int, retries::Int, memory_threshold::Real, verbose_results::Bool, debug::Int, report::Bool, logs::Symbol, timeout_profile_wait::Int, timeout_backtraces::Bool)
# Don't recursively call `runtests` e.g. if we `include` a file which calls it.
# So we ignore the `runtests(...)` call in `test/runtests.jl` when `runtests(...)`
# was called from the command line.
Expand All @@ -302,7 +307,7 @@ function _runtests(shouldrun, paths, nworkers::Int, nworker_threads::String, wor
if is_running_test_runtests_jl(proj_file)
# Assume this is `Pkg.test`, so test env already active.
@debugv 2 "Running in current environment `$(Base.active_project())`"
return _runtests_in_current_env(shouldrun, paths, proj_file, nworkers, nworker_threads, worker_init_expr, test_end_expr, testitem_timeout, retries, memory_threshold, verbose_results, debug, report, logs, timeout_profile_wait)
return _runtests_in_current_env(shouldrun, paths, proj_file, nworkers, nworker_threads, worker_init_expr, test_end_expr, testitem_timeout, retries, memory_threshold, verbose_results, debug, report, logs, timeout_profile_wait, timeout_backtraces)
else
@debugv 1 "Activating test environment for `$proj_file`"
orig_proj = Base.active_project()
Expand All @@ -315,7 +320,7 @@ function _runtests(shouldrun, paths, nworkers::Int, nworker_threads::String, wor
testenv = TestEnv.activate()
TEST_ENVS[proj_file] = testenv
end
_runtests_in_current_env(shouldrun, paths, proj_file, nworkers, nworker_threads, worker_init_expr, test_end_expr, testitem_timeout, retries, memory_threshold, verbose_results, debug, report, logs, timeout_profile_wait)
_runtests_in_current_env(shouldrun, paths, proj_file, nworkers, nworker_threads, worker_init_expr, test_end_expr, testitem_timeout, retries, memory_threshold, verbose_results, debug, report, logs, timeout_profile_wait, timeout_backtraces)
finally
Base.set_active_project(orig_proj)
end
Expand All @@ -327,6 +332,7 @@ function _runtests_in_current_env(
shouldrun, paths, projectfile::String, nworkers::Int, nworker_threads, worker_init_expr::Expr, test_end_expr::Expr,
testitem_timeout::Int, retries::Int, memory_threshold::Real, verbose_results::Bool, debug::Int, report::Bool, logs::Symbol,
timeout_profile_wait::Int,
timeout_backtraces::Bool,
)
start_time = time()
proj_name = something(Pkg.Types.read_project(projectfile).name, "")
Expand Down Expand Up @@ -391,7 +397,7 @@ function _runtests_in_current_env(
ti = starting[i]
@spawn begin
with_logger(original_logger) do
manage_worker($w, $proj_name, $testitems, $ti, $nworker_threads, $worker_init_expr, $test_end_expr, $testitem_timeout, $retries, $memory_threshold, $verbose_results, $debug, $report, $logs, $timeout_profile_wait)
manage_worker($w, $proj_name, $testitems, $ti, $nworker_threads, $worker_init_expr, $test_end_expr, $testitem_timeout, $retries, $memory_threshold, $verbose_results, $debug, $report, $logs, $timeout_profile_wait, $timeout_backtraces)
end
end
end
Expand Down Expand Up @@ -422,7 +428,7 @@ function start_worker(proj_name, nworker_threads, worker_init_expr, ntestitems;
const GLOBAL_TEST_CONTEXT = ReTestItems.TestContext($proj_name, $ntestitems)
GLOBAL_TEST_CONTEXT.setups_evaled = ReTestItems.TestSetupModules()
nthreads_str = $nworker_threads
@info "Starting test worker$($i) on pid = $(Libc.getpid()), with $nthreads_str threads"
@info "Starting test worker$($i) on pid = $(Libc.getpid()), with $nthreads_str threads, nthreads(:default) = $(Base.Threads.nthreads(:default)), nthreads(:interactive) = $(Base.Threads.nthreads(:interactive))"
$(worker_init_expr.args...)
nothing
end)
Expand Down Expand Up @@ -502,7 +508,8 @@ end

function manage_worker(
worker::Worker, proj_name::AbstractString, testitems::TestItems, testitem::Union{TestItem,Nothing}, nworker_threads, worker_init_expr::Expr, test_end_expr::Expr,
default_timeout::Int, retries::Int, memory_threshold::Real, verbose_results::Bool, debug::Int, report::Bool, logs::Symbol, timeout_profile_wait::Int
default_timeout::Int, retries::Int, memory_threshold::Real, verbose_results::Bool, debug::Int, report::Bool, logs::Symbol, timeout_profile_wait::Int,
timeout_backtraces::Bool,
)
ntestitems = length(testitems.testitems)
run_number = 1
Expand Down Expand Up @@ -563,11 +570,17 @@ function manage_worker(
@debugv 2 "Error" exception=e
# Handle the exception
if e isa TimeoutException
@warn "$worker timed out running test item $(repr(testitem.name)) after $timeout seconds."
xtra_output = nothing
if timeout_profile_wait > 0
@warn "$worker timed out running test item $(repr(testitem.name)) after $timeout seconds. \
A CPU profile will be triggered on the worker and then it will be terminated."
@info "Gathering a CPU profile from $worker."
trigger_profile(worker, timeout_profile_wait, :timeout)
end
if timeout_backtraces
@info "Gathering thread and task backtraces from $worker."
xtra_output = trigger_backtraces(worker, :timeout)
end
@info "Terminating $worker."
terminate!(worker, :timeout)
wait(worker)
# TODO: We print the captured logs after the worker is terminated,
Expand All @@ -576,6 +589,7 @@ function manage_worker(
# This is not an issue with eager logs, but when going through a file, this seems to help.
println(DEFAULT_STDOUT[])
_print_captured_logs(DEFAULT_STDOUT[], testitem, run_number)
isnothing(xtra_output) || println(DEFAULT_STDOUT[], xtra_output)
@error "$worker timed out running test item $(repr(testitem.name)) after $timeout seconds. \
Recording test error."
record_timeout!(testitem, run_number, timeout)
Expand Down
49 changes: 45 additions & 4 deletions src/workers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module Workers
using Sockets, Serialization

export Worker, remote_eval, remote_fetch, terminate!, WorkerTerminatedException
export trigger_profile
export trigger_profile, trigger_backtraces

function try_with_timeout(f, timeout)
cond = Threads.Condition()
Expand Down Expand Up @@ -108,8 +108,8 @@ end

# Send signal to the given `Worker` process to trigger a profile.
# Users can customise this profiling in the usual way, e.g. via
# `JULIA_PROFILE_PEEK_HEAP_SNAPSHOT`, but `Profile.set_peek_duration`, `Profile.peek_report[]`
# would have to be modified in the worker process.
# `JULIA_PROFILE_PEEK_HEAP_SNAPSHOT`, but `Profile.set_peek_duration`,
# `Profile.peek_report[]` would have to be modified in the worker process.
# See https://docs.julialang.org/en/v1/stdlib/Profile/#Triggered-During-Execution
# Called when timeout_profile_wait is non-zero.
function trigger_profile(w::Worker, timeout_profile_wait, from::Symbol=:manual)
Expand All @@ -120,7 +120,48 @@ function trigger_profile(w::Worker, timeout_profile_wait, from::Symbol=:manual)
elseif Sys.isbsd()
kill(w.process, 29) # SIGINFO
end
sleep(timeout_profile_wait) # Leave time for it to print the profile.
sleep(timeout_profile_wait) # Leave time to output the profile.
end
return nothing
end

# Spawn GDB to request thread backtraces followed by invoking
# jl_print_task_backtraces to get task backtraces. We wait for GDB
# to finish so no sleep is needed. Thread backtraces are written to
# the GDB process' standard output and not the worker's standard
# output, so we just return those as a `String` from here for the
# caller to assemble with the worker's output.
function trigger_backtraces(w::Worker, from::Symbol=:manual)
if Sys.islinux()
kpamnany marked this conversation as resolved.
Show resolved Hide resolved
@debug "using GDB to get thread and task backtraces on worker $(w.pid) from $from"
#iob = IOBuffer()
gdb_cmd = Cmd([
"gdb",
# Run in batched mode on the worker process
"--batch", "-p", "$(w.pid)",
# Julia's runtime uses SIGSEGV for GC and SIGUSR2 to pause threads
# (for profiling or backtracing). Tell GDB to ignore these signals.
"-ex", "handle SIGSEGV noprint nostop pass",
"-ex", "handle SIGUSR2 noprint nostop pass",
# Get all thread backtraces
"-ex", "thread apply all bt",
])
#=
@static if VERSION >= v"1.9"
push!(gdb_cmd.exec,
# Ask Julia to dump all task backtraces
"-ex", "call jl_print_task_backtraces(1)",
)
end
=#
try
println("Running $(gdb_cmd)")
run(gdb_cmd)
#run(pipeline(gdb_cmd, stdout=iob, stderr=iob))
#return String(take!(iob))
catch e
@warn "Could not get thread/task backtraces via GDB." exception=e
end
end
return nothing
end
Expand Down
67 changes: 67 additions & 0 deletions test/integrationtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -890,6 +890,73 @@ end
end
end

@testset "Backtraces timeout trigger" begin
function gdb_available()
try
run(`gdb -ex "exit"`)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a way to run CI on machines with gdb available? why don't the linux CI machines have it?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we need a step in the CI workflow that has:

      - name: Install GDB
        run: sudo apt-get install -y gdb

But CI.yml is written in a platform-independent way and I haven't figured out how to add that step for Ubuntu only. If you know, that would be great.

Copy link
Collaborator

@nickrobinson251 nickrobinson251 May 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, i think that just needs an if: line like

- name: Install GDB
  if: matrix.os == 'ubuntu-latest'
  run: sudo apt-get install -y gdb

not certain on the syntax, might need to be

- name: Install GDB
  if: ${{ matrix.os }} == 'ubuntu-latest'
  run: sudo apt-get install -y gdb

or

- name: Install GDB
  if: ${{ matrix.os == 'ubuntu-latest' }}
  run: sudo apt-get install -y gdb

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added it, but the CI machine does not allow gdb to attach to the process:

Could not attach to process.  If your uid matches the uid of the target
process, check the setting of /proc/sys/kernel/yama/ptrace_scope, or try
again as the root user.  For more details, see /etc/sysctl.d/10-ptrace.conf
ptrace: Inappropriate ioctl for device.

So I've backed it out. 🤷‍♂️

true
catch
false
end
end
function capture_timeout_backtraces(f, timeout_backtraces; kwargs...)
logs = mktemp() do path, io
redirect_stdio(stdout=io, stderr=io, stdin=devnull) do
encased_testset() do
if isnothing(timeout_backtraces)
runtests(joinpath(TEST_FILES_DIR, "_timeout_tests.jl"); nworkers=1, testitem_timeout=2, kwargs...)
else
runtests(joinpath(TEST_FILES_DIR, "_timeout_tests.jl"); nworkers=1, testitem_timeout=2, timeout_backtraces, kwargs...)
end
end
end
flush(io)
close(io)
read(path, String)
end
f(logs)
return logs
end

@testset "timeout_backtraces=false means no backtraces" begin
capture_timeout_backtraces(false) do logs
@test !occursin("Thread 1", logs)
end
end

@testset "timeout_backtraces=true means we gather backtraces" begin
capture_timeout_backtraces(true) do logs
if gdb_available()
if Sys.islinux()
@test occursin("in signal_listener", logs)
else
@test_skip "not on Linux"
end
@test count(r"pthread_cond_wait|__psych_cvwait", logs) > 0 # the stacktrace was printed (will fail on Windows)
if VERSION >= v"1.9" # jl_print_task_backtraces() output
@test occursin("==== Thread 1 created", logs)
@test occursin("==== End thread 1", logs)
end
else
@test_skip "gdb unavailable"
end
kpamnany marked this conversation as resolved.
Show resolved Hide resolved
end
end

# The RETESTITEMS_TIMEOUT_BACKTRACES environment variable can be used to set timeout_backtraces.
@testset "RETESTITEMS_TIMEOUT_BACKTRACES environment variable" begin
withenv("RETESTITEMS_TIMEOUT_BACKTRACES" => "true") do
capture_timeout_backtraces(nothing) do logs
if gdb_available()
@test occursin("Thread 1", logs)
else
@test_skip "gdb unavailable"
end
kpamnany marked this conversation as resolved.
Show resolved Hide resolved
end
end
end
end

@testset "worker always crashes immediately" begin
file = joinpath(TEST_FILES_DIR, "_happy_tests.jl")

Expand Down
Loading