JuliaTesting · kpamnany · May 6, 2024 · May 6, 2024 · May 6, 2024 · May 6, 2024
diff --git a/src/ReTestItems.jl b/src/ReTestItems.jl
@@ -197,6 +197,9 @@ will be run.
    Zero means no profile will be taken. Can also be set using the `RETESTITEMS_TIMEOUT_PROFILE_WAIT`
    environment variable. See the [Profile documentation](https://docs.julialang.org/en/v1/stdlib/Profile/#Triggered-During-Execution)
    for more information on triggered profiles. Note you can use `worker_init_expr` to tweak the profile settings on workers.
+- `timeout_backtraces::Bool=false`: Linux only. If `true`, we use GDB to get thread and task
+  backtraces from a worker that times-out. Can also be set using the `RETESTITEMS_TIMEOUT_BACKTRACES`
+  environment variable. If a CPU profile was also requested, this runs after that is taken.
 """
 function runtests end
 
@@ -243,6 +246,7 @@ function runtests(
     test_end_expr::Expr=Expr(:block),
     validate_paths::Bool=parse(Bool, get(ENV, "RETESTITEMS_VALIDATE_PATHS", "false")),
     timeout_profile_wait::Real=parse(Int, get(ENV, "RETESTITEMS_TIMEOUT_PROFILE_WAIT", "0")),
+    timeout_backtraces::Bool=parse(Bool, get(ENV, "RETESTITEMS_TIMEOUT_BACKTRACES", "false")),
 )
     nworker_threads = _validated_nworker_threads(nworker_threads)
     paths′ = _validated_paths(paths, validate_paths)
@@ -262,13 +266,14 @@ function runtests(
     timeout = ceil(Int, testitem_timeout)
     timeout_profile_wait = ceil(Int, timeout_profile_wait)
     (timeout_profile_wait > 0 && Sys.iswindows()) && @warn "CPU profiles on timeout is not supported on Windows, ignoring `timeout_profile_wait`"
+    (timeout_backtraces && !Sys.islinux()) && @warn "Backtraces on timeout is only supported on Linux, ignoring `timeout_backtraces`"
     debuglvl = Int(debug)
     if debuglvl > 0
         LoggingExtras.withlevel(LoggingExtras.Debug; verbosity=debuglvl) do
-            _runtests(shouldrun_combined, paths′, nworkers, nworker_threads, worker_init_expr, test_end_expr, timeout, retries, memory_threshold, verbose_results, debuglvl, report, logs, timeout_profile_wait)
+            _runtests(shouldrun_combined, paths′, nworkers, nworker_threads, worker_init_expr, test_end_expr, timeout, retries, memory_threshold, verbose_results, debuglvl, report, logs, timeout_profile_wait, timeout_backtraces)
         end
     else
-        return _runtests(shouldrun_combined, paths′, nworkers, nworker_threads, worker_init_expr, test_end_expr, timeout, retries, memory_threshold, verbose_results, debuglvl, report, logs, timeout_profile_wait)
+        return _runtests(shouldrun_combined, paths′, nworkers, nworker_threads, worker_init_expr, test_end_expr, timeout, retries, memory_threshold, verbose_results, debuglvl, report, logs, timeout_profile_wait, timeout_backtraces)
     end
 end
 
@@ -282,7 +287,7 @@ end
 # By tracking and reusing test environments, we can avoid this issue.
 const TEST_ENVS = Dict{String, String}()
 
-function _runtests(shouldrun, paths, nworkers::Int, nworker_threads::String, worker_init_expr::Expr, test_end_expr::Expr, testitem_timeout::Int, retries::Int, memory_threshold::Real, verbose_results::Bool, debug::Int, report::Bool, logs::Symbol, timeout_profile_wait::Int)
+function _runtests(shouldrun, paths, nworkers::Int, nworker_threads::String, worker_init_expr::Expr, test_end_expr::Expr, testitem_timeout::Int, retries::Int, memory_threshold::Real, verbose_results::Bool, debug::Int, report::Bool, logs::Symbol, timeout_profile_wait::Int, timeout_backtraces::Bool)
     # Don't recursively call `runtests` e.g. if we `include` a file which calls it.
     # So we ignore the `runtests(...)` call in `test/runtests.jl` when `runtests(...)`
     # was called from the command line.
@@ -302,7 +307,7 @@ function _runtests(shouldrun, paths, nworkers::Int, nworker_threads::String, wor
         if is_running_test_runtests_jl(proj_file)
             # Assume this is `Pkg.test`, so test env already active.
             @debugv 2 "Running in current environment `$(Base.active_project())`"
-            return _runtests_in_current_env(shouldrun, paths, proj_file, nworkers, nworker_threads, worker_init_expr, test_end_expr, testitem_timeout, retries, memory_threshold, verbose_results, debug, report, logs, timeout_profile_wait)
+            return _runtests_in_current_env(shouldrun, paths, proj_file, nworkers, nworker_threads, worker_init_expr, test_end_expr, testitem_timeout, retries, memory_threshold, verbose_results, debug, report, logs, timeout_profile_wait, timeout_backtraces)
         else
             @debugv 1 "Activating test environment for `$proj_file`"
             orig_proj = Base.active_project()
@@ -315,7 +320,7 @@ function _runtests(shouldrun, paths, nworkers::Int, nworker_threads::String, wor
                     testenv = TestEnv.activate()
                     TEST_ENVS[proj_file] = testenv
                 end
-                _runtests_in_current_env(shouldrun, paths, proj_file, nworkers, nworker_threads, worker_init_expr, test_end_expr, testitem_timeout, retries, memory_threshold, verbose_results, debug, report, logs, timeout_profile_wait)
+                _runtests_in_current_env(shouldrun, paths, proj_file, nworkers, nworker_threads, worker_init_expr, test_end_expr, testitem_timeout, retries, memory_threshold, verbose_results, debug, report, logs, timeout_profile_wait, timeout_backtraces)
             finally
                 Base.set_active_project(orig_proj)
             end
@@ -327,6 +332,7 @@ function _runtests_in_current_env(
     shouldrun, paths, projectfile::String, nworkers::Int, nworker_threads, worker_init_expr::Expr, test_end_expr::Expr,
     testitem_timeout::Int, retries::Int, memory_threshold::Real, verbose_results::Bool, debug::Int, report::Bool, logs::Symbol,
     timeout_profile_wait::Int,
+    timeout_backtraces::Bool,
 )
     start_time = time()
     proj_name = something(Pkg.Types.read_project(projectfile).name, "")
@@ -391,7 +397,7 @@ function _runtests_in_current_env(
                 ti = starting[i]
                 @spawn begin
                     with_logger(original_logger) do
-                        manage_worker($w, $proj_name, $testitems, $ti, $nworker_threads, $worker_init_expr, $test_end_expr, $testitem_timeout, $retries, $memory_threshold, $verbose_results, $debug, $report, $logs, $timeout_profile_wait)
+                        manage_worker($w, $proj_name, $testitems, $ti, $nworker_threads, $worker_init_expr, $test_end_expr, $testitem_timeout, $retries, $memory_threshold, $verbose_results, $debug, $report, $logs, $timeout_profile_wait, $timeout_backtraces)
                     end
                 end
             end
@@ -422,7 +428,7 @@ function start_worker(proj_name, nworker_threads, worker_init_expr, ntestitems;
         const GLOBAL_TEST_CONTEXT = ReTestItems.TestContext($proj_name, $ntestitems)
         GLOBAL_TEST_CONTEXT.setups_evaled = ReTestItems.TestSetupModules()
         nthreads_str = $nworker_threads
-        @info "Starting test worker$($i) on pid = $(Libc.getpid()), with $nthreads_str threads"
+        @info "Starting test worker$($i) on pid = $(Libc.getpid()), with $nthreads_str threads, nthreads(:default) = $(Base.Threads.nthreads(:default)), nthreads(:interactive) = $(Base.Threads.nthreads(:interactive))"
         $(worker_init_expr.args...)
         nothing
     end)
@@ -502,7 +508,8 @@ end
 
 function manage_worker(
     worker::Worker, proj_name::AbstractString, testitems::TestItems, testitem::Union{TestItem,Nothing}, nworker_threads, worker_init_expr::Expr, test_end_expr::Expr,
-    default_timeout::Int, retries::Int, memory_threshold::Real, verbose_results::Bool, debug::Int, report::Bool, logs::Symbol, timeout_profile_wait::Int
+    default_timeout::Int, retries::Int, memory_threshold::Real, verbose_results::Bool, debug::Int, report::Bool, logs::Symbol, timeout_profile_wait::Int,
+    timeout_backtraces::Bool,
 )
     ntestitems = length(testitems.testitems)
     run_number = 1
@@ -563,11 +570,17 @@ function manage_worker(
             @debugv 2 "Error" exception=e
             # Handle the exception
             if e isa TimeoutException
+                @warn "$worker timed out running test item $(repr(testitem.name)) after $timeout seconds."
+                xtra_output = nothing
                 if timeout_profile_wait > 0
-                    @warn "$worker timed out running test item $(repr(testitem.name)) after $timeout seconds. \
-                        A CPU profile will be triggered on the worker and then it will be terminated."
+                    @info "Gathering a CPU profile from $worker."
                     trigger_profile(worker, timeout_profile_wait, :timeout)
                 end
+                if timeout_backtraces
+                    @info "Gathering thread and task backtraces from $worker."
+                    xtra_output = trigger_backtraces(worker, :timeout)
+                end
+                @info "Terminating $worker."
                 terminate!(worker, :timeout)
                 wait(worker)
                 # TODO: We print the captured logs after the worker is terminated,
@@ -576,6 +589,7 @@ function manage_worker(
                 # This is not an issue with eager logs, but when going through a file, this seems to help.
                 println(DEFAULT_STDOUT[])
                 _print_captured_logs(DEFAULT_STDOUT[], testitem, run_number)
+                isnothing(xtra_output) || println(DEFAULT_STDOUT[], xtra_output)
                 @error "$worker timed out running test item $(repr(testitem.name)) after $timeout seconds. \
                     Recording test error."
                 record_timeout!(testitem, run_number, timeout)

diff --git a/src/workers.jl b/src/workers.jl
@@ -3,7 +3,7 @@ module Workers
 using Sockets, Serialization
 
 export Worker, remote_eval, remote_fetch, terminate!, WorkerTerminatedException
-export trigger_profile
+export trigger_profile, trigger_backtraces
 
 function try_with_timeout(f, timeout)
     cond = Threads.Condition()
@@ -108,8 +108,8 @@ end
 
 # Send signal to the given `Worker` process to trigger a profile.
 # Users can customise this profiling in the usual way, e.g. via
-# `JULIA_PROFILE_PEEK_HEAP_SNAPSHOT`, but `Profile.set_peek_duration`, `Profile.peek_report[]`
-# would have to be modified in the worker process.
+# `JULIA_PROFILE_PEEK_HEAP_SNAPSHOT`, but `Profile.set_peek_duration`,
+# `Profile.peek_report[]` would have to be modified in the worker process.
 # See https://docs.julialang.org/en/v1/stdlib/Profile/#Triggered-During-Execution
 # Called when timeout_profile_wait is non-zero.
 function trigger_profile(w::Worker, timeout_profile_wait, from::Symbol=:manual)
@@ -120,7 +120,48 @@ function trigger_profile(w::Worker, timeout_profile_wait, from::Symbol=:manual)
         elseif Sys.isbsd()
             kill(w.process, 29)  # SIGINFO
         end
-        sleep(timeout_profile_wait) # Leave time for it to print the profile.
+        sleep(timeout_profile_wait) # Leave time to output the profile.
+    end
+    return nothing
+end
+
+# Spawn GDB to request thread backtraces followed by invoking
+# jl_print_task_backtraces to get task backtraces. We wait for GDB
+# to finish so no sleep is needed. Thread backtraces are written to
+# the GDB process' standard output and not the worker's standard
+# output, so we just return those as a `String` from here for the
+# caller to assemble with the worker's output.
+function trigger_backtraces(w::Worker, from::Symbol=:manual)
+    if Sys.islinux()
+        @debug "using GDB to get thread and task backtraces on worker $(w.pid) from $from"
+        #iob = IOBuffer()
+        gdb_cmd = Cmd([
+            "gdb",
+            # Run in batched mode on the worker process
+            "--batch", "-p", "$(w.pid)",
+            # Julia's runtime uses SIGSEGV for GC and SIGUSR2 to pause threads
+            # (for profiling or backtracing). Tell GDB to ignore these signals.
+            "-ex", "handle SIGSEGV noprint nostop pass",
+            "-ex", "handle SIGUSR2 noprint nostop pass",
+            # Get all thread backtraces
+            "-ex", "thread apply all bt",
+        ])
+#=
+        @static if VERSION >= v"1.9"
+            push!(gdb_cmd.exec,
+                # Ask Julia to dump all task backtraces
+                "-ex", "call jl_print_task_backtraces(1)",
+            )
+        end
+=#
+        try
+            println("Running $(gdb_cmd)")
+            run(gdb_cmd)
+            #run(pipeline(gdb_cmd, stdout=iob, stderr=iob))
+            #return String(take!(iob))
+        catch e
+            @warn "Could not get thread/task backtraces via GDB." exception=e
+        end
     end
     return nothing
 end

diff --git a/test/integrationtests.jl b/test/integrationtests.jl
@@ -890,6 +890,73 @@ end
     end
 end
 
+@testset "Backtraces timeout trigger" begin
+    function gdb_available()
+        try
+            run(`gdb -ex "exit"`)
+            true
+        catch
+            false
+        end
+    end
+    function capture_timeout_backtraces(f, timeout_backtraces; kwargs...)
+        logs = mktemp() do path, io
+            redirect_stdio(stdout=io, stderr=io, stdin=devnull) do
+                encased_testset() do
+                    if isnothing(timeout_backtraces)
+                        runtests(joinpath(TEST_FILES_DIR, "_timeout_tests.jl"); nworkers=1, testitem_timeout=2, kwargs...)
+                    else
+                        runtests(joinpath(TEST_FILES_DIR, "_timeout_tests.jl"); nworkers=1, testitem_timeout=2, timeout_backtraces, kwargs...)
+                    end
+                end
+            end
+            flush(io)
+            close(io)
+            read(path, String)
+        end
+        f(logs)
+        return logs
+    end
+
+    @testset "timeout_backtraces=false means no backtraces" begin
+    capture_timeout_backtraces(false) do logs
+        @test !occursin("Thread 1", logs)
+        end
+    end
+
+    @testset "timeout_backtraces=true means we gather backtraces" begin
+    capture_timeout_backtraces(true) do logs
+        if gdb_available()
+            if Sys.islinux()
+                @test occursin("in signal_listener", logs)
+            else
+                @test_skip "not on Linux"
+            end
+            @test count(r"pthread_cond_wait|__psych_cvwait", logs) > 0 # the stacktrace was printed (will fail on Windows)
+            if VERSION >= v"1.9" # jl_print_task_backtraces() output
+                @test occursin("==== Thread 1 created", logs)
+                @test occursin("==== End thread 1", logs)
+            end
+        else
+            @test_skip "gdb unavailable"
+        end
+        end
+    end
+
+    # The RETESTITEMS_TIMEOUT_BACKTRACES environment variable can be used to set timeout_backtraces.
+    @testset "RETESTITEMS_TIMEOUT_BACKTRACES environment variable" begin
+    withenv("RETESTITEMS_TIMEOUT_BACKTRACES" => "true") do
+        capture_timeout_backtraces(nothing) do logs
+            if gdb_available()
+                @test occursin("Thread 1", logs)
+            else
+                @test_skip "gdb unavailable"
+            end
+            end
+        end
+    end
+end
+
 @testset "worker always crashes immediately" begin
     file = joinpath(TEST_FILES_DIR, "_happy_tests.jl")