Skip to content

Commit 031905b

Browse files
fix(profiling): always report CPU time regardless of Thread running state (#16273)
## Description This PR updates the Python Profiler logic to always report CPU Time for sampled Threads, regardless of whether they were running when we computed their spent CPU time. ### Why? In [Incident 48491](https://app.datadoghq.com/incidents/48491), we investigated an issue where following a `ddtrace` upgrade, one of our Python services suddenly started reporting near zero CPU Time and no Frames at all/empty flame graphs. The investigation showed that this changed if we removed the check on `is_running` and always reported CPU Time for each sampled Thread. Originally, this check was introduced to avoid reporting CPU Time on Stacks that were in fact idle states; however it also means it the Thread runs between two Samples but then does not run when we "check whether it is running", then we are blind to what it did during that time. On top of that, there is no reliable way to check whether a Thread is running – checking this is by definition racy, and although there are potential ways around it, we could never guarantee that the running state we determined matched the one that the Thread was in when we captured its Stack, meaning we could have discrepancies anyway. Finally, several other Datadog Profilers have this bias and accept it, so we should probably match that behaviour. **Note** the performance difference will probably be negligible, but this change may actually slightly improve our CPU usage as we will now make one less system call.
1 parent 4ce99ef commit 031905b

File tree

3 files changed

+8
-28
lines changed

3 files changed

+8
-28
lines changed

ddtrace/internal/datadog/profiling/stack/echion/echion/threads.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,13 +54,11 @@ class ThreadInfo
5454
mach_port_t mach_port;
5555
#endif
5656
microsecond_t cpu_time;
57-
bool running_ = false;
5857

5958
uintptr_t asyncio_loop = 0;
6059
uintptr_t tstate_addr = 0; // Remote address of PyThreadState for accessing asyncio_tasks_head
6160

6261
[[nodiscard]] Result<void> update_cpu_time();
63-
bool is_running();
6462

6563
[[nodiscard]] Result<void> sample(EchionSampler&, int64_t, PyThreadState*, microsecond_t);
6664
void unwind(EchionSampler&, PyThreadState*);

ddtrace/internal/datadog/profiling/stack/src/echion/threads.cc

Lines changed: 4 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -584,7 +584,7 @@ ThreadInfo::sample(EchionSampler& echion, int64_t iid, PyThreadState* tstate, mi
584584
return ErrorKind::CpuTimeError;
585585
}
586586

587-
Renderer::get().render_cpu_time(is_running() ? cpu_time - previous_cpu_time : 0);
587+
Renderer::get().render_cpu_time(cpu_time - previous_cpu_time);
588588

589589
this->unwind(echion, tstate);
590590

@@ -640,29 +640,18 @@ Result<void>
640640
ThreadInfo::update_cpu_time()
641641
{
642642
#if defined PL_LINUX
643-
struct timespec ts1;
644-
if (clock_gettime(cpu_clock_id, &ts1)) {
643+
struct timespec ts;
644+
if (clock_gettime(cpu_clock_id, &ts)) {
645645
// If the clock is invalid, we skip updating the CPU time.
646646
// This can happen if we try to compute CPU time for a thread that has exited.
647647
if (errno == EINVAL) {
648-
this->running_ = false;
649648
return Result<void>::ok();
650649
}
651650

652651
return ErrorKind::CpuTimeError;
653652
}
654653

655-
this->cpu_time = TS_TO_MICROSECOND(ts1);
656-
657-
// Determine if running by checking if CPU time advances between two back-to-back
658-
// measurements. This is done here to avoid a separate is_running() call with
659-
// its own syscalls (reduces 3 syscalls per thread to 2).
660-
struct timespec ts2;
661-
if (clock_gettime(cpu_clock_id, &ts2) != 0) {
662-
this->running_ = false;
663-
} else {
664-
this->running_ = (ts1.tv_sec != ts2.tv_sec || ts1.tv_nsec != ts2.tv_nsec);
665-
}
654+
this->cpu_time = TS_TO_MICROSECOND(ts);
666655
#elif defined PL_DARWIN
667656
thread_basic_info_data_t info;
668657
mach_msg_type_number_t count = THREAD_BASIC_INFO_COUNT;
@@ -673,33 +662,22 @@ ThreadInfo::update_cpu_time()
673662
// If the thread is invalid, we skip updating the CPU time.
674663
// This can happen if we try to compute CPU time for a thread that has exited.
675664
if (kr == KERN_INVALID_ARGUMENT) {
676-
this->running_ = false;
677665
return Result<void>::ok();
678666
}
679667

680668
return ErrorKind::CpuTimeError;
681669
}
682670

683671
if (info.flags & TH_FLAGS_IDLE) {
684-
this->running_ = false;
685672
return Result<void>::ok();
686673
}
687674

688675
this->cpu_time = TV_TO_MICROSECOND(info.user_time) + TV_TO_MICROSECOND(info.system_time);
689-
// On macOS, thread_info already gives us run_state, so no need to check if the clock is advancing
690-
this->running_ = (info.run_state == TH_STATE_RUNNING);
691676
#endif
692677

693678
return Result<void>::ok();
694679
}
695680

696-
bool
697-
ThreadInfo::is_running()
698-
{
699-
// Running state is computed in update_cpu_time by taking two back-to-back measurements of the CPU time.
700-
return this->running_;
701-
}
702-
703681
void
704682
for_each_thread(EchionSampler& echion, InterpreterInfo& interp, PyThreadStateCallback callback)
705683
{
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
fixes:
2+
- |
3+
profiling: The Profiler now always reports CPU time for threads, regardless of whether they are running when
4+
the sample is captured.

0 commit comments

Comments
 (0)