Skip to content

Commit c4fdb76

Browse files
committed
Heap profiling for ruby 4.x - Prototype
The idea is to delay the time at which we record object ids. Once we are outside of the allocation code path, we can request the object ID.
1 parent 682d223 commit c4fdb76

File tree

12 files changed

+323
-11
lines changed

12 files changed

+323
-11
lines changed

ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include "private_vm_api_access.h"
1717
#include "setup_signal_handler.h"
1818
#include "time_helpers.h"
19+
#include "heap_recorder.h"
1920

2021
// Used to trigger the execution of Collectors::ThreadContext, which implements all of the sampling logic
2122
// itself; this class only implements the "when to do it" part.
@@ -89,6 +90,9 @@ unsigned int MAX_ALLOC_WEIGHT = 10000;
8990
static rb_postponed_job_handle_t sample_from_postponed_job_handle;
9091
static rb_postponed_job_handle_t after_gc_from_postponed_job_handle;
9192
static rb_postponed_job_handle_t after_gvl_running_from_postponed_job_handle;
93+
#ifdef DEFERRED_HEAP_ALLOCATION_RECORDING
94+
static rb_postponed_job_handle_t finalize_heap_allocation_from_postponed_job_handle;
95+
#endif
9296
#endif
9397

9498
// Contains state for a single CpuAndWallTimeWorker instance
@@ -237,6 +241,9 @@ static VALUE rescued_after_gvl_running_from_postponed_job(VALUE self_instance);
237241
static VALUE _native_gvl_profiling_hook_active(DDTRACE_UNUSED VALUE self, VALUE instance);
238242
static inline void during_sample_enter(cpu_and_wall_time_worker_state* state);
239243
static inline void during_sample_exit(cpu_and_wall_time_worker_state* state);
244+
#ifdef DEFERRED_HEAP_ALLOCATION_RECORDING
245+
static void finalize_heap_allocation_from_postponed_job(DDTRACE_UNUSED void *_unused);
246+
#endif
240247

241248
// We're using `on_newobj_event` function with `rb_add_event_hook2`, which requires in its public signature a function
242249
// with signature `rb_event_hook_func_t` which doesn't match `on_newobj_event`.
@@ -283,11 +290,17 @@ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
283290
sample_from_postponed_job_handle = rb_postponed_job_preregister(unused_flags, sample_from_postponed_job, NULL);
284291
after_gc_from_postponed_job_handle = rb_postponed_job_preregister(unused_flags, after_gc_from_postponed_job, NULL);
285292
after_gvl_running_from_postponed_job_handle = rb_postponed_job_preregister(unused_flags, after_gvl_running_from_postponed_job, NULL);
293+
#ifdef DEFERRED_HEAP_ALLOCATION_RECORDING
294+
finalize_heap_allocation_from_postponed_job_handle = rb_postponed_job_preregister(unused_flags, finalize_heap_allocation_from_postponed_job, NULL);
295+
#endif
286296

287297
if (
288298
sample_from_postponed_job_handle == POSTPONED_JOB_HANDLE_INVALID ||
289299
after_gc_from_postponed_job_handle == POSTPONED_JOB_HANDLE_INVALID ||
290300
after_gvl_running_from_postponed_job_handle == POSTPONED_JOB_HANDLE_INVALID
301+
#ifdef DEFERRED_HEAP_ALLOCATION_RECORDING
302+
|| finalize_heap_allocation_from_postponed_job_handle == POSTPONED_JOB_HANDLE_INVALID
303+
#endif
291304
) {
292305
rb_raise(rb_eRuntimeError, "Failed to register profiler postponed jobs (got POSTPONED_JOB_HANDLE_INVALID)");
293306
}
@@ -1222,6 +1235,12 @@ static void on_newobj_event(DDTRACE_UNUSED VALUE unused1, DDTRACE_UNUSED void *u
12221235
state->stats.allocation_sampled++;
12231236

12241237
during_sample_exit(state);
1238+
1239+
#ifdef DEFERRED_HEAP_ALLOCATION_RECORDING
1240+
// On Ruby 4+, we need to trigger a postponed job to finalize the heap allocation recording.
1241+
// During on_newobj_event, we can't safely call rb_obj_id(), so we defer it until the event completes.
1242+
rb_postponed_job_trigger(finalize_heap_allocation_from_postponed_job_handle);
1243+
#endif
12251244
}
12261245

12271246
static void disable_tracepoints(cpu_and_wall_time_worker_state *state) {
@@ -1404,6 +1423,27 @@ static VALUE _native_resume_signals(DDTRACE_UNUSED VALUE self) {
14041423
}
14051424
#endif
14061425

1426+
#ifdef DEFERRED_HEAP_ALLOCATION_RECORDING
1427+
// This postponed job callback is used to finalize heap allocation recordings on Ruby 4+.
1428+
// During on_newobj_event, calling rb_obj_id() is unsafe because it mutates the object.
1429+
// So we defer getting the object_id until after the event completes.
1430+
static void finalize_heap_allocation_from_postponed_job(DDTRACE_UNUSED void *_unused) {
1431+
cpu_and_wall_time_worker_state *state = active_sampler_instance_state;
1432+
1433+
if (state == NULL) return;
1434+
1435+
if (!ddtrace_rb_ractor_main_p()) {
1436+
return;
1437+
}
1438+
1439+
// Get the heap_recorder from the thread_context_collector
1440+
heap_recorder *recorder = thread_context_collector_get_heap_recorder(state->thread_context_collector_instance);
1441+
if (recorder == NULL) return;
1442+
1443+
heap_recorder_finalize_pending_recordings(recorder);
1444+
}
1445+
#endif
1446+
14071447
static inline void during_sample_enter(cpu_and_wall_time_worker_state* state) {
14081448
// Tell the compiler it's not allowed to reorder the `during_sample` flag with anything that happens after.
14091449
//

ext/datadog_profiling_native_extension/collectors_thread_context.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1338,6 +1338,14 @@ VALUE enforce_thread_context_collector_instance(VALUE object) {
13381338
return object;
13391339
}
13401340

1341+
#ifdef DEFERRED_HEAP_ALLOCATION_RECORDING
1342+
heap_recorder* thread_context_collector_get_heap_recorder(VALUE self_instance) {
1343+
thread_context_collector_state *state;
1344+
TypedData_Get_Struct(self_instance, thread_context_collector_state, &thread_context_collector_typed_data, state);
1345+
return get_heap_recorder_from_stack_recorder(state->recorder_instance);
1346+
}
1347+
#endif
1348+
13411349
// This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec.
13421350
// It SHOULD NOT be used for other purposes.
13431351
static VALUE _native_stats(DDTRACE_UNUSED VALUE _self, VALUE collector_instance) {

ext/datadog_profiling_native_extension/collectors_thread_context.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include <stdbool.h>
55

66
#include "gvl_profiling_helper.h"
7+
#include "heap_recorder.h"
78

89
void thread_context_collector_sample(
910
VALUE self_instance,
@@ -18,6 +19,12 @@ void thread_context_collector_on_gc_start(VALUE self_instance);
1819
__attribute__((warn_unused_result)) bool thread_context_collector_on_gc_finish(VALUE self_instance);
1920
VALUE enforce_thread_context_collector_instance(VALUE object);
2021

22+
#ifdef DEFERRED_HEAP_ALLOCATION_RECORDING
23+
// Get the heap_recorder from the thread_context_collector instance.
24+
// Used to access pending recordings for deferred finalization.
25+
heap_recorder* thread_context_collector_get_heap_recorder(VALUE self_instance);
26+
#endif
27+
2128
#ifndef NO_GVL_INSTRUMENTATION
2229
typedef enum {
2330
ON_GVL_RUNNING_UNKNOWN, // Thread is not known, it may not even be from the current Ractor

ext/datadog_profiling_native_extension/extconf.rb

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,11 @@ def skip_building_extension!(reason)
144144
# On Ruby 4, we can't ask the object_id from IMEMOs (https://github.com/ruby/ruby/pull/13347)
145145
$defs << "-DNO_IMEMO_OBJECT_ID" unless RUBY_VERSION < "4"
146146

147+
# On Ruby 4, we need to defer calling rb_obj_id during heap allocation recording
148+
# because it's not safe to mutate objects during the newobj tracepoint
149+
# (see https://bugs.ruby-lang.org/issues/21710)
150+
$defs << "-DDEFERRED_HEAP_ALLOCATION_RECORDING" unless RUBY_VERSION < "4"
151+
147152
# This symbol is exclusively visible on certain Ruby versions: 2.6 to 3.2, as well as 3.4 (but not 4.0+)
148153
# It's only used to get extra information about an object when a failure happens, so it's a "very nice to have" but not
149154
# actually required for correct behavior of the profiler.

ext/datadog_profiling_native_extension/heap_recorder.c

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,18 @@ static void object_record_free(heap_recorder*, object_record*);
7979
static VALUE object_record_inspect(heap_recorder*, object_record*);
8080
static object_record SKIPPED_RECORD = {0};
8181

82+
#ifdef DEFERRED_HEAP_ALLOCATION_RECORDING
83+
// A pending recording is used to defer the object_id call on Ruby 4+
84+
// where calling rb_obj_id during on_newobj_event is unsafe.
85+
typedef struct {
86+
VALUE object_ref;
87+
heap_record *heap_record;
88+
live_object_data object_data;
89+
} pending_recording;
90+
91+
#define MAX_PENDING_RECORDINGS 64
92+
#endif
93+
8294
struct heap_recorder {
8395
// Config
8496
// Whether the recorder should try to determine approximate sizes for tracked objects.
@@ -130,6 +142,17 @@ struct heap_recorder {
130142
// Data for a heap recording that was started but not yet ended
131143
object_record *active_recording;
132144

145+
#ifdef DEFERRED_HEAP_ALLOCATION_RECORDING
146+
// Pending recordings that need to be finalized after on_newobj_event completes.
147+
// On Ruby 4+, we can't call rb_obj_id during the newobj event, so we store the
148+
// VALUE reference here and finalize it via a postponed job.
149+
pending_recording pending_recordings[MAX_PENDING_RECORDINGS];
150+
uint pending_recordings_count;
151+
// Temporary storage for the recording in progress, used between start and end
152+
VALUE active_deferred_object;
153+
live_object_data active_deferred_object_data;
154+
#endif
155+
133156
// Reusable arrays, implementing a flyweight pattern for things like iteration
134157
#define REUSABLE_LOCATIONS_SIZE MAX_FRAMES_LIMIT
135158
ddog_prof_Location *reusable_locations;
@@ -294,6 +317,12 @@ void heap_recorder_after_fork(heap_recorder *heap_recorder) {
294317
heap_recorder->stats_lifetime = (struct stats_lifetime) {0};
295318
}
296319

320+
#ifdef DEFERRED_HEAP_ALLOCATION_RECORDING
321+
// Sentinel marker for active_recording indicating we're using deferred recording.
322+
// Only used for pointer comparison, no data is read from or written to this.
323+
static object_record DEFERRED_RECORD = {.obj_id = -1};
324+
#endif
325+
297326
void start_heap_allocation_recording(heap_recorder *heap_recorder, VALUE new_obj, unsigned int weight, ddog_CharSlice alloc_class) {
298327
if (heap_recorder == NULL) {
299328
return;
@@ -314,13 +343,29 @@ void start_heap_allocation_recording(heap_recorder *heap_recorder, VALUE new_obj
314343
// directly OR because the GVL got released in the middle of an update), let's skip this sample as well.
315344
// See notes on `heap_recorder_update` for details.
316345
|| heap_recorder->updating
346+
#ifdef DEFERRED_HEAP_ALLOCATION_RECORDING
347+
// Skip if we've hit the pending recordings limit or if there's already a deferred object being recorded
348+
|| heap_recorder->pending_recordings_count >= MAX_PENDING_RECORDINGS
349+
|| heap_recorder->active_deferred_object != Qnil
350+
#endif
317351
) {
318352
heap_recorder->active_recording = &SKIPPED_RECORD;
319353
return;
320354
}
321355

322356
heap_recorder->num_recordings_skipped = 0;
323357

358+
#ifdef DEFERRED_HEAP_ALLOCATION_RECORDING
359+
// On Ruby 4+, we can't call rb_obj_id during on_newobj_event as it mutates the object.
360+
// Instead, we store the VALUE reference and will get the object_id later via a postponed job.
361+
heap_recorder->active_deferred_object = new_obj;
362+
heap_recorder->active_deferred_object_data = (live_object_data) {
363+
.weight = weight * heap_recorder->sample_rate,
364+
.class = intern_or_raise(heap_recorder->string_storage, alloc_class),
365+
.alloc_gen = rb_gc_count(),
366+
};
367+
heap_recorder->active_recording = &DEFERRED_RECORD;
368+
#else
324369
VALUE ruby_obj_id = rb_obj_id(new_obj);
325370
if (!FIXNUM_P(ruby_obj_id)) {
326371
rb_raise(rb_eRuntimeError, "Detected a bignum object id. These are not supported by heap profiling.");
@@ -335,6 +380,7 @@ void start_heap_allocation_recording(heap_recorder *heap_recorder, VALUE new_obj
335380
.alloc_gen = rb_gc_count(),
336381
}
337382
);
383+
#endif
338384
}
339385

340386
// end_heap_allocation_recording_with_rb_protect gets called while the stack_recorder is holding one of the profile
@@ -383,6 +429,23 @@ static VALUE end_heap_allocation_recording(VALUE protect_args) {
383429
return Qnil;
384430
}
385431

432+
#ifdef DEFERRED_HEAP_ALLOCATION_RECORDING
433+
if (active_recording == &DEFERRED_RECORD) {
434+
// On Ruby 4+, we defer the object_id call. Store the recording in the pending list.
435+
if (heap_recorder->pending_recordings_count < MAX_PENDING_RECORDINGS) {
436+
heap_record *heap_record = get_or_create_heap_record(heap_recorder, locations);
437+
heap_record->num_tracked_objects++; // Pre-increment since we're going to commit this later
438+
439+
pending_recording *pending = &heap_recorder->pending_recordings[heap_recorder->pending_recordings_count++];
440+
pending->object_ref = heap_recorder->active_deferred_object;
441+
pending->heap_record = heap_record;
442+
pending->object_data = heap_recorder->active_deferred_object_data;
443+
}
444+
heap_recorder->active_deferred_object = Qnil;
445+
return Qnil;
446+
}
447+
#endif
448+
386449
heap_record *heap_record = get_or_create_heap_record(heap_recorder, locations);
387450

388451
// And then commit the new allocation.
@@ -399,6 +462,89 @@ void heap_recorder_update_young_objects(heap_recorder *heap_recorder) {
399462
heap_recorder_update(heap_recorder, /* full_update: */ false);
400463
}
401464

465+
#ifdef DEFERRED_HEAP_ALLOCATION_RECORDING
466+
bool heap_recorder_has_pending_recordings(heap_recorder *heap_recorder) {
467+
if (heap_recorder == NULL) {
468+
return false;
469+
}
470+
return heap_recorder->pending_recordings_count > 0;
471+
}
472+
473+
bool heap_recorder_finalize_pending_recordings(heap_recorder *heap_recorder) {
474+
if (heap_recorder == NULL) {
475+
return false;
476+
}
477+
478+
uint count = heap_recorder->pending_recordings_count;
479+
if (count == 0) {
480+
return false;
481+
}
482+
483+
for (uint i = 0; i < count; i++) {
484+
pending_recording *pending = &heap_recorder->pending_recordings[i];
485+
486+
VALUE obj = pending->object_ref;
487+
488+
// Check if the object is still valid (it might have been GC'd between
489+
// the allocation and this finalization)
490+
if (obj == Qnil || !RTEST(obj)) {
491+
// Object is gone, decrement the pre-incremented count and cleanup
492+
pending->heap_record->num_tracked_objects--;
493+
cleanup_heap_record_if_unused(heap_recorder, pending->heap_record);
494+
unintern_or_raise(heap_recorder, pending->object_data.class);
495+
continue;
496+
}
497+
498+
// Now it's safe to get the object_id
499+
VALUE ruby_obj_id = rb_obj_id(obj);
500+
if (!FIXNUM_P(ruby_obj_id)) {
501+
// Bignum object id - not supported, skip this recording
502+
pending->heap_record->num_tracked_objects--;
503+
cleanup_heap_record_if_unused(heap_recorder, pending->heap_record);
504+
unintern_or_raise(heap_recorder, pending->object_data.class);
505+
continue;
506+
}
507+
508+
long obj_id = FIX2LONG(ruby_obj_id);
509+
510+
// Create the object record now that we have the object_id
511+
object_record *record = object_record_new(obj_id, pending->heap_record, pending->object_data);
512+
513+
// Commit the recording
514+
int existing_error = st_update(heap_recorder->object_records, record->obj_id, update_object_record_entry, (st_data_t) record);
515+
if (existing_error) {
516+
// Duplicate object_id - this shouldn't happen normally, but handle it gracefully
517+
pending->heap_record->num_tracked_objects--;
518+
cleanup_heap_record_if_unused(heap_recorder, pending->heap_record);
519+
object_record_free(heap_recorder, record);
520+
}
521+
}
522+
523+
heap_recorder->pending_recordings_count = 0;
524+
return true;
525+
}
526+
527+
// Mark pending recordings to prevent GC from collecting the objects
528+
// while they're waiting to be finalized
529+
void heap_recorder_mark_pending_recordings(heap_recorder *heap_recorder) {
530+
if (heap_recorder == NULL) {
531+
return;
532+
}
533+
534+
for (uint i = 0; i < heap_recorder->pending_recordings_count; i++) {
535+
VALUE obj = heap_recorder->pending_recordings[i].object_ref;
536+
if (obj != Qnil) {
537+
rb_gc_mark(obj);
538+
}
539+
}
540+
541+
// Also mark the active deferred object if it's set
542+
if (heap_recorder->active_deferred_object != Qnil) {
543+
rb_gc_mark(heap_recorder->active_deferred_object);
544+
}
545+
}
546+
#endif
547+
402548
// NOTE: This function needs and assumes it gets called with the GVL being held.
403549
// But importantly **some of the operations inside `st_object_record_update` may cause a thread switch**,
404550
// so we can't assume a single update happens in a single "atomic" step -- other threads may get some running time

ext/datadog_profiling_native_extension/heap_recorder.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,20 @@ int end_heap_allocation_recording_with_rb_protect(heap_recorder *heap_recorder,
123123
// these objects quicker) and hopefully reduces tail latency (because there's less objects at serialization time to check).
124124
void heap_recorder_update_young_objects(heap_recorder *heap_recorder);
125125

126+
#ifdef DEFERRED_HEAP_ALLOCATION_RECORDING
127+
// Finalize any pending heap allocation recordings by getting their object IDs.
128+
// This should be called via a postponed job, after the on_newobj_event has completed.
129+
// Returns true if there were pending recordings that were finalized.
130+
bool heap_recorder_finalize_pending_recordings(heap_recorder *heap_recorder);
131+
132+
// Check if there are any pending recordings waiting to be finalized.
133+
bool heap_recorder_has_pending_recordings(heap_recorder *heap_recorder);
134+
135+
// Mark pending recordings to prevent GC from collecting the objects
136+
// while they're waiting to be finalized.
137+
void heap_recorder_mark_pending_recordings(heap_recorder *heap_recorder);
138+
#endif
139+
126140
// Update the heap recorder to reflect the latest state of the VM and prepare internal structures
127141
// for efficient iteration.
128142
//

0 commit comments

Comments
 (0)