refactor(profiling): add debug check that the GIL is held to memalloc (#11860)

nsrip-dd · web-flow · commit 5f1aefcf19a5 · 2025-01-07T09:26:51.000-08:00
The allocation profiler functions should (in theory) only be called when
allocating Python objects, which should (in theory) only be done with
the GIL held. We have reason to believe that this code is sometimes
reached without the GIL held, since some crashes in the code seem to go
away with our own locking. Add a debug flag,
`_DD_PROFILING_MEMALLOC_CRASH_ON_NO_GIL`, that when set will make the
profiler crash if it detects the GIL is not held in places where we
think it ought to be.
diff --git a/ddtrace/profiling/collector/_memalloc.c b/ddtrace/profiling/collector/_memalloc.c
@@ -5,6 +5,7 @@
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 
+#include "_memalloc_debug.h"
 #include "_memalloc_heap.h"
 #include "_memalloc_reentrant.h"
 #include "_memalloc_tb.h"
@@ -48,7 +49,13 @@ static PyObject* object_string = NULL;
 // We add an option here to _add_ a crash, in order to observe this condition in a future diagnostic iteration.
 // **This option is _intended_ to crash the Python process** do not use without a good reason!
 static char g_crash_on_mutex_pass_str[] = "_DD_PROFILING_MEMALLOC_CRASH_ON_MUTEX_PASS";
-static const char* g_truthy_values[] = { "1", "true", "yes", "on", "enable", "enabled", NULL }; // NB the sentinel NULL
+// The allocation profiler functions should (in theory) only be called when allocating Python
+// objects, which should (in theory) only be done with the GIL held. We have reason to believe
+// that this code is sometimes reached without the GIL held, since some crashes in the code
+// seem to go away with our own locking. This debug flag will make the profiler crash if
+// it detects the GIL is not held in places where we think it ought to be.
+static char g_crash_on_no_gil_str[] = "_DD_PROFILING_MEMALLOC_CRASH_ON_NO_GIL";
+static bool g_crash_on_no_gil = false;
 static memlock_t g_memalloc_lock;
 
 static alloc_tracker_t* global_alloc_tracker;
@@ -92,25 +99,24 @@ static void
 memalloc_init()
 {
     // Check if we should crash the process on mutex pass
-    char* crash_on_mutex_pass_str = getenv(g_crash_on_mutex_pass_str);
-    bool crash_on_mutex_pass = false;
-    if (crash_on_mutex_pass_str) {
-        for (int i = 0; g_truthy_values[i]; i++) {
-            if (strcmp(crash_on_mutex_pass_str, g_truthy_values[i]) == 0) {
-                crash_on_mutex_pass = true;
-                break;
-            }
-        }
-    }
+    bool crash_on_mutex_pass = memalloc_get_bool_env(g_crash_on_mutex_pass_str);
     memlock_init(&g_memalloc_lock, crash_on_mutex_pass);
 #ifndef _WIN32
     pthread_atfork(memalloc_prefork, memalloc_postfork_parent, memalloc_postfork_child);
 #endif
+
+    g_crash_on_no_gil = memalloc_get_bool_env(g_crash_on_no_gil_str);
 }
 
 static void
 memalloc_add_event(memalloc_context_t* ctx, void* ptr, size_t size)
 {
+    if (g_crash_on_no_gil && !PyGILState_Check()) {
+        int* p = NULL;
+        *p = 0;
+        abort(); // should never reach here
+    }
+
     uint64_t alloc_count = atomic_add_clamped(&global_alloc_tracker->alloc_count, 1, ALLOC_TRACKER_MAX_COUNT);
 
     /* Return if we've reached the maximum number of allocations */
diff --git a/ddtrace/profiling/collector/_memalloc_debug.h b/ddtrace/profiling/collector/_memalloc_debug.h
@@ -0,0 +1,25 @@
+#ifndef _DDTRACE_MEMALLOC_DEBUG_H
+#define _DDTRACE_MEMALLOC_DEBUG_H
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+static const char* g_truthy_values[] = { "1", "true", "yes", "on", "enable", "enabled", NULL }; // NB the sentinel NULL
+
+static bool
+memalloc_get_bool_env(char* key)
+{
+    char* val = getenv(key);
+    if (!val) {
+        return false;
+    }
+    for (int i = 0; g_truthy_values[i]; i++) {
+        if (strcmp(val, g_truthy_values[i]) == 0) {
+            return true;
+        }
+    }
+    return false;
+}
+
+#endif
diff --git a/ddtrace/profiling/collector/_memalloc_heap.c b/ddtrace/profiling/collector/_memalloc_heap.c
@@ -2,6 +2,7 @@
 #include <stdlib.h>
 
 #define PY_SSIZE_T_CLEAN
+#include "_memalloc_debug.h"
 #include "_memalloc_heap.h"
 #include "_memalloc_reentrant.h"
 #include "_memalloc_tb.h"
@@ -27,7 +28,6 @@ typedef struct
 } heap_tracker_t;
 
 static char g_crash_on_mutex_pass_str[] = "_DD_PROFILING_MEMHEAP_CRASH_ON_MUTEX_PASS";
-static const char* g_truthy_values[] = { "1", "true", "yes", "on", "enable", "enabled", NULL }; // NB the sentinel NULL
 static memlock_t g_memheap_lock;
 
 static heap_tracker_t global_heap_tracker;
@@ -68,16 +68,7 @@ static void
 memheap_init()
 {
     // Check if we should crash the process on mutex pass
-    char* crash_on_mutex_pass_str = getenv(g_crash_on_mutex_pass_str);
-    bool crash_on_mutex_pass = false;
-    if (crash_on_mutex_pass_str) {
-        for (int i = 0; g_truthy_values[i]; i++) {
-            if (strcmp(crash_on_mutex_pass_str, g_truthy_values[i]) == 0) {
-                crash_on_mutex_pass = true;
-                break;
-            }
-        }
-    }
+    bool crash_on_mutex_pass = memalloc_get_bool_env(g_crash_on_mutex_pass_str);
     memlock_init(&g_memheap_lock, crash_on_mutex_pass);
 #ifndef _WIN32
     pthread_atfork(memheap_prefork, memheap_postfork_parent, memheap_postfork_child);