From d4b9c6f28f55b4141dbbc94ebbe5c87c18365b43 Mon Sep 17 00:00:00 2001 From: sternj Date: Mon, 2 Dec 2024 12:57:34 -0500 Subject: [PATCH] Added some more coments --- src/include/sampleheap.hpp | 41 +++++++++++++++++++++++++++++++++++++- src/source/pywhere.cpp | 27 ++++++++++++++++++++++++- 2 files changed, 66 insertions(+), 2 deletions(-) diff --git a/src/include/sampleheap.hpp b/src/include/sampleheap.hpp index aa29be7c7..495f4a0d1 100644 --- a/src/include/sampleheap.hpp +++ b/src/include/sampleheap.hpp @@ -85,10 +85,26 @@ class SampleHeap : public SuperHeap { return nullptr; } if (pythonDetected() && !g.wasInMalloc()) { + // All standard Python allocations pass through + // MakeLocalAllocator in libscalene.cpp . + // We don't want to double-count these allocations-- + // if we're in here and g.wasInMalloc is true, then + // we passed through MakeLocalAllocator and + // this allocation has already been counted + // + // We only want to count allocations here + // if `malloc` itself was called by client code. auto realSize = SuperHeap::getSize(ptr); if (realSize > 0) { if (sz == NEWLINE + sizeof(ScaleneHeader)) { - // Don't count these allocations + // FIXME + // + // If we ourselves are allocating a NEWLINE, we're doing + // it through the Python allocator, so if it's an actual newline + // I don't think we should ever get here. I think our original intention + // is that we shouldn't count NEWLINE records that we already counted, + // but I think if we get to this line here, we didn't actually create a NEWLINE + // and should count it. return ptr; } register_malloc(realSize, ptr, false); // false -> invoked from C/C++ @@ -150,6 +166,29 @@ class SampleHeap : public SuperHeap { // If this is the special NEWLINE value, trigger an update. if (unlikely(realSize == NEWLINE)) { std::string filename; + // Originally, we had the following check around this line: + // + // ``` + // if (where != nullptr && where(filename, lineno, bytei)) + // ``` + // + // This was to prevent a NEWLINE record from being accidentally triggered by + // non-Scalene code. + // + // However, by definition, we trigger a NEWLINE _after_ the line has + // been executed, specifically on a `PyTrace_Line` event. + // + // If the absolute last line of a program makes an allocation, + // the next PyTrace_Line will occur inside `scalene_profiler.py` and not any client + // code, since the line after the last line of the program is when Scalene starts its + // teardown. + // + // In this case. the `whereInPython` function will return 0, since whereInPython checks + // if the current frame is in client code and the Scalene profiler teardown code is by definition + // not. + // + // This risks letting allocations of length NEWLINE_TRIGGER_LENGTH that are not true NEWLINEs + // create a NEWLINE record, but we view this as incredibly improbable. writeCount(MallocSignal, realSize, ptr, filename, -1, -1); mallocTriggered()++; return; diff --git a/src/source/pywhere.cpp b/src/source/pywhere.cpp index c894f1f57..02f54e7fa 100644 --- a/src/source/pywhere.cpp +++ b/src/source/pywhere.cpp @@ -269,11 +269,36 @@ static void allocate_newline() { static int trace_func(PyObject* obj, PyFrameObject* frame, int what, PyObject* arg) { if (what == PyTrace_CALL || what == PyTrace_C_CALL) { - PyThreadState* tstate = PyThreadState_Get(); + // Prior to this check, trace_func was called + // in every child frame. When we figured out the frame + // was a child of the current line, only then did we disable tracing in that frame. + // This was causing a major slowdown when importing pytorch-- from what we can tell, + // the import itself called many functions and the call overhead of the entire tracing harness + // was incurred for each call at least once. + // + // + // What we're trying to do here, though, is see if we have moved on to another line of the client program. + // Therefore, we can disable tracing for the moment, since one of three things has happened: + // + // 1. We have called a library function. We therefore know that there will be absolutely no important events coming from this + // frame, since the program can't progress to the next line before until the call has ended + // + // 2. We have called a client function. We know that the line we were on hasn't ended yet, since we would get a PyTrace_Line + // event if that did happen. This leaves us with one of two cases: + // + // 2.1: The function makes no allocations. Therefore, not tracing Line events in its frame is valid and the next Line + // we get is in the parent frame, the one that we care about + // 2.2: the function does make an allocation. In that case, we separately enable settrace at that allocation, + // so we still track it + // + // + // FIXME: if, in a single line, we see a pattern in a single line like allocation -> client call w/ allocation, we won't actually increment + // the n_mallocs counter for the line we started with frame->f_trace_lines = 0; frame->f_trace = NULL; #if PY_VERSION_HEX >= 0x030a0000 && PY_VERSION_HEX < 0x030c0000 // This pre-3.12 optimization only exists post 3.9 + PyThreadState* tstate = PyThreadState_Get(); tstate->cframe->use_tracing = 0; #endif