From d4b9c6f28f55b4141dbbc94ebbe5c87c18365b43 Mon Sep 17 00:00:00 2001
From: sternj <sternj@brandeis.edu>
Date: Mon, 2 Dec 2024 12:57:34 -0500
Subject: [PATCH] Added some more coments

---
 src/include/sampleheap.hpp | 41 +++++++++++++++++++++++++++++++++++++-
 src/source/pywhere.cpp     | 27 ++++++++++++++++++++++++-
 2 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/src/include/sampleheap.hpp b/src/include/sampleheap.hpp
index aa29be7c7..495f4a0d1 100644
--- a/src/include/sampleheap.hpp
+++ b/src/include/sampleheap.hpp
@@ -85,10 +85,26 @@ class SampleHeap : public SuperHeap {
       return nullptr;
     }
     if (pythonDetected() && !g.wasInMalloc()) {
+      // All standard Python allocations pass through
+      // MakeLocalAllocator in libscalene.cpp . 
+      // We don't want to double-count these allocations--
+      // if we're in here and g.wasInMalloc is true, then 
+      // we passed through MakeLocalAllocator and
+      // this allocation has already been counted
+      //
+      // We only want to count allocations here
+      // if `malloc` itself was called by client code.
       auto realSize = SuperHeap::getSize(ptr);
       if (realSize > 0) {
         if (sz == NEWLINE + sizeof(ScaleneHeader)) {
-          // Don't count these allocations
+          // FIXME
+          //
+          // If we ourselves are allocating a NEWLINE, we're doing
+          // it through the Python allocator, so if it's an actual newline
+          // I don't think we should ever get here. I think our original intention
+          // is that we shouldn't count NEWLINE records that we already counted,
+          // but I think if we get to this line here, we didn't actually create a NEWLINE
+          // and should count it. 
           return ptr;
         }
         register_malloc(realSize, ptr, false);  // false -> invoked from C/C++
@@ -150,6 +166,29 @@ class SampleHeap : public SuperHeap {
     // If this is the special NEWLINE value, trigger an update.
     if (unlikely(realSize == NEWLINE)) {
       std::string filename;
+      // Originally, we had the following check around this line:
+      //
+      // ```
+      // if (where != nullptr && where(filename, lineno, bytei))
+      // ```
+      // 
+      // This was to prevent a NEWLINE record from being accidentally triggered by
+      // non-Scalene code.
+      //
+      // However, by definition, we trigger a NEWLINE _after_ the line has
+      // been executed, specifically on a `PyTrace_Line` event.
+      //
+      // If the absolute last line of a program makes an allocation, 
+      // the next PyTrace_Line will occur inside `scalene_profiler.py` and not any client
+      // code, since the line after the last line of the program is when Scalene starts its
+      // teardown. 
+      // 
+      // In this case.  the `whereInPython` function will return 0, since whereInPython checks
+      // if the current frame is in client code and the Scalene profiler teardown code is by definition 
+      // not. 
+      //
+      // This risks letting allocations of length NEWLINE_TRIGGER_LENGTH that are not true NEWLINEs
+      // create a NEWLINE record, but we view this as incredibly improbable. 
       writeCount(MallocSignal, realSize, ptr, filename, -1, -1);
       mallocTriggered()++;
       return;
diff --git a/src/source/pywhere.cpp b/src/source/pywhere.cpp
index c894f1f57..02f54e7fa 100644
--- a/src/source/pywhere.cpp
+++ b/src/source/pywhere.cpp
@@ -269,11 +269,36 @@ static void allocate_newline() {
 static int trace_func(PyObject* obj, PyFrameObject* frame, int what,
                       PyObject* arg) {
   if (what == PyTrace_CALL || what == PyTrace_C_CALL) {
-    PyThreadState* tstate = PyThreadState_Get();
+    // Prior to this check, trace_func was called
+    // in every child frame. When we figured out the frame
+    // was a child of the current line, only then did we disable tracing in that frame. 
+    // This was causing a major slowdown when importing pytorch-- from what we can tell,
+    // the import itself called many functions and the call overhead of the entire tracing harness
+    // was incurred for each call at least once.
+    // 
+    //
+    // What we're trying to do here, though, is see if we have moved on to another line of the client program. 
+    // Therefore, we can disable tracing for the moment, since one of three things has happened:
+    //
+    // 1. We have called a library function. We therefore know that there will be absolutely no important events coming from this
+    //    frame, since the program can't progress to the next line before until the call has ended
+    //
+    // 2. We have called a client function. We know that the line we were on hasn't ended yet, since we would get a PyTrace_Line
+    //    event if that did happen. This leaves us with one of two cases:
+    //    
+    //    2.1: The function makes no allocations. Therefore, not tracing Line events in its frame is valid and the next Line
+    //         we get is in the parent frame, the one that we care about
+    //    2.2: the function does make an allocation. In that case, we separately enable settrace at that allocation,
+    //         so we still track it
+    //
+    //
+    // FIXME: if, in a single line, we see a pattern in a single line like allocation -> client call w/ allocation, we won't actually increment
+    //        the n_mallocs counter for the line we started with
     frame->f_trace_lines = 0;
     frame->f_trace = NULL;
     #if PY_VERSION_HEX >= 0x030a0000 && PY_VERSION_HEX < 0x030c0000 
     // This pre-3.12 optimization only exists post 3.9
+    PyThreadState* tstate = PyThreadState_Get();
     tstate->cframe->use_tracing = 0;
     #endif