diff --git a/.github/workflows/test-smoketests.yml b/.github/workflows/test-smoketests.yml index 3c4813042..8088ce0a1 100644 --- a/.github/workflows/test-smoketests.yml +++ b/.github/workflows/test-smoketests.yml @@ -51,6 +51,11 @@ jobs: - name: decorator smoke test run: python test/smoketest_profile_decorator.py + # FIXME: these tests are broken under the current Github runner + # + # - name: line invalidation test + # run: python test/smoketest_line_invalidation.py + # Note: This test doesn't need to read an output, # it is meant to determine if there is an ImportError # or anything related if relative imports are used. diff --git a/scalene/scalene_profiler.py b/scalene/scalene_profiler.py index 3e56c1929..29b6586e8 100644 --- a/scalene/scalene_profiler.py +++ b/scalene/scalene_profiler.py @@ -546,7 +546,7 @@ def malloc_signal_handler( ByteCodeIndex(f.f_lasti), ] Scalene.__alloc_sigq.put([0]) - pywhere.enable_settrace() + pywhere.enable_settrace(this_frame) del this_frame @staticmethod @@ -1289,6 +1289,9 @@ def alloc_sigqueue_processor(x: Optional[List[int]]) -> None: bytei, ) = item is_malloc = action == Scalene.MALLOC_ACTION + if count == scalene.scalene_config.NEWLINE_TRIGGER_LENGTH + 1: + continue # in previous implementations, we were adding NEWLINE to the footprint. + # We should not account for this in the user-facing profile. count /= Scalene.BYTES_PER_MB if is_malloc: stats.current_footprint += count diff --git a/src/include/sampleheap.hpp b/src/include/sampleheap.hpp index 662526c26..495f4a0d1 100644 --- a/src/include/sampleheap.hpp +++ b/src/include/sampleheap.hpp @@ -85,10 +85,26 @@ class SampleHeap : public SuperHeap { return nullptr; } if (pythonDetected() && !g.wasInMalloc()) { + // All standard Python allocations pass through + // MakeLocalAllocator in libscalene.cpp . + // We don't want to double-count these allocations-- + // if we're in here and g.wasInMalloc is true, then + // we passed through MakeLocalAllocator and + // this allocation has already been counted + // + // We only want to count allocations here + // if `malloc` itself was called by client code. auto realSize = SuperHeap::getSize(ptr); if (realSize > 0) { if (sz == NEWLINE + sizeof(ScaleneHeader)) { - // Don't count these allocations + // FIXME + // + // If we ourselves are allocating a NEWLINE, we're doing + // it through the Python allocator, so if it's an actual newline + // I don't think we should ever get here. I think our original intention + // is that we shouldn't count NEWLINE records that we already counted, + // but I think if we get to this line here, we didn't actually create a NEWLINE + // and should count it. return ptr; } register_malloc(realSize, ptr, false); // false -> invoked from C/C++ @@ -150,12 +166,30 @@ class SampleHeap : public SuperHeap { // If this is the special NEWLINE value, trigger an update. if (unlikely(realSize == NEWLINE)) { std::string filename; - int lineno; - int bytei; - decltype(whereInPython)* where = p_whereInPython; - if (where != nullptr && where(filename, lineno, bytei)) { - writeCount(MallocSignal, realSize, ptr, filename, lineno, bytei); - } + // Originally, we had the following check around this line: + // + // ``` + // if (where != nullptr && where(filename, lineno, bytei)) + // ``` + // + // This was to prevent a NEWLINE record from being accidentally triggered by + // non-Scalene code. + // + // However, by definition, we trigger a NEWLINE _after_ the line has + // been executed, specifically on a `PyTrace_Line` event. + // + // If the absolute last line of a program makes an allocation, + // the next PyTrace_Line will occur inside `scalene_profiler.py` and not any client + // code, since the line after the last line of the program is when Scalene starts its + // teardown. + // + // In this case. the `whereInPython` function will return 0, since whereInPython checks + // if the current frame is in client code and the Scalene profiler teardown code is by definition + // not. + // + // This risks letting allocations of length NEWLINE_TRIGGER_LENGTH that are not true NEWLINEs + // create a NEWLINE record, but we view this as incredibly improbable. + writeCount(MallocSignal, realSize, ptr, filename, -1, -1); mallocTriggered()++; return; } diff --git a/src/source/pywhere.cpp b/src/source/pywhere.cpp index d501d6ae8..02f54e7fa 100644 --- a/src/source/pywhere.cpp +++ b/src/source/pywhere.cpp @@ -268,6 +268,41 @@ static void allocate_newline() { static int trace_func(PyObject* obj, PyFrameObject* frame, int what, PyObject* arg) { + if (what == PyTrace_CALL || what == PyTrace_C_CALL) { + // Prior to this check, trace_func was called + // in every child frame. When we figured out the frame + // was a child of the current line, only then did we disable tracing in that frame. + // This was causing a major slowdown when importing pytorch-- from what we can tell, + // the import itself called many functions and the call overhead of the entire tracing harness + // was incurred for each call at least once. + // + // + // What we're trying to do here, though, is see if we have moved on to another line of the client program. + // Therefore, we can disable tracing for the moment, since one of three things has happened: + // + // 1. We have called a library function. We therefore know that there will be absolutely no important events coming from this + // frame, since the program can't progress to the next line before until the call has ended + // + // 2. We have called a client function. We know that the line we were on hasn't ended yet, since we would get a PyTrace_Line + // event if that did happen. This leaves us with one of two cases: + // + // 2.1: The function makes no allocations. Therefore, not tracing Line events in its frame is valid and the next Line + // we get is in the parent frame, the one that we care about + // 2.2: the function does make an allocation. In that case, we separately enable settrace at that allocation, + // so we still track it + // + // + // FIXME: if, in a single line, we see a pattern in a single line like allocation -> client call w/ allocation, we won't actually increment + // the n_mallocs counter for the line we started with + frame->f_trace_lines = 0; + frame->f_trace = NULL; + #if PY_VERSION_HEX >= 0x030a0000 && PY_VERSION_HEX < 0x030c0000 + // This pre-3.12 optimization only exists post 3.9 + PyThreadState* tstate = PyThreadState_Get(); + tstate->cframe->use_tracing = 0; + #endif + + } if (what != PyTrace_LINE) { return 0; } @@ -297,7 +332,6 @@ static int trace_func(PyObject* obj, PyFrameObject* frame, int what, // Needed because decref will be called in on_stack Py_INCREF(frame); if (on_stack(last_fname_s, lineno_l, static_cast(frame))) { - frame->f_trace_lines = 0; return 0; } @@ -370,7 +404,13 @@ static PyObject* depopulate_struct(PyObject* self, PyObject* args) { } static PyObject* enable_settrace(PyObject* self, PyObject* args) { + PyObject* frame; + if (!PyArg_ParseTuple(args, "O", &frame)) { + return NULL; + } + PyFrameObject* frame_obj = (PyFrameObject*) frame; PyEval_SetTrace(trace_func, NULL); + frame_obj->f_trace_lines = 1; Py_RETURN_NONE; } @@ -389,7 +429,7 @@ static PyMethodDef EmbMethods[] = { {"print_files_to_profile", print_files_to_profile, METH_NOARGS, "printing for debug"}, // {"return_buffer", return_buffer, METH_NOARGS, ""}, - {"enable_settrace", enable_settrace, METH_NOARGS, ""}, + {"enable_settrace", enable_settrace, METH_VARARGS, ""}, {"disable_settrace", disable_settrace, METH_NOARGS, ""}, {"populate_struct", populate_struct, METH_NOARGS, ""}, {"depopulate_struct", depopulate_struct, METH_NOARGS, ""}, diff --git a/test/line_attribution_tests/line_after_final_alloc.py b/test/line_attribution_tests/line_after_final_alloc.py new file mode 100644 index 000000000..3e8b09c90 --- /dev/null +++ b/test/line_attribution_tests/line_after_final_alloc.py @@ -0,0 +1,13 @@ + +def main(): + accum = bytes() + for i in range(31): + accum += bytes(10485767 * 2) + + + asdf = bytes(2 * 10485767) + some_dead_line = None + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/test/line_attribution_tests/loop_below_threshold.py b/test/line_attribution_tests/loop_below_threshold.py new file mode 100644 index 000000000..0b0195f63 --- /dev/null +++ b/test/line_attribution_tests/loop_below_threshold.py @@ -0,0 +1,12 @@ + +def main(): + accum = bytes() + for i in range(31): + accum += bytes(10485767 // 4) # far below the allocation sampling window + + + asdf = bytes(2 * 10485767) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/test/line_attribution_tests/loop_with_multiple_lines.py b/test/line_attribution_tests/loop_with_multiple_lines.py new file mode 100644 index 000000000..ea8da175b --- /dev/null +++ b/test/line_attribution_tests/loop_with_multiple_lines.py @@ -0,0 +1,12 @@ + +def main(): + accum = bytes() + for i in range(31): + accum += bytes(2 * 10485767) # 2x the allocation sampling window + bogus = None + + asdf = bytes(2 * 10485767) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/test/line_attribution_tests/loop_with_one_alloc.py b/test/line_attribution_tests/loop_with_one_alloc.py new file mode 100644 index 000000000..2d004ac42 --- /dev/null +++ b/test/line_attribution_tests/loop_with_one_alloc.py @@ -0,0 +1,12 @@ + +def main(): + accum = bytes() + for i in range(31): + accum += bytes(2 * 10485767) # 2x the allocation sampling window + + + asdf = bytes(2 * 10485767) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/test/line_attribution_tests/loop_with_two_allocs.py b/test/line_attribution_tests/loop_with_two_allocs.py new file mode 100644 index 000000000..1f8f3f39d --- /dev/null +++ b/test/line_attribution_tests/loop_with_two_allocs.py @@ -0,0 +1,11 @@ + +def main(): + accum = bytes() + for i in range(31): + accum += bytes(2 * 10485767) + bytes(2 * 10485767) # 2x the allocation sampling window + + + asdf = bytes(2 * 10485767) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/test/smoketest_line_invalidation.py b/test/smoketest_line_invalidation.py new file mode 100644 index 000000000..80a3688ae --- /dev/null +++ b/test/smoketest_line_invalidation.py @@ -0,0 +1,124 @@ +""" +This is bound very closely to the current implementation of +the tests in `test/line_attribution_tests. + +The two things that matter are the number of loops, the size +of the allocations, and the exact line numbers. + + +""" + +expected_md5_sums = { + "test/line_attribution_tests/line_after_final_alloc.py": "2d457078fae8c2a7b498cf7dd87324bc", + "test/line_attribution_tests/loop_below_threshold.py": "96f9e1c086ecdd522a6c565e0e751df6", + "test/line_attribution_tests/loop_with_multiple_lines.py": "ed6d9802f31ef9d7a8636e6d0d728013", + "test/line_attribution_tests/loop_with_one_alloc.py": "fe952422358a8070c8049fdab1c512af", + "test/line_attribution_tests/loop_with_two_allocs.py": "9ca21ee9f9a768c4d05f3c4ba23444e9", +} + +import subprocess +import tempfile +import sys +from pathlib import Path +from hashlib import md5 +from scalene.scalene_json import ScaleneJSONSchema + +N_LOOPS = 31 +LOOP_ALLOC_LINENO = 5 # +OUT_OF_LOOP_ALLOC_LINENO = 8 + + +def check_for_changes(): + errors = [] + for fname, expected_sum in expected_md5_sums.items(): + with open( fname, "rb") as f: + digest = md5(f.read()).hexdigest() + if digest != expected_sum: + errors.append(fname) + assert len(errors) == 0, f'Detected change in file(s) {",".join(errors)}' + + +def get_line(scalene_profile: ScaleneJSONSchema, lineno: int): + files = list(scalene_profile.files.keys()) + assert len(files) == 1 + filename = files[0] + return scalene_profile.files[filename].lines[lineno - 1] + + +def get_profile(test_stem, outdir_p, test_dir): + proc = subprocess.run( + [ + sys.executable, + "-m", + "scalene", + "--cli", + "--json", + "--outfile", + outdir_p / f"{test_stem}.json", + test_dir / f"{test_stem}.py", + ], + # capture_output=True, + check=True, + ) + with open(outdir_p / f"{test_stem}.json", "r") as f: + profile = ScaleneJSONSchema.model_validate_json(f.read()) + return (test_stem, profile) + + +def main(): + check_for_changes() + test_dir = Path(__file__).parent / "line_attribution_tests" + with tempfile.TemporaryDirectory() as outdir: + outdir_p = Path(outdir) + one_alloc = get_profile("loop_with_one_alloc", outdir_p, test_dir) + two_on_one_line = get_profile("loop_with_two_allocs", outdir_p, test_dir) + below_threshold = get_profile("loop_below_threshold", outdir_p, test_dir) + line_after_final_alloc = get_profile( + "line_after_final_alloc", outdir_p, test_dir + ) + errors = [] + for stem, profile in [one_alloc, two_on_one_line, line_after_final_alloc]: + line = get_line(profile, LOOP_ALLOC_LINENO) + if not line.n_mallocs == N_LOOPS: + errors.append( + f"Expected {N_LOOPS} distinct lines on {stem}, got {line.n_mallocs} on line {LOOP_ALLOC_LINENO}" + ) + + bt_stem, bt_prof = below_threshold + bt_line = get_line(bt_prof, LOOP_ALLOC_LINENO) + if not bt_line.n_mallocs < N_LOOPS: + errors.append( + f"{bt_stem} makes smaller allocations than the allocation sampling window, so fewer than {N_LOOPS} allocations on {LOOP_ALLOC_LINENO} should be reported. Got {bt_line.n_mallocs} mallocs" + ) + + for stem, profile in [ + one_alloc, + two_on_one_line, + below_threshold, + line_after_final_alloc, + ]: + line = get_line(profile, OUT_OF_LOOP_ALLOC_LINENO) + if not line.n_mallocs == 1: + errors.append( + f"Line {OUT_OF_LOOP_ALLOC_LINENO} in {stem} makes a large allocation, so it should be reported." + ) + + if len(errors) > 0: + for error in errors: + print(f"ERROR: {error}") + for profile in [ + one_alloc, + two_on_one_line, + below_threshold, + line_after_final_alloc, + ]: + print("\n\n\n\n") + print(profile[1].model_dump_json(indent=4)) + exit(1) + else: + print("PASS") + exit(0) + + +if __name__ == "__main__": + main()