Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix line invalidation speed #891

Merged
merged 21 commits into from
Dec 3, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/test-smoketests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ jobs:
- name: decorator smoke test
run: python test/smoketest_profile_decorator.py

- name: line invalidation test
run: python test/smoketest_line_invalidation.py

# Note: This test doesn't need to read an output,
# it is meant to determine if there is an ImportError
# or anything related if relative imports are used.
Expand Down
5 changes: 4 additions & 1 deletion scalene/scalene_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -546,7 +546,7 @@ def malloc_signal_handler(
ByteCodeIndex(f.f_lasti),
]
Scalene.__alloc_sigq.put([0])
pywhere.enable_settrace()
pywhere.enable_settrace(this_frame)
del this_frame

@staticmethod
Expand Down Expand Up @@ -1289,6 +1289,9 @@ def alloc_sigqueue_processor(x: Optional[List[int]]) -> None:
bytei,
) = item
is_malloc = action == Scalene.MALLOC_ACTION
if count == scalene.scalene_config.NEWLINE_TRIGGER_LENGTH + 1:
continue # in previous implementations, we were adding NEWLINE to the footprint.
# We should not account for this in the user-facing profile.
count /= Scalene.BYTES_PER_MB
if is_malloc:
stats.current_footprint += count
Expand Down
48 changes: 41 additions & 7 deletions src/include/sampleheap.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,26 @@
return nullptr;
}
if (pythonDetected() && !g.wasInMalloc()) {
// All standard Python allocations pass through
// MakeLocalAllocator in libscalene.cpp .
// We don't want to double-count these allocations--
// if we're in here and g.wasInMalloc is true, then
// we passed through MakeLocalAllocator and
// this allocation has already been counted
//
// We only want to count allocations here
// if `malloc` itself was called by client code.
auto realSize = SuperHeap::getSize(ptr);
if (realSize > 0) {
if (sz == NEWLINE + sizeof(ScaleneHeader)) {
// Don't count these allocations
// FIXME
Dismissed Show dismissed Hide dismissed
//
// If we ourselves are allocating a NEWLINE, we're doing
// it through the Python allocator, so if it's an actual newline
// I don't think we should ever get here. I think our original intention
// is that we shouldn't count NEWLINE records that we already counted,
// but I think if we get to this line here, we didn't actually create a NEWLINE
// and should count it.
return ptr;
}
register_malloc(realSize, ptr, false); // false -> invoked from C/C++
Expand Down Expand Up @@ -150,12 +166,30 @@
// If this is the special NEWLINE value, trigger an update.
if (unlikely(realSize == NEWLINE)) {
std::string filename;
int lineno;
int bytei;
decltype(whereInPython)* where = p_whereInPython;
if (where != nullptr && where(filename, lineno, bytei)) {
writeCount(MallocSignal, realSize, ptr, filename, lineno, bytei);
}
// Originally, we had the following check around this line:
//
// ```
// if (where != nullptr && where(filename, lineno, bytei))
// ```
//
// This was to prevent a NEWLINE record from being accidentally triggered by
// non-Scalene code.
//
// However, by definition, we trigger a NEWLINE _after_ the line has
// been executed, specifically on a `PyTrace_Line` event.
//
// If the absolute last line of a program makes an allocation,
// the next PyTrace_Line will occur inside `scalene_profiler.py` and not any client
// code, since the line after the last line of the program is when Scalene starts its
// teardown.
//
// In this case. the `whereInPython` function will return 0, since whereInPython checks
// if the current frame is in client code and the Scalene profiler teardown code is by definition
// not.
//
// This risks letting allocations of length NEWLINE_TRIGGER_LENGTH that are not true NEWLINEs
// create a NEWLINE record, but we view this as incredibly improbable.
writeCount(MallocSignal, realSize, ptr, filename, -1, -1);
mallocTriggered()++;
return;
}
Expand Down
44 changes: 42 additions & 2 deletions src/source/pywhere.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,41 @@ static void allocate_newline() {

static int trace_func(PyObject* obj, PyFrameObject* frame, int what,
PyObject* arg) {
if (what == PyTrace_CALL || what == PyTrace_C_CALL) {
// Prior to this check, trace_func was called
// in every child frame. When we figured out the frame
// was a child of the current line, only then did we disable tracing in that frame.
// This was causing a major slowdown when importing pytorch-- from what we can tell,
// the import itself called many functions and the call overhead of the entire tracing harness
// was incurred for each call at least once.
//
//
// What we're trying to do here, though, is see if we have moved on to another line of the client program.
// Therefore, we can disable tracing for the moment, since one of three things has happened:
//
// 1. We have called a library function. We therefore know that there will be absolutely no important events coming from this
// frame, since the program can't progress to the next line before until the call has ended
//
// 2. We have called a client function. We know that the line we were on hasn't ended yet, since we would get a PyTrace_Line
// event if that did happen. This leaves us with one of two cases:
//
// 2.1: The function makes no allocations. Therefore, not tracing Line events in its frame is valid and the next Line
// we get is in the parent frame, the one that we care about
// 2.2: the function does make an allocation. In that case, we separately enable settrace at that allocation,
// so we still track it
//
//
// FIXME: if, in a single line, we see a pattern in a single line like allocation -> client call w/ allocation, we won't actually increment
// the n_mallocs counter for the line we started with
frame->f_trace_lines = 0;
frame->f_trace = NULL;
#if PY_VERSION_HEX >= 0x030a0000 && PY_VERSION_HEX < 0x030c0000
// This pre-3.12 optimization only exists post 3.9
PyThreadState* tstate = PyThreadState_Get();
tstate->cframe->use_tracing = 0;
#endif

}
if (what != PyTrace_LINE) {
return 0;
}
Expand Down Expand Up @@ -297,7 +332,6 @@ static int trace_func(PyObject* obj, PyFrameObject* frame, int what,
// Needed because decref will be called in on_stack
Py_INCREF(frame);
if (on_stack(last_fname_s, lineno_l, static_cast<PyFrameObject*>(frame))) {
frame->f_trace_lines = 0;
return 0;
}

Expand Down Expand Up @@ -370,7 +404,13 @@ static PyObject* depopulate_struct(PyObject* self, PyObject* args) {
}

static PyObject* enable_settrace(PyObject* self, PyObject* args) {
PyObject* frame;
if (!PyArg_ParseTuple(args, "O", &frame)) {
return NULL;
}
PyFrameObject* frame_obj = (PyFrameObject*) frame;
PyEval_SetTrace(trace_func, NULL);
frame_obj->f_trace_lines = 1;
Py_RETURN_NONE;
}

Expand All @@ -389,7 +429,7 @@ static PyMethodDef EmbMethods[] = {
{"print_files_to_profile", print_files_to_profile, METH_NOARGS,
"printing for debug"},
// {"return_buffer", return_buffer, METH_NOARGS, ""},
{"enable_settrace", enable_settrace, METH_NOARGS, ""},
{"enable_settrace", enable_settrace, METH_VARARGS, ""},
{"disable_settrace", disable_settrace, METH_NOARGS, ""},
{"populate_struct", populate_struct, METH_NOARGS, ""},
{"depopulate_struct", depopulate_struct, METH_NOARGS, ""},
Expand Down
13 changes: 13 additions & 0 deletions test/line_attribution_tests/line_after_final_alloc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@

def main():
accum = bytes()
for i in range(31):
accum += bytes(10485767 * 2)


asdf = bytes(2 * 10485767)
Dismissed Show dismissed Hide dismissed
some_dead_line = None
Dismissed Show dismissed Hide dismissed


if __name__ == '__main__':
main()
12 changes: 12 additions & 0 deletions test/line_attribution_tests/loop_below_threshold.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@

def main():
accum = bytes()
for i in range(31):
accum += bytes(10485767 // 4) # far below the allocation sampling window


asdf = bytes(2 * 10485767)
Dismissed Show dismissed Hide dismissed


if __name__ == '__main__':
main()
12 changes: 12 additions & 0 deletions test/line_attribution_tests/loop_with_multiple_lines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@

def main():
accum = bytes()
for i in range(31):
accum += bytes(2 * 10485767) # 2x the allocation sampling window
bogus = None
Fixed Show fixed Hide fixed
Dismissed Show dismissed Hide dismissed

asdf = bytes(2 * 10485767)
Dismissed Show dismissed Hide dismissed


if __name__ == '__main__':
main()
12 changes: 12 additions & 0 deletions test/line_attribution_tests/loop_with_one_alloc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@

def main():
accum = bytes()
for i in range(31):
accum += bytes(2 * 10485767) # 2x the allocation sampling window


asdf = bytes(2 * 10485767)
Dismissed Show dismissed Hide dismissed


if __name__ == '__main__':
main()
11 changes: 11 additions & 0 deletions test/line_attribution_tests/loop_with_two_allocs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@

def main():
accum = bytes()
for i in range(31):
accum += bytes(2 * 10485767) + bytes(2 * 10485767) # 2x the allocation sampling window


asdf = bytes(2 * 10485767)
Dismissed Show dismissed Hide dismissed

if __name__ == '__main__':
main()
124 changes: 124 additions & 0 deletions test/smoketest_line_invalidation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
"""
This is bound very closely to the current implementation of
the tests in `test/line_attribution_tests.

The two things that matter are the number of loops, the size
of the allocations, and the exact line numbers.


"""

expected_md5_sums = {
"test/line_attribution_tests/line_after_final_alloc.py": "2d457078fae8c2a7b498cf7dd87324bc",
"test/line_attribution_tests/loop_below_threshold.py": "96f9e1c086ecdd522a6c565e0e751df6",
"test/line_attribution_tests/loop_with_multiple_lines.py": "ed6d9802f31ef9d7a8636e6d0d728013",
"test/line_attribution_tests/loop_with_one_alloc.py": "fe952422358a8070c8049fdab1c512af",
"test/line_attribution_tests/loop_with_two_allocs.py": "9ca21ee9f9a768c4d05f3c4ba23444e9",
}

import subprocess
import tempfile
import sys
from pathlib import Path
from hashlib import md5
from scalene.scalene_json import ScaleneJSONSchema

N_LOOPS = 31
LOOP_ALLOC_LINENO = 5 #
OUT_OF_LOOP_ALLOC_LINENO = 8


def check_for_changes():
errors = []
for fname, expected_sum in expected_md5_sums.items():
with open( fname, "rb") as f:
digest = md5(f.read()).hexdigest()
if digest != expected_sum:
errors.append(fname)
assert len(errors) == 0, f'Detected change in file(s) {",".join(errors)}'


def get_line(scalene_profile: ScaleneJSONSchema, lineno: int):
files = list(scalene_profile.files.keys())
assert len(files) == 1
filename = files[0]
return scalene_profile.files[filename].lines[lineno - 1]


def get_profile(test_stem, outdir_p, test_dir):
proc = subprocess.run(
Dismissed Show dismissed Hide dismissed
[
sys.executable,
"-m",
"scalene",
"--cli",
"--json",
"--outfile",
outdir_p / f"{test_stem}.json",
test_dir / f"{test_stem}.py",
],
# capture_output=True,
check=True,
)
with open(outdir_p / f"{test_stem}.json", "r") as f:
profile = ScaleneJSONSchema.model_validate_json(f.read())
return (test_stem, profile)


def main():
check_for_changes()
test_dir = Path(__file__).parent / "line_attribution_tests"
with tempfile.TemporaryDirectory() as outdir:
outdir_p = Path(outdir)
one_alloc = get_profile("loop_with_one_alloc", outdir_p, test_dir)
two_on_one_line = get_profile("loop_with_two_allocs", outdir_p, test_dir)
below_threshold = get_profile("loop_below_threshold", outdir_p, test_dir)
line_after_final_alloc = get_profile(
"line_after_final_alloc", outdir_p, test_dir
)
errors = []
for stem, profile in [one_alloc, two_on_one_line, line_after_final_alloc]:
line = get_line(profile, LOOP_ALLOC_LINENO)
if not line.n_mallocs == N_LOOPS:
errors.append(
f"Expected {N_LOOPS} distinct lines on {stem}, got {line.n_mallocs} on line {LOOP_ALLOC_LINENO}"
)

bt_stem, bt_prof = below_threshold
bt_line = get_line(bt_prof, LOOP_ALLOC_LINENO)
if not bt_line.n_mallocs < N_LOOPS:
errors.append(
f"{bt_stem} makes smaller allocations than the allocation sampling window, so fewer than {N_LOOPS} allocations on {LOOP_ALLOC_LINENO} should be reported. Got {bt_line.n_mallocs} mallocs"
)

for stem, profile in [
one_alloc,
two_on_one_line,
below_threshold,
line_after_final_alloc,
]:
line = get_line(profile, OUT_OF_LOOP_ALLOC_LINENO)
if not line.n_mallocs == 1:
errors.append(
f"Line {OUT_OF_LOOP_ALLOC_LINENO} in {stem} makes a large allocation, so it should be reported."
)

if len(errors) > 0:
for error in errors:
print(f"ERROR: {error}")
for profile in [
one_alloc,
two_on_one_line,
below_threshold,
line_after_final_alloc,
]:
print("\n\n\n\n")
print(profile[1].model_dump_json(indent=4))
exit(1)
else:
print("PASS")
exit(0)


if __name__ == "__main__":
main()
Loading