forked from MathieuTurcotte/msparser
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmsparser.py
278 lines (225 loc) · 8.48 KB
/
msparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
# Copyright (c) 2011 Mathieu Turcotte
# Licensed under the MIT license.
"""
The msparser module offers a simple interface to parse the Valgrind massif.out
file format, i.e. data files produced the Valgrind heap profiler.
"""
from __future__ import with_statement # Enable with statement in Python 2.5.
import os.path
import re
__all__ = ["parse", "parse_file", "ParseError"]
# Precompiled regex used to parse comments.
_COMMENT_RE = re.compile("\s*(#|$)")
# Precompiled regexes used to parse header fields.
_FIELD_DESC_RE = re.compile("desc:\s(?P<data>.*)$")
_FIELD_CMD_RE = re.compile("cmd:\s(?P<data>.*)$")
_FIELD_TIME_UNIT_RE = re.compile("time_unit:\s(?P<data>ms|B|i)$")
# Precompiled regexes used to parse snaphot fields.
_FIELD_SNAPSHOT_RE = re.compile("snapshot=(?P<data>\d+)")
_FIELD_TIME_RE = re.compile("time=(?P<data>\d+)")
_FIELD_MEM_HEAP_RE = re.compile("mem_heap_B=(?P<data>\d+)")
_FIELD_MEM_EXTRA_RE = re.compile("mem_heap_extra_B=(?P<data>\d+)")
_FIELD_MEM_STACK_RE = re.compile("mem_stacks_B=(?P<data>\d+)")
_FIELD_HEAP_TREE_RE = re.compile("heap_tree=(?P<data>\w+)")
# Precompiled regex to parse heap entries. Matches three things:
# - the number of children,
# - the number of bytes,
# - and the details section.
_HEAP_ENTRY_RE = re.compile("""
\s*n # skip zero or more spaces, then 'n'
(?P<num_children>\d+) # match number of children, 1 or more digits
:\s # skip ':' and one space
(?P<num_bytes>\d+) # match the number of bytes, 1 or more digits
\s # skip one space
(?P<details>.*) # match the details
""", re.VERBOSE)
# Precompiled regex to check if the details section is below threshold.
_HEAP_BELOW_THRESHOLD_RE = re.compile(r"""in.*places?.*""")
# Precompiled regex to parse the details section of entries above threshold.
# This should match four things:
# - the hexadecimal address,
# - the function name,
# - the file name or binary path, i.e. file.cpp or usr/local/bin/foo.so,
# - and a line number if present.
# Last two parts are optional to handle entries without a file name or binary
# path.
_HEAP_DETAILS_RE = re.compile(r"""
(?P<address>[a-fA-F0-9x]+) # match the hexadecimal address
:\s # skip ': '
(?P<function>.+?) # match the function's name, non-greedy
(?: # don't capture fname/line group
\s
\(
(?:in\s)? # skip 'in ' if present
(?P<fname>[^:]+) # match the file name
:? # skip ':', if present
(?P<line>\d+)? # match the line number, if present
\)
)? # fname/line group is optional
$ # should have reached the EOL
""", re.VERBOSE)
class ParseContext:
"""
A simple context for parsing. Dumbed down version of fileinput.
"""
def __init__(self, fd):
self._fd = fd
self._line = 0
def line(self):
return self._line
def readline(self):
self._line += 1
return self._fd.readline()
def filename(self):
return os.path.abspath(self._fd.name)
class ParseError(Exception):
"""
Error raised when a parsing error is encountered.
"""
def __init__(self, msg, ctx):
self.msg = msg
self.line = ctx.line()
self.filename = ctx.filename()
def __str__(self):
return " ".join([str(self.msg), 'at line', str(self.line), 'in',
str(self.filename)])
def parse_file(filepath):
"""
Convenience function taking a file path instead of a file descriptor.
"""
with open(filepath) as fd:
return parse(fd)
def parse(fd):
"""
Parse an already opened massif output file.
"""
mdata = {}
ctx = ParseContext(fd)
_parse_header(ctx, mdata)
_parse_snapshots(ctx, mdata)
return mdata
def _match_unconditional(ctx, regex, string):
"""
Unconditionaly match a regular expression against a string, i.e. if there
is no match we raise a ParseError.
"""
match = regex.match(string)
if match is None:
raise ParseError("".join(["can't match '", string, "' against '",
regex.pattern, "'"]), ctx)
return match
def _get_next_line(ctx, may_reach_eof=False):
"""
Read another line from ctx. If may_reach_eof is False, reaching EOF will
be considered as an error.
"""
line = ctx.readline() # Returns an empty string on EOF.
if len(line) == 0:
if may_reach_eof is False:
raise ParseError("unexpected EOF", ctx)
else:
return None
else:
return line.strip("\n")
def _get_next_field(ctx, field_regex, may_reach_eof=False):
"""
Read the next data field. The field_regex arg is a regular expression that
will be used to match the field. Data will be extracted from the match
object by calling m.group('data'). If may_reach_eof is False, reaching EOF
will be considered as an error.
"""
line = _get_next_line(ctx, may_reach_eof)
while line is not None:
if _COMMENT_RE.match(line):
line = _get_next_line(ctx, may_reach_eof)
else:
match = _match_unconditional(ctx, field_regex, line)
return match.group("data")
return None
def _parse_header(ctx, mdata):
mdata["desc"] = _get_next_field(ctx, _FIELD_DESC_RE)
mdata["cmd"] = _get_next_field(ctx, _FIELD_CMD_RE)
mdata["time_unit"] = _get_next_field(ctx, _FIELD_TIME_UNIT_RE)
def _parse_snapshots(ctx, mdata):
index = 0
snapshots = []
detailed_snapshot_indices = []
peak_snapshot_index = None
snapshot = _parse_snapshot(ctx)
while snapshot is not None:
if snapshot["is_detailed"]:
detailed_snapshot_indices.append(index)
if snapshot["is_peak"]:
peak_snapshot_index = index
snapshots.append(snapshot["data"])
snapshot = _parse_snapshot(ctx)
index += 1
mdata["snapshots"] = snapshots
mdata["detailed_snapshot_indices"] = detailed_snapshot_indices
if peak_snapshot_index is not None:
mdata["peak_snapshot_index"] = peak_snapshot_index
def _parse_snapshot(ctx):
"""
Parse another snapshot, appending it to the mdata["snapshots"] list. On
EOF, False will be returned.
"""
snapshot_id = _get_next_field(ctx, _FIELD_SNAPSHOT_RE, may_reach_eof=True)
if snapshot_id is None:
return None
snapshot_id = int(snapshot_id)
time = int(_get_next_field(ctx, _FIELD_TIME_RE))
mem_heap = int(_get_next_field(ctx, _FIELD_MEM_HEAP_RE))
mem_heap_extra = int(_get_next_field(ctx, _FIELD_MEM_EXTRA_RE))
mem_stacks = int(_get_next_field(ctx, _FIELD_MEM_STACK_RE))
heap_tree_field = _get_next_field(ctx, _FIELD_HEAP_TREE_RE)
heap_tree = None
is_detailed = False
is_peak = False
if heap_tree_field != "empty":
is_detailed = True
if heap_tree_field == "peak":
is_peak = True
heap_tree = _parse_heap_tree(ctx)
return {
"is_detailed": is_detailed,
"is_peak": is_peak,
"data": {
"id": snapshot_id,
"time": time,
"mem_heap": mem_heap,
"mem_heap_extra": mem_heap_extra,
"mem_stack": mem_stacks,
"heap_tree": heap_tree
}
}
def _parse_heap_tree(ctx):
"""
Parse a heap tree.
"""
line = _get_next_line(ctx)
entry_match = _match_unconditional(ctx, _HEAP_ENTRY_RE, line)
details_group = entry_match.group("details")
details = None
details_match = _HEAP_DETAILS_RE.match(details_group)
if details_match:
# The 'line' field could be None if the binary/library wasn't compiled
# with debug info. To avoid errors on this condition, we need to make
# sure that the 'line' field is not None before trying to convert it to
# an integer.
linum = details_match.group(4)
if linum is not None:
linum = int(linum)
details = {
"address": details_match.group("address"),
"function": details_match.group("function"),
"file": details_match.group("fname"),
"line": linum
}
children = []
for i in range(0, int(entry_match.group("num_children"))):
children.append(_parse_heap_tree(ctx))
heap_node = {}
heap_node["nbytes"] = int(entry_match.group("num_bytes"))
heap_node["children"] = children
heap_node["details"] = details
return heap_node