Skip to content

Commit 7df2031

Browse files
committed
implement graph based line clustering
1 parent 2d45741 commit 7df2031

File tree

1 file changed

+158
-35
lines changed

1 file changed

+158
-35
lines changed

trish.py

Lines changed: 158 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,28 @@ def pairs(l):
189189
for j in range(i+1, len(l)):
190190
yield (l[i], l[j])
191191

192+
class Match():
193+
def __init__(self, *items):
194+
assert len(items) == 2
195+
self.items = tuple(items)
196+
197+
@property
198+
def left(self):
199+
return self.items[0]
200+
201+
@property
202+
def right(self):
203+
return self.items[1]
204+
205+
def __getitem__(self, i):
206+
return self.items[i]
207+
208+
def __str__(self):
209+
return f'{self.left} / self.right'
210+
211+
def __repr__(self):
212+
return f'<Match {self.left} {self.right}>'
213+
192214

193215
def group_codebase_files(matches):
194216
# a map from a pair of codebases to a map of pair of files
@@ -202,53 +224,130 @@ def group_codebase_files(matches):
202224

203225
pairmaker = base_a.pairmaker(base_b)
204226
codebase_pair = pairmaker(base_a, base_b)
205-
line_pair = pairmaker(line_a.line_number, line_b.line_number)
227+
line_pair = pairmaker(line_a, line_b)
206228
file_pair = pairmaker(line_a.source_file, line_b.source_file)
207-
codebases_map[codebase_pair][file_pair].append(line_pair)
229+
codebases_map[codebase_pair][file_pair].append(Match(*line_pair))
208230

209231
return {k: dict(v) for k, v in codebases_map.items()}
210232

233+
class LineRun():
234+
def __init__(self):
235+
self.lines = []
236+
self.neighbors = []
237+
self.visited = False
238+
239+
class LineCluster():
240+
def __init__(self):
241+
self.lines = []
242+
self.min_line = None
243+
self.max_line = None
244+
245+
def __len__(self):
246+
return len(self.lines)
247+
248+
def update(self, line):
249+
self.lines.append(line)
250+
line_no = line.line_number
251+
if self.min_line is None or line_no < self.min_line.line_number:
252+
self.min_line = line
253+
if self.max_line is None or line_no > self.max_line.line_number:
254+
self.max_line = line
255+
256+
def __repr__(self):
257+
return (f'<LineCluster {len(self)}'
258+
f' {self.min_line.line_number}'
259+
f':{self.max_line.line_number}>')
260+
211261

212262
def group_lines(options, codebases_map):
263+
'''
264+
{
265+
(base_a, base_b): {
266+
(file_a, file_b): [
267+
(line_a, line_b)
268+
]
269+
}
270+
}
271+
'''
213272
res = {}
214273
for codebase_pair, file_map in codebases_map.items():
215274
file_res = {}
216275
for file_pair, matches in file_map.items():
217-
# group matches between files by geometric angle
218-
angle_map = defaultdict(list)
219-
def line_pair_angle(line_pair):
220-
return line_pair[1] - line_pair[0]
221-
222-
for line_pair in matches:
223-
angle_map[line_pair_angle(line_pair)].append(line_pair)
224-
225-
ranges = []
226-
for angle, matches in angle_map.items():
227-
matches.sort()
228-
229-
range_start = None
276+
left_file, right_file = file_pair
277+
run_map = {}
278+
def compute_runs(pair_i):
279+
# the same item can be involved in more than a single pair
280+
# the set can be avoided by changing algorithms instead
281+
match_list = list({match[pair_i] for match in matches})
282+
# insane stuff can be done here
283+
# we may sort this O(file_size), which may not be a good idea,
284+
# given that most of the time, len(matches) <<< file_size.
285+
# we could store a list of file lines somewhere, and test if the
286+
# line has matched using a hash set.
287+
match_list.sort(key=lambda m: m.line_number)
288+
289+
runs = []
290+
291+
run = LineRun()
230292
expected_i = None
231293

232-
def end_range():
233-
if range_start is not None:
234-
range_end = expected_i - 1
235-
range_len = range_end - range_start
236-
ranges.append((range_start,
237-
range_start + angle,
238-
range_len + options.window_size))
239-
240-
for line_pair in matches:
241-
line_a, line_b = line_pair
242-
if line_a != expected_i:
243-
end_range()
244-
range_start = line_a
245-
expected_i = line_a + 1
246-
247-
end_range()
248-
249-
file_res[file_pair] = ranges
294+
def end_run():
295+
nonlocal run
296+
if run.lines:
297+
runs.append(run)
298+
run = LineRun()
299+
300+
for line in match_list:
301+
if expected_i is None or line.line_number != expected_i:
302+
end_run()
303+
304+
run_map[line] = run
305+
run.lines.append(line)
306+
307+
end_run()
308+
return match_list, runs
309+
310+
left_matches, left_runs = compute_runs(0)
311+
right_matches, right_runs = compute_runs(1)
312+
313+
for left_line, right_line in matches:
314+
left_run = run_map[left_line]
315+
right_run = run_map[right_line]
316+
left_run.neighbors.append(right_run)
317+
right_run.neighbors.append(left_run)
318+
319+
# groups connected clusters, marking these as visited
320+
def get_cluster(run):
321+
if run.visited:
322+
return
323+
324+
run.visited = True
325+
yield from run.lines
326+
for neighbor in run.neighbors:
327+
yield from get_cluster(neighbor)
328+
329+
clusters = []
330+
# runs should be connected now, so it doesn't matter which side is
331+
# iterated over
332+
for run in left_runs:
333+
new_cluster = list(get_cluster(run))
334+
if not new_cluster:
335+
continue
336+
337+
left_cluster = LineCluster()
338+
right_cluster = LineCluster()
339+
for line in new_cluster:
340+
if line.source_file is left_file:
341+
left_cluster.update(line)
342+
else:
343+
right_cluster.update(line)
344+
345+
assert len(left_cluster) and len(right_cluster)
346+
clusters.append((left_cluster, right_cluster))
347+
348+
file_res[file_pair] = clusters
250349
res[codebase_pair] = file_res
251-
return res
350+
return res
252351

253352

254353
def rate_grouped_lines(codebases_map):
@@ -267,10 +366,34 @@ def store(f, *args, **kwargs):
267366
res[f.__name__] = f_res
268367
return f_res
269368

369+
'''
370+
{
371+
Canon: [Lines]
372+
}
373+
'''
270374
res['matches'] = matches
375+
'''
376+
{
377+
(base_a, base_b): {
378+
(file_a, file_b): [
379+
(line_a, line_b)
380+
]
381+
}
382+
}
383+
'''
271384
codebase_file_groups = store(group_codebase_files, matches)
385+
'''
386+
{
387+
(base_a, base_b): {
388+
(file_a, file_b): [
389+
<LineCluster size begin:end>
390+
]
391+
}
392+
}
393+
'''
272394
lengthful_matches = store(group_lines, options, codebase_file_groups)
273-
return store(rate_grouped_lines, lengthful_matches), res
395+
return lengthful_matches, res
396+
# return store(rate_grouped_lines, lengthful_matches), res
274397

275398

276399
def main(args=sys.argv[1:]):

0 commit comments

Comments
 (0)