@@ -189,6 +189,28 @@ def pairs(l):
189189 for j in range (i + 1 , len (l )):
190190 yield (l [i ], l [j ])
191191
192+ class Match ():
193+ def __init__ (self , * items ):
194+ assert len (items ) == 2
195+ self .items = tuple (items )
196+
197+ @property
198+ def left (self ):
199+ return self .items [0 ]
200+
201+ @property
202+ def right (self ):
203+ return self .items [1 ]
204+
205+ def __getitem__ (self , i ):
206+ return self .items [i ]
207+
208+ def __str__ (self ):
209+ return f'{ self .left } / self.right'
210+
211+ def __repr__ (self ):
212+ return f'<Match { self .left } { self .right } >'
213+
192214
193215def group_codebase_files (matches ):
194216 # a map from a pair of codebases to a map of pair of files
@@ -202,53 +224,130 @@ def group_codebase_files(matches):
202224
203225 pairmaker = base_a .pairmaker (base_b )
204226 codebase_pair = pairmaker (base_a , base_b )
205- line_pair = pairmaker (line_a . line_number , line_b . line_number )
227+ line_pair = pairmaker (line_a , line_b )
206228 file_pair = pairmaker (line_a .source_file , line_b .source_file )
207- codebases_map [codebase_pair ][file_pair ].append (line_pair )
229+ codebases_map [codebase_pair ][file_pair ].append (Match ( * line_pair ) )
208230
209231 return {k : dict (v ) for k , v in codebases_map .items ()}
210232
233+ class LineRun ():
234+ def __init__ (self ):
235+ self .lines = []
236+ self .neighbors = []
237+ self .visited = False
238+
239+ class LineCluster ():
240+ def __init__ (self ):
241+ self .lines = []
242+ self .min_line = None
243+ self .max_line = None
244+
245+ def __len__ (self ):
246+ return len (self .lines )
247+
248+ def update (self , line ):
249+ self .lines .append (line )
250+ line_no = line .line_number
251+ if self .min_line is None or line_no < self .min_line .line_number :
252+ self .min_line = line
253+ if self .max_line is None or line_no > self .max_line .line_number :
254+ self .max_line = line
255+
256+ def __repr__ (self ):
257+ return (f'<LineCluster { len (self )} '
258+ f' { self .min_line .line_number } '
259+ f':{ self .max_line .line_number } >' )
260+
211261
212262def group_lines (options , codebases_map ):
263+ '''
264+ {
265+ (base_a, base_b): {
266+ (file_a, file_b): [
267+ (line_a, line_b)
268+ ]
269+ }
270+ }
271+ '''
213272 res = {}
214273 for codebase_pair , file_map in codebases_map .items ():
215274 file_res = {}
216275 for file_pair , matches in file_map .items ():
217- # group matches between files by geometric angle
218- angle_map = defaultdict (list )
219- def line_pair_angle (line_pair ):
220- return line_pair [1 ] - line_pair [0 ]
221-
222- for line_pair in matches :
223- angle_map [line_pair_angle (line_pair )].append (line_pair )
224-
225- ranges = []
226- for angle , matches in angle_map .items ():
227- matches .sort ()
228-
229- range_start = None
276+ left_file , right_file = file_pair
277+ run_map = {}
278+ def compute_runs (pair_i ):
279+ # the same item can be involved in more than a single pair
280+ # the set can be avoided by changing algorithms instead
281+ match_list = list ({match [pair_i ] for match in matches })
282+ # insane stuff can be done here
283+ # we may sort this O(file_size), which may not be a good idea,
284+ # given that most of the time, len(matches) <<< file_size.
285+ # we could store a list of file lines somewhere, and test if the
286+ # line has matched using a hash set.
287+ match_list .sort (key = lambda m : m .line_number )
288+
289+ runs = []
290+
291+ run = LineRun ()
230292 expected_i = None
231293
232- def end_range ():
233- if range_start is not None :
234- range_end = expected_i - 1
235- range_len = range_end - range_start
236- ranges .append ((range_start ,
237- range_start + angle ,
238- range_len + options .window_size ))
239-
240- for line_pair in matches :
241- line_a , line_b = line_pair
242- if line_a != expected_i :
243- end_range ()
244- range_start = line_a
245- expected_i = line_a + 1
246-
247- end_range ()
248-
249- file_res [file_pair ] = ranges
294+ def end_run ():
295+ nonlocal run
296+ if run .lines :
297+ runs .append (run )
298+ run = LineRun ()
299+
300+ for line in match_list :
301+ if expected_i is None or line .line_number != expected_i :
302+ end_run ()
303+
304+ run_map [line ] = run
305+ run .lines .append (line )
306+
307+ end_run ()
308+ return match_list , runs
309+
310+ left_matches , left_runs = compute_runs (0 )
311+ right_matches , right_runs = compute_runs (1 )
312+
313+ for left_line , right_line in matches :
314+ left_run = run_map [left_line ]
315+ right_run = run_map [right_line ]
316+ left_run .neighbors .append (right_run )
317+ right_run .neighbors .append (left_run )
318+
319+ # groups connected clusters, marking these as visited
320+ def get_cluster (run ):
321+ if run .visited :
322+ return
323+
324+ run .visited = True
325+ yield from run .lines
326+ for neighbor in run .neighbors :
327+ yield from get_cluster (neighbor )
328+
329+ clusters = []
330+ # runs should be connected now, so it doesn't matter which side is
331+ # iterated over
332+ for run in left_runs :
333+ new_cluster = list (get_cluster (run ))
334+ if not new_cluster :
335+ continue
336+
337+ left_cluster = LineCluster ()
338+ right_cluster = LineCluster ()
339+ for line in new_cluster :
340+ if line .source_file is left_file :
341+ left_cluster .update (line )
342+ else :
343+ right_cluster .update (line )
344+
345+ assert len (left_cluster ) and len (right_cluster )
346+ clusters .append ((left_cluster , right_cluster ))
347+
348+ file_res [file_pair ] = clusters
250349 res [codebase_pair ] = file_res
251- return res
350+ return res
252351
253352
254353def rate_grouped_lines (codebases_map ):
@@ -267,10 +366,34 @@ def store(f, *args, **kwargs):
267366 res [f .__name__ ] = f_res
268367 return f_res
269368
369+ '''
370+ {
371+ Canon: [Lines]
372+ }
373+ '''
270374 res ['matches' ] = matches
375+ '''
376+ {
377+ (base_a, base_b): {
378+ (file_a, file_b): [
379+ (line_a, line_b)
380+ ]
381+ }
382+ }
383+ '''
271384 codebase_file_groups = store (group_codebase_files , matches )
385+ '''
386+ {
387+ (base_a, base_b): {
388+ (file_a, file_b): [
389+ <LineCluster size begin:end>
390+ ]
391+ }
392+ }
393+ '''
272394 lengthful_matches = store (group_lines , options , codebase_file_groups )
273- return store (rate_grouped_lines , lengthful_matches ), res
395+ return lengthful_matches , res
396+ # return store(rate_grouped_lines, lengthful_matches), res
274397
275398
276399def main (args = sys .argv [1 :]):
0 commit comments