77from pathlib import PosixPath
88from pprint import pprint
99
10+ try :
11+ from tqdm import tqdm
12+ progressbar = tqdm
13+ except :
14+ progressbar = lambda x : x
15+
1016TOKEN_SPLITTER = re .compile (' |(\w+)' )
1117
1218DEFAULT_WINDOW_SIZE = 3
1319
20+ import logging
21+ import sys
22+ logging .basicConfig (stream = sys .stderr , level = logging .DEBUG , format = '%(asctime)s:%(levelname)s - %(message)s' )
23+ logger = logging .getLogger ()
24+
1425def tokenize (line ):
1526 is_group = False
1627 for token in re .split (TOKEN_SPLITTER , line ):
@@ -134,7 +145,7 @@ def preprocess_source(source_file, window_size):
134145
135146def preprocess_sources (source_files , window_size ):
136147 '''creates line groups'''
137- for source_file in source_files :
148+ for source_file in progressbar ( source_files ) :
138149 yield from preprocess_source (source_file , window_size )
139150
140151
@@ -216,7 +227,7 @@ def group_codebase_files(matches):
216227 # a map from a pair of codebases to a map of pair of files
217228 # to list of pairs of matches
218229 codebases_map = defaultdict (lambda : defaultdict (list ))
219- for keygroup , occurences in matches :
230+ for keygroup , occurences in progressbar ( matches ) :
220231 for line_a , line_b in pairs (occurences ):
221232 base_a , base_b = line_a .codebase , line_b .codebase
222233 if base_a is base_b :
@@ -270,7 +281,7 @@ def group_lines(options, codebases_map):
270281 }
271282 '''
272283 res = {}
273- for codebase_pair , file_map in codebases_map .items ():
284+ for codebase_pair , file_map in progressbar ( codebases_map .items () ):
274285 file_res = {}
275286 for file_pair , matches in file_map .items ():
276287 left_file , right_file = file_pair
@@ -347,7 +358,7 @@ def get_cluster(run):
347358
348359 file_res [file_pair ] = clusters
349360 res [codebase_pair ] = file_res
350- return res
361+ return res
351362
352363
353364def rate_grouped_lines (codebases_map ):
@@ -381,6 +392,7 @@ def store(f, *args, **kwargs):
381392 }
382393 }
383394 '''
395+ logger .info ('grouping codebases / files' )
384396 codebase_file_groups = store (group_codebase_files , matches )
385397 '''
386398 {
@@ -391,6 +403,7 @@ def store(f, *args, **kwargs):
391403 }
392404 }
393405 '''
406+ logger .info ('graph madness' )
394407 lengthful_matches = store (group_lines , options , codebase_file_groups )
395408 return lengthful_matches , res
396409 # return store(rate_grouped_lines, lengthful_matches), res
@@ -400,9 +413,10 @@ def main(args=sys.argv[1:]):
400413 options = _trish_parser ().parse_args (args = args )
401414 source_files = find_sources (options .targets , options .pattern )
402415 window_size = options .window_size
416+ logger .info ('correlating sources' )
403417 matches = correlate_sources (source_files , window_size )
404418 scores , metadata = process_matches (options , matches )
405- pprint (metadata )
419+ # pprint(metadata)
406420 # for codebase_pair, score in scores.items():
407421 # codebase_a, codebase_b = codebase_pair
408422 # print(f'{score}\t{codebase_a.name}\t{codebase_b.name}')
0 commit comments