-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathidentity-canonicalizer
executable file
·530 lines (472 loc) · 18.8 KB
/
identity-canonicalizer
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
#!/usr/bin/env python3
import os, sys, re
from collections import Counter
# Inspired by:
# git log --since=2020-01-01 | \
# > grep -E '^ [^ ]*(Signed-off|Tested|Reported|Reviewed|Acked)[^ ]*-by:' | \
# > cut -d: -f2- |
# > sed -e 's/^ //g; s/^.*-by: *//g; s/</ </g; s/ / /g; s/>.*/>/g;' | \
# > sed -e 's/^\([^<]*\)<.*>$/\1/g; s/"\(.*\)"/\1/g' | \
# > grep -v @syzkaller | \
# > sort -u
# Find people participating in 50+ commits since 2020-01-01:
# git log --since=2020-01-01 | ~/bin/identity-canonicalizer | sort -g > contributors.txt
# cat contributors.txt | awk '{if ($1 > 2) {print $0}}' >eligible.txt
# cat eligible.txt | awk '{if ($1 > 49) {print $0}}' >ballots.txt
# Find these lines, and keep the email portion
by = re.compile(r'^ [^ ]*(Signed-off|Tested|Reported|Reviewed|Acked)[^ ]*-by:(.*)')
# Split name from email, and remove trailing stuff after email
splitter = re.compile(r'^(.*)<(.*)>[^>]*$')
# HTML mailto
mailto = re.compile(r'^(.*<[^ ]*) ?<mailto:.*$')
# Bad email characters
bad_email = re.compile(r'([ <>])')
# Unwrap quoted names
quoted = re.compile(r'^"(.*)"$')
# Ignore affiliations
affiliated = re.compile(r'^(.*)\(.*\)$')
# Ignore these identities, which are bots, collectives, twitter handles, or intentionally no email.
email_ignore = re.compile(r'^(.*@syzkaller.*.appspot.*|syzkaller@googlegroups\.com|[email protected]|(lpk|lkp)@intel.com|[email protected]|[email protected]|patchwork-bot\+.*@kernel.org|keescook\[email protected]|coverity scan|smatch|should come before them, without any blank lines. As the|A cast of thousands|KASAN|Ryota Shiga(?: \(Flatt Security\))?|Thomas Sattler|many different companies|Coverity Static Analyzer.*|Marco Scardovi|https?://.*|Marian Rainer-Harbach|Max VA|.* working with ZDI|Android Treehugger Robot|tag, or it goes automatically\?|is not accompanied by a link to the report\. Those links are|sanity checks in grsecurity|PaX.s SIZE_OVERFLOW plugin running on grsecurity.s syzkaller|Meysam Firouzi|Amirmohammad Eftekhar|Maarten van der Schrieck|.* \(@[^@]*\))$')
# Ignore these names, which may use a "real" email address, but are considered bots.
name_ignore = re.compile(r'^(kernel test robot|coverity-bot)$')
# Ignore one-off typos.
typo_ignore = re.compile(r'^$')
# Best guess adjustment of names for changes, latinization, abbreviation, and typos.
# match: replacement
spelling = {
"": None,
"Ville Syrjala": "Ville Syrjälä",
"Alexey Budankov": "Alexei Budankov",
"Yuval Basson": "Yuval Bason",
"Ricardo Ribalda": "Ricardo Ribalda Delgado",
"Sean Wang": "Soul Huang",
"Jernej Skrabec": "Jernej Škrabec",
"Alexandru M Stan": "Alexandru Stan",
"Bindu R": "Bindu Ramamurthy",
"Daniel Almeida": "Daniel W. S. Almeida",
"Wong, Vee Khee": "Wong Vee Khee",
"Gabriel C": "Gabriel Craciunescu",
"Enric Balletbo Serra": "Enric Balletbo i Serra",
"Yao Lihua": "Lihua Yao",
"Klaus Jensen": "Klaus Birkelund Jensen",
"Alex Gagniuc": "Alexandru Gagniuc",
"Michael Bringmann": "Michael W. Bringmann",
"Wang Hui": "Hui Wang",
"Rafael Tinoco": "Rafael David Tinoco",
"Yi Chen": "Chen Yi",
"Lee, Shawn C": "Lee Shawn C",
"Eugeniy Paltsev": "Evgeniy Paltsev",
"Manish": "Manish Chopra",
": Phillip Lougher": "Phillip Lougher",
"Aneesh Kumar": "Aneesh Kumar K.V.",
"Aneesh Kumar K.V": "Aneesh Kumar K.V.",
"zhangyi": "Zhang Yi Z",
"Brad Warrum": "Bradley Warrum",
"Wang Jian": "Jian Wang",
"Jose Bollo": "José Bollo",
"Yu Chen": "Chen Yu",
"jeffrey.lin": "Jeffrey Lin",
"Kevin Wangtao": "Tao Wang",
"Vitaly Mayatskih": "Vitaly Mayatskikh",
"Oza Oza": "Oza Pawandeep",
"Sunil Kovvuri Goutham": "Sunil Goutham",
"Yannick Fertre": "Yannick Fertré",
"Wilson Chris P": "Chris Wilson",
"Thomas Hellstrm": "Thomas Hellström",
"Thomas Hellstrom": "Thomas Hellström",
"Howard Chung": "Yun-Hao Chung",
"Pandruvada Srinivas": "Srinivas Pandruvada",
"Alexey Min": "Alexey Minnekhanov",
"Ondrej Jirman": "Ondřej Jirman",
"Javier González": "Javier González",
"David Airlie": "Dave Airlie",
"Zhongjun Tan": "Tan Zhongjun",
"Zqiang": "Zhang Qiang",
"Qiang.zhang": "Zhang Qiang",
"Justin M. Forbes": "Justin Forbes",
# Maybe ignore names shorter than 3 chars?
"sh": "Hui Su",
"Raphael Gallais-Pou": "Raphaël Gallais-Pou",
"XU pengfei": "Pengfei Xu",
"Soenke Huster": "Sönke Huster",
"Ondrej Mosnacek": "Ondrej Mosnáček",
# https://lore.kernel.org/all/[email protected]/
"SJLIN0": "Wallace Lin",
}
email_typos = {
'mpatocka redhat com': '[email protected]',
'eraniangoogle.com': '[email protected]',
'mgross.linux.intel.com': '[email protected]',
# cb774bd35318c1b4cb61f6f2caac85537d07fbde
'aspriel.gmail.com': '[email protected]',
# 8a1ec3f3275479292613273a7be2ac87f2a7f6e6
'imre.deak.intel.com': '[email protected]',
# 8b097881b54cbc23dd78262ed88c9924d00ea457
'Vineet Gupta [arc]': '[email protected]',
# 0cd9d33ace336bc424fc30944aa3defd6786e4fe
'https://github.com/joanbm': '[email protected]',
# bcd97734318d1d87bb237dbc0a60c81237b0ac50
# ee50e67ba0e17b1a1a8d76691d02eadf9e0f392c
'mathew.j.martineau>@linux.intel.com': '[email protected]',
# d1f1cecc92ae0dba44eac3ce10baf4edb4553e41
'irogers>@google.com': '[email protected]',
}
full_replace = {
# b88aef36b87c9787a4db724923ec4f57dfd513f3 typo
'Zdenek Kabelac <[email protected]>': 'Zdenek Kabelac <[email protected]>',
# 77422a8f6f61be1ef64978e9a94f40fed0d1634e typo
'Roi Dayan <[email protected]>': 'Roi Dayan <[email protected]>',
# b04d910af330b55e1d5d6eb9ecd53a375a9cf81c typo
'Longpeng <<a href="mailto:[email protected]" target="_blank">[email protected]</a>><br>': 'Longpeng(Mike) <[email protected]>',
# 81d46d693173a5c86a9b0c648eca1817ad5c0ae5 typo
'Longpeng <<a href="mailto:[email protected]" target="_blank">[email protected]</a>><br></blockquote><div><br></div><div>Acked-by: Jason Wang <<a href="mailto:[email protected]">[email protected]</a>></div><div> </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">': 'Longpeng(Mike) <[email protected]>',
# db405774f6a80c5607dcf43ec810f078bb5c660d typo
'Greg Kroah-Hartman <[email protected]>Cc: Dave Airlie <[email protected]>': 'Greg Kroah-Hartman <[email protected]>',
# 00bfe94e388fe12bfd0d4f6361b1b1343374ff5b typo
'Subbaraya Sundeep <[email protected]>': 'Subbaraya Sundeep <[email protected]>',
# Name order and uncolliding
'Zhang Rui <[email protected]>': 'Rui Zhang <[email protected]>',
# bda24462578ca2b0538d9257509070708ce41acc typo?
'Ariel Bernstein <[email protected]>': 'Ariel Bernstein <[email protected]>',
# Uncolliding with 'Rong Chen <[email protected]>'
'Rong Chen <[email protected]>': 'Rong A Chen <[email protected]>',
# Uncolliding with '[email protected]'
'Aaron Ma <[email protected]>': 'Pengyu Ma <[email protected]>',
# 2b9f28d5e8efad34f472542315911c5ee9a65b6c
# 9f4ce5d72b8e7a1f750598407c99f9e39dfb12fc
'Jason Wang <a class="moz-txt-link-rfc2396E" href="mailto:[email protected]"><[email protected]> </a>': 'Jason Wang <[email protected]>',
# fe80536acf8397827be77f9b8ada384b90e790d0
'Martin <[email protected]>': 'Martin Varghese <[email protected]>',
# Various
'Taketo Kabe': 'Taketo Kabe <[email protected]>',
# Various
'AngeloGioacchino Del Regno': 'AngeloGioacchino Del Regno <[email protected]>',
# Various
'Andrey Grodzovsky [email protected]': 'Andrey Grodzovsky <[email protected]>',
# 5ddf994fa22f78ae3742d72520a8c3e8521d96cd
'kernel test robot [email protected]': 'kernel test robot <[email protected]>',
# 21f965221e7c42609521342403e8fb91b8b3e76e
'Sherlock Holo [email protected]': 'Sherlock Holo <[email protected]>',
# Various
'Srinivas Pandruvada <srinivas.pI [email protected]>': 'Srinivas Pandruvada <[email protected]>',
# d6020f4b26179481c7cb13aa94d7abcdfd8a4326
'Hans Verkuil <hverkuil-cisco at xs4all.nl>': 'Hans Verkuil <[email protected]>',
# a4d1846512e12f9e84be63b3f2f2165e4d0d2d1e
'Uwe Kleine-König <(address hidden)>': 'Uwe Kleine-König <[email protected]>',
# b2cfec52feb3bb737c4b65018ef4bfe9789e4be8
'Inki Dae <[email protected]>': 'Inki Dae <[email protected]>',
# a820190204aef0739aa3a067d00273d117f9367c
'Martin <[email protected]>': 'Martin Leung <[email protected]>',
# 10a9accd4842e6098cc27d79d43d7542254003bc
'Ilya <[email protected]>': 'Ilya Bakoulin <[email protected]>',
# e3290f883127159e3aa7957f30bd4266602d403e
'Suraj Kandpal': 'Suraj Kandpal <[email protected]>',
# aa8a950a5d6b2094830aff834198777371ff91ff
'AceLan Kao <[email protected]>': 'AceLan Kao <[email protected]>',
# e5011447376e1b050847ccb2ef7933176ce4de41
'Doug Anderson <[email protected]>': 'Douglas Anderson <[email protected]>',
# b49f700668fff7565b945dce823def79bff59bb0
'Prabhakar <[email protected]>': 'Lad Prabhakar <[email protected]>',
# 37f5b858a66543b2b67c0288280af623985abc29
'Daniel Kaehn <[email protected]>': 'Danny Kaehn <[email protected]>',
# ef43c30858754d99373a63dff33280a9969b49bc
'Tan Shaopeng <[email protected]>': 'Shaopeng Tan <[email protected]>',
# d6a76c0a5a75b519ce81cd472077f9e76db5d6c3
'Shaoyun.liu <[email protected]>': 'shaoyunl <[email protected]>',
# c1839501fe3e67d98635f159dba8b170d08f6521
'edmund.raile <[email protected]>': 'Edmund Raile <[email protected]>',
# 68a24aba7c593eafa8fd00f2f76407b9b32b47a9
'Yang, Chenyuan <[email protected]>': 'Chenyuan Yang <[email protected]>',
# 8e4ff684762b6503db45e8906e258faee080c336
'"Harris, James R" <[email protected]>': 'Jim Harris <[email protected]>',
# b646ce9ce99f74d3dee8fd56303b9255d3c278ec
'Matt Roper <matthew.d.roper at intel.com>': 'Matt Roper <[email protected]>',
# 84d2db91f14a32dc856a5972e3f0907089093c7a
'Bongsu Jeon': 'Bongsu Jeon <[email protected]>',
}
def guess_name(name):
if name == None:
return None
if ", " in name:
name = "%s %s" % tuple(name.split(', ',1))
name = spelling.get(name, name)
return name
def flatten(name):
return name.lower()
class Person:
def __init__(self, email, name=None):
self.emails = [email]
self.names = []
self.commits = {}
self.best_count = 0
self.fullnames = {}
self.best_name = None
self.add_name(name)
def has_name(self):
if len(self.names) == 0:
return False
return True
def has_email(self, email):
for known in self.emails:
if known == email:
return True
return False
# Only for "absorb"
def get_email(self):
if len(self.emails) == 1:
return self.emails[0]
raise ValueError("Whoops, trying to get email when more than 1 exist: [%s]",
"] [".join(self.emails))
def add_fullname(self, email, name=None):
if name:
fullname = "%s <%s>" % (name, email)
else:
fullname = '%s' % (email)
self.fullnames.setdefault(fullname, 0)
self.fullnames[fullname] += 1
if self.fullnames[fullname] > self.best_count:
self.best_name = fullname
self.best_count = self.fullnames[fullname]
def add_email(self, email):
if email not in self.emails:
self.emails.append(email)
def add_name(self, name):
if name:
self.names.append(name)
def add_commit(self, sha):
self.commits.setdefault(sha, 0)
self.commits[sha] += 1
# For combining nameless Person
def add_all_commits(self, other):
for sha in other.commits:
self.commits.setdefault(sha, 0)
self.commits[sha] += other.commits[sha]
# To find best email address.
def collapse_aliases(self):
drop = []
add = dict()
for fullname in self.fullnames:
name = None
email = fullname
if '<' in email:
name, email = fullname.split(' <',1)
name = name.strip()
if email.endswith('>'):
email = email.split('>')[0]
if '@' not in email:
print(self.dump(True), file=sys.stderr)
raise ValueError("missing @ in [%s]: %s" % (email, fullname))
user, domain = email.split('@', 1)
# No +, move on.
if '+' not in user:
continue
# Reconstruct without +... suffix
user = user.split('+', 1)[0]
email = "%s@%s" % (user, domain)
# If this is a novel email, move on.
if not self.has_email(email):
continue
if name:
collapsed = "%s <%s>" % (name, email)
else:
collapsed = '%s' % (email)
add.setdefault(collapsed, 0)
add[collapsed] += self.fullnames[fullname]
drop.append(fullname)
# Zero out any collapsed entries
for fullname in drop:
self.fullnames[fullname] = 0
# Insert any new counts
for fullname in add:
self.fullnames.setdefault(fullname, 0)
self.fullnames[fullname] += add[fullname]
# Recount most used full name.
self.best_name = None
self.best_count = 0
for fullname in self.fullnames:
if self.fullnames[fullname] > self.best_count:
self.best_name = fullname
self.best_count = self.fullnames[fullname]
def dump(self, show_all_emails=False):
out = "%d\t%s" % (len(self.commits), self.best_name)
if show_all_emails:
for fullname in self.fullnames:
if fullname != self.best_name:
out += "\n\t%s" % (fullname)
#for commit in self.commits:
# out += "\n\t\t%s" % (commit)
return out
class Pool:
def __init__(self):
self.email_to_person = {}
self.name_to_person = {}
self.people = []
# For debugging collisions.
self.date = None
def saw_email(self, person, email):
person.add_email(email)
self.email_to_person.setdefault(flatten(email), person)
return person
def saw_name(self, person, name):
if name:
person.add_email(name)
self.name_to_person.setdefault(flatten(name), person)
return person
def collapse_aliases(self):
for person in self.people:
person.collapse_aliases()
def absorb(self, complete, part):
# Take all the commits
complete.add_all_commits(part)
# Remove old email entry
email = flatten(part.get_email())
del self.email_to_person[email]
# Record email on the complete Person
self.saw_email(complete, email)
return complete
def found(self, sha, email, name=None):
#if sha == '4965e38fa064056021254af4656b1089a42dc764':
# print('track: %s: %s <%s>' % (sha, name, email), file=sys.stderr)
person = None
if '@' not in email:
report = email
if name:
report = "%s <%s>" % (name, email)
print("%s: ignoring email without @: %s" % (sha, report), file=sys.stderr)
return
hit = bad_email.search(email)
if hit:
char = hit.group(1)
report = email
if name:
report = "%s <%s>" % (name, email)
print("%s: ignoring email with '%s': %s" % (sha, char, report), file=sys.stderr)
return
person_by_email = self.email_to_person.get(flatten(email), None)
if name:
person_by_name = self.name_to_person.get(flatten(name), None)
else:
person_by_name = None
if person_by_name and person_by_email:
# If we find a Person entry with a name, and it has an
# email conflict with a Person without a name, merge
# the nameless Person with the named Person.
if person_by_email != person_by_name and \
not person_by_email.has_name() and \
not person_by_name.has_email(email):
#print("Absorbing:\nperson_by_email:\n%s\ninto person_by_name:\n%s" % (person_by_email.dump(show_all_emails=True), person_by_name.dump(show_all_emails=True)), file=sys.stderr)
self.absorb(person_by_name, person_by_email)
person_by_email = None
else:
person = person_by_email
# If there are existing people with conflicting details, fail.
if person_by_email != person_by_name:
print("%s\nCollided %s (name:[%s] email:%s) on two people:\n%s\n\n%s" % (self.date, sha, name, email, person_by_email.dump(show_all_emails=True), person_by_name.dump(show_all_emails=True)), file=sys.stderr)
return person
# If we found a person by email only, add their name.
if person_by_email and not person_by_name:
person = self.saw_name(person_by_email, name)
# If we found a person by name only, add their email.
if person_by_name and not person_by_email:
person = self.saw_email(person_by_name, email)
# If we found a completely new person, create their entry.
if not person:
person = Person(email, name)
self.people.append(person)
self.saw_email(person, email)
self.saw_name(person, name)
# Record this email/name combo.
person.add_fullname(email, name)
# Record their activity on this sha.
person.add_commit(sha)
return person
def set_date(self, date):
self.date = date.strip()
def dump(self, show_all_emails=False):
out = []
for person in self.people:
out.append(person.dump(show_all_emails))
return out
pool = Pool()
sha = None
for line in sys.stdin:
if line.startswith('commit '):
sha = line.split(' ')[1].strip()
continue
if line.startswith('Date: ') or line.startswith('AuthorDate: '):
pool.set_date(" ".join(line.split(' ')[1:]))
continue
hit = by.search(line)
if not hit:
continue
line = hit.group(2).strip()
# Drop comment trailers
if ' #' in line:
line = line.split(' #', 1)[0].strip()
# Fix pasted "mailto" tags
hit = mailto.search(line)
if hit:
line = hit.group(1).strip()
# Try to fix common trailing typos
if '<' in line and not '>' in line:
if line.endswith('.') or line.endswith(')'):
line = line[:-1]
line = line + '>'
if line.endswith('>>'):
line = line[:-1]
#if sha == 'b04d910af330b55e1d5d6eb9ecd53a375a9cf81c':
# print("%s: %s" % (sha, line), file=sys.stderr)
# Perform full-line replacements.
line = full_replace.get(line, line)
# Try to split name from email
hit = splitter.search(line)
if hit:
email = hit.group(2).strip()
name = hit.group(1).strip()
hit = quoted.search(name)
if hit:
name = hit.group(1).strip()
hit = affiliated.search(name)
if hit:
name = hit.group(1).strip()
name = guess_name(name)
else:
email = line
name = None
# Special case: is this a malformed email lacking <>s that we can easily handle?
# e.g. Michal Kubecek [email protected]
if ' ' in email:
words = email.split(' ')
last_word = words.pop()
if '@' in last_word:
email = last_word
if email.startswith('<'):
email = email[1:]
if email.endswith('>'):
email = email[:-1]
if len(words) > 0:
name = " ".join(words)
# Ignore various emails.
hit = email_ignore.search(email)
if hit:
continue
# Replace email typos.
email = email_typos.get(email, email)
if name:
# Ignore various names.
hit = name_ignore.search(name)
if hit:
name = None
if name:
# Skip specific name+email typos.
hit = typo_ignore.search('%s <%s>' % (name, email))
if hit:
continue
person = pool.found(sha, email, name)
# Post-process to collapse "+"s in email aliases
pool.collapse_aliases()
full = False
if len(sys.argv) > 1:
if sys.argv[1] == '--full':
full = True
print("\n".join(pool.dump(full)))