-
Notifications
You must be signed in to change notification settings - Fork 29
/
mindmatch_cluster.py
100 lines (86 loc) · 4 KB
/
mindmatch_cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python
"""Mind match with pre-clustering: a script for matching people to people in the conference
Here, we add a trick to make the problem smaller by applying spectral clustering and
apply mind-matching to each cluster.
Usage:
mindmatch_cluster.py PATH [--n_match=<n_match>] [--n_trim=<n_trim>] [--output=<output>] [--n_clusters=<n_clusters>]
mindmatch_cluster.py [-h | --help]
mindmatch_cluster.py [-v | --version]
Arguments:
PATH Path to a CSV file,
a file need to have ('user_id', 'fullname', 'abstracts', 'conflicts') in the header
Options:
-h --help Show documentation helps
--version Show version
--n_match=<n_match> Number of match per user
--n_clusters=<n_clusters> Number of cluster before performing mindmatch
--n_trim=<n_trim> Trimming parameter for distance matrix, increase to reduce problem size
--output=<output> Output CSV file contains 'user_id' and 'match_ids' which has match ids with ; separated
"""
import numpy as np
import pandas as pd
from docopt import docopt
from paper_reviewer_matcher import (
preprocess,
compute_topics,
perform_mindmatch,
compute_conflicts,
calculate_affinity_distance
)
from sklearn.cluster import SpectralClustering
if __name__ == "__main__":
arguments = docopt(__doc__, version='MindMatch 0.1.dev')
file_name = arguments['PATH']
df = pd.read_csv(file_name).fillna('')
assert 'user_id' in df.columns, "CSV file must have ``user_id`` in the columns"
assert 'fullname' in df.columns, "CSV file must have ``fullname`` in the columns"
assert 'abstracts' in df.columns, "CSV file must have ``abstracts`` in the columns"
assert 'conflicts' in df.columns, "CSV file must have ``conflicts`` in the columns"
print("Number of people in the file = {}".format(len(df)))
n_match = arguments.get('--n_match')
if n_match is None:
n_match = 6
print('<n_match> is set to default for 6 match per user')
else:
n_match = int(n_match)
print('Number of match is set to {}'.format(n_match))
assert n_match >= 2, "You should set <n_match> to be more than 2"
n_trim = arguments.get('--n_trim')
if n_trim is None:
n_trim = 0
print('<n_trim> is set to default, this will take very long to converge for a large problem')
else:
n_trim = int(n_trim)
print('Trimming parameter is set to {}'.format(n_trim))
n_clusters= arguments.get('--n_clusters')
if n_clusters is None:
n_cluters = 4
print('Setting number of clusters <n_cluters> to 4')
else:
n_clusters = int(n_clusters)
print('Setting number of clusters to 4')
output_filename = arguments.get('output')
if output_filename is None:
output_filename = 'output_match.csv'
# compute topics
X_topic = compute_topics(list(map(preprocess, list(df['abstracts']))))
spectral_clustering = SpectralClustering(n_clusters=n_clusters, random_state=42)
labels = spectral_clustering.fit_predict(X_topic)
labels[0] = 3 # make some trick so that each group has even numbers, this is specific to this example
df["group"] = labels
df["topics"] = [x for x in X_topic]
output = []
for _, df_group in df.groupby("group"):
X = np.vstack(df_group.topics.values) # topics
A = calculate_affinity_distance(X, X) # calculate affinity matrix
cois = compute_conflicts(df_group.reset_index(drop=True)) # COIs from names
b = perform_mindmatch(A, n_trim=10, n_match=6, cois=cois) # performing
user_ids_map = {ri: r['user_id'] for ri, r in df_group.reset_index(drop=True).iterrows()}
for i in range(len(b)):
match_ids = [str(user_ids_map[b_]) for b_ in np.nonzero(b[i])[0]]
output.append({
'user_id': user_ids_map[i],
'match_ids': ';'.join(match_ids)
})
output_df = pd.DataFrame(output)
output_df.to_csv(output_filename, index=False)