-
Notifications
You must be signed in to change notification settings - Fork 0
/
clustering_metrics.py
234 lines (175 loc) · 9.29 KB
/
clustering_metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
import numpy as np
import itertools
import nltk
from scipy.stats import spearmanr
from sklearn.metrics import accuracy_score
# Get number of clusters
def get_num_clusters(cluster2indices):
return len(cluster2indices)
# Get min size of clusters
def get_min_cluster_size(cluster2indices):
return min([len(cluster2indices[x]) for x in cluster2indices])
# Get max size of clusters
def get_max_cluster_size(cluster2indices):
return max([len(cluster2indices[x]) for x in cluster2indices])
# Get mean size of clusters
def get_mean_cluster_size(cluster2indices):
return np.mean([len(cluster2indices[x]) for x in cluster2indices])
# get median size of clusters
def get_median_cluster_size(cluster2indices):
return np.median([len(cluster2indices[x]) for x in cluster2indices])
# Intra Cluster Similarity - semantic scholar topics or cso topics
# Calculate for each cluster the percentage of papers having at least one keyword in top/most common
# n_ keywords of clusters, then average percentages
# source: 'cso' or 'sem_scholar'
def intra_cluster_similarity(cluster2indices, cluster2words_freq_dist, source, df_clustered, n=[10]):
# for each cluster, how many papers are with at least one keyword represented in top n keywords
n2cluster2represented_papers = dict()
# for all n_ in n consider top n_ keywords
for n_ in n:
n2cluster2represented_papers[n_] = dict()
# for all clusters
for cluster_index in cluster2indices:
# get cluster = list of paper indices in dataframe
cluster = cluster2indices[cluster_index]
# initialise current number of papers with 0
n2cluster2represented_papers[n_][cluster_index] = 0
# get top, most common n_ keywords for current cluster
most_common = [x for (x, _) in cluster2words_freq_dist[cluster_index].most_common(n_)]
# check for all papers if they have at least one keyword in most_common list
for paper_index in cluster:
if source == "sem_scholar":
# get semantc scholar keywords of current paper
words = df_clustered.iloc[paper_index]["semantic_scholar_keywords"]
elif source == "cso":
# get cso topics of current paper
words = df_clustered.iloc[paper_index]["cso_enhanced"]
if not set(words).isdisjoint(set(most_common)):
# increase number of papers in dictionary with 1
n2cluster2represented_papers[n_][cluster_index] += 1
# for each n_ in n, calculate the mean of the percentages of papers with at least one
# keyword in top n_ of each cluster
# i.e. cluster 0: 5/10 have key words in top n_ => 50% = 0.5
# cluster 1: 2/10 have key words in top n_ => 25% = 0.2
# => average for n_: 0.35
# for all n_ in n, store mean percentage over clusters
n2mean_percentage = dict()
for n_ in n:
percentages = []
for cluster_index in n2cluster2represented_papers[n_]:
if int(cluster_index) != -1:
repr_papers = n2cluster2represented_papers[n_][cluster_index]
total_papers = len(cluster2indices[cluster_index])
percentages.append(repr_papers/total_papers)
n2mean_percentage[n_] = np.mean(percentages)
return n2mean_percentage
# Inter Cluster Similarity - semantic scholar keywords or cso topics
# Calculate for all cluster pairs (c1, c2) the spearman correlation of the lists of top n_ keywords,
# then average these similarity scores over number of pairs
# source: 'cso' or 'sem_scholar'
def inter_cluster_similarity(cluster2indices, cluster2words_freq_dist, source, n=[10]):
cluster_indices = list(cluster2indices.keys())
if "-1" in cluster_indices:
cluster_indices.remove("-1")
if -1 in cluster_indices:
cluster_indices.remove(-1)
# create all pairs of clusters
cluster_pairs = [(c1, c2) for c1, c2 in itertools.combinations(cluster_indices, 2)]
# for each top/most_common n_ keyworsd, store similarity value
# (spearman correlation of top n_ keywords lists) of each cluster pair
n2pair_similarities = dict()
for n_ in n:
n2pair_similarities[n_] = []
for c1, c2 in cluster_pairs:
words_freq_dist_c1 = cluster2words_freq_dist[c1]
words_freq_dist_c2 = cluster2words_freq_dist[c2]
c1_n_most_common = [x for x, _ in words_freq_dist_c1.most_common(n_)]
c2_n_most_common = [x for x, _ in words_freq_dist_c2.most_common(n_)]
# We need two lists with equal size for spearman correlation
min_length = min(len(c1_n_most_common), len(c2_n_most_common))
correlation, _ = spearmanr(c1_n_most_common[:min_length], c2_n_most_common[:min_length])
n2pair_similarities[n_].append(correlation)
# for each n calculate the mean of the correlations of all cluster pairs
n2mean_correlation = dict()
for n_ in n:
correlations = n2pair_similarities[n_]
n2mean_correlation[n_] = np.mean(correlations)
return n2mean_correlation
# Calculate a similarity score of a word list compared to a word frequency distribution
# normalized by cluster size
def wordlist_freq_dist_similarity(words, word_freq_dist, cluster_size):
score = 0
# for each keyword in keywords
for word in words:
for i, (w, frequency) in enumerate(word_freq_dist.most_common()):
# if keyword also in frequency distribution
if w == word:
# add frequency/cluster_size to the score
score += frequency/cluster_size
break
return score
# Calculate 1. the accuracy, 2. a mean ranking score of new unclustered papers, and 3. a absolute score based on % of papers with words
# parameter gold: a list of clusters, one cluster per classificated paper
# source: 'cso' or 'sem_scholar'
def classification_acc_ranking(gold, cluster2words_freq_dist, source, cluster2indices, df_not_clustered):
# for each new classified paper (identified by index in df_not_clustered), get its keywords
index2words = dict()
for i, row in df_not_clustered.iterrows():
if source == "sem_scholar":
index2words[i] = row["semantic_scholar_keywords"]
elif source == "cso":
index2words[i] = row["cso_enhanced"]
else:
print("Warning. Source not found!")
# for each paper index store mapping from cluster to score
index2cluster2score = dict()
for index in index2words:
index2cluster2score[index] = dict()
words = index2words[index]
# for each cluster
for cluster in cluster2words_freq_dist:
# get similarity score of paper's words and the word frequency distribution of the current cluster
score = wordlist_freq_dist_similarity(words, cluster2words_freq_dist[cluster], len(cluster2indices[cluster]))
# save score in dictionary
index2cluster2score[index][cluster] = score
# 1. first metric - accuracy of correct clusters with regard to gold (classified with classifier)
# for each paper index store cluster with highest score
index2best_cluster = dict()
for index in index2words:
max_score = -1
best_cluster = 0
for cluster in index2cluster2score[index]:
if int(cluster) != -1:
score = index2cluster2score[index][cluster]
if score > max_score:
max_score = score
best_cluster = cluster
index2best_cluster[index] = best_cluster
# create list of cluster predictions based on keywords
predicted = [index2best_cluster[index] for index in index2best_cluster]
# calculate accuracy
acc = accuracy_score(gold, predicted)
# 2. second metric - at which position is correct cluster in list of clusters sorted by score
# list of scores: g/(p * g)
# g: number of clusters
# p: position of gold cluster in sorted score list of paper
ranking_scores = []
for index in index2words:
cluster2score = index2cluster2score[index]
for i, (cluster, score) in enumerate(sorted(cluster2score.items(), key= lambda x:x[1], reverse=True)):
if gold[index] == cluster:
#ranking_scores.append(len(cluster2score)/((i+1) * len(cluster2score)))
ranking_scores.append(1/(i+1))
break
# calculate mean ranking score
mean_ranking_score = np.mean(ranking_scores)
# 3. third metric - how many in % of cluster members have the words
scores = []
for index in index2words:
words = index2words[index]
score = 0
for word in words:
score += cluster2words_freq_dist[gold[index]][word] / len(cluster2indices[gold[index]])
scores.append(score)
mean_absolute_score = np.mean(scores)
return acc, mean_ranking_score, mean_absolute_score