-
Notifications
You must be signed in to change notification settings - Fork 5
/
similarity.py
125 lines (101 loc) · 3.53 KB
/
similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
import pandas as pd
def compute_jaccard_sim(test_sent, train_sent):
test_sent = test_sent.lower().split()
train_sent = train_sent.lower().split()
test = set(test_sent)
train = set(train_sent)
c = test.intersection(train)
return float(len(c)) / (len(test) + 1)
def pre_proccess(sentence):
processed_sent = []
# tokenize words of the sentence
words = word_tokenize(sentence.lower())
# Get stop words
stop_words = set(stopwords.words("english"))
# Remove stopwards and add lemmatized root form of words
for w in words:
if w not in stop_words:
processed_sent.append(WordNetLemmatizer().lemmatize(PorterStemmer().stem(w)))
return processed_sent
def lesk_context_bag(context_sentence):
context_bag_list = []
for w in context_sentence:
for syn in wn.synsets(w):
gloss = pre_proccess(str(syn.definition()))
for w_g in gloss:
context_bag_list.append(w_g)
return context_bag_list
def compute_lesk_score(sentence1, sentence2):
sentence1 = pre_proccess(sentence1)
sentence2 = pre_proccess(sentence2)
lesk_scores = {}
context_bag1 = lesk_context_bag(sentence1)
context_bag2 = set(lesk_context_bag(sentence2))
lesk_score = 1.0*len(set(context_bag1).intersection(set(context_bag2)))/( 1 + len(set(context_bag1)))
return lesk_score
sent1 = "what is market rate of onion"
sent2 = "pesticide rate of onion is 5"
c = compute_lesk_score(sent1, sent2)
j = compute_jaccard_sim(sent1, sent2)
def test_answer():
df = pd.read_csv('metric_test.csv')
test_question = list(df['1'] )
predicted_question = list(df['3'] )
list_jaccard = []
list_lesk = []
count_jaccard = 0
count_jaccard_threshold = 0
count_lesk = 0
count_lesk_threshold = 0
for test,pred in zip(test_question, predicted_question):
pred_ans = pred
js = compute_jaccard_sim(test , pred_ans)
lesk_s = compute_lesk_score(test, pred_ans)
list_jaccard.append(js)
list_lesk.append(lesk_s)
if lesk_s>0.95:
count_lesk_threshold += 1
if lesk_s>0:
count_lesk += 1
if js>0.8:
count_jaccard_threshold += 1
if js>0:
count_jaccard += 1
print count_lesk, count_jaccard
print count_lesk_threshold, count_jaccard_threshold
def test_question():
df = pd.read_csv('metric_test.csv')
test_question = list(df['0'] )
predicted_question = list(df['2'] )
list_jaccard = []
list_lesk = []
count_jaccard = 0
count_jaccard_threshold = 0
count_lesk = 0
count_lesk_threshold = 0
for test,pred in zip(test_question, predicted_question):
js = compute_jaccard_sim(test , pred)
lesk_s = compute_lesk_score(test, pred)
list_jaccard.append(js)
list_lesk.append(lesk_s)
if lesk_s>0.95:
count_lesk_threshold += 1
if lesk_s>0:
count_lesk += 1
if js>0.55:
count_jaccard_threshold += 1
if js>0:
count_jaccard += 1
df_result = pd.read_csv('metric_result.csv')
ground_truth = list(df_result['Truth'])
count_ground_truth = 0
for i in ground_truth:
count_ground_truth += i
print count_ground_truth
print count_lesk, count_jaccard
print count_lesk_threshold, count_jaccard_threshold