-
Notifications
You must be signed in to change notification settings - Fork 0
/
LabelledSentences.py
136 lines (111 loc) · 4.87 KB
/
LabelledSentences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from nltk.tokenize import word_tokenize
import pickle
def getLabelledSentences(candidate_sentences_file, labelled_sentences_file):
inputFile = open(candidate_sentences_file, "r", encoding="utf-8")
outputFile = open(labelled_sentences_file, "w+", encoding="utf-8")
routputFile = open(labelled_sentences_file[:-4].replace("sentences","features") + "_relationships.txt", "w+", encoding="utf-8")
sentenceLabels = []
rLabels = []
for line in inputFile:
words = word_tokenize(line)
wordLabels = [] #line[:-2]
label = ""
for word in words:
if label != 'q' and label != 'qq':
print("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" + line)
# if user input 'q' then until end of line keep input as 'q'
if label == 'q':
label = 'q'
elif label == 'qq':
label = 'qq'
elif label == "stop":
label = "stop"
else:
label = ''
# loop getting input until valid input provided
while label != 'o' and label != 'q' and label != 'u' and label != 'n' and label != "stop" and label != "qq":
label = input(word + " ")
# try to check previous label to see if B or I
if label == 'n' or label == 'u':
try:
if label in wordLabels[-1][1]:
label = 'I' + label
else:
label = 'B' + label
except IndexError:
label = 'B' + label
wordLabels.append((word, label))
else:
# works for both 'q' and 'o'
wordLabels.append((word, 'o'))
if label != 'qq':
outputFile.write(str(wordLabels) + '\n')
sentenceLabels.append(wordLabels)
counter = 1
ulist = []
nlist = []
for wordLabel in wordLabels:
if wordLabel[1] == "Bu":
s = wordLabel[0]
try:
i = 0
while wordLabels[counter + i][1] == "Iu":
s += " " + wordLabels[counter + i][0]
i += 1
except IndexError:
pass
ulist.append((s, counter - 1))
elif wordLabel[1] == "Bn":
s = wordLabel[0]
try:
i = 0
while wordLabels[counter + i][1] == "In":
s += " " + wordLabels[counter + i][0]
i += 1
except IndexError:
pass
nlist.append((s, counter - 1))
counter += 1
for u in ulist:
for n in nlist:
# label as relationship
print("\nRelationship?\n\n" + line + "\n" + str(u) + " --- " + str(n) + "\n")
r = input("Label: ")
features = {}
# size
features['size'] = u[0]
# object
features['object'] = n[0]
# size
if u[1] < n[1]:
features['order'] = 'size'
words_between = wordLabels[u[1] + 1:n[1]]
else:
features['order'] = 'obj'
words_between = wordLabels[n[1] + 1:u[1]]
# num words between
features['num_words_between'] = str(len(words_between))
# words between
features['u_words_between'] = str(list(set(words_between)))
if r == "r":
routputFile.write(str((features, 'r')) + "\n")
rLabels.append((features, 'r'))
else:
routputFile.write(str((features, 'o')) + "\n")
rLabels.append((features, 'o'))
if label == "stop":
with open(labelled_sentences_file[:-4] + '.pickle', 'wb+') as f:
pickle.dump(sentenceLabels, f)
with open(labelled_sentences_file[:-4].replace("sentences","features") + '_relationships.pickle', 'wb+') as f:
pickle.dump(rLabels, f)
inputFile.close()
outputFile.close()
routputFile.close()
quit()
with open(labelled_sentences_file[:-4] + '.pickle', 'wb+') as f:
pickle.dump(sentenceLabels, f)
with open(labelled_sentences_file[:-4].replace("sentences","features") + '_relationships.pickle', 'wb+') as f:
pickle.dump(rLabels, f)
inputFile.close()
outputFile.close()
routputFile.close()