-
Notifications
You must be signed in to change notification settings - Fork 0
/
load_conll.py
108 lines (94 loc) · 3.66 KB
/
load_conll.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import os
import codecs
import numpy as np
def parse_conll_file(file, multiple=False):
tokens = []
sentences = []
for line in file.readlines():
if(line == "\n"):
if len(tokens) != 0:
#sentence = CoNLL_Sentence(tokens=tokens)
sentences.append(tokens)
tokens = []
else:
print("That should not happen.")
else:
parts = line.split("\t")
if multiple == False:
token = [parts[0], parts[1], parts[2]]#Simplified_CoNLL_Token(token=parts[0], token_label=parts[1], sentence_label=parts[2])
else:
token = [parts[0], parts[1], parts[2], parts[3], parts[4], parts[5]]
tokens.append(token)
return sentences
def parse_conll_files(path, multiple=False):
sentences = []
for subdir, dirs, files in os.walk(path):
for file in files:
with codecs.open(os.path.join(subdir, file), "r", "utf8") as f:
file_sentences = parse_conll_file(f, multiple=multiple)
sentences.append(file_sentences)
return sentences
def transform_to_model_input(sentences):
x = []
y_arg = []
y_rhet = []
for sentence in sentences:
x_sentence = []
y_sentence_arg = []
y_sentence_rhet = []
for token in sentence:
x_sentence.append(token[0])
y_sentence_arg.append(token[1])
y_sentence_rhet.append(token[2])
x.append(np.array(x_sentence))
y_arg.append(np.array(y_sentence_arg))
y_rhet.append(np.array(y_sentence_rhet))
return np.array(x), np.array(y_arg), np.array(y_rhet)
def transform_to_model_input_multiple(sentences):
x = []
y_arg = []
y_rhet = []
y_aspect = []
y_summary = []
y_citation = []
for sentence in sentences:
x_sentence = []
y_sentence_arg = []
y_sentence_rhet = []
y_sentence_aspect = []
y_sentence_summary = []
y_sentence_citation = []
for token in sentence:
x_sentence.append(token[0])
y_sentence_arg.append(token[1])
y_sentence_rhet.append(token[2])
y_sentence_aspect.append(token[3])
y_sentence_summary.append(token[4])
y_sentence_citation.append(token[5])
x.append(np.array(x_sentence))
y_arg.append(np.array(y_sentence_arg))
y_rhet.append(np.array(y_sentence_rhet))
y_aspect.append(np.array(y_sentence_aspect))
y_summary.append(np.array(y_sentence_summary))
y_citation.append(np.array(y_sentence_citation))
return np.array(x), np.array(y_arg), np.array(y_rhet), np.array(y_aspect), np.array(y_summary), np.array(y_citation)
def load_data(path="./../annotations_conll"):
sentences = parse_conll_files(path)
flat_sentences = [item for sublist in sentences for item in sublist]
x, y_arg, y_rhet = transform_to_model_input(flat_sentences)
print("Data size: " + str(len(x)))
return x, y_arg, y_rhet
def load_data_multiple(path=""):
sentences = parse_conll_files(path, multiple=True)
flat_sentences = [item for sublist in sentences for item in sublist]
x, y_arg, y_rhet, y_aspect, y_summary, y_citation = transform_to_model_input_multiple(flat_sentences)
print("Data size: " + str(len(x)))
return x, y_arg, y_rhet, y_aspect, y_summary, y_citation
def main():
print("Process started")
sentences = parse_conll_files("./annotations_conll")
flat_sentences = [item for sublist in sentences for item in sublist]
x, y_arg, y_rhet = transform_to_model_input(flat_sentences)
print("Process ended")
if __name__ == "__main__":
main()