-
Notifications
You must be signed in to change notification settings - Fork 1
/
dataset.py
88 lines (70 loc) · 2.79 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# Transfer dataset into traninable format
from constant import POS_DATA, CWS_DATA, Final
from collections import defaultdict, Counter
def processing_pos_data(path: str = POS_DATA['Train']):
dataset_list = []
# Entry: List of a sentence
# In a sentence, there are multiple tuple
# [('我', S),
# ('是', S),
# ('大', B),
# ('非', M),
# ('洲', E)]
pos_knowledge_graph = defaultdict(Counter)
with open(path, 'r') as data_file:
raw_data_lines = data_file.readlines()
for raw_sentence in raw_data_lines:
sentence_list = []
for raw_word_tag in raw_sentence.split():
this_is_shit = False
try:
word, pos = raw_word_tag.rsplit('/', 1)
except: # idiot dataset = =
# $$_ and the innocent English word being seperated
word = raw_word_tag
pos = ''
# this_is_shit = True
# Trainable format for CWS #
if len(word) == 1: # single word
label = 'S'
sentence_list.append((word, label))
elif this_is_shit: # currently don't use
# e.g. ('$$_', 'S')
label = 'S' # or maybe O?!
sentence_list.append((word, label))
else: # normal case (and weird case)
# e.g. ('$', 'B'), ('$', 'M'), ('_', 'E')
for i, char in enumerate(word):
if i == 0:
label = 'B'
elif i == len(word)-1:
label = 'E'
else:
label = 'M'
sentence_list.append((char, label))
# POS knowledge graph #
pos_knowledge_graph[word].update([pos])
dataset_list.append(sentence_list)
return dataset_list, pos_knowledge_graph
def get_raw_article_from_cws_data(path: str = CWS_DATA['Train'], output_path: str = ''):
with open(path, 'r') as f_in:
raw_data_with_space = f_in.read()
# remove all the space
raw_article = "".join(raw_data_with_space.split(' '))
if output_path:
with open(output_path, 'w') as f_out:
f_out.write(raw_article)
return raw_article
def get_raw_article(path: str = Final['RawArticle']):
with open(path, 'r') as article:
return article.read()
if __name__ == "__main__":
# just for test purpose
dataset_list, pos_knowledge_graph = processing_pos_data(POS_DATA['Train'])
print(dataset_list[0])
for w in pos_knowledge_graph.keys():
print(w, pos_knowledge_graph[w].most_common()[0][0])
break
# Total words without duplicate in training set: 21247
print(len(pos_knowledge_graph))
get_raw_article_from_cws_data(CWS_DATA['Train'], 'Data/raw_article.txt')