-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtemplate_maker.py
207 lines (154 loc) · 7.43 KB
/
template_maker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
'''
Convert tagged sentence pairs into YAML (custom) templates for generation
Example: The quick brown fox|NN#animal jumps
This goes back to my pre-YAML basic idea for template data format - now it's a meta-template of sorts.
'''
import collections
import yaml # useful for debugging as well as error messages
from utility import LANGUAGES
import yaml_reader
FORBIDDEN_YAML_DATA = Exception('FORBIDDEN_YAML_DATA') # just anything that will raise an error if you try to write it to YAML
TAG_SEPARATOR = '|'
INPUT_FILE = 'template_input.yml'
OUTPUT_FILE = 'datasets/templates/custom_postedited.yml'
# also need to specify symbol names, since you can't just go in order!
# TODO: does this belong in data.py? but this is like a METAdata format...
def find_symbol(tags):
candidates = [t for t in tags if t.startswith('@')]
assert(len(candidates) is 1)
return candidates[0][1:]
def find_hashtags(tags):
return [t[1:] for t in tags if t.startswith('#')]
def find_formattags(tags):
return [t[1:] for t in tags if t.startswith('%')]
def get_type_from_pos(pos, lang):
#return POS_TO_TYPE_MAPPING[lang][pos] # meh, this would actually be less DRY - can't "combine" switch statements
if lang == 'en':
if pos in ['JJ', 'JJR', 'JJS']:
return 'adjective'
elif pos in ['NN', 'NNS']:
return 'noun'
elif pos in ['NNP', 'NNPS']:
return 'name'
elif pos in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
return 'verb'
else:
raise RuntimeError('Unsupported English POS tag: ' + pos)
elif lang == 'zh':
if pos in ['JJ', 'VA']:
return 'adjective'
elif pos == 'NR':
return 'name'
elif pos == 'NN':
return 'noun'
elif pos == 'VV':
return 'verb'
else:
raise RuntimeError('Unsupported Chinese POS tag: ' + pos)
else:
raise RuntimeError('Unsupported language: ' + lang)
# TODO: DRY out dict accessors...
def process_datum(datum):
data_per_lang = { lang: process_tagged_sentence(datum[lang], lang) for lang in datum if lang in LANGUAGES }
# this is much cleaner if you write things from a data-centric perspective/style
symbols_per_lang = {
lang: {
sym: data_per_lang[lang]['symbols'][sym]['type'] for sym in data_per_lang[lang].get('symbols', [])
} for lang in data_per_lang
}
assert('en' in symbols_per_lang)
symbol_types = symbols_per_lang['en']
symbols = set(symbol_types.keys())
# symbol declarations and types must match for all languages
if ( any(set(symbols_per_lang[lang].keys()) != symbols for lang in data_per_lang) or
any(symbols_per_lang[lang][sym] != symbol_types[sym] for sym in symbols for lang in data_per_lang)):
raise RuntimeError('symbol declaration or type mismatch\n{}'.format(yaml.dump(symbols_per_lang)))
# establish basic backbone - code after this just writes into it.
# TODO: could (and probably should) read a blank template in from YAML instead...
# but that would require having blank template PIECES for each symbol and each lang.
# meh, with decent indentation, it resembles the YAML sufficiently
result = {
'symbols': {
sym: {
'type': symbol_types[sym], # these entries must get filled in below
'options': {
# TODO: number?
'tags': []
}
} for sym in symbols
},
'langs': {
lang: {
'template': FORBIDDEN_YAML_DATA,
'tags': {}
} for lang in data_per_lang
}
}
for lang in data_per_lang:
result['langs'][lang]['template'] = data_per_lang[lang]['template']
for sym in symbols:
hashtags = data_per_lang[lang]['symbols'][sym]['hashtags']
assert(type(hashtags) is list)
if hashtags:
result['symbols'][sym]['options']['tags'] += hashtags # generator ignores duplicates, right?
result['langs']
formattags = data_per_lang[lang]['symbols'][sym]['formattags']
assert(type(formattags) is list)
if formattags:
result['langs'][lang]['tags'][sym] = formattags
# TODO: for particular types, specify literal forms
#print(yaml.dump(data_per_lang))
# TODO: handle POS tags - depending on language and depending on POS, might want a literal form
# can probably do this pretty easily by using a mapping from POS tags to
# but should also check POS tags as to whether or not to add lang-specific forms
# TODO: store the original word in the template somehow
return result #data_per_lang
def process_tagged_sentence(sentence, lang):
assert(type(sentence) is str)
tokens = sentence.split()
result = collections.defaultdict(dict) # wait, do i need defaultdict? YES. otherwise have to allocate on first
template_words = []
for tok in tokens:
if TAG_SEPARATOR in tok: # tagged
tok_pieces = tok.split(TAG_SEPARATOR)
word, pos = tok_pieces[0:2]
assert(pos[0] not in '#@%')
assert(pos) # not empty
if len(tok_pieces) is 2: # just word|POS - no post-editing was done - just revert to untagged word
template_words.append(word)
else:
tags = tok_pieces[2:]
symbol = find_symbol(tags)
template_words.append(symbol)
hashtags = find_hashtags(tags)
formattags = find_formattags(tags)
# TODO: number? (just use NN/NNS?)
assert(1 + len(hashtags) + len(formattags) == len(tags))
assert(symbol not in result['symbols'])
result['symbols'][symbol] = {
'hashtags': hashtags,
'formattags': formattags,
'original word': word,
'POS': pos,
'type': get_type_from_pos(pos, lang) }
else: # bare word
template_words.append(tok)
# TODO: convert punctuation to a separate field? or just eat it for now? (custom templates, after all)
# this is for long sentences recovered from clean-corpus.perl
assert(len(template_words) <= 100)
assert('template' not in result)
result['template'] = ' '.join(template_words) # hey, this works! you can override the default with a different data type
return dict(result)
if __name__ == '__main__':
input_data = yaml_reader.read_file(INPUT_FILE)
output_data = {}
for (key, value) in input_data.items():
try:
output_data[key] = process_datum(value)
except RuntimeError as e:
#assert(type(e.args[0]) is str)
raise Exception('Error processing {} - key {} - {}'.format(INPUT_FILE, key, e.args[0]))
#import pdb; pdb.set_trace()
except AssertionError as e:
raise Exception('Error processing {} - key {} - probably too long'.format(INPUT_FILE, key))
yaml_reader.write_file(OUTPUT_FILE, output_data)