forked from RikVN/AMR
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpostprocess_AMRs.py
215 lines (152 loc) · 7.06 KB
/
postprocess_AMRs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
#!/usr/bin/env python
# -*- coding: utf8 -*-
'''Script that tests given seq2seq model on given test data, also restoring and wikifying the produced AMRs
Input should either be a produced AMR -file or a folder to traverse. Outputs .restore, .pruned, .coref and .all files'''
import sys
import re
import argparse
import os
from amr_utils import *
import wikify_file
from multiprocessing import Pool
def create_arg_parser():
### If using -fol, -f and -s are directories. In that case the filenames of the sentence file and output file should match (except extension)
### If not using -fol, -f and -s are directories ###
parser = argparse.ArgumentParser()
parser.add_argument('-f', required = True ,help="File or folder to be post-processed")
parser.add_argument('-s', default = '' ,help="Sentence file or folder, necessary for Wikification")
parser.add_argument('-fol', action = 'store_true' ,help="Whether -f is a folder")
parser.add_argument('-sent_ext', default = '.sent' ,help="Sentence file, necessary for Wikification - only needed when doing single file")
parser.add_argument('-out_ext', default = '.seq.amr' ,help="Output directory when doing a file")
parser.add_argument('-t', default = 16, type = int ,help="Maximum number of parallel threads")
parser.add_argument('-c', default = 'dupl', action='store', choices=['dupl','index','abs'], help='How to handle coreference - input was either duplicated/indexed/absolute path')
parser.add_argument('-no_wiki', action='store_true', help='Not doing Wikification, since it takes a long time')
args = parser.parse_args()
return args
def check_valid(restore_file, rewrite):
'''Checks whether the AMRS in a file are valid, possibly rewrites to default AMR'''
idx = 0
warnings = 0
all_amrs = []
for line in open(restore_file,'r'):
idx += 1
if not valid_amr(line):
print 'Error or warning in line {0}, write default\n'.format(idx)
warnings += 1
default_amr = get_default_amr()
all_amrs.append(default_amr) ## add default when error
else:
all_amrs.append(line)
if warnings == 0:
print 'No badly formed AMRs!\n'
elif rewrite:
print 'Rewriting {0} AMRs with error to default AMR\n'.format(warnings)
with open(restore_file,'w') as out_f:
for line in all_amrs:
out_f.write(line.strip()+'\n')
out_f.close()
else:
print '{0} AMRs with warning - no rewriting to default\n'.format(warnings)
def add_wikification(in_file, sent_file):
'''Function that adds wiki-links to produced AMRs'''
wiki_file = in_file + '.wiki'
print 'Doing Wikification...'
if not os.path.isfile(wiki_file): #check if wiki file doesn't exist already
wikify_file.wikify_file(in_file, sent_file)
if len([x for x in open(sent_file,'r')]) != len([x for x in open(wiki_file,'r')]):
print 'Wikification failed for some reason (length {0} instead of {1})\n\tSave file as backup with wrong extension, no validating\n'
os.system('mv {0} {1}'.format(wiki_file, wiki_file.replace('.wiki','.failed_wiki')))
return wiki_file, False
else:
print 'Validating Wikified AMRs...\n'
check_valid(wiki_file, True)
return wiki_file, True
else:
print 'Wiki file already exists, skipping...'
return wiki_file, True
def add_coreference(in_file, ext):
'''Function that adds coreference back for each concept that occurs more than once'''
print 'Adding coreference...\n'
coref_file = in_file + ext
if not os.path.isfile(coref_file):
os.system('python restore_duplicate_coref.py -f {0} -output_ext {1}'.format(in_file, ext))
else:
print 'Coref file already exists, skipping...'
return coref_file
def do_pruning(in_file):
'''Function that prunes duplicate output'''
print 'Pruning...\n'
prune_file = in_file + '.pruned'
if not os.path.isfile(prune_file):
os.system('python prune_amrs.py -f {0}'.format(in_file))
print 'Validating pruned AMRs...\n'
check_valid(prune_file, True)
else:
print 'Prune file already exists, skipping'
return prune_file
def restore_amr(in_file, out_file, coref_type):
'''Function that restores variables in output AMR'''
print 'Restoring variables...'
if not os.path.isfile(out_file):
if coref_type == 'index':
restore_call = 'python restoreAMR/restore_amr.py -f {0} -o {1} -index'.format(in_file, out_file)
elif coref_type == 'abs':
restore_call = 'python restoreAMR/restore_amr.py -f {0} -o {1} -abs'.format(in_file, out_file)
else:
restore_call = 'python restoreAMR/restore_amr.py -f {0} -o {1}'.format(in_file, out_file)
os.system(restore_call)
print 'Validating restored AMRs...\n'
check_valid(out_file, True)
else:
print 'Restore file already exists, skipping...'
return out_file
def process_file(input_list):
'''Postprocessing AMR file'''
f = input_list[0]
sent_file = input_list[1]
if not os.path.isfile(sent_file) or not os.path.isfile(f):
print 'Something is wrong, sent-file or amr-file does not exist'
sys.exit(0)
if os.path.getsize(f) > 0: #check if file has content
restore_file = f + '.restore'
restore_file = restore_amr(f, restore_file, args.c)
prune_file = do_pruning(restore_file)
if args.c == 'dupl': #coreference by duplication is done in separate script
coref_file = add_coreference(restore_file, '.coref')
if not args.no_wiki: #sometimes we don't want to do Wikification because it takes time
wiki_file, success = add_wikification(restore_file, sent_file)
#then add all postprocessing steps together, starting at the pruning
print 'Do all postprocessing steps...\n'
wiki_file_pruned, success = add_wikification(prune_file, sent_file)
if success:
if args.c == 'dupl':
coref_file_wiki_pruned = add_coreference(wiki_file_pruned, '.coref.all')
else: #we already did coreference in restore file, still call the output-file .coref.all to not get confused in evaluation, just copy previous file
os.system("cp {0} {1}".format(wiki_file_pruned, wiki_file_pruned + '.coref.all'))
else:
print 'Wikification failed earlier, not trying again here\n'
print 'Done processing!'
def match_files_by_name(amr_files, sent_files):
'''Input is a list of both amr and sentence files, return matching pairs to test in parallel in the main function'''
matches = []
for amr in amr_files:
match_amr = amr.split('/')[-1].split('.')[0] #return filename when given file /home/user/folder/folder2/filename.txt
for sent in sent_files:
match_sent = sent.split('/')[-1].split('.')[0]
if match_sent == match_amr: #matching sentence and AMR file, we can process those
matches.append([amr, sent])
break
return matches
if __name__ == "__main__":
args = create_arg_parser()
if not args.fol:
print 'Process single file\n'
process_file([args.f, args.s])
else:
sent_files = get_files_by_ext(args.s, args.sent_ext)
amr_files = get_files_by_ext(args.f, args.out_ext)
matching_files = match_files_by_name(amr_files, sent_files)
#Process file in parallel here
print 'Processing {0} files, doing max {1} in parallel'.format(len(matching_files), args.t)
pool = Pool(processes=args.t)
pool.map(process_file, matching_files)