Skip to content

Commit 188f6e9

Browse files
committed
USim with files or sentences
1 parent 84c3697 commit 188f6e9

File tree

1 file changed

+338
-0
lines changed

1 file changed

+338
-0
lines changed

USim.py

Lines changed: 338 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,338 @@
1+
import os
2+
import sys
3+
import scipy
4+
import argparse
5+
from multiprocessing import Pool
6+
import multiprocessing
7+
from subprocess import call, Popen
8+
import pickle
9+
from functools import reduce
10+
import operator
11+
import platform
12+
import re
13+
14+
TUPA_DIR = '/cs/labs/oabend/borgr/tupa/'
15+
UCCA_DIR = TUPA_DIR + 'ucca'
16+
sys.path.append(UCCA_DIR)
17+
sys.path.append(UCCA_DIR + '/scripts/distances')
18+
sys.path.append(UCCA_DIR + '/ucca')
19+
20+
from ucca.ioutil import file2passage
21+
import codecs
22+
23+
import align
24+
from ucca.ioutil import passage2file
25+
from ucca.convert import from_text
26+
27+
POOL_SIZE = multiprocessing.cpu_count()
28+
full_rerank = True
29+
30+
from tupa.parse import Parser
31+
from tupa.config import Config
32+
Config("")
33+
34+
PARSER = None
35+
PARSER_PATH = None
36+
SENTENCE_ID_FILENAME = "sentenceIds.pkl"
37+
PARSED_FILE = "parsed"
38+
39+
40+
def main(args):
41+
if args.source_sentences is not None:
42+
ucca_parse_sentences(
43+
args.source_sentences + args.reference_sentences, args.parse_dir, args.parser_path)
44+
source_sentences, reference_sentences = args.source_sentences, args.reference_sentences
45+
res = [str(USim(s, r, args.parse_dir)) + "\n" for s,
46+
r in zip(source_sentences, reference_sentences)]
47+
else:
48+
ucca_parse_files(args.source_files + args.reference_files,
49+
args.parse_dir, args.parser_path)
50+
51+
source_files = []
52+
source_sentences = []
53+
for source_file in args.source_files:
54+
with open(source_file) as fl:
55+
for line in fl:
56+
# source_sentences.append(line.strip())
57+
source_files.append(source_file)
58+
reference_sentences = []
59+
reference_files = []
60+
for reference_file in args.reference_files:
61+
with open(reference_file) as fl:
62+
for line in fl:
63+
# reference_sentences.append(line.strip())
64+
reference_files.append(reference_file)
65+
res = [str(USim(s, r, args.parse_dir, i, i + len(source_sentences))) + "\n" for i, (s,
66+
r) in enumerate(zip(source_files, reference_files))]
67+
68+
with open(args.output_file, "w") as fl:
69+
fl.writelines(res)
70+
# a lot of code duplication because pooling doesn't react well to passing
71+
# different lambdas as an argument
72+
73+
74+
def normalize_sentence(s):
75+
s = re.sub(r"\W+", r" ", s)
76+
s = re.sub(r"(\s[a-zA-Z])\s([a-zA-Z]\s)", r"\1\2", s)
77+
s = s.lower()
78+
s = s.strip()
79+
return s
80+
81+
82+
def rerank_by_uccasim(gamma=0.27):
83+
data_dir = ASSESS_DIR + "data" + os.sep
84+
# only used to extract source sentences
85+
first_nucle = data_dir + "references/" + "NUCLEA.m2"
86+
k_best_dir = data_dir + "K-best/"
87+
system_file = k_best_dir + "conll14st.output.1.best100"
88+
calculations_dir = "calculations_data/uccasim_rerank/"
89+
ucca_parse_dir = calculations_dir + "/ucca_parse/"
90+
full = "full" if full_rerank else ""
91+
output_file = full + str(gamma) + "_" + "uccasim_rank_results"
92+
out_text_file = calculations_dir + output_file
93+
out_res_file = calculations_dir + "score_" + output_file
94+
95+
if not os.path.isfile(out_text_file):
96+
gold_file = first_nucle # only used to extract source sentences
97+
print("acquiring source")
98+
source_sentences, _ = m2scorer.load_annotation(gold_file)
99+
100+
source_sentences = source_sentences
101+
# load system hypotheses
102+
fin = m2scorer.smart_open(system_file, 'r')
103+
system_sentences = [line.strip() for line in fin.readlines()]
104+
fin.close()
105+
106+
packed_system_sentences = get_roro_packed(system_sentences)
107+
108+
print("parsing")
109+
ucca_parse(reduce(operator.add, packed_system_sentences) +
110+
source_sentences, ucca_parse_dir)
111+
112+
print("reranking")
113+
# find top ranking
114+
pool = Pool(POOL_SIZE)
115+
assert(len(packed_system_sentences) == len(source_sentences))
116+
if full_rerank:
117+
results = pool.starmap(referece_less_full_rerank, zip(source_sentences, packed_system_sentences, [
118+
ucca_parse_dir] * len(packed_system_sentences), [gamma] * len(packed_system_sentences)))
119+
else:
120+
results = pool.starmap(referece_less_oracle, zip(source_sentences, packed_system_sentences, [
121+
ucca_parse_dir] * len(packed_system_sentences), [gamma] * len(packed_system_sentences)))
122+
pool.close()
123+
pool.join()
124+
results = list(results)
125+
if full_rerank:
126+
results = [x for y in results for x in y]
127+
sentences = "\n".join(list(zip(*results))[0])
128+
results = list(zip(*results))[1]
129+
results = "\n".join([str(x) for x in results])
130+
131+
print("writing to " + out_text_file)
132+
with codecs.open(out_text_file, "w+", "utf-8") as fl:
133+
fl.write(sentences)
134+
with open(out_res_file, "w+") as fl:
135+
fl.write(results)
136+
137+
138+
def parse_location(output_dir, filename, sentence_num=None):
139+
filename = os.path.splitext(os.path.basename(filename))[0]
140+
cur_dir = os.path.join(output_dir, filename)
141+
if sentence_num is None:
142+
return cur_dir
143+
return os.path.join(cur_dir, str(sentence_num) + ".xml")
144+
145+
146+
def get_parser(model_path):
147+
global PARSER
148+
global PARSER_PATH
149+
if PARSER_PATH is not model_path or PARSER is None:
150+
PARSER_PATH = model_path
151+
PARSER = Parser(model_path)
152+
return PARSER
153+
154+
155+
def ucca_parse_sentences(sentences, output_dir, model_path, clean=False, normalize_sentence=normalize_sentence):
156+
sentences = list(set([normalize_sentence(sentence)
157+
for sentence in sentences]))
158+
output_dir = os.path.realpath(output_dir)
159+
to_parse = get_parsed_subdirs(sentences, output_dir)
160+
to_parse = [sent for sent, loc in zip(sentences, to_parse) if loc is None]
161+
if to_parse:
162+
i = 0
163+
out_path = os.path.join(output_dir, "parse_batch" + str(i))
164+
while os.path.isfile(os.path.join(out_path, SENTENCE_ID_FILENAME)):
165+
i += 1
166+
out_path = os.path.join(output_dir, "parse_batch" + str(i))
167+
if not os.path.isdir(out_path):
168+
os.makedirs(out_path)
169+
print("Output folder:", out_path)
170+
171+
for i, sentence in enumerate(to_parse):
172+
# adds sentences to sentence ids memory
173+
tmp = get_sentence_id(sentence, out_path, True, normalize_sentence)
174+
assert tmp == i, (tmp, i)
175+
print(to_parse)
176+
print("Parsing", len(to_parse), "sentences.", len(
177+
sentences) - len(to_parse), "sentences already parsed.")
178+
_ucca_parse_text(to_parse, out_path, "", clean,
179+
normalize_sentence, model_path)
180+
else:
181+
print("All", len(sentences), "sentences already parsed")
182+
183+
184+
def ucca_parse_files(filenames, output_dir, model_path, clean=False, normalize_sentence=lambda x: x):
185+
output_dir = os.path.realpath(output_dir)
186+
if filenames:
187+
for filename in filenames:
188+
cur_output_dir = parse_location(output_dir, filename)
189+
if os.path.isdir(cur_output_dir):
190+
print("File already parsed in", cur_output_dir)
191+
else:
192+
os.makedirs(cur_output_dir)
193+
with open(filename, "r") as fl:
194+
text = fl.readlines()
195+
_ucca_parse_text(text, output_dir, filename,
196+
clean, normalize_sentence, model_path)
197+
198+
199+
def _ucca_parse_text(text, output_dir, filename, clean, normalize_sentence, model_path):
200+
text = [normalize_sentence(x) for x in text]
201+
text = from_text(text, split=True, one_per_line=True)
202+
text = list(text)
203+
parser = get_parser(model_path)
204+
out_location = os.path.dirname(parse_location(output_dir, filename, 0))
205+
if not os.path.isdir(out_location):
206+
os.makedirs(out_location)
207+
for i, (passage, *_) in enumerate(parser.parse(text)):
208+
passage2file(passage, parse_location(
209+
output_dir, filename, i))
210+
# create an empty file anounces parsing finished succsessfuly
211+
parsed_file = os.path.join(out_location, PARSED_FILE)
212+
with open(parsed_file, "w") as _:
213+
pass
214+
if clean:
215+
filenames = os.listdir(output_dir)
216+
for filename in filenames:
217+
if filename.endswith(".txt"):
218+
os.remove(os.path.join(output_dir, item))
219+
220+
221+
_id_dics = {}
222+
223+
224+
def get_parsed_subdirs(sentences, parse_dir):
225+
res = [None] * len(sentences)
226+
parse_dir = os.path.realpath(parse_dir)
227+
for parse_subdir, dirs, files in os.walk(parse_dir):
228+
if PARSED_FILE in files:
229+
for i, sentence in enumerate(sentences):
230+
if res[i] is None: # avoid multiple lookups in case the sentence was already found once
231+
try:
232+
get_sentence_id(sentence, parse_subdir, False)
233+
res[i] = parse_subdir
234+
except KeyError:
235+
pass
236+
return res
237+
238+
239+
def get_parsed_subdir(sentence, parse_dir):
240+
parse_dir = os.path.realpath(parse_dir)
241+
for parse_subdir, dirs, files in os.walk(parse_dir):
242+
if PARSED_FILE in files and any((fl.endswith(SENTENCE_ID_FILENAME) for fl in files)):
243+
try:
244+
get_sentence_id(sentence, parse_subdir, False)
245+
return parse_subdir
246+
except KeyError:
247+
pass
248+
249+
250+
def get_sentence_id(sentence, parse_dir, graceful=True, normalize_sentence=normalize_sentence):
251+
""" returns the sentence id in the parse_dir,
252+
if graceful is true adds a new sentence id
253+
if the sentence does not exist in the ids list,
254+
otherwise throws exception"""
255+
parse_dir = os.path.realpath(parse_dir)
256+
filename = SENTENCE_ID_FILENAME
257+
max_id = "max"
258+
sentence = normalize_sentence(sentence)
259+
if parse_dir in _id_dics:
260+
id_dic = _id_dics[parse_dir]
261+
elif not os.path.isfile(parse_dir + os.sep + filename):
262+
print("creating a new id list for file", parse_dir + os.sep + filename)
263+
id_dic = {max_id: -1}
264+
_id_dics[parse_dir] = id_dic
265+
else:
266+
with open(parse_dir + os.sep + filename, "rb") as fl:
267+
id_dic = pickle.load(fl)
268+
_id_dics[parse_dir] = id_dic
269+
if graceful and not sentence in id_dic:
270+
id_dic[max_id] += 1
271+
id_dic[sentence] = id_dic[max_id]
272+
with open(parse_dir + os.sep + filename, "wb+") as fl:
273+
pickle.dump(id_dic, fl)
274+
return id_dic[sentence]
275+
276+
277+
def parsed_sentence2xml(sentence, parse_dir, sent_id=None, normalize_sentence=normalize_sentence):
278+
if sent_id is None:
279+
location = get_parsed_subdir(sentence, parse_dir)
280+
filename = parse_location(location, "", get_sentence_id(
281+
sentence, location, False, normalize_sentence))
282+
# print("reading parse from ", filename)
283+
# with open(filename) as fl:
284+
# print("sentence:", sentence)
285+
# print("xml first lines:", fl.readlines()[:30])
286+
return file2passage(filename)
287+
else:
288+
return file2passage(parse_location(parse_dir, sentence, sent_id))
289+
290+
291+
def USim(source, sentence, parse_dir, source_id=None, sentence_id=None, normalize_sentence=normalize_sentence):
292+
""" accepts filename instead of sentence\source and a sentence id\source_sentence id for locating the file"""
293+
if align.regularize_word(source) == "":
294+
if align.regularize_word(sentence) == "":
295+
return 1
296+
else:
297+
return 0
298+
elif align.regularize_word(sentence) == "":
299+
return 0
300+
source_xml = parsed_sentence2xml(
301+
source, parse_dir, source_id, normalize_sentence)
302+
sentence_xml = parsed_sentence2xml(
303+
sentence, parse_dir, sentence_id, normalize_sentence)
304+
return align.fully_aligned_distance(source_xml, sentence_xml)
305+
306+
307+
def announce_finish():
308+
if sys.platform == "linux":
309+
if set(("debian", "Ubuntu")) & set(platform.linux_distribution()):
310+
call(['speech-dispatcher']) # start speech dispatcher
311+
call(['spd-say', '"your process has finished"'])
312+
else:
313+
# perhaps works only in ubuntu?
314+
a = Popen(
315+
('play --no-show-progress --null --channels 1 synth %s sine %f' % (300, 2)).split())
316+
elif sys.platform == "darwin":
317+
call('say "your process has finished"'.split())
318+
else:
319+
import winsound
320+
winsound.Beep(300, 2)
321+
322+
if __name__ == '__main__':
323+
parser = argparse.ArgumentParser(
324+
description='Extract USim scores')
325+
parser.add_argument(
326+
'parse_dir', help="Name of the directory to save and look for parsed sentences")
327+
parser.add_argument('output_file')
328+
parser.add_argument('-sf', '--source_files', nargs='+')
329+
parser.add_argument('-rf', '--reference_files', nargs='+')
330+
parser.add_argument('-ss', '--source_sentences', nargs='+')
331+
parser.add_argument('-rs', '--reference_sentences', nargs='+')
332+
parser.add_argument('-p', "--parser_path")
333+
334+
args, unknown = parser.parse_known_args()
335+
PARSER_PATH = args.parser_path
336+
if not((args.source_files is not None and args.reference_files is not None) or (args.source_sentences is not None and args.reference_sentences is not None)):
337+
print("please provide sources and references as files or as sentences.")
338+
main(args)

0 commit comments

Comments
 (0)