|
| 1 | +import os |
| 2 | +import sys |
| 3 | +import scipy |
| 4 | +import argparse |
| 5 | +from multiprocessing import Pool |
| 6 | +import multiprocessing |
| 7 | +from subprocess import call, Popen |
| 8 | +import pickle |
| 9 | +from functools import reduce |
| 10 | +import operator |
| 11 | +import platform |
| 12 | +import re |
| 13 | + |
| 14 | +TUPA_DIR = '/cs/labs/oabend/borgr/tupa/' |
| 15 | +UCCA_DIR = TUPA_DIR + 'ucca' |
| 16 | +sys.path.append(UCCA_DIR) |
| 17 | +sys.path.append(UCCA_DIR + '/scripts/distances') |
| 18 | +sys.path.append(UCCA_DIR + '/ucca') |
| 19 | + |
| 20 | +from ucca.ioutil import file2passage |
| 21 | +import codecs |
| 22 | + |
| 23 | +import align |
| 24 | +from ucca.ioutil import passage2file |
| 25 | +from ucca.convert import from_text |
| 26 | + |
| 27 | +POOL_SIZE = multiprocessing.cpu_count() |
| 28 | +full_rerank = True |
| 29 | + |
| 30 | +from tupa.parse import Parser |
| 31 | +from tupa.config import Config |
| 32 | +Config("") |
| 33 | + |
| 34 | +PARSER = None |
| 35 | +PARSER_PATH = None |
| 36 | +SENTENCE_ID_FILENAME = "sentenceIds.pkl" |
| 37 | +PARSED_FILE = "parsed" |
| 38 | + |
| 39 | + |
| 40 | +def main(args): |
| 41 | + if args.source_sentences is not None: |
| 42 | + ucca_parse_sentences( |
| 43 | + args.source_sentences + args.reference_sentences, args.parse_dir, args.parser_path) |
| 44 | + source_sentences, reference_sentences = args.source_sentences, args.reference_sentences |
| 45 | + res = [str(USim(s, r, args.parse_dir)) + "\n" for s, |
| 46 | + r in zip(source_sentences, reference_sentences)] |
| 47 | + else: |
| 48 | + ucca_parse_files(args.source_files + args.reference_files, |
| 49 | + args.parse_dir, args.parser_path) |
| 50 | + |
| 51 | + source_files = [] |
| 52 | + source_sentences = [] |
| 53 | + for source_file in args.source_files: |
| 54 | + with open(source_file) as fl: |
| 55 | + for line in fl: |
| 56 | + # source_sentences.append(line.strip()) |
| 57 | + source_files.append(source_file) |
| 58 | + reference_sentences = [] |
| 59 | + reference_files = [] |
| 60 | + for reference_file in args.reference_files: |
| 61 | + with open(reference_file) as fl: |
| 62 | + for line in fl: |
| 63 | + # reference_sentences.append(line.strip()) |
| 64 | + reference_files.append(reference_file) |
| 65 | + res = [str(USim(s, r, args.parse_dir, i, i + len(source_sentences))) + "\n" for i, (s, |
| 66 | + r) in enumerate(zip(source_files, reference_files))] |
| 67 | + |
| 68 | + with open(args.output_file, "w") as fl: |
| 69 | + fl.writelines(res) |
| 70 | +# a lot of code duplication because pooling doesn't react well to passing |
| 71 | +# different lambdas as an argument |
| 72 | + |
| 73 | + |
| 74 | +def normalize_sentence(s): |
| 75 | + s = re.sub(r"\W+", r" ", s) |
| 76 | + s = re.sub(r"(\s[a-zA-Z])\s([a-zA-Z]\s)", r"\1\2", s) |
| 77 | + s = s.lower() |
| 78 | + s = s.strip() |
| 79 | + return s |
| 80 | + |
| 81 | + |
| 82 | +def rerank_by_uccasim(gamma=0.27): |
| 83 | + data_dir = ASSESS_DIR + "data" + os.sep |
| 84 | + # only used to extract source sentences |
| 85 | + first_nucle = data_dir + "references/" + "NUCLEA.m2" |
| 86 | + k_best_dir = data_dir + "K-best/" |
| 87 | + system_file = k_best_dir + "conll14st.output.1.best100" |
| 88 | + calculations_dir = "calculations_data/uccasim_rerank/" |
| 89 | + ucca_parse_dir = calculations_dir + "/ucca_parse/" |
| 90 | + full = "full" if full_rerank else "" |
| 91 | + output_file = full + str(gamma) + "_" + "uccasim_rank_results" |
| 92 | + out_text_file = calculations_dir + output_file |
| 93 | + out_res_file = calculations_dir + "score_" + output_file |
| 94 | + |
| 95 | + if not os.path.isfile(out_text_file): |
| 96 | + gold_file = first_nucle # only used to extract source sentences |
| 97 | + print("acquiring source") |
| 98 | + source_sentences, _ = m2scorer.load_annotation(gold_file) |
| 99 | + |
| 100 | + source_sentences = source_sentences |
| 101 | + # load system hypotheses |
| 102 | + fin = m2scorer.smart_open(system_file, 'r') |
| 103 | + system_sentences = [line.strip() for line in fin.readlines()] |
| 104 | + fin.close() |
| 105 | + |
| 106 | + packed_system_sentences = get_roro_packed(system_sentences) |
| 107 | + |
| 108 | + print("parsing") |
| 109 | + ucca_parse(reduce(operator.add, packed_system_sentences) + |
| 110 | + source_sentences, ucca_parse_dir) |
| 111 | + |
| 112 | + print("reranking") |
| 113 | + # find top ranking |
| 114 | + pool = Pool(POOL_SIZE) |
| 115 | + assert(len(packed_system_sentences) == len(source_sentences)) |
| 116 | + if full_rerank: |
| 117 | + results = pool.starmap(referece_less_full_rerank, zip(source_sentences, packed_system_sentences, [ |
| 118 | + ucca_parse_dir] * len(packed_system_sentences), [gamma] * len(packed_system_sentences))) |
| 119 | + else: |
| 120 | + results = pool.starmap(referece_less_oracle, zip(source_sentences, packed_system_sentences, [ |
| 121 | + ucca_parse_dir] * len(packed_system_sentences), [gamma] * len(packed_system_sentences))) |
| 122 | + pool.close() |
| 123 | + pool.join() |
| 124 | + results = list(results) |
| 125 | + if full_rerank: |
| 126 | + results = [x for y in results for x in y] |
| 127 | + sentences = "\n".join(list(zip(*results))[0]) |
| 128 | + results = list(zip(*results))[1] |
| 129 | + results = "\n".join([str(x) for x in results]) |
| 130 | + |
| 131 | + print("writing to " + out_text_file) |
| 132 | + with codecs.open(out_text_file, "w+", "utf-8") as fl: |
| 133 | + fl.write(sentences) |
| 134 | + with open(out_res_file, "w+") as fl: |
| 135 | + fl.write(results) |
| 136 | + |
| 137 | + |
| 138 | +def parse_location(output_dir, filename, sentence_num=None): |
| 139 | + filename = os.path.splitext(os.path.basename(filename))[0] |
| 140 | + cur_dir = os.path.join(output_dir, filename) |
| 141 | + if sentence_num is None: |
| 142 | + return cur_dir |
| 143 | + return os.path.join(cur_dir, str(sentence_num) + ".xml") |
| 144 | + |
| 145 | + |
| 146 | +def get_parser(model_path): |
| 147 | + global PARSER |
| 148 | + global PARSER_PATH |
| 149 | + if PARSER_PATH is not model_path or PARSER is None: |
| 150 | + PARSER_PATH = model_path |
| 151 | + PARSER = Parser(model_path) |
| 152 | + return PARSER |
| 153 | + |
| 154 | + |
| 155 | +def ucca_parse_sentences(sentences, output_dir, model_path, clean=False, normalize_sentence=normalize_sentence): |
| 156 | + sentences = list(set([normalize_sentence(sentence) |
| 157 | + for sentence in sentences])) |
| 158 | + output_dir = os.path.realpath(output_dir) |
| 159 | + to_parse = get_parsed_subdirs(sentences, output_dir) |
| 160 | + to_parse = [sent for sent, loc in zip(sentences, to_parse) if loc is None] |
| 161 | + if to_parse: |
| 162 | + i = 0 |
| 163 | + out_path = os.path.join(output_dir, "parse_batch" + str(i)) |
| 164 | + while os.path.isfile(os.path.join(out_path, SENTENCE_ID_FILENAME)): |
| 165 | + i += 1 |
| 166 | + out_path = os.path.join(output_dir, "parse_batch" + str(i)) |
| 167 | + if not os.path.isdir(out_path): |
| 168 | + os.makedirs(out_path) |
| 169 | + print("Output folder:", out_path) |
| 170 | + |
| 171 | + for i, sentence in enumerate(to_parse): |
| 172 | + # adds sentences to sentence ids memory |
| 173 | + tmp = get_sentence_id(sentence, out_path, True, normalize_sentence) |
| 174 | + assert tmp == i, (tmp, i) |
| 175 | + print(to_parse) |
| 176 | + print("Parsing", len(to_parse), "sentences.", len( |
| 177 | + sentences) - len(to_parse), "sentences already parsed.") |
| 178 | + _ucca_parse_text(to_parse, out_path, "", clean, |
| 179 | + normalize_sentence, model_path) |
| 180 | + else: |
| 181 | + print("All", len(sentences), "sentences already parsed") |
| 182 | + |
| 183 | + |
| 184 | +def ucca_parse_files(filenames, output_dir, model_path, clean=False, normalize_sentence=lambda x: x): |
| 185 | + output_dir = os.path.realpath(output_dir) |
| 186 | + if filenames: |
| 187 | + for filename in filenames: |
| 188 | + cur_output_dir = parse_location(output_dir, filename) |
| 189 | + if os.path.isdir(cur_output_dir): |
| 190 | + print("File already parsed in", cur_output_dir) |
| 191 | + else: |
| 192 | + os.makedirs(cur_output_dir) |
| 193 | + with open(filename, "r") as fl: |
| 194 | + text = fl.readlines() |
| 195 | + _ucca_parse_text(text, output_dir, filename, |
| 196 | + clean, normalize_sentence, model_path) |
| 197 | + |
| 198 | + |
| 199 | +def _ucca_parse_text(text, output_dir, filename, clean, normalize_sentence, model_path): |
| 200 | + text = [normalize_sentence(x) for x in text] |
| 201 | + text = from_text(text, split=True, one_per_line=True) |
| 202 | + text = list(text) |
| 203 | + parser = get_parser(model_path) |
| 204 | + out_location = os.path.dirname(parse_location(output_dir, filename, 0)) |
| 205 | + if not os.path.isdir(out_location): |
| 206 | + os.makedirs(out_location) |
| 207 | + for i, (passage, *_) in enumerate(parser.parse(text)): |
| 208 | + passage2file(passage, parse_location( |
| 209 | + output_dir, filename, i)) |
| 210 | + # create an empty file anounces parsing finished succsessfuly |
| 211 | + parsed_file = os.path.join(out_location, PARSED_FILE) |
| 212 | + with open(parsed_file, "w") as _: |
| 213 | + pass |
| 214 | + if clean: |
| 215 | + filenames = os.listdir(output_dir) |
| 216 | + for filename in filenames: |
| 217 | + if filename.endswith(".txt"): |
| 218 | + os.remove(os.path.join(output_dir, item)) |
| 219 | + |
| 220 | + |
| 221 | +_id_dics = {} |
| 222 | + |
| 223 | + |
| 224 | +def get_parsed_subdirs(sentences, parse_dir): |
| 225 | + res = [None] * len(sentences) |
| 226 | + parse_dir = os.path.realpath(parse_dir) |
| 227 | + for parse_subdir, dirs, files in os.walk(parse_dir): |
| 228 | + if PARSED_FILE in files: |
| 229 | + for i, sentence in enumerate(sentences): |
| 230 | + if res[i] is None: # avoid multiple lookups in case the sentence was already found once |
| 231 | + try: |
| 232 | + get_sentence_id(sentence, parse_subdir, False) |
| 233 | + res[i] = parse_subdir |
| 234 | + except KeyError: |
| 235 | + pass |
| 236 | + return res |
| 237 | + |
| 238 | + |
| 239 | +def get_parsed_subdir(sentence, parse_dir): |
| 240 | + parse_dir = os.path.realpath(parse_dir) |
| 241 | + for parse_subdir, dirs, files in os.walk(parse_dir): |
| 242 | + if PARSED_FILE in files and any((fl.endswith(SENTENCE_ID_FILENAME) for fl in files)): |
| 243 | + try: |
| 244 | + get_sentence_id(sentence, parse_subdir, False) |
| 245 | + return parse_subdir |
| 246 | + except KeyError: |
| 247 | + pass |
| 248 | + |
| 249 | + |
| 250 | +def get_sentence_id(sentence, parse_dir, graceful=True, normalize_sentence=normalize_sentence): |
| 251 | + """ returns the sentence id in the parse_dir, |
| 252 | + if graceful is true adds a new sentence id |
| 253 | + if the sentence does not exist in the ids list, |
| 254 | + otherwise throws exception""" |
| 255 | + parse_dir = os.path.realpath(parse_dir) |
| 256 | + filename = SENTENCE_ID_FILENAME |
| 257 | + max_id = "max" |
| 258 | + sentence = normalize_sentence(sentence) |
| 259 | + if parse_dir in _id_dics: |
| 260 | + id_dic = _id_dics[parse_dir] |
| 261 | + elif not os.path.isfile(parse_dir + os.sep + filename): |
| 262 | + print("creating a new id list for file", parse_dir + os.sep + filename) |
| 263 | + id_dic = {max_id: -1} |
| 264 | + _id_dics[parse_dir] = id_dic |
| 265 | + else: |
| 266 | + with open(parse_dir + os.sep + filename, "rb") as fl: |
| 267 | + id_dic = pickle.load(fl) |
| 268 | + _id_dics[parse_dir] = id_dic |
| 269 | + if graceful and not sentence in id_dic: |
| 270 | + id_dic[max_id] += 1 |
| 271 | + id_dic[sentence] = id_dic[max_id] |
| 272 | + with open(parse_dir + os.sep + filename, "wb+") as fl: |
| 273 | + pickle.dump(id_dic, fl) |
| 274 | + return id_dic[sentence] |
| 275 | + |
| 276 | + |
| 277 | +def parsed_sentence2xml(sentence, parse_dir, sent_id=None, normalize_sentence=normalize_sentence): |
| 278 | + if sent_id is None: |
| 279 | + location = get_parsed_subdir(sentence, parse_dir) |
| 280 | + filename = parse_location(location, "", get_sentence_id( |
| 281 | + sentence, location, False, normalize_sentence)) |
| 282 | + # print("reading parse from ", filename) |
| 283 | + # with open(filename) as fl: |
| 284 | + # print("sentence:", sentence) |
| 285 | + # print("xml first lines:", fl.readlines()[:30]) |
| 286 | + return file2passage(filename) |
| 287 | + else: |
| 288 | + return file2passage(parse_location(parse_dir, sentence, sent_id)) |
| 289 | + |
| 290 | + |
| 291 | +def USim(source, sentence, parse_dir, source_id=None, sentence_id=None, normalize_sentence=normalize_sentence): |
| 292 | + """ accepts filename instead of sentence\source and a sentence id\source_sentence id for locating the file""" |
| 293 | + if align.regularize_word(source) == "": |
| 294 | + if align.regularize_word(sentence) == "": |
| 295 | + return 1 |
| 296 | + else: |
| 297 | + return 0 |
| 298 | + elif align.regularize_word(sentence) == "": |
| 299 | + return 0 |
| 300 | + source_xml = parsed_sentence2xml( |
| 301 | + source, parse_dir, source_id, normalize_sentence) |
| 302 | + sentence_xml = parsed_sentence2xml( |
| 303 | + sentence, parse_dir, sentence_id, normalize_sentence) |
| 304 | + return align.fully_aligned_distance(source_xml, sentence_xml) |
| 305 | + |
| 306 | + |
| 307 | +def announce_finish(): |
| 308 | + if sys.platform == "linux": |
| 309 | + if set(("debian", "Ubuntu")) & set(platform.linux_distribution()): |
| 310 | + call(['speech-dispatcher']) # start speech dispatcher |
| 311 | + call(['spd-say', '"your process has finished"']) |
| 312 | + else: |
| 313 | + # perhaps works only in ubuntu? |
| 314 | + a = Popen( |
| 315 | + ('play --no-show-progress --null --channels 1 synth %s sine %f' % (300, 2)).split()) |
| 316 | + elif sys.platform == "darwin": |
| 317 | + call('say "your process has finished"'.split()) |
| 318 | + else: |
| 319 | + import winsound |
| 320 | + winsound.Beep(300, 2) |
| 321 | + |
| 322 | +if __name__ == '__main__': |
| 323 | + parser = argparse.ArgumentParser( |
| 324 | + description='Extract USim scores') |
| 325 | + parser.add_argument( |
| 326 | + 'parse_dir', help="Name of the directory to save and look for parsed sentences") |
| 327 | + parser.add_argument('output_file') |
| 328 | + parser.add_argument('-sf', '--source_files', nargs='+') |
| 329 | + parser.add_argument('-rf', '--reference_files', nargs='+') |
| 330 | + parser.add_argument('-ss', '--source_sentences', nargs='+') |
| 331 | + parser.add_argument('-rs', '--reference_sentences', nargs='+') |
| 332 | + parser.add_argument('-p', "--parser_path") |
| 333 | + |
| 334 | + args, unknown = parser.parse_known_args() |
| 335 | + PARSER_PATH = args.parser_path |
| 336 | + if not((args.source_files is not None and args.reference_files is not None) or (args.source_sentences is not None and args.reference_sentences is not None)): |
| 337 | + print("please provide sources and references as files or as sentences.") |
| 338 | + main(args) |
0 commit comments