USim with files or sentences

borgr · borgr · commit 188f6e95f758 · 2018-03-08T16:56:32.000+02:00
diff --git a/USim.py b/USim.py
@@ -0,0 +1,338 @@
+import os
+import sys
+import scipy
+import argparse
+from multiprocessing import Pool
+import multiprocessing
+from subprocess import call, Popen
+import pickle
+from functools import reduce
+import operator
+import platform
+import re
+
+TUPA_DIR = '/cs/labs/oabend/borgr/tupa/'
+UCCA_DIR = TUPA_DIR + 'ucca'
+sys.path.append(UCCA_DIR)
+sys.path.append(UCCA_DIR + '/scripts/distances')
+sys.path.append(UCCA_DIR + '/ucca')
+
+from ucca.ioutil import file2passage
+import codecs
+
+import align
+from ucca.ioutil import passage2file
+from ucca.convert import from_text
+
+POOL_SIZE = multiprocessing.cpu_count()
+full_rerank = True
+
+from tupa.parse import Parser
+from tupa.config import Config
+Config("")
+
+PARSER = None
+PARSER_PATH = None
+SENTENCE_ID_FILENAME = "sentenceIds.pkl"
+PARSED_FILE = "parsed"
+
+
+def main(args):
+    if args.source_sentences is not None:
+        ucca_parse_sentences(
+            args.source_sentences + args.reference_sentences, args.parse_dir, args.parser_path)
+        source_sentences, reference_sentences = args.source_sentences, args.reference_sentences
+        res = [str(USim(s, r, args.parse_dir)) + "\n" for s,
+               r in zip(source_sentences, reference_sentences)]
+    else:
+        ucca_parse_files(args.source_files + args.reference_files,
+                         args.parse_dir, args.parser_path)
+
+        source_files = []
+        source_sentences = []
+        for source_file in args.source_files:
+            with open(source_file) as fl:
+                for line in fl:
+                    # source_sentences.append(line.strip())
+                    source_files.append(source_file)
+        reference_sentences = []
+        reference_files = []
+        for reference_file in args.reference_files:
+            with open(reference_file) as fl:
+                for line in fl:
+                    # reference_sentences.append(line.strip())
+                    reference_files.append(reference_file)
+        res = [str(USim(s, r, args.parse_dir, i, i + len(source_sentences))) + "\n" for i, (s,
+               r) in enumerate(zip(source_files, reference_files))]
+
+    with open(args.output_file, "w") as fl:
+        fl.writelines(res)
+# a lot of code duplication because pooling doesn't react well to passing
+# different lambdas as an argument
+
+
+def normalize_sentence(s):
+    s = re.sub(r"\W+", r" ", s)
+    s = re.sub(r"(\s[a-zA-Z])\s([a-zA-Z]\s)", r"\1\2", s)
+    s = s.lower()
+    s = s.strip()
+    return s
+
+
+def rerank_by_uccasim(gamma=0.27):
+    data_dir = ASSESS_DIR + "data" + os.sep
+    # only used to extract source sentences
+    first_nucle = data_dir + "references/" + "NUCLEA.m2"
+    k_best_dir = data_dir + "K-best/"
+    system_file = k_best_dir + "conll14st.output.1.best100"
+    calculations_dir = "calculations_data/uccasim_rerank/"
+    ucca_parse_dir = calculations_dir + "/ucca_parse/"
+    full = "full" if full_rerank else ""
+    output_file = full + str(gamma) + "_" + "uccasim_rank_results"
+    out_text_file = calculations_dir + output_file
+    out_res_file = calculations_dir + "score_" + output_file
+
+    if not os.path.isfile(out_text_file):
+        gold_file = first_nucle  # only used to extract source sentences
+        print("acquiring source")
+        source_sentences, _ = m2scorer.load_annotation(gold_file)
+
+        source_sentences = source_sentences
+        # load system hypotheses
+        fin = m2scorer.smart_open(system_file, 'r')
+        system_sentences = [line.strip() for line in fin.readlines()]
+        fin.close()
+
+        packed_system_sentences = get_roro_packed(system_sentences)
+
+        print("parsing")
+        ucca_parse(reduce(operator.add, packed_system_sentences) +
+                   source_sentences, ucca_parse_dir)
+
+        print("reranking")
+        # find top ranking
+        pool = Pool(POOL_SIZE)
+        assert(len(packed_system_sentences) == len(source_sentences))
+        if full_rerank:
+            results = pool.starmap(referece_less_full_rerank, zip(source_sentences, packed_system_sentences, [
+                                   ucca_parse_dir] * len(packed_system_sentences), [gamma] * len(packed_system_sentences)))
+        else:
+            results = pool.starmap(referece_less_oracle, zip(source_sentences, packed_system_sentences, [
+                                   ucca_parse_dir] * len(packed_system_sentences), [gamma] * len(packed_system_sentences)))
+        pool.close()
+        pool.join()
+        results = list(results)
+        if full_rerank:
+            results = [x for y in results for x in y]
+        sentences = "\n".join(list(zip(*results))[0])
+        results = list(zip(*results))[1]
+        results = "\n".join([str(x) for x in results])
+
+        print("writing to " + out_text_file)
+        with codecs.open(out_text_file, "w+", "utf-8") as fl:
+            fl.write(sentences)
+        with open(out_res_file, "w+") as fl:
+            fl.write(results)
+
+
+def parse_location(output_dir, filename, sentence_num=None):
+    filename = os.path.splitext(os.path.basename(filename))[0]
+    cur_dir = os.path.join(output_dir, filename)
+    if sentence_num is None:
+        return cur_dir
+    return os.path.join(cur_dir, str(sentence_num) + ".xml")
+
+
+def get_parser(model_path):
+    global PARSER
+    global PARSER_PATH
+    if PARSER_PATH is not model_path or PARSER is None:
+        PARSER_PATH = model_path
+        PARSER = Parser(model_path)
+    return PARSER
+
+
+def ucca_parse_sentences(sentences, output_dir, model_path, clean=False, normalize_sentence=normalize_sentence):
+    sentences = list(set([normalize_sentence(sentence)
+                          for sentence in sentences]))
+    output_dir = os.path.realpath(output_dir)
+    to_parse = get_parsed_subdirs(sentences, output_dir)
+    to_parse = [sent for sent, loc in zip(sentences, to_parse) if loc is None]
+    if to_parse:
+        i = 0
+        out_path = os.path.join(output_dir, "parse_batch" + str(i))
+        while os.path.isfile(os.path.join(out_path, SENTENCE_ID_FILENAME)):
+            i += 1
+            out_path = os.path.join(output_dir, "parse_batch" + str(i))
+        if not os.path.isdir(out_path):
+            os.makedirs(out_path)
+        print("Output folder:", out_path)
+
+        for i, sentence in enumerate(to_parse):
+            # adds sentences to sentence ids memory
+            tmp = get_sentence_id(sentence, out_path, True, normalize_sentence)
+            assert tmp == i, (tmp, i)
+        print(to_parse)
+        print("Parsing", len(to_parse), "sentences.", len(
+            sentences) - len(to_parse), "sentences already parsed.")
+        _ucca_parse_text(to_parse, out_path, "", clean,
+                         normalize_sentence, model_path)
+    else:
+        print("All", len(sentences), "sentences already parsed")
+
+
+def ucca_parse_files(filenames, output_dir, model_path, clean=False, normalize_sentence=lambda x: x):
+    output_dir = os.path.realpath(output_dir)
+    if filenames:
+        for filename in filenames:
+            cur_output_dir = parse_location(output_dir, filename)
+            if os.path.isdir(cur_output_dir):
+                print("File already parsed in", cur_output_dir)
+            else:
+                os.makedirs(cur_output_dir)
+                with open(filename, "r") as fl:
+                    text = fl.readlines()
+                _ucca_parse_text(text, output_dir, filename,
+                                 clean, normalize_sentence, model_path)
+
+
+def _ucca_parse_text(text, output_dir, filename, clean, normalize_sentence, model_path):
+    text = [normalize_sentence(x) for x in text]
+    text = from_text(text, split=True, one_per_line=True)
+    text = list(text)
+    parser = get_parser(model_path)
+    out_location = os.path.dirname(parse_location(output_dir, filename, 0))
+    if not os.path.isdir(out_location):
+        os.makedirs(out_location)
+    for i, (passage, *_) in enumerate(parser.parse(text)):
+        passage2file(passage, parse_location(
+            output_dir, filename, i))
+    # create an empty file anounces parsing finished succsessfuly
+    parsed_file = os.path.join(out_location, PARSED_FILE)
+    with open(parsed_file, "w") as _:
+        pass
+    if clean:
+        filenames = os.listdir(output_dir)
+        for filename in filenames:
+            if filename.endswith(".txt"):
+                os.remove(os.path.join(output_dir, item))
+
+
+_id_dics = {}
+
+
+def get_parsed_subdirs(sentences, parse_dir):
+    res = [None] * len(sentences)
+    parse_dir = os.path.realpath(parse_dir)
+    for parse_subdir, dirs, files in os.walk(parse_dir):
+        if PARSED_FILE in files:
+            for i, sentence in enumerate(sentences):
+                if res[i] is None:  # avoid multiple lookups in case the sentence was already found once
+                    try:
+                        get_sentence_id(sentence, parse_subdir, False)
+                        res[i] = parse_subdir
+                    except KeyError:
+                        pass
+    return res
+
+
+def get_parsed_subdir(sentence, parse_dir):
+    parse_dir = os.path.realpath(parse_dir)
+    for parse_subdir, dirs, files in os.walk(parse_dir):
+        if PARSED_FILE in files and any((fl.endswith(SENTENCE_ID_FILENAME) for fl in files)):
+            try:
+                get_sentence_id(sentence, parse_subdir, False)
+                return parse_subdir
+            except KeyError:
+                pass
+
+
+def get_sentence_id(sentence, parse_dir, graceful=True, normalize_sentence=normalize_sentence):
+    """ returns the sentence id in the parse_dir, 
+        if graceful is true adds a new sentence id 
+        if the sentence does not exist in the ids list,
+        otherwise throws exception"""
+    parse_dir = os.path.realpath(parse_dir)
+    filename = SENTENCE_ID_FILENAME
+    max_id = "max"
+    sentence = normalize_sentence(sentence)
+    if parse_dir in _id_dics:
+        id_dic = _id_dics[parse_dir]
+    elif not os.path.isfile(parse_dir + os.sep + filename):
+        print("creating a new id list for file", parse_dir + os.sep + filename)
+        id_dic = {max_id: -1}
+        _id_dics[parse_dir] = id_dic
+    else:
+        with open(parse_dir + os.sep + filename, "rb") as fl:
+            id_dic = pickle.load(fl)
+            _id_dics[parse_dir] = id_dic
+    if graceful and not sentence in id_dic:
+        id_dic[max_id] += 1
+        id_dic[sentence] = id_dic[max_id]
+        with open(parse_dir + os.sep + filename, "wb+") as fl:
+            pickle.dump(id_dic, fl)
+    return id_dic[sentence]
+
+
+def parsed_sentence2xml(sentence, parse_dir, sent_id=None, normalize_sentence=normalize_sentence):
+    if sent_id is None:
+        location = get_parsed_subdir(sentence, parse_dir)
+        filename = parse_location(location, "", get_sentence_id(
+            sentence, location, False, normalize_sentence))
+        # print("reading parse from ", filename)
+        # with open(filename) as fl:
+        #     print("sentence:", sentence)
+        #     print("xml first lines:", fl.readlines()[:30])
+        return file2passage(filename)
+    else:
+        return file2passage(parse_location(parse_dir, sentence, sent_id))
+
+
+def USim(source, sentence, parse_dir, source_id=None, sentence_id=None, normalize_sentence=normalize_sentence):
+    """ accepts filename instead of sentence\source and a sentence id\source_sentence id for locating the file"""
+    if align.regularize_word(source) == "":
+        if align.regularize_word(sentence) == "":
+            return 1
+        else:
+            return 0
+    elif align.regularize_word(sentence) == "":
+        return 0
+    source_xml = parsed_sentence2xml(
+        source, parse_dir, source_id, normalize_sentence)
+    sentence_xml = parsed_sentence2xml(
+        sentence, parse_dir, sentence_id, normalize_sentence)
+    return align.fully_aligned_distance(source_xml, sentence_xml)
+
+
+def announce_finish():
+    if sys.platform == "linux":
+        if set(("debian", "Ubuntu")) & set(platform.linux_distribution()):
+            call(['speech-dispatcher'])  # start speech dispatcher
+            call(['spd-say', '"your process has finished"'])
+        else:
+            # perhaps works only in ubuntu?
+            a = Popen(
+                ('play --no-show-progress --null --channels 1 synth %s sine %f' % (300, 2)).split())
+    elif sys.platform == "darwin":
+        call('say "your process has finished"'.split())
+    else:
+        import winsound
+        winsound.Beep(300, 2)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Extract USim scores')
+    parser.add_argument(
+        'parse_dir', help="Name of the directory to save and look for parsed sentences")
+    parser.add_argument('output_file')
+    parser.add_argument('-sf', '--source_files', nargs='+')
+    parser.add_argument('-rf', '--reference_files', nargs='+')
+    parser.add_argument('-ss', '--source_sentences', nargs='+')
+    parser.add_argument('-rs', '--reference_sentences', nargs='+')
+    parser.add_argument('-p', "--parser_path")
+
+    args, unknown = parser.parse_known_args()
+    PARSER_PATH = args.parser_path
+    if not((args.source_files is not None and args.reference_files is not None) or (args.source_sentences is not None and args.reference_sentences is not None)):
+        print("please provide sources and references as files or as sentences.")
+    main(args)