forked from koskenni/pytwolc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathzerofilled2raw.py
143 lines (129 loc) · 5.32 KB
/
zerofilled2raw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
"""A script for converting zero-filled examples into examples with raw morphophonemes
Raw morphophonemes are named according to a set of principal forms
(principal parts) which are assumed to reflect all morphophonemic
alternations in a lexeme. The script assumes that the examples within
a lexeme occur all in the same order as for their form.
In order to apply this script to another language, one must revise the
principal_set and the feat2mphons values. The code still relies on
the assumption that the stem comes as the first morph (and thus some
minor modifications would be required in order to handle prefixing
languages).
© Kimmo Koskenniemi, 2018. This is free software under GPL 3 license.
"""
import argparse
argparser = argparse.ArgumentParser(
"python3 zerofilled2raw.py",
description="Forms raw morphophonemes out of zero-filled morphs")
argparser.add_argument(
"input",
default="ksk-zerofilled.csv",
help="zero-filled example words as a CSV file")
argparser.add_argument(
"output",
default="ksk-raw-examp.csv",
help="example word with raw morhpophonemes as a CSV file")
argparser.add_argument(
"affix_info",
default="demo-affix-info.csv",
help="principal forms and morphophonemic affixes as a CSV file")
argparser.add_argument(
"-s", "--morph-separator",
default=".",
help="separator between morphs in the word form")
argparser.add_argument(
"-d", "--csv-delimiter",
default=",",
help="delimiter between the fields")
argparser.add_argument(
"-n", "--name-separator",
default=" ",
help="separator between morpheme names in the morpheme list")
argparser.add_argument(
"-z", "--zero-symbol",
default="Ø",
help="symbol inserted in word forms to align them")
argparser.add_argument(
"-v", "--verbosity",
default=0,
type=int,
help="level of diagnostic and debugging output")
args = argparser.parse_args()
import re
import csv
from collections import OrderedDict
from orderedset import OrderedSet
principal_set = OrderedSet()
""""Set of principal forms or principal parts, i.e. the forms which
uniquely determine the morphophonemic variations that may occur within
the stem """
feat2mphons = {}
# Read in the feature combinations of principal forms and
# the morphophonemic representations of affix features
with open(args.affix_info, "r") as afffil:
affrdr = csv.reader(afffil, delimiter=args.csv_delimiter)
for row in affrdr:
if row[1] == '+':
principal_set.add(row[0])
else:
feat2mphons[row[0]] = row[1]
#print("principal_set =", principal_set)####
#print("feat2mphons =", feat2mphons)####
# Read in the morpheme names and the zero-filled morphs
stem_morpheme_data = OrderedDict()
"""Indexed by stem morpheme name, value is a list of the original data
for that stem morpheme. Each value consists of a tuple of fields
(MORPHEMES, MORPHS, ALIGNED) in the original data."""
with open(args.input, "r") as infil:
rdr = csv.DictReader(infil, delimiter=args.csv_delimiter)
for row in rdr:
names = row["MORPHEMES"].strip()
orig_morphs = row["MORPHS"].strip()
zerof_morphs = row["ZEROFILLED"].strip()
if (not names) or (not zerof_morphs):
continue
name_lst = names.split(args.name_separator, maxsplit=1)
stem_name = name_lst[0]
form_name = " ".join(name_lst[1:]) if len(name_lst) > 1 else ""
zerof_morph_lst = zerof_morphs.split(args.morph_separator, maxsplit=1)
if stem_name not in stem_morpheme_data:
stem_morpheme_data[stem_name] = []
stem_morpheme_data[stem_name].append((form_name, orig_morphs, zerof_morph_lst))
ofil = open(args.output, "w")
writer = csv.DictWriter(ofil, fieldnames=["MORPHEMES", "MORPHS", "ZEROFILLED", "RAW"])
writer.writeheader()
for stem_morpheme, data_lst in stem_morpheme_data.items():
princ_zstem_lst =[]
# select the principal forms of this stem morpheme
for data in data_lst:
form_name, orig_morphs, zerof_morph_lst = data
if form_name in principal_set:
princ_zstem_lst.append(zerof_morph_lst[0])
# print("principal zero-filled stems:[", stem_morpheme, "]", princ_zstem_lst)###
# form the raw morphophonemes by combining corresponding symbols
l = len(princ_zstem_lst[0])
zstem_rawsym_lst = []
for i in range(l):
lst = []
for princ_zstem in princ_zstem_lst:
lst.append(princ_zstem[i])
# print(stem_morpheme, i, lst)###
raw_seq = "".join(lst)
if re.match(r"^(.)(\1)*$", raw_seq):
raw_sym = raw_seq[0] # abbreviate if all identical
else:
raw_sym = "{" + raw_seq + "}"
zstem_rawsym_lst.append(raw_sym)
zstem_pairsym_str = " ".join(zstem_rawsym_lst)
# Output the data augmented with the representation with raw morphophonemes
for data in data_lst:
form_name, orig_morphs, zerof_morph_lst = data
row["MORPHEMES"] = (stem_morpheme + " " + form_name).strip()
row["MORPHS"] = orig_morphs
orig_zerof_morphs = args.morph_separator.join(zerof_morph_lst)
row["ZEROFILLED"] = orig_zerof_morphs
raw_lst = [zstem_pairsym_str]
feat_lst = form_name.split(" ")
for feat in feat_lst:
raw_lst.append(feat2mphons[feat])
row["RAW"] = " ".join(raw_lst)
writer.writerow(row)