forked from udieckmann/Kielipankki-utilities
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvrt-old-finnish-nertag
executable file
·281 lines (228 loc) · 9.27 KB
/
vrt-old-finnish-nertag
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
#! /usr/bin/env python3
# -*- mode: Python; -*-
from itertools import groupby, count
from queue import Queue
from subprocess import Popen, PIPE
from threading import Thread
import enum, os, re, sys, traceback
from vrtargslib import trans_args, trans_main
from vrtargslib import BadData, BadCode
from vrtnamelib import binxname, binxrest
from vrtnamelib import isbinnames as isnames
from vrtnamelib import binnamelist as namelist, nameindex, nameindices
from vrtnamelib import bininsertnames as insertnames
from vrtdatalib import binasrecord as asrecord
from vrtdatalib import binescape as escape, binunescape as unescape
from outsidelib import FINERCMD, HFSTENV
def parsearguments():
description = '''
Pass word forms of sentences in the input VRT document through the
Finnish name-recognition tool FiNER.
'''
parser = trans_args(description = description)
parser.add_argument('--word', '-w', metavar = 'name',
type = binxname, default = b'word',
help = 'input word field name (default word)')
# so the defaults are not valid values of the type?
parser.add_argument('--prefix', '-p', metavar = 'fix',
type = binxname, default = b'finer-',
help = '''
prefix to output field names
(default "finer-")
''')
parser.add_argument('--suffix', '-s', metavar = 'fix',
type = binxrest, default = b'',
help = 'suffix to output field names')
parser.add_argument('--tagtools',
choices = ['1.3.2', '1.4.0', '1.5.0'],
default = '1.5.0',
help = 'tagtools version (1.5.0)')
args = parser.parse_args()
args.prog = parser.prog
return args
def message(args, mess):
print(args.prog + ':', mess, file = sys.stderr)
def terminate(proc):
try:
proc.terminate()
except ProcessLookupError:
pass
def main(args, ins, ous):
with Popen([ FINERCMD.format(args.tagtools),
'--no-tokenize', '--show-analyses' ],
env = HFSTENV,
stdin = PIPE,
stdout = PIPE,
stderr = sys.stderr.buffer) as fine:
copy = Queue()
Thread(target = combine, args = (args, fine, copy, ous)).start()
status = 1
try:
implement_main(args, ins, fine, copy)
status = 0
except BadData as exn:
message(args, exn)
except BrokenPipeError as exn:
message(args, 'broken pipe in main thread')
except KeyboardInterrupt as exn:
message(args, 'keyboard interrupt in main thread')
except Exception as exn:
print(traceback.format_exc(), file = sys.stderr)
if status:
terminate(fine)
else:
fine.stdin.close()
try:
copy.join()
except KeyboardInterrupt:
message(args, 'keyboard interrupt in main thread')
status = 1
return status
def implement_main(args, ins, fine, copy):
# each "word" goes to fine, multi-dash tokens now as single dash,
# with an empty line after each sentence; everything goes to copy
# as alternative groups of meta and data, with new "lemma", "msd",
# "aux", "nertag" (or such) in names
wordix = None
def send(sentence):
for k, record in enumerate(sentence, start = 1):
token = unescape(record[wordix])
if token.startswith(b'---') and len(set(token)) == 1:
# underlying tool dumps core on a multi-dash token
# (a component in tagtools 1.3.2, 1.4.0 at least)
# https://jira.csc.fi/browse/KP-2704
fine.stdin.write(b'-\n')
elif re.fullmatch(BR'[A-Z][a-z]+[.]', token):
# underlying tool dumps core on data approximated by
# this pattern, which usually is poorly tokenized but
# occurred in an actual corpus - non-ASCII examples
# were not inspected yet, so the pattern may be too
# strict still (https://jira.csc.fi/browse/KP-2766)
fine.stdin.write(token.rstrip(b'.'))
fine.stdin.write(b'\n')
else:
fine.stdin.write(token)
fine.stdin.write(b'\n')
else:
fine.stdin.write(b'\n')
fine.stdin.flush()
def setnames(line):
nonlocal wordix
if isnames(line):
[wordix] = nameindices(namelist(line), args.word)
return insertnames(line, args.word,
args.prefix + b'lemma' + args.suffix,
args.prefix + b'msd' + args.suffix,
args.prefix + b'aux' + args.suffix,
args.prefix + b'nertag' + args.suffix)
return line
def issome(line): return not line.isspace()
def ismeta(line): return line.startswith(b'<')
first = True
for groupismeta, group in groupby(filter(issome, ins), ismeta):
if groupismeta:
meta = tuple(map(setnames, group))
copy.put(meta)
first = False
continue
# groupisdata
if first:
# there shall always be previous meta
copy.put(())
first = False
if wordix is None:
raise BadData('error: token before field names')
sentence = tuple(map(asrecord, group))
copy.put(sentence)
send(sentence)
if not groupismeta:
# there shall always be final meta
copy.put(())
def kvpairs(args, pairs):
'''Rewrite [k1=v1][k2=v2] as k1=v1,k2=v2, assuming that brackets and
bars do not occur otherwise.
'''
return pairs.strip(b'[]').replace(b'][', b',')
def combine(args, fine, copy, out):
'''Read finer output (word TAB lemma TAB msd TAB aux TAB nergag NL /
NL) and flat vrt from the copy process. Insert analysis from finer
to the vrt at the named position.
This is run as a thread that consumes the finer process and syncs
it with the copy queue. Preceding meta and corresponding data were
put in the queue before the data was sent to finer, so they will
always be there when a sentence is read out of finer.
'''
fail = True
try:
implement_combine(args, fine, copy, out)
fail = False
except BrokenPipeError:
message(args, 'broken pipe in combine thread')
except StopIteration:
# not sure when this can happen now
message(args, 'stop iteration in combine thread')
except ValueError as exn:
# sometimes keyboard interruption in main thread produces here
# a readline of closed file (or at least it did in one stage
# in development)
message(args, 'value error in combine thread ' + str(exn))
finally:
if fail: terminate(fine)
def implement_combine(args, fine, copy, out):
'''Thread may find pipe closed.'''
response = (tokens
for isempty, tokens
in groupby(fine.stdout, bytes.isspace)
if not isempty)
at = None # word field index, after which insert new
for analyses in response:
meta = copy.get_nowait()
data = copy.get_nowait()
copy.task_done()
copy.task_done()
for line in meta:
if isnames(line):
at = nameindex(namelist(line), args.word)
if at is None:
raise BadData('combine thread: data before names')
shipmeta(meta, out)
for new, old in zip(analyses, data):
[
form, lemma, msd, aux, nertag
] = asrecord(new)
old.insert(at + 1, escape(nertag.strip(b'<>')) or b'_')
old.insert(at + 1, escape(kvpairs(args, aux)) or b'_')
old.insert(at + 1, escape(kvpairs(args, msd)) or b'_')
old.insert(at + 1, escape(lemma) or b'_')
else:
shipdata(data, out)
# should this have a timeout? or could it be .get_nowait()?
# added flush and made get get_nowait because a few processes
# seemed to make no progress till they timed out - were they
# stalled here?
out.flush()
shipmeta(copy.get_nowait(), out)
copy.task_done()
def shipmeta(meta, out):
for line in meta: out.write(line)
from datetime import datetime
def shipdata(data, out):
if len(data) > 2000:
# even marmot appeared to behave badly with long "sentences"
# (though only much longer than the parser - present limit of
# 2000 is taken out of thin air and could probably have been
# much larger but then it only controls a warning in stderr)
# [so this check is inherited from a marmot tool]
sys.stderr.buffer.write(b'shipping data at ')
sys.stderr.buffer.write(datetime.now().isoformat().encode())
sys.stderr.buffer.write(b' of len ')
sys.stderr.buffer.write(str(len(data)).encode())
sys.stderr.buffer.write(b'\n')
sys.stderr.buffer.flush()
for record in data:
out.write(b'\t'.join(record))
out.write(b'\n')
if __name__ == '__main__':
trans_main(parsearguments(), main,
in_as_text = False,
out_as_text = False)