-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathhrt-udpipe
executable file
·175 lines (145 loc) · 5.52 KB
/
hrt-udpipe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#! /usr/bin/env python3
# -*- mode: Python; -*-
'''Use UDPipe to parse near-plaintext within markup above sentence.
'''
import sys
from itertools import groupby
from queue import Empty as EmptyQueue
from subprocess import Popen, PIPE
from threading import Thread
from vrtargslib import trans_args, trans_main
from vrtdatalib import binescape, binasrecord
from vrtnamelib import binmakenames
from hrtlib import tokenize
from outsidelib import UDPIPE, UDPIPEMODEL
def parsearguments():
description = '''
Segment text data between any (non-sentence) meta into sentences
and tokens, tag and parse, using UDPipe with one of the UD2
models. WIP! Does not cope with UD m.n tokens.
'''
parser = trans_args(description = description, inplace = False)
parser.add_argument('--model',
choices = [
'finnish-tdt',
'finnish-ftb',
'swedish-talbanken',
'english-ewt',
'english-gum',
# ADD MODELS
],
default = 'finnish-tdt',
help = '''
UDPipe language model (default finnish-tdt)
''')
# should have options to set field names but meh
args = parser.parse_args()
args.inplace = False
args.backup = None
args.prog = parser.prog
return args
def main(args, inf, ouf):
proc = Popen([ UDPIPE, '--immediate',
'--tokenize', '--tag', '--parse',
'--output=conllu',
# --output formats (among others):
# conllu: id, word, ..., misc (SpacesAfter, SpaceAfter)
# vertical: word only
# horizontal: input with normalized spacing
# plaintext: reconstructed input with original spacing
# matxin: whatever it is, it _hangs_ (TO INVESTIGATE)
# (still not investigated and udpipe version updated)
UDPIPEMODEL.format(args.model) ],
stdin = PIPE,
stdout = PIPE,
stderr = PIPE)
# start a watcherr here since proc.stderr in PIPE
# (rather redundant for this particular tool)
Thread(target=watcherr, args=[args, proc]).start()
try:
tokenize(inf, proc, combiner(args), ouf)
finally:
proc.stdin.close()
def combiner(args):
def combine(proc, meta, sent, ouf):
'''Reads the proc and meta in synch; writes to output stream; on
exception, should this close proc.stdout? when proc might also
not be there any more?
'''
try:
implement_combine(args, proc, meta, sent, ouf)
except EmptyQueue:
print('{}: combine thread: empty queue'.format(args.prog),
file = sys.stderr)
except Exception as exn:
print('{}: combine thread:'.format(args.prog), exn,
file = sys.stderr)
return combine
def implement_combine(args, proc, meta, sent, ouf):
ouf.write(binmakenames(b'id word lemma upos xpos feat head rel aux misc'))
for group in (tuple(group)
# must reify group to recognize sentinel group
for isspace, group in groupby(proc.stdout,
bytes.isspace)
if not isspace):
if all((line.startswith(b'#') or sent in line)
for line in group):
# Hope that UDPipe considers sentinel a sentence
# of its own but it may still come with comments
shipmeta(meta.get_nowait(), ouf)
meta.task_done()
else:
shipdata(group, ouf)
else:
# the final meta but is this a bit racy?
shipmeta(meta.get_nowait(), ouf)
meta.task_done()
def shipmeta(lines, ouf):
for line in lines: ouf.write(line)
def shipdata(lines, ouf):
ouf.write(b'<sentence>\n')
end = b'0' # aka none
for line in lines:
if line.startswith(b'#'): continue
[
jd, word, lemma, upos, xpos,
feat, head, rel, aux, misc
] = binasrecord(line)
if b'-' in jd:
# 1-2 Ellei _
# 1 Ell _
# 2 ei _
# ...
# 6-7 miksei _
# ...
# (with finnish-ftb-ud)
for part in (b'<multitoken id="', jd,
b'" misc="', misc,
b'" word="', binescape(word),
b'">\n'):
ouf.write(part)
# TODO also escape any quotes in word!
_, end = jd.split(b'-')
else:
for part in (jd, b'\t',
binescape(word), b'\t',
binescape(lemma), b'\t',
upos, b'\t',
xpos, b'\t',
feat, b'\t',
head, b'\t',
rel, b'\t',
aux, b'\t',
misc, b'\n'):
ouf.write(part)
if jd == end: ouf.write(b'</multitoken>\n')
else:
ouf.write(b'</sentence>\n')
def watcherr(args, proc):
for line in proc.stderr:
sys.stderr.buffer.write(b'stderr: ')
sys.stderr.buffer.write(line)
if __name__ == '__main__':
trans_main(parsearguments(), main,
in_as_text = False,
out_as_text = False)