forked from udieckmann/Kielipankki-utilities
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhrt-tokenize-udpipe
executable file
·167 lines (137 loc) · 5.3 KB
/
hrt-tokenize-udpipe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#! /usr/bin/env python3
# -*- mode: Python; -*-
'''Use UDPipe to tokenize within markup above sentence.
'''
import sys
from itertools import groupby
from queue import Empty as EmptyQueue
from subprocess import Popen, PIPE
from threading import Thread
from vrtargslib import trans_args, trans_main
from vrtdatalib import binescape, binasrecord
from vrtnamelib import binmakenames
from hrtlib import tokenize
from outsidelib import UDPIPE
from outsidelib import UDPIPEMODEL as MODEL
def parsearguments():
description = '''
Segment text data between any (non-sentence) meta into sentences
and tokens using UDPipe with one of the UD2 models.
'''
parser = trans_args(description = description, inplace = False)
parser.add_argument('--model',
choices = [
'finnish-tdt',
'finnish-ftb',
'swedish-talbanken',
'english-ewt',
'english-gum',
],
default = 'finnish-tdt',
help = '''
UDPipe language model (default finnish-tdt)
''')
# should have options to set field names but meh
args = parser.parse_args()
args.inplace = False
args.backup = None
args.prog = parser.prog
return args
def main(args, inf, ouf):
proc = Popen([ UDPIPE, '--immediate', '--tokenize', '--output=conllu',
# --output formats (among others):
# conllu: id, word, ..., misc (SpacesAfter, SpaceAfter)
# vertical: word only
# horizontal: input with normalized spacing
# plaintext: reconstructed input with original spacing
# matxin: whatever it is, it _hangs_ (TO INVESTIGATE)
# (still not investigated and udpipe version updated)
MODEL.format(args.model) ],
stdin = PIPE,
stdout = PIPE,
stderr = PIPE)
# start a watcherr here since proc.stderr in PIPE
# (rather redundant for this particular tool)
Thread(target=watcherr, args=[args, proc]).start()
try:
tokenize(inf, proc, combiner(args), ouf)
finally:
proc.stdin.close()
def combiner(args):
def combine(proc, meta, sent, ouf):
'''Reads the proc and meta in synch; writes to output stream; on
exception, should this close proc.stdout? when proc might also
not be there any more?
'''
try:
implement_combine(args, proc, meta, sent, ouf)
except EmptyQueue:
print('{}: combine thread: empty queue'.format(args.prog),
file = sys.stderr)
except Exception as exn:
print('{}: combine thread:'.format(args.prog), exn,
file = sys.stderr)
return combine
def implement_combine(args, proc, meta, sent, ouf):
# name id something else so that the output can
# be piped to a different tool that produces an
# id by default (notably an udpipe parser) or
# should id be omitted here altogether?
ouf.write(binmakenames(b'wid word spaces'))
for group in (tuple(group)
# must reify group to recognize sentinel group
for isspace, group in groupby(proc.stdout,
bytes.isspace)
if not isspace):
if all((line.startswith(b'#') or sent in line)
for line in group):
# Hope that UDPipe considers sentinel a sentence
# of its own but it may still come with comments
shipmeta(meta.get_nowait(), ouf)
meta.task_done()
else:
shipdata(group, ouf)
else:
# the final meta but is this a bit racy?
shipmeta(meta.get_nowait(), ouf)
meta.task_done()
def shipmeta(lines, ouf):
for line in lines: ouf.write(line)
def shipdata(lines, ouf):
ouf.write(b'<sentence>\n')
end = b'0' # aka none
for line in lines:
if line.startswith(b'#'): continue
record = binasrecord(line)
jd, word, spaces = record[0], record[1], record[9]
if b'-' in jd:
# 1-2 Ellei _
# 1 Ell _
# 2 ei _
# ...
# 6-7 miksei _
# ...
# (with finnish-ftb-ud)
for part in (b'<multitoken id="', jd,
b'" spaces="', binescape(spaces),
b'" word="', binescape(word),
b'">\n'):
ouf.write(part)
# TODO also escape any quotes in word! and in spaces!
_, end = jd.split(b'-')
else:
for part in (jd, b'\t',
binescape(word), b'\t',
binescape(spaces), b'\n'):
ouf.write(part)
if jd == end: ouf.write(b'</multitoken>\n')
else:
ouf.write(b'</sentence>\n')
def watcherr(args, proc):
for line in proc.stderr:
sys.stderr.buffer.write(b'err: ')
sys.stderr.buffer.write(line)
if __name__ == '__main__':
trans_main(parsearguments(), main,
in_as_text = False,
out_as_text = False)