forked from udieckmann/Kielipankki-utilities
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhrtlib.py
74 lines (60 loc) · 2.32 KB
/
hrtlib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from itertools import groupby
from queue import Queue
from random import choice
from string import ascii_letters
from threading import Thread
from vrtargslib import BadData
from vrtdatalib import binunescape
def tokenize(inf, proc, combine, ouf):
'''Sends sentinel-headed, line-separeted, unescaped text blocks from
inf to proc, which should be an external tokenizer process; puts
each markup-line group from inf to a Queue (call it 'meta'),
ensuring a (possibly empty) markup-line group before the first and
after the last text block.
Runs combine(proc, meta, sentinel, ouf) as a Thread that is
supposed to combine and format to ouf the tokenized output of proc
and the original markup lines.
'''
# sentinel does not occur in inf, probably;
# sentinel is not split by proc, hopefully
sent = bytes(map(ord, (choice(ascii_letters) for k in range(16))))
meta = Queue()
def send(group):
# start with the sentinel: when combine can read a sentinel
# group from proc, the preceding meta is available in copy
# because the meta group was put in copy before the data group
# was sent to proc
proc.stdin.write(sent)
proc.stdin.write(b'\n\n')
for line in map(binunescape, group): proc.stdin.write(line)
proc.stdin.write(b'\n')
Thread(target=combine, args=[proc, meta, sent, ouf]).start()
def issome(line): return not line.isspace()
def ismeta(line): return line.startswith(b'<')
def checked(meta):
if any(line.startswith((b'<sentence ',
b'<sentence>',
b'</sentence>'))
for line in meta):
raise BadData('sentence tag not allowed')
else:
return meta
todo = groupby(filter(issome, inf), ismeta)
# ensure meta group before first data group;
# default empty meta group is for empty inf
kind, group = next(todo, (True, ()))
if kind:
meta.put(checked(tuple(group)))
else:
meta.put(())
send(group)
# meta and data alternate
for kind, group in todo:
if kind:
meta.put(checked(tuple(group)))
else:
send(group)
# ensure meta group after last data group
if not kind: meta.put(())
proc.stdin.close()
meta.join()