forked from udieckmann/Kielipankki-utilities
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhrt-encode-tags
executable file
·224 lines (189 loc) · 8.62 KB
/
hrt-encode-tags
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
#! /usr/bin/env python3
# -*- mode: Python; -*-
"""
hrt-encode-tags
Encode XML tags and comments in the input HRT as special XML comments,
so that they can be passed through tokenization and restored with
vrt-decode-tags at the correct places, without disturbing the
tokenizer.
By default, the tags for "text" and "paragraph" are preserved
unencoded in the output if the tag begins a line (use
--preserve-tags=taglist to change the default.) The tag comment for an
encoded tag is placed after the closest preceding preserved tag.
Consecutive tag comments are in the same order as the corresponding
tags were in the text.
The format of the tag comment is
<!-- #structure-tag: <tag text>|<offset> <left context> <right context> -->
where each space is a single space (U+0020), and the fields have the
following meaning:
tag text: the text in the tag between < and >, including tag name,
attributes and / for an end tag or empty tag; may also contain
"|" (in attribute values or comment text)
offset: the position of the tag in the following text specified as
the number of non-space (content) characters following the list
of comments
left context: (up to) 10 non-space content characters preceding the
tag in the text (may be empty)
right context: (up to) 10 non-space content characters following the
tag in the text (may be empty)
Note: The contexts may contain literal "--" (but no ">"), so the
comment is not necessarily strictly an XML comment.
XML comments in the input are also encoded in the same way, even
though "structure-tag" is somewhat misleading for them.
In the text content from which tags have been removed, whitespace is
preserved, except on lines containing only tags and whitespace: such
lines are removed completely, including the final newline. All
trailing whitespace following a tag is always removed. For example,
the text
ab
<b><c>
<a>cd</a>
<a>ef</a>
</c> </b>
gh
becomes
ab
cd
ef
gh
"""
import re
from itertools import accumulate, groupby
from vrtargsoolib import InputProcessor
from vrtcommentlib import makevrtcomment
class TagEncoder(InputProcessor):
"""Encode XML tags in the input HRT as XML comments."""
DESCRIPTION = """
Encode XML tags in the input HRT as special XML comments, so that
they can be passed through tokenization and restored with
vrt-decode-tags.
"""
ARGSPECS = [
('--preserve-tags = taglist "text paragraph"',
'preserve tags of XML elements listed in taglist if they occur at the'
' beginning of a line; taglist is a list of element names separated'
' by spaces'),
# CHECK: By default, the following also matches Unicode spaces,
# including non-break space. Should it? It might depend on the
# tokenizer what is most useful.
('--spaces|ignore-spaces = regexp "\\s"',
'ignore characters matching regexp when computing the tag offset and'
' context strings; regexp is a Python string (Unicode) regular'
' expression'),
# The following default matches ASCII hyphen-minus and Unicode HYPHEN.
('--hyphens|ignore-hyphens = regexp "[-\\u2010]"',
'ignore characters matching regexp at the end of a word (when'
' followed by ignored space or a tag) but not as a word by itself;'
' regexp is a Python string (Unicode) regular expression'),
]
def __init__(self):
super().__init__()
def main(self, args, inf, ouf):
# Tags not to be encoded, including special VRT comments
preserve_tag_re = re.compile(
b'< ( /? (' + '|'.join(args.preserve_tags.split()).encode() + br')'
+ br' | !-- \s \#vrt \s .* ) [>\s]', re.VERBOSE)
# Also encode XML comments, even though they are not structural tags,
# so "#vrt structure-tag" is somewhat misleading. This allows comments
# in the middle of a line or a token. The resulting VRT comments are
# not strictly XML comments, as they contain "--" in the middle of the
# comment.
# Should we also allow multi-line comments? They would need to be
# converted to multiple single-line comments.
tag_re = re.compile(
r'''< ( (?: (?: /\w+ | \w+ (?: \s [^>]+ )? /? )
| !-- .+ -- )
) >
( (?: \s*? \n )? )''', re.VERBOSE)
# Ignore spaces, and hyphens preceded by a non-space and followed by
# space or end-of-string (a tag).
ignore_re = re.compile(
((r'((?<= \S) (?:' + args.hyphens + ')+ )?'
+ '(?: \Z | (?:' + args.spaces + ')+)')
if args.hyphens else
('(?:' + args.spaces + ')+')),
re.VERBOSE)
context_chars = 10
LESS_THAN = '<'.encode('utf-8')[0]
NEWLINE = '\n'.encode('utf-8')[0]
def is_preservetag(line):
line_strip = line.strip()
return bool(line_strip and line_strip[0] == LESS_THAN
and preserve_tag_re.match(line_strip))
def encode_tags(lines):
text = b''.join(lines).decode()
parts = tag_re.split(text)
if len(parts) == 1:
return lines
# print(parts)
notag_indices = range(0, len(parts), 3)
tag_indices = range(1, len(parts), 3)
nl_indices = range(2, len(parts), 3)
notag_parts = [ignore_re.sub('', parts[i]) for i in notag_indices]
# print(notag_parts)
# print([parts[i] for i in notag_indices])
# print([parts[i] for i in tag_indices])
# print([parts[i] for i in nl_indices])
notag_text_nospaces = ''.join(notag_parts)
tagpos = list(accumulate(len(notag_part)
for notag_part in notag_parts))
# print(tagpos)
result = []
for tagnum, partnum in enumerate(tag_indices):
result.append(encode_tag(parts[partnum], notag_text_nospaces,
tagpos[tagnum])
.encode())
prev_nonempty_partnum = -1
linenum = 0
for partnum in notag_indices:
# print(prev_nonempty_partnum,
# repr(parts[prev_nonempty_partnum]),
# partnum,
# repr(parts[partnum]))
if not parts[partnum]:
continue
# print(repr(parts[partnum - 1]),
# repr(parts[prev_nonempty_partnum]),
# list((i, parts[i]) for i in
# range(prev_nonempty_partnum + 2, partnum, 3)))
# If this is not the first text part and the previous
# non-empty text part did not end in a newline,
# contains non-spaces and does not end in spaces (is
# not only leading spaces), and if there is a newline
# following any of the tags after the previous
# non-empty text part, add a newline.
if partnum:
prev_nonempty = parts[prev_nonempty_partnum]
if (prev_nonempty
and prev_nonempty[-1] != '\n'
and prev_nonempty.strip()
and not re.match(r'.*\s+$', prev_nonempty)
and any(parts[nl_num] and parts[nl_num][-1] == '\n'
for nl_num in range(
prev_nonempty_partnum + 2, partnum, 3))):
# print('NEWLINE')
result.append(b'\n')
result.append(parts[partnum].encode())
prev_nonempty_partnum = partnum
if result[-1][-1] != NEWLINE:
result.append(b'\n')
# print(result)
return result
def encode_tag(tag, notag_text, tagpos):
# <!-- #vrt structure-tag: span type="add"|7 Ochfrån detta... -->
return makevrtcomment(
'structure-tag',
(tag + '|' + str(tagpos) + ' '
+ notag_text[max(tagpos - context_chars, 0) : tagpos] + ' '
+ notag_text[tagpos : tagpos + context_chars]))
def output_lines(lines):
for line in lines:
ouf.write(line)
for is_preservetag_group, group in groupby(inf, is_preservetag):
lines = list(group)
# print(is_preservetag_group, lines)
if not is_preservetag_group:
lines = encode_tags(lines)
output_lines(lines)
if __name__ == '__main__':
TagEncoder().run()