forked from udieckmann/Kielipankki-utilities
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvrt-fix-sentbreaks
executable file
·229 lines (197 loc) · 8.77 KB
/
vrt-fix-sentbreaks
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
#! /usr/bin/env python3
# -*- mode: Python; -*-
"""
vrt-fix-sentbreaks
Fix in VRT input spurious sentence breaks added by the UDPipe Finnish
tokenizer.
The logic of the code is taken from Jussi Piitulainen's unbreak.py for
the CEAL corpus, either directly or slightly modified.
"""
import re
import sys
from itertools import groupby, chain
import vrtdatalib
import vrtnamelib
from vrtargsoolib import InputProcessor
class SentenceBreakFixer(InputProcessor):
"""Fix spurious sentence breaks added by the UDPipe Finnish tokenizer."""
_default_endpunct = '.!?:'
"""End-of-sentence punctuation marks"""
_default_dashes = '-–—'
_default_quotes = '"”“\'’‘«»'
DESCRIPTION = """
Fix spurious sentence breaks in VRT input added by the UDPipe
Finnish tokenizer.
"""
ARGSPECS = [
('--comments',
'output XML comments for removed breaks'),
('--verbose',
'log output to stderr'),
('--word|w = name -> word_attr',
'use positional attribute name as the word-form attribute;'
' alternatively, name may be a one-based integer denoting the index'
' of the word-form attribute, or a combination of the two, separated'
' by a "|" (the latter is used if the former is not found)',
# The default contains a literal "|", so it cannot be specified on the
# argspec line itself.
dict(default="word|1")),
('--max-tokens=num :int =200',
'do not remove a sentence break if the combined sentence would have'
' more than num tokens (0 = unlimited)'),
('--end-punctuation = chars -> endpunct',
'consider chars as possible sentence-ending punctuation marks',
dict(default=_default_endpunct)),
('--dashes = chars',
'consider chars as dashes',
dict(default=_default_dashes)),
('--quotes = chars',
'consider chars as quotation marks',
dict(default=_default_quotes)),
]
def __init__(self):
super().__init__()
def main(self, args, inf, ouf):
LESS_THAN = '<'.encode()[0]
endpunct = args.endpunct
dashes = args.dashes
quotes = args.quotes
max_token_count = args.max_tokens
word_attrnum = 0
sent_token_count = 0
this_token_count = 0
# Parts for different types of nobreak labels and comments
comment_parts = {
# (label prefix, label suffix, comment prefix, comment suffix)
'noskip': ('', '', 'Removed ', ''),
'skip': ('SKIP-', '', 'Skipped removing ', ' (long sentence)'),
'sure': ('', '', '', ''),
'unsure': ('', '-ALARM', '', ' (unsure)')
}
nobreak_labels = {}
nobreak_comments = {}
for skip in ['noskip', 'skip']:
for sure in ['sure', 'unsure']:
nobreak_labels[(skip, sure)] = (
comment_parts[skip][0] + comment_parts[sure][0]
+ 'NOBREAK'
+ comment_parts[sure][1] + comment_parts[skip][1])
nobreak_comments[(skip, sure)] = (
nobreak_labels[(skip, sure)] + ': '
+ comment_parts[skip][2] + comment_parts[sure][2]
+ 'sentence break'
+ comment_parts[sure][3] + comment_parts[skip][3])
def isbreak(line):
return (line[0] == LESS_THAN
and (line.startswith(b'<sentence')
or line.startswith(b'</sentence')))
def get_wordform(line):
return vrtdatalib.binlineref(line, word_attrnum).decode()
def isword(line):
# mainly all-alphabetic but also alphabetic-alphabetic and
# alphabetic's that both occurred (also alpha-alpha's if any)
return re.fullmatch(R"\w+(-\w+)?('s)?", get_wordform(line))
def isdash(line):
return get_wordform(line) in dashes
def isquote(line):
return get_wordform(line) in quotes
def isendpunct(line):
wf = get_wordform(line)
# The word form contains an end-of-sentence punctuation
# mark and it is _not_ an upper-case letter followed by a
# full stop, since "A." typically does not end a sentence.
return (any((c in wf) for c in endpunct)
and not (len(wf) == 2 and wf[1] == '.' and wf[0].isupper()))
def istitle(line):
return get_wordform(line).istitle()
def log(before, kind, after):
if args.verbose:
print('#',
*chain((w.decode().rstrip('\n')
for w in ([b'##'] + before)[-4:]),
[kind],
(w.decode().rstrip('\n')
for w in (after + [b'##'])[:6])),
file=sys.stderr)
def comment(text):
if args.comments:
ouf.write(b'<!-- ' + text.encode() + b' -->\n')
def count_tokens(lines):
return sum(int(line[0] != LESS_THAN) for line in lines)
def output_removed_break(broken, before, this, sure='sure'):
# Output information for a removed break (possible comment and log
# lines), or if the combined sentence would be longer than the
# maximum length, output the break.
nonlocal sent_token_count
# comment('token count: ' + str(sent_token_count))
if (max_token_count > 0
and sent_token_count + this_token_count > max_token_count):
comment(nobreak_comments[('skip', sure)])
output_lines(broken)
log(before, nobreak_labels[('skip', sure)], this)
sent_token_count = 0
else:
comment(nobreak_comments[('noskip', sure)])
log(before, nobreak_labels[('noskip', sure)], this)
def output_lines(lines):
for line in lines:
ouf.write(line)
before = ['<outside>']
broken = []
before_first_sent = True
for kind, group in groupby(inf, isbreak):
if kind:
broken = list(group)
continue
this = list(group)
this_token_count = count_tokens(this)
# comment('this token count: ' + str(this_token_count))
if before_first_sent:
# Find out which positional attribute is the word form, based
# on args.word_attr and the possible Positional attributes
# comment before the first sentence
word_attrnum = vrtnamelib.extract_numnameindex(
this, args.word_attr)
before_first_sent = False
if len(broken) == 0:
# before first sentence
pass
elif len(broken) == 1:
# either just '<sentence ...>' or just '</sentence>' at a
# larger boundary so not relevant so ship the break
output_lines(broken)
sent_token_count = 0
elif isword(before[-1]) and isword(this[0]):
# omit the sentence break for sure - grep out the (NO)BREAK
# comments to get the final valid output
output_removed_break(broken, before, this, 'sure')
elif any(isendpunct(w) for w in before[-2:]):
# appropriate punctuation before the break so ship the break
# without logging (no there is no certainty of anything ever
# but one has to draw the line somewhere - nothing to grep)
output_lines(broken)
sent_token_count = 0
elif ((len(before) > 2 and
isdash(before[-2]) and isquote(before[-1]) and
istitle(this[0]))
or
(len(this) > 2 and
isdash(this[0]) and isquote(this[1]) and
istitle(this[2]))):
# one side is dash quote and something more and what follows
# the dash quote is capitalized - keep the break with ALARM
comment(
'BREAK-ALARM: Kept sentence break near a dash and quote')
output_lines(broken)
log(before, 'BREAK-ALARM', this)
sent_token_count = 0
else:
# omit the sentence break with ALARM - grep out the
# (NO)BREAK comments to get the final valid output
output_removed_break(broken, before, this, 'unsure')
output_lines(this)
sent_token_count += this_token_count
# comment('token count: ' + str(sent_token_count))
before = this
if __name__ == '__main__':
SentenceBreakFixer().run()