forked from udieckmann/Kielipankki-utilities
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvrt-dehype
executable file
·106 lines (89 loc) · 3.98 KB
/
vrt-dehype
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#! /usr/bin/env python3
# -*- mode: Python; -*-
# This script attempts to remove line-breaking hyphens (and page numbers)
# in plaintext within VRT markup. The input plaintext itself is not yet
# in the VRT form, only the markup around it is assumed to be.
# An important structural assumption is that tag lines start with < and
# other lines do not start with <. The intended mechanism to achieve that
# is to have < as < in content lines.
# Do not use re.IGNORECASE while we have Python 3 at 3.4 in Taito.
import argparse, os, re, sys
from html import unescape, escape
from itertools import groupby
import signal
signal.signal(signal.SIGINT, signal.SIG_DFL)
signal.signal(signal.SIGPIPE, signal.SIG_DFL)
# match object has two groups of two letters each,
# the hyphen-newline combination was between them;
# it is then passed to hyper or debug_hyper
hyphen = re.compile(R'''
([a-zäöA-ZÄÖ]{2}) -\n
([a-zäöA-ZÄÖ]{2})
''', re.VERBOSE)
# match object has three groups: two letters
# and an optional sentence-internal punctuation
# mark (including hyphen), a possible page number
# on its own line, and another two letters;
# it is then passed to number or debug_number
page = re.compile(R'''
([a-zäöA-ZÄÖ]{2} [,\-]?) \n+
(\d+) \n+
([a-zäöA-ZÄÖ]{2})
''', re.VERBOSE)
vowels = 'aeiouyäöAEIOUYÄÖ'
def hyper(hype):
left, right = hype.group(1), hype.group(2)
ment = '{}-{}' if left[1] == right[0] in vowels else '{}{}'
return ment.format(left, right)
def debug_hyper(hype):
left, right = hype.group(1), hype.group(2)
ment = 'HYP({})-({})' if left[1] == right[0] in vowels else 'HYP({})({})'
return ment.format(left, right)
def number(numb):
left, page, right = map(numb.group, (1, 2, 3))
return '{}\n{}'.format(left, right)
def debug_number(numb):
left, page, right = map(numb.group, (1, 2, 3))
return 'NUM({}\n{})'.format(left, right)
def dehyphenate(source, target, *, denum = False, debug = False):
for ismeta, part in groupby(source, lambda line:
line.startswith('<')):
if ismeta:
print(*part, sep = '', end = '', file = target)
else:
content = ''.join(part)
if denum:
content = page.sub(debug_number if debug else number, content)
content = hyphen.sub(debug_hyper if debug else hyper, content)
print(content, end = '', file = target)
def main():
parser = argparse.ArgumentParser(description = '''
Attempts to remove hyphens at line ends, and optionally
page numbers between lines, in plaintext blocks within VRT
markup. This may or may not be an improvement over the
input. Errors of omission and of commission are to be
expected. The question is how many they are.''')
parser.add_argument('arg', metavar = 'FILE', nargs = '?',
type = argparse.FileType('r', encoding = 'UTF-8'),
default = sys.stdin,
help = 'input file (default stdin)')
parser.add_argument('--out', '-o', metavar = 'outfile',
type = argparse.FileType('w', encoding = 'UTF-8'),
default = sys.stdout,
help = 'output file (default stdout)')
parser.add_argument('--denum', action = 'store_true',
help = 'first attempt to remove some page numbers'
' that occur within a sentence (even within a word)')
parser.add_argument('--debug', action = 'store_true',
help = 'indicate removals with HYP(..)(..)'
' and NUM(..\\n..)')
parser.add_argument('--version', action = 'store_true',
help = 'print a version indicator and exit')
args = parser.parse_args()
if args.version:
print('vrt-dehype 0.1a (FIN-CLARIN 2018)')
exit(0)
with args.arg as source, args.out as target:
dehyphenate(source, target, debug = args.debug, denum = args.denum)
if __name__ == '__main__':
main()