forked from udieckmann/Kielipankki-utilities
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvrt-fix-smileys
executable file
·132 lines (104 loc) · 3.99 KB
/
vrt-fix-smileys
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#! /usr/bin/env python3
# -*- mode: Python; -*-
from vrtargslib import trans_args, trans_main
from vrtnamelib import xname, namelist, nameindex, isnames
from vrtdatalib import unescape, asrecord
def parsearguments():
description = '''
Attempt to combine successive token forms (words) if they seem
intended to be a single "smiley" symbol.
'''
parser = trans_args(description = description)
parser.add_argument('--word', '-w',
type = xname, default = 'word',
help = 'name of the word field [word]')
parser.add_argument('--spaces', '-s',
type = xname, default = 'spaces',
help = '''
name of the spaces field [spaces] that may
indicate that there was no space between the
tokens before tokenization; ignored if there
is no such field
''')
args = parser.parse_args()
args.prog = parser.prog
return args
# only consider binary splits, and no undesired fuses with further
# material, so this remains quite approximate; suspect these will miss
# any :-) and such! ah, maybe run it twice? glue even the partial ':-'
# and fix again to glue with any following ')', so that might work?
# the real solution, of course, is to have the tokenizer handle these,
# and others like these - incidentally, UDPipe Finnish model seems to
# keep ':-' together already so that a ':-)' got fixed in one go (and
# some of the basic smileys need no fixing! and some are more harder)
splits = {
('<', '3'),
(':', ')'), (':', '))'), (':', ')))'),
(':)', ')'), (':)', '))'),
(':))', ')'),
(':', '-'), (';', '-'),
(':-', ')'), (':-', '))'),
(':-)', ')'),
(';', ')'), (';', '))'),
(';)', ')'),
(':', '-)'),
(';', '-)'),
('=', ')'), ('=)', ')'),
(':', '('), (':-', '('),
(':', 'D'), (':', 'DD'),
(';', 'D'), (';', 'DD'),
('=', 'D'),
('=', 'E'),
('=', 'f'),
('=', 'O'),
(':', 'o'), (':', 'O'),
(':', 'P'), (';', 'P'), ('=', 'P'),
}
initials = { prefix for prefix, rest in splits }
def main(args, ins, ous):
lines = filter(lambda line: not line.isspace(), ins)
while True:
line = next(lines, None)
if line is None:
return
if isnames(line):
wp = nameindex(namelist(line), args.word)
sp = ( None if args.spaces not in namelist(line)
else nameindex(namelist(line), args.spaces) )
print(line, end = '', file = ous)
continue
if line.startswith('<'):
print(line, end = '', file = ous)
continue
# line is a data line (a token)
record = asrecord(line)
if sp is not None and 'SpaceAfter=No' not in record[sp]:
print(line, end = '', file = ous)
continue
if unescape(record[wp]) not in initials:
# word is not the starting character of a smiley
print(line, end = '', file = ous)
continue
# line may start a smiley
line2 = next(lines, None)
if line2 is None:
# this cannot happen! unless last line is a token!
print(line, end = '', file = ous)
continue
if line2.startswith('<'):
# line2 may be a positional-attributes comment but that is
# merely redundant and need not be examined again
print(line, line2, sep = '', end = '', file = ous)
continue
record2 = asrecord(line2)
if (unescape(record[wp]), unescape(record2[wp])) in splits:
record[wp] = record[wp] + record2[wp]
record[sp] = record2[sp]
print(*record, sep = '\t', file = ous)
continue
# was not a split smiley
print(line, end = '', file = ous)
print(line2, end = '', file = ous)
continue
if __name__ == '__main__':
trans_main(parsearguments(), main)