forked from udieckmann/Kielipankki-utilities
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvrt-report-element-size
executable file
·259 lines (215 loc) · 9.32 KB
/
vrt-report-element-size
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
#! /usr/bin/env python3
# -*- mode: Python; -*-
from collections import Counter
import sys
from vrtargslib import trans_args, trans_main
from vrtargslib import BadData, BadCode
from vrtdatalib import lineref
from vrtnamelib import xname
from vrtnamelib import isnames, namelist, nameindex
def parsearguments():
description = '''
Report on the distribution of size of a chosen type of element in
chosen units. Element can be text, paragraph, or sentence. Unit
can be number of tokens, "words", paragraphs, sentences, any meta,
comments; "word" is defined as a token that contains at least one
letter or digit in the word field.
'''
parser = trans_args(description = description, inplace = False)
parser.add_argument('--element', default = 'sentence',
choices = [ 'sentence', 'paragraph', 'text' ],
help = '''
type of element to report the length of
(default: sentence)
''')
parser.add_argument('--unit', default = 'token',
choices = [ 'token', 'word',
'paragraph', 'sentence',
'meta', 'comment' ],
help = '''
type of unit to count within segment
(default: token)
''')
parser.add_argument('--word', default = 'word',
type = xname,
help = '''
field to use as word when unit is word
''')
parser.add_argument('--summary', # default: full distribution
choices = [ 'H5', 'V5', 'H11', 'V11', 'H13', 'V13' ],
help = '''
report the lengths at or just above the
quantile points, horizontally or vertically,
in TSV format: 5-number summaries consist of
quartile points, 11-number summaries of decile
points; 13-number summaries of decile points
with low and high quartile points; horizontal
summaries include also number of observations
and mean length
''')
labeling = parser.add_mutually_exclusive_group()
labeling.add_argument('--label',
help = '''
start each row of the report with the label
''')
labeling.add_argument('--filename', action = 'store_true',
help = '''
start each row with the name of the input
file (requires explicit input file)
''')
args = parser.parse_args()
args.prog = parser.prog
args.inplace = False
args.backup = None
return args
def lengthcounts(args, ins):
'''Return a Counter that maps each observed segment length in the
specified units to the observed number of occurrences of that
length in the input VRT stream. Ignore other lines within segment.
'''
begin, end = dict(sentence = (('<sentence>',
'<sentence '),
'</sentence>'),
paragraph = (('<paragraph>',
'<paragraph '),
'</paragraph>'),
text = (('<text>',
'<text '),
'</text>')) [args.element]
needword, wordix = False, None
if args.unit == 'token':
def isword(line):
'Count every token inside sentence element'
return not line.startswith('<')
elif args.unit == 'word':
needword = True
def isword(line):
if wordix is None:
raise BadData('data before names')
return (not line.startswith('<') and
any(c.isalpha() or c.isdigit()
for c in lineref(line, wordix)))
elif args.unit == 'paragraph':
def isword(line):
return line.startswith(('<paragraph>',
'<paragraph '))
elif args.unit == 'sentence':
def isword(line):
return line.startswith(('<sentence>',
'<sentence '))
elif args.unit == 'meta':
def isword(line):
return line.startswith('<')
elif args.unit == 'comment':
def isword(line):
return line.startswith('<!--')
else:
print(args.prog + ': this cannot happen:',
'unit ==', repr(args.unit),
file = sys.stderr)
exit(2)
def issome(line): return not line.isspace()
D = Counter()
n = 0
for line in filter(issome, ins):
if needword and isnames(line):
wordix = nameindex(namelist(line), args.word)
if line.startswith(begin): n = 0
elif line.startswith(end): D[n] += 1
else: n += isword(line)
else: return D
def quantilepoints(counts, number, proportions):
'''Yield the observed values that are cumulatively at or just above
the specified proportions. Something like that anyway. The
proportions being specified in strictly increasing order of
numerator/denominator as a sequence of (numerator, denominator).
'''
proportions = iter(proportions)
num, den = next(proportions, (None, None))
if num is None: return
sofar = 0
for length, count in sorted(counts.items()):
sofar += count
while sofar * den >= num * number:
yield length
num, den = next(proportions, (None, None))
if num is None: return
def shipfull(counts, ous, *, name = None):
name and print('label', end = '\t', file = ous)
print('Size', 'Count', sep = '\t', file = ous)
for length, count in sorted(counts.items()):
name and print(name, end = '\t', file = ous)
print(length, count, sep = '\t', file = ous)
def shipcol(labels, aliases, points, ous, *, name = None):
name and print('Label', end = '\t', file = ous)
print('Pt', 'Aka', 'Size', sep = '\t', file = ous)
for row in zip(labels, aliases, points):
name and print(name, end = '\t', file = ous)
print(*row, sep = '\t', file = ous)
def shiprow(labels, points, total, number, ous, name = None):
name and print('Label', end = '\t', file = ous)
print('Total', 'Nobs', 'Mean', *labels, sep = '\t', file = ous)
name and print(name, end = '\t', file = ous)
print(total, number, total / number, *points, sep = '\t', file = ous)
def main(args, ins, ous):
if args.infile is None and args.filename:
raise BadData('need input file to use filename as label')
rowlabel = args.label or (args.filename and args.infile)
counts = lengthcounts(args, ins)
total = sum(length * count for length, count in counts.items())
number = sum(counts.values())
if args.summary is None:
shipfull(counts, ous)
return
if args.summary.startswith('V'):
labels, aliases, proportions = dict(
V5 = (# quartile points
('Q0', 'Q1', 'Q2', 'Q3', 'Q4'),
('Min', 'LoQ', 'Med', 'HiQ', 'Max'),
((0,4), (1,4), (2,4), (3,4), (4,4))),
V11 = (# decile points
('D0', 'D1', 'D2', 'D3', 'D4',
'D5',
'D6', 'D7', 'D8', 'D9', 'D10'),
('Min', '_', '_', '_', '_',
'Med',
'_', '_', '_', '_', 'Max'),
((0,10), (1,10), (2,10), (3,10), (4,10),
(5,10),
(6,10), (7,10), (8,10), (9,10), (10,10))),
V13 = (# quantile points: deciles with two quartiles
('D0', 'D1', 'D2', 'Q1', 'D3', 'D4',
'D5',
'D6', 'D7', 'Q3', 'D8', 'D9', 'D10'),
('Min', '_', '_', 'LoQ', '_', '_',
'Med',
'_', '_', 'HiQ', '_', '_', 'Max'),
((0,10), (1,10), (2,10), (1,4), (3,10), (4,10),
(5,10),
(6,10), (7,10), (3,4), (8,10), (9,10), (10,10))),
) [args.summary]
points = quantilepoints(counts, number, proportions)
shipcol(labels, aliases, points, ous,
name = rowlabel)
return
labels, proportions = dict(
H5 = (('Min', 'Q1', 'Med', 'Q2', 'Max'),
((0,4), (1,4), (2,4), (3,4), (4,4))),
H11 = (('Min', 'D1', 'D2', 'D3', 'D4',
'Med', 'D6', 'D7', 'D8', 'D9',
'Max'),
((0,10), (1,10), (2,10), (3,10), (4,10),
(5,10), (6,10), (7,10), (8,10), (9,10), (10,10))),
H13 = (('Min', 'D1', 'D2', 'LoQ', 'D3', 'D4',
'Med', 'D6', 'D7', 'HiQ', 'D8', 'D9',
'Max'),
((0,10), (1,10), (2,10), (1,4), (3,10), (4,10),
(5,10), (6,10), (7,10), (3,4), (8,10), (9,10),
(10,10))),
) [args.summary]
points = quantilepoints(counts, number, proportions)
shiprow(labels, points, total, number, ous,
name = rowlabel)
return
if __name__ == '__main__':
trans_main(parsearguments(), main)