-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathvrt-pack
executable file
·271 lines (213 loc) · 8.5 KB
/
vrt-pack
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
#! /usr/bin/env python3
# -*- mode: Python; -*-
from argparse import ArgumentTypeError
from itertools import groupby, count
from tempfile import mkstemp
import enum, os, re, sys, traceback # using enum?
import io, os, string, sys
from vrtargslib import version_args
from vrtargslib import BadData, BadCode
from vrtnamelib import isbinnames
def sizetype(text):
m = re.fullmatch(r'([1-9][0-9]*)(k|M|)', text)
if m:
return int(m.group(1)) * { '' : 10 ** 0,
'k' : 10 ** 3,
'M' : 10 ** 6 }[m.group(2)]
else:
raise ArgumentTypeError('bad size')
def parsearguments():
description = '''
Pack a directory tree of vrt documents (named *.vrt) to
predictably sized files (named *.vrf) in a new directory tree of
fragments sequences for processing, respecting the integrity of
sentence elements and field-name comments. The intention is that
eventual unpacking reproduces the original hierarchy with any new
annotations added to the tokens in the packed fragments.
'''
parser = version_args(description = description)
parser.add_argument('indir', help = 'input directory')
outgrp = parser.add_mutually_exclusive_group(required = True)
outgrp.add_argument('--out', '-o', dest = 'outdir', metavar = 'outdir',
help = 'output directory')
outgrp.add_argument('--suffix', '-s', metavar = '.fix',
help = 'output in indir.fix/')
# not sure if this belongs to this tool or another tool
# parser.add_argument('--element', '-e', metavar = 'name',
# help = 'element to keep whole (default sentence)')
szegrp = parser.add_mutually_exclusive_group()
szegrp.add_argument('--lines', '-l', metavar = 'number',
type = sizetype,
help = 'number of lines to reach in each output file')
szegrp.add_argument('--tokens', '-t', metavar = 'number',
type = sizetype,
help = 'number of token lines to reach in each output file')
szegrp.add_argument('--bytes', '-b', metavar = 'number',
type = sizetype,
help = 'number of bytes to reach in each output file')
args = parser.parse_args()
args.prog = parser.prog
return args
# A dirsource produces all *.vrt files found under a given directory,
# recursively, in a lexicographic order of their pathnames.
def dirsource(path, memberpath = ''):
'''Yield a fragment producer for each *.vrt file under path in some
sort of lexicographic order of their path names, memberpath stored in
each fragment for eventual unpacking.
'''
if os.path.isdir(path):
for name in sorted(os.listdir(path)):
yield from dirsource(os.path.join(path, name),
os.path.join(memberpath, name))
elif os.path.isfile(path) and os.path.splitext(path)[1] == '.vrt':
inf = open(path, mode = 'br')
yield makesource(properlines(inf), memberpath)
inf.close()
else:
pass
def properlines(lines):
for line in lines:
if line.isspace(): continue
yield (line if line.endswith(b'\n') else line + b'\n')
def makesource(inf, path):
'''Return a source of the fragments of the given file together with
any field-name line found in the file (or None if not found yet -
it should be in the first fragment, but any further fragment may
end up in a different output file where the names need to be
copied so that that file also carries the necessary information.)
'''
names = None
# TODO fix any weird characters in source ... (like, sigh) or
# maybe refuse to deal with any potentially foul play
begin = ( b'<... fragment="{}" source="[]">\n'
.replace(b'[]', path.encode('UTF-8')))
end = b'</...>\n'
frak = 1
def identify(line):
nonlocal frak, names
if line == end:
frak += 1
return frak - 1
if isbinnames(line):
names = line
return frak
def bracketed(lines):
k = count(start = 1)
start = True
for line in lines:
if start:
yield begin.replace(b'{}', str(next(k)).encode('UTF-8'))
start = False
yield line
if line.startswith(b'</sentence>'):
yield end
start = True
if not start:
yield end
return ((names, group)
for kind, group in
groupby(bracketed(inf), identify))
def membergen():
'''Generate up to 27 000 000 archive member names from a000/m000.vrf
to z999/m999.vrf before crashing. Surely that is more than will
ever be needed? (That is up to 27 000 directories on the top level
of the archive but only 1 000 files each. More likely in practice
there will be only a few top-level directories with the 1 000
files in each, or just one with a few dozen files.)
'''
for a in string.ascii_lowercase:
for ddd in range(1000):
for eee in range(1000):
yield '{}{:03}/m{:03}.vrf'.format(a, ddd, eee)
# TODO this should not be global
membernames = membergen()
# A dirsink creates a directory where the packed fragments go in
# actual files in subdirectories.
def dirsink(args, dirobj, sentinel):
os.mkdir(dirobj)
return makesink(args, dirobj, dirobj, dirmember, sentinel)
def dirmember(dirobj, dirname, fieldnames):
membername = next(membernames)
subdir = os.path.dirname(os.path.join(dirname, membername))
os.makedirs(subdir, exist_ok = True)
fd, temp = mkstemp(dir = subdir,
prefix = os.path.basename(membername),
suffix = '.tmp')
os.close(fd)
out = open(temp, mode = 'bw')
def end():
out.close()
os.rename(temp, os.path.join(dirname, membername))
# must not print anything outside fragment brackets
# but fieldnames must be printed in first fragment
# in every member instead - after fragment bracket;
# fieldnames is None or out.write(fieldnames)
return out, end
def makesink(args, target, targetname, member, sentinel):
size_in_units = ( len if args.bytes else
(lambda line: 1) if args.lines else
(lambda line: 1 - line.startswith(b'<')) )
size_limit = ( args.bytes or
args.lines or
args.tokens or
100000 )
out, end, size = None, None, None
def consumer(fieldnames, fragment):
nonlocal out, end, size
if out is None and fragment is sentinel:
return
if fragment is sentinel:
end()
return
if out is None:
out, end = member(target, targetname, fieldnames)
size = 0
if size == 0 and fieldnames is not None:
# insert field names in first fragment of member
line = next(fragment)
out.write(line)
out.write(fieldnames)
size += size_in_units(line)
size += size_in_units(fieldnames)
for line in fragment:
out.write(line)
size += size_in_units(line)
if size >= size_limit:
end()
out, end = None, None
return consumer
def main(args):
try:
implement_main(args)
except KeyboardInterrupt:
print(args.prog + ': keyboard interrupt',
file = sys.stderr)
exit(1)
def implement_main(args):
indir = os.path.realpath(args.indir)
outdir = ( os.path.realpath(args.outdir)
if args.outdir
else os.path.join(os.path.dirname(indir),
os.path.basename(indir) + args.suffix) )
if os.path.isdir(indir):
source = dirsource(indir)
else:
print('{}: error: not a directory: {}'
.format(args.prog, args.indir),
file = sys.stderr)
exit(1)
sentinel = [ b'*** sentinel line ***\n' ]
try:
sink = dirsink(args, outdir, sentinel)
except Exception as exn:
print('{}: error: could not make output directory'
.format(args.prog),
'{}: {}'.format(args.prog, exn),
sep = '\n', file = sys.stderr)
exit(1)
for producer in source:
for names, fragment in producer:
sink(names, fragment)
sink(None, sentinel)
if __name__ == '__main__':
main(parsearguments())