forked from udieckmann/Kielipankki-utilities
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTDPipe
101 lines (76 loc) · 2.56 KB
/
TDPipe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#! /bin/bash -e
# ===
# This version of the pipeline tears early, to also protect Marmot.
# ===
# This is *one* way to use VRT tools to build a Finnish pipeline.
# Best run as array jobs in batch, see bin/gamarr for that.
# For convenience, consider a separate log dir for each such job.
# Assuming
# (1) VRT tools in bin/ (e.g., a symbolic link to a repo clone)
# (2) data/ as produced by bin/vrt-pack, q.v.
# (3) default field names
# fills data/ with siblings to *.vrf found there,
# in successive stages of analysis.
# Note! If there be quite inordinate "sentence" lengths, even marmot
# needs protection! and lookup/fillup (omorfi) may need protection
# against (long) words that look like multiple compounds (there is a
# tool to REDACT such). See bin/vrt-report-element-size for sentence
# length distribution, esp. max, and something may be written to
# report word lengths (bin/vrt-keep -n word | wc -L may help ISWIM).
case $# in
1) ;;
*)
>&2 echo "$0: want 1 argument: got $#:"
for arg in "$@"
do
>&2 echo "argument: $arg"
done
exit 2
;;
esac
tick () {
now=$SECONDS
add=$((now - TICK))
printf -v FULL "%d:%02d:%02d" $((now/3600)) $((now/60%60)) $((now%60))
printf -v TIME "%d:%02d:%02d" $((add/3600)) $((add/60%60)) $((add%60))
date "+%F %T IN $TIME OF $FULL PAST $1"
TICK=$now
}
case "$1" in
data/a???/m???.vrf)
# The A, B, C, D extensions are so that the lexicographic
# order of the names is also their temporal order (and the
# current notion of sibling extensions allows only letters).
TICK=$SECONDS
# tear/mend protect marmot/parser and long sentences from each other
# (for a moderate sense of "long")
# (and "sentence" can be most anything)
bin/vrt-simple-tear -I Atear "$1"
tick "tear"
# tag : word -> marmot.in -> marmot.{pos,feat} -> lemma,pos,feat
bin/vrt-tdp-alpha-lookup -I Atear/Blookup "$1.Atear"
tick "lookup"
bin/vrt-tdp-alpha-marmot -I Blookup/Cmarmot "$1.Blookup"
tick "marmot"
bin/vrt-tdp-alpha-fillup -I Cmarmot/Dfillup "$1.Cmarmot"
tick "fillup"
# parse : pos,feat -> id + head + rel
bin/vrt-tdp-alpha-parse -I Dfillup/Eparse "$1.Dfillup"
tick "parse"
bin/vrt-conll09-mend -i "$1.Eparse"
tick "mend"
# Finish: id, head, rel => ref, dephead, deprel,
# with ref nee id pre-fronted.
bin/vrt-rename -I Eparse/Final \
-m id=ref,head=dephead,rel=deprel \
"$1.Eparse"
bin/vrt-drop -i --dots "$1.Final"
bin/vrt-keep -i -f word,ref --rest "$1.Final"
tick "finish"
;;
*)
>&2 echo "$0: want argument in data/a???/m???.vrf"
>&2 echo "$0: got argument: $1"
exit 2
;;
esac