-
Notifications
You must be signed in to change notification settings - Fork 6
/
onmt.tape
70 lines (54 loc) · 2.1 KB
/
onmt.tape
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import "tapes/submitters.tape"
import "tapes/versioners.tape"
import "tapes/dummy.tape"
import "tapes/pipeline.tape"
# ==== pipeline starts here ====
# download all the data to the local directory
import "tapes/download.tape"
# prepare downloaded training data:
# - clean: remove empty lines, remove sentence pairs with large length ratio
# - sample: sample a certain number of lines of data
import "tapes/prepare_train.tape"
# prepare dev/test data, e.g. extract text from sgm:
# - extract dev/test from sgm format, if the wrapping exists
import "tapes/prepare_devtest.tape"
# merge multiple train/dev/test sets
# note that merging of train/dev is mandatory,
# while test is controlled by the branch point `MergeTest`
import "tapes/merge.tape"
# tasks related to tokenize
import "tapes/tokenize.tape"
# tasks related to truecase
import "tapes/truecase.tape"
# tasks related to subword processing
import "tapes/subword.tape"
# task binarize_data, train and decode fits in here in the execution order
# these tasks are usually the ones that needs to be defined for each toolkit
import "tapes/onmt.tape"
# the order here is the reverse of the preprocessing
# usually: debpe -> detruecase -> detokenize
# if you need nist-bleu, you'll have to wrap up output into xml as well
import "tapes/postprocessing.tape"
# currently support nist_bleu and multi_bleu
# sacrebleu to be supported
import "tapes/bleu.tape"
# ==== pipeline ends here ====
plan test {
reach sacrebleu, multi_bleu via (SgmDev: yes) * (SgmTest: yes) * (MergeTest: yes) *
(UseExistingTruecaser: no) * (TrainSampleSize: DontSample) * (DoTokenize: yes) *
(DoTruecase: yes) * (SubwordMethod: bpe) * (TrainDataSource: iwslt_deen_2014) *
(DevDataSource: iwslt_deen_dev2010 iwslt_deen_dev2012) *
(TestDataSource: iwslt_deen_test2010 iwslt_deen_test2011 iwslt_deen_test2012) *
(TestMode: no) * (SentencePieceOps: *)
}
# This is pending for onmt for now
# plan cleanup {
#
# }
# Nuts and bolts:
global {
ducttape_experimental_packages=true
ducttape_experimental_submitters=true
ducttape_experimental_imports=true
ducttape_experimental_multiproc=true
}