forked from facebookresearch/fairseq
-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathprepare-iwslt17-multilingual.sh
133 lines (122 loc) · 4.28 KB
/
prepare-iwslt17-multilingual.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
SRCS=(
"de"
"fr"
)
TGT=en
ROOT=$(dirname "$0")
SCRIPTS=$ROOT/../../scripts
SPM_TRAIN=$SCRIPTS/spm_train.py
SPM_ENCODE=$SCRIPTS/spm_encode.py
BPESIZE=16384
ORIG=$ROOT/iwslt17_orig
DATA=$ROOT/iwslt17.de_fr.en.bpe16k
mkdir -p "$ORIG" "$DATA"
TRAIN_MINLEN=1 # remove sentences with <1 BPE token
TRAIN_MAXLEN=250 # remove sentences with >250 BPE tokens
URLS=(
"https://wit3.fbk.eu/archive/2017-01-trnted/texts/de/en/de-en.tgz"
"https://wit3.fbk.eu/archive/2017-01-trnted/texts/fr/en/fr-en.tgz"
)
ARCHIVES=(
"de-en.tgz"
"fr-en.tgz"
)
VALID_SETS=(
"IWSLT17.TED.dev2010.de-en IWSLT17.TED.tst2010.de-en IWSLT17.TED.tst2011.de-en IWSLT17.TED.tst2012.de-en IWSLT17.TED.tst2013.de-en IWSLT17.TED.tst2014.de-en IWSLT17.TED.tst2015.de-en"
"IWSLT17.TED.dev2010.fr-en IWSLT17.TED.tst2010.fr-en IWSLT17.TED.tst2011.fr-en IWSLT17.TED.tst2012.fr-en IWSLT17.TED.tst2013.fr-en IWSLT17.TED.tst2014.fr-en IWSLT17.TED.tst2015.fr-en"
)
# download and extract data
for ((i=0;i<${#URLS[@]};++i)); do
ARCHIVE=$ORIG/${ARCHIVES[i]}
if [ -f "$ARCHIVE" ]; then
echo "$ARCHIVE already exists, skipping download"
else
URL=${URLS[i]}
wget -P "$ORIG" "$URL"
if [ -f "$ARCHIVE" ]; then
echo "$URL successfully downloaded."
else
echo "$URL not successfully downloaded."
exit 1
fi
fi
FILE=${ARCHIVE: -4}
if [ -e "$FILE" ]; then
echo "$FILE already exists, skipping extraction"
else
tar -C "$ORIG" -xzvf "$ARCHIVE"
fi
done
echo "pre-processing train data..."
for SRC in "${SRCS[@]}"; do
for LANG in "${SRC}" "${TGT}"; do
cat "$ORIG/${SRC}-${TGT}/train.tags.${SRC}-${TGT}.${LANG}" \
| grep -v '<url>' \
| grep -v '<talkid>' \
| grep -v '<keywords>' \
| grep -v '<speaker>' \
| grep -v '<reviewer' \
| grep -v '<translator' \
| grep -v '<doc' \
| grep -v '</doc>' \
| sed -e 's/<title>//g' \
| sed -e 's/<\/title>//g' \
| sed -e 's/<description>//g' \
| sed -e 's/<\/description>//g' \
| sed 's/^\s*//g' \
| sed 's/\s*$//g' \
> "$DATA/train.${SRC}-${TGT}.${LANG}"
done
done
echo "pre-processing valid data..."
for ((i=0;i<${#SRCS[@]};++i)); do
SRC=${SRCS[i]}
VALID_SET=(${VALID_SETS[i]})
for ((j=0;j<${#VALID_SET[@]};++j)); do
FILE=${VALID_SET[j]}
for LANG in "$SRC" "$TGT"; do
grep '<seg id' "$ORIG/${SRC}-${TGT}/${FILE}.${LANG}.xml" \
| sed -e 's/<seg id="[0-9]*">\s*//g' \
| sed -e 's/\s*<\/seg>\s*//g' \
| sed -e "s/\’/\'/g" \
> "$DATA/valid${j}.${SRC}-${TGT}.${LANG}"
done
done
done
# learn BPE with sentencepiece
TRAIN_FILES=$(for SRC in "${SRCS[@]}"; do echo $DATA/train.${SRC}-${TGT}.${SRC}; echo $DATA/train.${SRC}-${TGT}.${TGT}; done | tr "\n" ",")
echo "learning joint BPE over ${TRAIN_FILES}..."
python "$SPM_TRAIN" \
--input=$TRAIN_FILES \
--model_prefix=$DATA/sentencepiece.bpe \
--vocab_size=$BPESIZE \
--character_coverage=1.0 \
--model_type=bpe
# encode train/valid
echo "encoding train with learned BPE..."
for SRC in "${SRCS[@]}"; do
python "$SPM_ENCODE" \
--model "$DATA/sentencepiece.bpe.model" \
--output_format=piece \
--inputs $DATA/train.${SRC}-${TGT}.${SRC} $DATA/train.${SRC}-${TGT}.${TGT} \
--outputs $DATA/train.bpe.${SRC}-${TGT}.${SRC} $DATA/train.bpe.${SRC}-${TGT}.${TGT} \
--min-len $TRAIN_MINLEN --max-len $TRAIN_MAXLEN
done
echo "encoding valid with learned BPE..."
for ((i=0;i<${#SRCS[@]};++i)); do
SRC=${SRCS[i]}
VALID_SET=(${VALID_SETS[i]})
for ((j=0;j<${#VALID_SET[@]};++j)); do
python "$SPM_ENCODE" \
--model "$DATA/sentencepiece.bpe.model" \
--output_format=piece \
--inputs $DATA/valid${j}.${SRC}-${TGT}.${SRC} $DATA/valid${j}.${SRC}-${TGT}.${TGT} \
--outputs $DATA/valid${j}.bpe.${SRC}-${TGT}.${SRC} $DATA/valid${j}.bpe.${SRC}-${TGT}.${TGT}
done
done