forked from Ensembl/ensembl-production-metazoa
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mz_generic.sh
executable file
·428 lines (327 loc) · 14.3 KB
/
mz_generic.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
#!/usr/bin/env bash
# See the NOTICE file distributed with this work for additional information
# regarding copyright ownership.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
SPEC_PATH="$1"
SPECIAL_ACTION="$2"
SPECIAL_ACTION_ARG="$3"
if [ -z "$SPEC_PATH" ]; then
SPEC_PATH="help"
fi
if [ z"${SPECIAL_ACTION}" = z"help" -o z"$SPEC_PATH" = z"help" ]; then
SPECIAL_ACTION="help"
echo 'usage: $0 meta/config/path|help|env_setup_only [optional args] ' >> /dev/stderr
echo ' N.B. "help" and "|env_setup_only" can be used as optional argumets. see below' >> /dev/stderr
echo ' optional arguments:' >> /dev/stderr
echo ' help -- print help and exit' >> /dev/stderr
echo ' env_setup_only -- prepare environment and exit' >> /dev/stderr
echo ' restore [pattern] -- restore from the back up matching "pattern", removing later backups; or restore from the most recent one, if no "pattern" is given ' >> /dev/stderr
echo ' stop_after_conf -- get data, validate, prepare cofiguration for the loader and stop' >> /dev/stderr
echo ' stop_after_load -- stop after the loader pipeline before running anything else' >> /dev/stderr
echo ' stop_before_xref -- stop before running xref helper' >> /dev/stderr
echo ' pre_final_dc -- run datachecks before creating final dump' >> /dev/stderr
echo ' finalise -- create "final" backup' >> /dev/stderr
echo ' patch_schema -- patch schema to the latest available' >> /dev/stderr
exit 0
fi
# enabling tracing and failing on error
set -o errexit
set -o xtrace
if [ z"${SPECIAL_ACTION}" = z"env_setup_only" -o z"$SPEC_PATH" = z"env_setup_only" ]; then
SPECIAL_ACTION="env_setup_only"
fi
# load _mz.conf
if [ -f "$MZ_CONFIG" ]; then
source $MZ_CONFIG
elif [ -f "$(pwd)/_mz.conf" ]; then
MZ_CONFIG=$(pwd)/_mz.conf
source $MZ_CONFIG
elif [ -f "$(pwd)/ensembl-production-metazoa-private/conf/_mz.conf" ]; then
MZ_CONFIG="$(pwd)/ensembl-production-metazoa-private/conf/_mz.conf"
source $MZ_CONFIG
fi
# ENS_VERSION and MZ_RELEASE number
if [ -z "$MZ_RELEASE" ]; then
MZ_RELEASE=51
fi
if [ -z "$ENS_VERSION" ]; then
ENS_VERSION=104
fi
# TODO: create a param or a config to load user scpecific options from
if [ -z "$SCRIPTS" ]; then
SCRIPTS="$(pwd)"
fi
if [ -z "$WD" ]; then
WD="$(pwd)"/data
fi
mkdir -p "$WD"
# db server alias
if [ -z "$CMD" ]; then
echo 'no db server alias "$CMD" is provided' >> /dev/stderr
exit 1
fi
if [ -z "$CMD_W" ]; then
CMD_W="${CMD}-w"
fi
# should we create a local copy of _mz.conf and fill / update it?
MZ_SCRIPTS=${SCRIPTS}/ensembl-production-metazoa/scripts
source ${MZ_SCRIPTS}/lib.sh
if [ z"${SPECIAL_ACTION}" = z"env_setup_only" ]; then
# env_setup_only -- no meta needed
echo 'not using meta information. "env_setup_only" passed...' >> /dev/stderr
else
# picking META_FILE_RAW
SPEC_SHORT="$(basename ${SPEC_PATH})"
spec_dir="$(dirname ${SPEC_PATH})"
abs_dir="$(dirname $(realpath ${SPEC_PATH}))"
if [ "$spec_dir" = "$abs_dir" ]; then
META_FILE_RAW=${SPEC_PATH}
else # relative paths
if [ "$spec_dir" = "." ]; then
META_FILE_RAW=$MZ_SCRIPTS/../meta/$ENS_VERSION/$SPEC_SHORT
else # use current dir as base
META_FILE_RAW=$(pwd)/$SPEC_PATH
fi
fi
fi # ! env_setup_only
# prepare / source ensembl.prod.${ENS_VERSION}
# ${SCRIPTS}/ensembl.prod.${ENS_VERSION}
flock ${SCRIPTS}/ensembl.prod.${ENS_VERSION}.lock -c "
source ${MZ_SCRIPTS}/lib.sh;
get_ensembl_prod ${SCRIPTS}/ensembl.prod.${ENS_VERSION} $ENS_VERSION \
$MZ_SCRIPTS/checkout_ensembl.20210208.sh $MZ_SCRIPTS/legacy/create_setup_script.sh
"
[ -f "${SCRIPTS}/ensembl.prod.${ENS_VERSION}/_FAILED" ] && echo "setting up env failed... exiting" >> /dev/stderr && exit 2
# or copy
source ${SCRIPTS}/ensembl.prod.${ENS_VERSION}/setup.sh
export PROD_DB_SCRIPTS=${ENSEMBL_ROOT_DIR}/ensembl.prod.${ENS_VERSION}uction/scripts/production_database
echo 'ENSEMBL_ROOT_DIR='"${ENSEMBL_ROOT_DIR}" >> /dev/stderr
# exit after env setup, if you wish to
if [ z"${SPECIAL_ACTION}" = z"env_setup_only" ]; then
echo environment is set up. exiting... >> /dev/stderr
exit 0
fi
# prepared data dir
export DATA_DIR=$WD/$SPEC_SHORT
populate_dirs $DATA_DIR
export DONE_TAGS_DIR=$DATA_DIR/done
export PIPELINE_OUT_DIR=$DATA_DIR/data/pipeline_out
echo DATA_DIR="'${DATA_DIR}'" >> /dev/stderr
# get data
ASM_URL=$(get_meta_conf $META_FILE_RAW ASM_URL)
get_asm_ftp "$ASM_URL" "$DATA_DIR/data/raw"
export ASM_DIR=$DATA_DIR/data/raw/asm
# get adhoc data
get_individual_files_to_asm $ASM_DIR $META_FILE_RAW
# run data preprocessing commands (DATA_INIT meta tag)
run_data_init $META_FILE_RAW $ASM_DIR
# get metadata from gbff
prepare_metada $META_FILE_RAW $ASM_DIR $ENSEMBL_ROOT_DIR $DATA_DIR/metadata
export META_FILE=$DATA_DIR/metadata/meta
STOP_AFTER_CONF=${STOP_AFTER_CONF}
if [ -z "${STOP_AFTER_CONF}" ]; then
STOP_AFTER_CONF=$(get_meta_conf $META_FILE_RAW 'STOP_AFTER_CONF')
fi
if [ -n "${STOP_AFTER_CONF}" -a "x${STOP_AFTER_CONF}" != "xNO" -o z"${SPECIAL_ACTION}" = z"stop_after_conf" ]; then
echo 'stopping after config generation (STOP_AFTER_CONF). see stats...' >> /dev/stderr
exit 0
false
fi
# load using new-genome-loader
run_new_loader $CMD_W $MZ_RELEASE $DATA_DIR/metadata $ENSEMBL_ROOT_DIR \
$DATA_DIR/data/pipeline_out/new_load nopfx2
DBNAME=$(find $DATA_DIR/data/pipeline_out/new_load/*_core_* -maxdepth 0 -type d | grep -F _core_ | head -n 1 | perl -pe 's,^.*/,,')
SPECIES=$(get_meta_str $META_FILE "species.production_name")
SPECIES_SCI=$(get_meta_str $META_FILE "species.scientific_name")
SPECIES_SCI_=$(echo $SPECIES_SCI | perl -pe 's/[ _]+/_/g')
echo "using DBNAME $DBNAME" >> /dev/stderr
echo "using SPECIES $SPECIES" >> /dev/stderr
echo "using SPECIES_SCI_ $SPECIES_SCI_" >> /dev/stderr
backup_relink $DBNAME $CMD new_loader $DATA_DIR/bup
if [ z"${SPECIAL_ACTION}" = z"restore" ]; then
# RESTORE / UNCOMMENT TO USE
echo "!!! RESTORING DB (tag: '${SPECIAL_ACTION_ARG}')!!!" >> /dev/stderr; restore $DBNAME $CMD_W $DATA_DIR/bup "$SPECIAL_ACTION_ARG"; echo ok >> /dev/stderr; exit 0; false; fail
fi
# fill meta
fill_meta $CMD_W $DBNAME $META_FILE $DATA_DIR/data/pipeline_out/fill_meta
backup_relink $DBNAME $CMD with_meta $DATA_DIR/bup
# mark trans_spliced transcripts
TR_TRANS_SPLICED="$(get_meta_conf $META_FILE_RAW 'TR_TRANS_SPLICED')"
if [ -n "$TR_TRANS_SPLICED" ]; then
mark_tr_trans_spliced $CMD_W $DBNAME "$TR_TRANS_SPLICED"
backup_relink $DBNAME $CMD tr_spliced_marks $DATA_DIR/bup
fi
# initial test, use "stop_after_load" as the second arg for the first run, if not sure
if [ z"${SPECIAL_ACTION}" = z"stop_after_load" ]; then
run_core_stats_new $CMD_W $DBNAME $SPECIES $DATA_DIR/data/pipeline_out/core_stats _initial
run_dc $CMD_W $DBNAME $ENSEMBL_ROOT_DIR $DATA_DIR/data/pipeline_out/dc _initial
exit 0
fi
GFF_FILE=$(get_meta_conf $META_FILE 'GFF_FILE')
# getting data from repbase to upcast species name
REPBASE_SPECIES_NAME="$(get_meta_conf $META_FILE REPBASE_SPECIES_NAME_RAW)"
if [ -z "$REPBASE_SPECIES_NAME" ]; then
REPBASE_SPECIES_NAME=$(echo "$SPECIES_SCI" | perl -pe 's/[ _]+/_/g')
fi
REPBASE_OUT_DIR=$DATA_DIR/data/pipeline_out/repeat_lib/repbase
get_repbase_lib "$SPECIES_SCI" $CMD $DBNAME $REPBASE_OUT_DIR
REPBASE_FILE_NEW=$REPBASE_OUT_DIR/repbase.lib
DISABLE_REPBASE_NAME_UPCAST=$(get_meta_conf $META_FILE_RAW DISABLE_REPBASE_NAME_UPCAST)
if [ -z "${DISABLE_REPBASE_NAME_UPCAST}" \
-o "x${DISABLE_REPBASE_NAME_UPCAST}" != "xYES" \
-o "x${DISABLE_REPBASE_NAME_UPCAST}" != "x1"
]; then
REPBASE_SPECIES_NAME=$(cat $REPBASE_OUT_DIR/_repbase_species_name)
else
REPBASE_SPECIES_NAME_NEW=$(cat $REPBASE_OUT_DIR/_repbase_species_name)
if [ "$REPBASE_SPECIES_NAME" != "$REPBASE_SPECIES_NAME_NEW" ]; then
REPBASE_FILE_NEW=
fi
fi
# building repeat libraries
REP_LIB="$(get_meta_conf $META_FILE_RAW REP_LIB)"
if [ -z "$REP_LIB" ]; then
# try to get raw lib
REP_LIB="$(get_meta_conf $META_FILE_RAW REP_LIB_RAW)"
if [ -z "$REP_LIB" ]; then
# repeats harvesting
construct_repeat_libraries $CMD_W $DBNAME $SPECIES \
$DATA_DIR/data/pipeline_out/repeat_lib \
"$(get_meta_conf $META_FILE_RAW REPEAT_MODELER_OPTIONS)"
REP_LIB="$DATA_DIR/data/pipeline_out/repeat_lib/${SPECIES}.rm.lib"
# nonref_unset_toplevel $CMD_W $DBNAME
fi
# repeats filtering stage
REPBASE_FILTER="$(get_meta_conf $META_FILE_RAW REPBASE_FILTER)"
REPBASE_FILE="$(get_meta_conf $META_FILE_RAW REPBASE_FILE)"
if [ -z "$GFF_FILE" ]; then
REPBASE_FILTER='NO'
fi
# get RepBase from repeat masker if nothing specified
if [ -z "$REPBASE_FILTER" -o "x$REPBASE_FILTER" != "xNO" ]; then
if [ -z "$REPBASE_FILE" ]; then
REPBASE_FILE="$REPBASE_FILE_NEW"
fi
if [ -n "$REPBASE_FILE" -a -f "$REPBASE_FILE" ]; then
# dump transctipts and translations
DUMP_TR_DIR=$DATA_DIR/data/pipeline_out/repeat_lib/tr_tr
dump_translations $CMD $DBNAME $DUMP_TR_DIR \
${SCRIPTS}/ensembl.prod.${ENS_VERSION} \
"cds" # dump CDS. I.e. UTRs having repeats are valid models
RM_CLEAN_RNA_FILE=${DUMP_TR_DIR}/tr.fna
RM_CLEAN_PEP_FILE=${DUMP_TR_DIR}/pep.faa
rm_clean_peps=$(grep -c '>' $RM_CLEAN_PEP_FILE)
if [ "$rm_clean_peps" -gt "0" ]; then
filter_repeat_library $REP_LIB \
$REPBASE_FILE \
$RM_CLEAN_PEP_FILE \
$RM_CLEAN_RNA_FILE \
"${SPECIES}.rm.filtered" \
$DATA_DIR/data/pipeline_out/repeat_lib/filter
REP_LIB="$DATA_DIR/data/pipeline_out/repeat_lib/filter/${SPECIES}.rm.filtered"
fi # rm_clean_peps
fi # -n REPBASE_FILE
fi # REPBASE_FILTER != NO
fi # -z REPLIB
# checkin rep lib size
if [ -n "$REP_LIB" -a "x$REP_LIB" != "xNO" ]; then
rep_lib_size=$(grep -c '>' $REP_LIB)
if [ "$rep_lib_size" -lt "1" ]; then
echo "empty repeat library ${REP_LIB}" >> /dev/stderr
IGNORE_EMPTY_REP_LIB=$(get_meta_conf $META_FILE_RAW IGNORE_EMPTY_REP_LIB)
if [ -n "$IGNORE_EMPTY_REP_LIB" -a "x$IGNORE_EMPTY_REP_LIB" != "xNO" -a "x$IGNORE_EMPTY_REP_LIB" != "x0" ]; then
REP_LIB="NO"
echo " ignoring. (IGNORE_EMPTY_REP_LIB=${IGNORE_EMPTY_REP_LIB})" >> /dev/stderr
else
echo " failing. set IGNORE_EMPTY_REP_LIB=1 to ignore" >> /dev/stderr
fail
fi
fi
fi
# repeat masking
run_repeat_masking $CMD_W $DBNAME $SPECIES "$REP_LIB" $DATA_DIR/data/pipeline_out/dna_features "$REPBASE_SPECIES_NAME" "$META_FILE_RAW"
# nonref_unset_toplevel $CMD_W $DBNAME
backup_relink $DBNAME $CMD repeat_masking $DATA_DIR/bup
if [ -n "$GFF_FILE" ]; then
# RNA features
RUN_RNA_FEATURES="$(get_meta_conf $META_FILE_RAW RUN_RNA_FEATURES)"
if [ -z "$RUN_RNA_FEATURES" -o "x$RUN_RNA_FEATURES" != "xNO" ]; then
run_rna_features $CMD_W $DBNAME $SPECIES $ENSEMBL_ROOT_DIR \
$DATA_DIR/data/pipeline_out/rna_features '_opt' "$(get_meta_conf $META_FILE_RAW RNA_FEAT_PARAMS)"
backup_relink $DBNAME $CMD rna_features $DATA_DIR/bup
fi
# RNA genes
RUN_RNA_GENES=$(get_meta_conf $META_FILE_RAW RUN_RNA_GENES)
if [ -z "$RUN_RNA_GENES" -o "x$RUN_RNA_GENES" != "xNO" ]; then
run_rna_genes $CMD_W $DBNAME $SPECIES $ENSEMBL_ROOT_DIR \
$DATA_DIR/data/pipeline_out/rna_features '_opt' "$(get_meta_conf $META_FILE_RAW RNA_GENE_PARAMS)"
backup_relink $DBNAME $CMD rna_genes $DATA_DIR/bup
fi
if [ z"${SPECIAL_ACTION}" = z"stop_before_xref" ]; then
echo "stopping before run_xref..." >> /dev/stderr
exit 0
fi
# run xref pipelines
RUN_XREF="$(get_meta_conf $META_FILE_RAW RUN_XREF)"
if [ -z "$RUN_XREF" -o "x$RUN_XREF" != "xNO" ]; then
run_xref $CMD_W $DBNAME $SPECIES $ENSEMBL_ROOT_DIR $DATA_DIR/data/pipeline_out/xrefs/all "$(get_meta_conf $META_FILE_RAW XREF_PARAMS)"
backup_relink $DBNAME $CMD run_xref $DATA_DIR/bup
fi
# fix gene and transcript stable ids, update xrefs name
UPDATE_STABLE_IDS="$(get_meta_conf $META_FILE_RAW 'UPDATE_STABLE_IDS')"
UPDATE_STABLE_IDS_OPTIONS="$(get_meta_conf $META_FILE_RAW 'UPDATE_STABLE_IDS_OPTIONS')"
if [ -n "$UPDATE_STABLE_IDS" -a "x$UPDATE_STABLE_IDS" != "xNO" ]; then
PREV_XREF_FILE="$DATA_DIR/data/pipeline_out/xrefs/all/prev_xrefs/${DBNAME}.ids_xref.txt"
update_stable_ids $CMD_W $DBNAME $PREV_XREF_FILE "${UPDATE_STABLE_IDS_OPTIONS}" $ENSEMBL_ROOT_DIR $DATA_DIR/data/pipeline_out/update_stable_ids
backup_relink $DBNAME $CMD update_stable_ids $DATA_DIR/bup
fi
# description projection
# we can project descriptions only if we have compara data -- thus skipping
if [ -z "$(get_meta_str $META_FILE 'sample.location_param')" ]; then
SAMPLE_GENE="$(get_meta_conf $META_FILE 'SAMPLE_GENE')"
set_core_random_samples $CMD_W $DBNAME "$SAMPLE_GENE" $ENSEMBL_ROOT_DIR
backup_relink $DBNAME $CMD random_samples $DATA_DIR/bup
fi
fi # "GFF_FILE"
# core stats
run_core_stats_new $CMD_W $DBNAME $SPECIES $DATA_DIR/data/pipeline_out/core_stats
backup_relink $DBNAME $CMD core_stats $DATA_DIR/bup
# run_dc $CMD_W $DBNAME $ENSEMBL_ROOT_DIR $DATA_DIR/data/pipeline_out/dc _pre_prod_sync
update_prod_tables_new $CMD_W $DBNAME $SPECIES $DATA_DIR/data/pipeline_out/update_prod_tables
backup_relink $DBNAME $CMD prodsync_new $DATA_DIR/bup
if [ z"${SPECIAL_ACTION}" = z"pre_final_dc" ]; then
run_dc $CMD_W $DBNAME $ENSEMBL_ROOT_DIR $DATA_DIR/data/pipeline_out/dc _pre_final
fi
if [ z"${SPECIAL_ACTION}" = z"finalise" ]; then
backup_relink $DBNAME $CMD final $DATA_DIR/bup
echo done >> /dev/stderr
fi
if [ z"${SPECIAL_ACTION}" = z"patch_schema" ]; then
echo running additional steps. not backing them up >> /dev/stderr
patch_db_schema $CMD_W $DBNAME \
$ENSEMBL_ROOT_DIR $DATA_DIR/data/pipeline_out/patch_schema
backup_relink $DBNAME $CMD patched_schema $DATA_DIR/bup
echo schema patched >> /dev/stderr
fi
# additioanal staff
exit 0
# use conf instead???
# exit 0
# add metakey
DBNAME_FIN=$DBNAME
update_prod_tables_new $CMD_W $DBNAME_FIN $SPECIES \
$DATA_DIR/data/pipeline_out/update_prod_tables_fin _fin
run_dc $CMD_W $DBNAME_FIN \
$ENSEMBL_ROOT_DIR $DATA_DIR/data/pipeline_out/dc _fin
echo done additional >> /dev/stderr