-
Notifications
You must be signed in to change notification settings - Fork 3
/
main.nf
432 lines (328 loc) · 14.8 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
#!/usr/bin/env nextflow
/* -----------------------------------------------
Auxilliary workflow to prepare database files used by SWAAT.
Author: "Houcemeddine Othman"
Credits: "Wits University H3Africa/GSK ADME collaboration"
Maintainer: "Houcemeddine Othman"
Email: "[email protected]"
-------------------------------------------------*/
log.info """
@@@###############################################################################################@@
*
(######## .### # ### */ ,** ( ################
####( #### ###. ,## (### / #(*.# , ### ################
###. #### ##### #### ( ** ##### .###
.###### #### /##,### ### . ( . * .., ###.### ###(
(######### ### ### ###/### ( , . .* /### #### *###
/#### #####, ###### ,**, ########### ###,
, ,### #### .### ( ,* , /(#, * ,, * ############## /###
############# ##* ## .( * .* ### ### ###.
###### # , * * #### (### (###
STRUCTURAL WORKFLOW FOR ANALYZING ADME TARGETS
"""
def helpMessage() {
log.info"""
Usage:
The typical command for running SWAAT:
nextflow run main.nf --dbhome /home/to/database --vcfhome /path/to/vcf/dir --outfolder output_folder --genelist ADME_genes_to_annotate.txt
Arguments:
--dbhome [folder] Path to database containing the dependency files for annotating the variants (Default False)
--vcfhome [folder] Path to folder containing VCF files split by annotated gene (e.g. CYP2D6.vcf) (Default False)
--outfolder [str] Where to output the plain text and the HTML report (Default: false)
--genelist [file] User can limit the annotation to the list of genes contained in a this text file (one line per gene) (Default False)
Other
--foldxexe [str] Specifies the name of the executable of FoldX software (Default foldx)
--encomexe [str] Specifies the name of the executable of build_encom (Default build_encom)
--freesasaexe [str] Specifies the name of the executable of freesasa software (Default freesasa)
--strideexe [str] Specifies the name of the executable of stride software (Default freesasa)
--rotabase [abs path] Path to rotabase.txt (Default PATH to foldx)
""".stripIndent()
}
// Show help message
if (params.help) {
helpMessage()
exit 0
}
// home to script file
params.SCRIPTHOME = launchDir+"/scripts"
// home to PDBs
params.PDBFILESPATH = "$params.dbhome/PDBs/"
// Home to folder containing Blosum62, Grantham and Sneath matrices
params.MATRICES="$params.dbhome/matrices/"
// Path to the pickle file for Random forest prediction
params.RFMLM=launchDir+"/ML4swaat/swaat_rf.ML"
/*------------------------------------------------------------------------------------------
The following bloc assigns default executable names to
FoldX, build_encom, freesasa and stride.
If ijnstalled under other names, they need to be changed from the CLI or by modifying
the default values in main.nf
-------------------------------------------------------------------------------------------*/
params.foldxexe = "foldx"
params.encomexe = "build_encom"
params.freesasaexe = "freesasa"
params.strideexe = "stride"
/*--------------------------------------------------------------------------------------------
The following bloc check if essential paramters specified in options
are valid
/*------------------------------------------------------------------------------------------*/
// test if the gene file list exists
def testFile = new File(params.genelist)
if (!testFile.exists()) {
log.info "file ${params.genelist} does not exist "
exit 1 }
else {
log.info "~~~~~Gene list file exists"
}
// test if the dbhome exist
def testFolderDb = new File(params.dbhome)
if (!testFolderDb.exists()) {
log.info "file ${params.dbhome} does not exist "
exit 1 }
else {
log.info "~~~~~Path to database exists"
}
// generate a channel from the gene list and replace "\n"
genes = Channel.fromPath("$params.genelist").splitText() { it.replaceAll("\n", "") }
.ifEmpty { error "Cannot find genes in gene list" }
// check if VCF files exist
Channel.fromPath("${params.vcfhome}"+"/*.vcf")
.ifEmpty { error "Cannot find files with extension .vcf in ${params.vcfhome}" }
/*--------------------------------------------------------------------------------------------
The following bloc will assess if freesasa, build_sncom, stide and foldx are in the PATH
are valid
/*------------------------------------------------------------------------------------------*/
def checkToolInPath(tool)
{
Runtime myrt = Runtime.getRuntime();
Process myproc = myrt.exec("which "+tool)
int exitVal = myproc.waitFor()
if (exitVal != 0) {
log.info tool+" not in PATH"
exit 1
}
}
checkToolInPath(params.freesasaexe)
checkToolInPath(params.encomexe)
checkToolInPath(params.foldxexe)
checkToolInPath(params.strideexe)
/*--------------------------------------------------------------------------------------------
The following bloc will set params.rotabase to the same path as foldx if it is not
specified in CLI
/*------------------------------------------------------------------------------------------*/
if (!params.rotabase) {
log.info "~~~~~Setting the path to rotabase.txt from foldx location"
foldx_full_path = "which ${params.foldxexe}".execute().text
params.rotabase = "dirname ${foldx_full_path}".execute().text.replaceAll("\n", "")+"/rotabase.txt"
}
/*--------------------------------------------------------------------------------------------
SWAAT starts to run here
/*------------------------------------------------------------------------------------------*/
log.info """
=========================================
Path to VCFs : ${params.vcfhome}
Path to gene_list : ${params.genelist}
Path to database : ${params.dbhome}
PDBs : ${params.PDBFILESPATH}
Matrices : ${params.MATRICES}
Predictive ML : ${params.RFMLM}
FoldX Rotabase : ${params.rotabase}
========================================
"""
process generateSwaatInput {
publishDir "${params.outfolder}/$gene", mode:'copy'
input:
val(gene) from genes.flatMap()
output:
file "${gene}_var2prot.csv" into var2aa_report
file "${gene}.swaat" into swaat_input
script:
gene_output_dir = file("${params.outfolder}/$gene") // create subdirectories (gene names) in the output directory
gene_output_dir.mkdir()
"""
vcf4gene=\$(ls ${params.vcfhome}/${gene}.vcf)
map4gene=\$(ls ${params.dbhome}/maps/${gene}.tsv)
# gnerate missense variants report and swaat input
python ${params.SCRIPTHOME}/ParseVCF.py --vcf \$vcf4gene \
--map \$map4gene \
--output ${gene}_var2prot.csv \
--swaat ${gene}.swaat
"""
}
process filterVars {
//echo true
/* execution error is ignored if no variant is in the range of the
exons or, it is not covered by the protein structure. or
no *not_processed.tsv is generated
*/
input:
file(variants) from swaat_input
output:
file("${name}_retained.tsv") optional true into (retained_vars, retained)
file("${name}_not_processed.tsv") optional true into gaps_and_non_processed
//file("${name}_pointer.tsv") optional true into pointerfile
val name into bigreceiver
publishDir "${params.outfolder}/$name", mode:'copy'
script:
name = variants.baseName.replaceFirst(".swaat","")
"""
#!/usr/bin/env python
import glob
print("$name")
with open("$variants", "r") as input:
lines=input.readlines()
for datafile in glob.glob("${params.dbhome}/uniprot2PDBmap" + "/*.tsv"):
# supposes that the variant file contains only one gene
gene_name_in_vars = lines[0].split()[0]
with open(datafile, "r") as mydatafile:
datalines=mydatafile.readlines()
gene_name_in_map=datalines[0].split()[1]
if gene_name_in_map == gene_name_in_vars:
covered_residues=[line.split()[2] for line in datalines[2:]]
print(covered_residues)
try:
info_line = datalines[0].split()
fasta = info_line[1]+".fasta"
pdb = info_line[4]
with open("${name}_pointer.txt", "w") as pointerfile:
pointerfile.writelines(fasta+"\t"+pdb)
except:
pass
for var in lines:
print(var)
if (var.split()[2] in covered_residues) and (var.split()[3] != "_") :
with open("${name}_retained.tsv", "a+") as processed_var:
processed_var.writelines(var)
elif var.split()[3] == "_" :
with open("${name}_not_processed.tsv", "a+") as not_processed:
not_processed.writelines(var)
else:
with open("${name}_not_processed.tsv", "a+") as not_processed:
not_processed.writelines(var)
"""
}
vars = retained_vars.splitText() { it.replaceAll("\n", "") }
vars.into { vars4foldx; vars4encom ; vars4suffix }
process generateGuidingFile {
/* --------------------------------------------------------------------------------------
This process generates the guiding file (which sequence and which PDB for the mutation)
----------------------------------------------------------------------------------------*/
//echo true
input:
val variant from vars4suffix.flatMap()
output:
file "*.id" into id
file "*_whichPDB.tsv" into guidingFile
"""
id=\$(echo $variant |sed 's/ /-/g')
echo $variant > \$id.id
gene=\$(echo $variant| awk {'print \$1'} )
filename=\$(echo \${gene}_refseq.fa )
echo \$filename >mygene
python ${params.SCRIPTHOME}/whichPdb.py --fasta ${params.dbhome}/sequences//Refseq/\$filename \
--pdbpath ${params.PDBFILESPATH} --output \${id}_whichPDB.tsv
"""
}
process calculateStructuralFeatures {
/*----------------------------------------------------------------------------
The process calculates structural features using FoldX, ENCoM, stride
and assigns annotation from the database using parseOutput.py
-----------------------------------------------------------------------------*/
errorStrategy 'ignore'
echo true
cpus 2
input:
val variant from vars4foldx.flatMap()
file(my_id) from id
output:
file "${the_id}_swaat.csv" into calculated_parameters
script:
the_id = my_id.baseName.replaceFirst(".id","")
"""
echo $variant >seq_coord_${the_id}.txt
python ${params.SCRIPTHOME}/processFoldX.py --var $my_id \
--seq2chain ${params.dbhome}/Seq2Chain \
--output ${the_id} \
--map ${params.dbhome}/uniprot2PDBmap
mutation_suffix=\$(sed 's/;//' individual_list_${the_id}.txt |sed 's/,//')
pdbfile=\$(cat ${the_id}_pdb.txt )
Uniprot=\$(basename \$pdbfile .pdb)
touch \$Uniprot.pointer
ln -s ${params.PDBFILESPATH}/\$pdbfile
ln -s ${params.rotabase}
${params.foldxexe} --command=BuildModel --pdb=\$pdbfile --mutant-file=individual_list_${the_id}.txt >/dev/null
mv Dif_*.fxout ${the_id}_\${mutation_suffix}_suffixed.fxout
mv WT_*.pdb WT_${the_id}_repaired.pdb
mv *_1.pdb ${the_id}_mutant.pdb
${params.encomexe} -i ${the_id}_mutant.pdb -cov ${the_id}.cov -o ${the_id}.eigen >/dev/null
${params.freesasaexe} -n 200 ${the_id}_mutant.pdb >${the_id}_mut.sasa
${params.freesasaexe} -n 200 WT_${the_id}_repaired.pdb >${the_id}_wt.sasa
${params.freesasaexe} --format seq -n 200 WT_${the_id}_repaired.pdb >${the_id}_perAA.sasa
${params.freesasaexe} --format seq -n 200 ${the_id}_mutant.pdb >${the_id}_perAA_mut.sasa
${params.strideexe} -f ${the_id}_mutant.pdb -h >Hbond_${the_id}_mut.dat
${params.strideexe} -f WT_${the_id}_repaired.pdb -h >Hbond_${the_id}_wt.dat
gene_names=\$(awk {'print \$1'} $my_id)
coor_in_ref=\$(awk {'print \$1'} seq_coord_${the_id}.txt)
ID=\$(basename *.pointer .pointer )
eigefile=\$(echo \$ID.eige)
mapfile=\$(ls ${params.dbhome}/uniprot2PDBmap/\${gene_names}_*.tsv)
python ${params.SCRIPTHOME}/parseOutput.py --diff ${the_id}*.fxout \
--matrix ${params.MATRICES}/blosum62.txt \
--strideWT Hbond_${the_id}_mut.dat \
--strideMut Hbond_${the_id}_mut.dat \
--freesasaMut ${the_id}_mut.sasa \
--sneath ${params.MATRICES}/sneath.txt \
--grantham ${params.MATRICES}/grantham.txt \
--freesasaWT ${the_id}_wt.sasa \
--aasasa ${the_id}_perAA.sasa \
--aasasamut ${the_id}_perAA_mut.sasa \
--indiv individual_list_${the_id}.txt \
--pdbMut ${the_id}_mutant.pdb \
--pdbWT WT_${the_id}_repaired.pdb \
--modesMut ${the_id}.eigen \
--modesWT ${params.dbhome}/ENCoM/\$eigefile \
--pssm ${params.dbhome}/PSSMs/\$gene_names.pssm \
--genename \$gene_names \
--output ${the_id}_swaat.csv \
--map \$mapfile
"""
}
data_vars = calculated_parameters.collectFile(name: "./swaatall.csv" , newLine: false, skip: 1, keepHeader: true)
process predictVarEfectMl {
/*----------------------------------------------------------------------------
Predicts the effect of amino acid substitution with a random forset model
Output: 1 (Has effect), 0 (Neutral)
-----------------------------------------------------------------------------*/
input:
file(data4allAavars) from data_vars
output:
file "predicted_outcomes.csv" into predicted_outcomes
"""
python ${params.SCRIPTHOME}/deployML.py --inputdata ${data4allAavars} \
--modelpickle ${params.RFMLM} \
--output predicted_outcomes.csv
"""
}
process formatReport {
/*----------------------------------------------------------------------------
Generates the formatted CSV and HTML reports and output to params.outfolder
-----------------------------------------------------------------------------*/
input:
file(vars) from var2aa_report.collect()
file(outcomes) from predicted_outcomes
output:
file "${outcomes}"
file "*.html"
publishDir "${params.outfolder}/", mode:'copy'
"""
for treeFile in ${vars}
do
tail -n +2 \$treeFile >>allVariantsInOneFile.csv
done
python ${params.SCRIPTHOME}/formatOutput.py --prediction ${outcomes} \
--variants allVariantsInOneFile.csv \
--dataHome ${params.dbhome}
"""
}
workflow.onComplete {
println ( workflow.success ? "\nDone! Open the following report in your browser --> " : "Oops .. something went wrong" )
}