From 7684769e2bc4092b4b7be6827be9e5a68239fd7c Mon Sep 17 00:00:00 2001 From: skchronicles Date: Tue, 28 Nov 2023 17:56:32 -0500 Subject: [PATCH] Adding rules to aggregate blast results into a spreadsheet --- workflow/Snakefile | 5 ++ workflow/rules/paired-end.smk | 112 +++++++++++++++++++++++++- workflow/scripts/files2spreadsheet.py | 4 +- 3 files changed, 116 insertions(+), 5 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index ae1716b..bb57c63 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -118,12 +118,17 @@ rule all: join(workpath,"{name}","output","{name}.metaspades_blast.tsv"), name=pe_samples ), + provided( + [join(workpath,"Project","metaspades_blast.xlsx")], + pe_samples + ), # BLAST Megahit contigs against nt virsuses database, # @imported from rules/paired-end.smk expand( join(workpath,"{name}","output","{name}.megahit_blast.tsv"), name=samples ), + join(workpath,"Project","megahit_blast.xlsx"), # Krona, taxonomic classification report # @imported from rules/paired-end.smk expand( diff --git a/workflow/rules/paired-end.smk b/workflow/rules/paired-end.smk index 3d7e3bc..45d91e9 100644 --- a/workflow/rules/paired-end.smk +++ b/workflow/rules/paired-end.smk @@ -695,7 +695,7 @@ rule blast_metaspades_contigs: output: blast=join(workpath,"{name}","output","{name}.metaspades_blast.tsv"), params: - rname='blastmetaspadest', + rname='blastmetaspades', blast_db=config['references']['blast_viral_db'], header=join(workpath,"{name}","output","{name}.metaspades_blast.header.tsv"), tmp=join(workpath,"{name}","output","{name}.metaspades_blast.tmp.tsv"), @@ -717,7 +717,7 @@ rule blast_metaspades_contigs: container: config['images']['blast'] shell: """ # BLAST metaspades contigs against viral database - echo -e "#{params.outfmt_tabs}" \\ + echo -e "{params.outfmt_tabs}" \\ > {params.header} blastn -query {input.contigs} \\ -db {params.blast_db} \\ @@ -733,6 +733,59 @@ rule blast_metaspades_contigs: """ +rule blast_metaspades_xlsx: + """ + Data-processing step to aggregate the blast-ed metaspades contigs results into + a single XLSX file. Within the output XLSX file, there will be a tab for each + sample's blast results. The first tab in the spreadsheet will contain concat + blast results across all samples. + @Input: + Blast texts file containing alignment results of metaspades contigs (gather) + @Output: + Blast XLSX file containing alignment results of metaspades contigs for all samples + """ + input: + blasts=expand(join(workpath,"{name}","output","{name}.metaspades_blast.tsv"), name=pe_samples), + output: + tsv=join(workpath,"Project","metaspades_blast.tsv"), + excel=join(workpath,"Project","metaspades_blast.xlsx"), + params: + rname='blastmetaspadesxlsx', + outfmt_tabs='\\t'.join([ + 'sample', 'qaccver', 'saccver', 'staxid', + 'ssciname', 'scomname', 'sblastname', + 'sscinames', 'stitle', 'pident', 'qlen', + 'length', 'mismatch', 'gapopen', 'qstart', + 'qend', 'sstart', 'send', 'evalue', 'bitscore' + ]), + extension=".metaspades_blast.tsv", + script=join(workpath, "workflow", "scripts", "file2spreadsheet.py"), + threads: int(allocated("threads", "blast_metaspades_xlsx", cluster)) + container: config['images']['blast'] + shell: """ + # Create aggregated single file + # results of BLAST-ed metaspades + # contigs against viral database + echo -e "{params.outfmt_tabs}" \\ + > {output.tsv} + # Adding a column to the output + # that contains each sample name + awk -F '\\t' -v OFS='\\t' \\ + 'FNR>1 {{sub("{params.extension}", "", FILENAME); print FILENAME, $0}}' \\ + {input.blasts} \\ + > {output.tsv} + + # Create an excel spreadsheet + # containing the blast results + # of aligning the metaspades + # contigs against NCBI viral db + {params.script} \\ + --rm-suffix "{params.extension}" \\ + --input {output.tsv} {input.blasts} \\ + --output {output.excel} + """ + + rule blast_megahit_contigs: """ Data-processing step to blast assembled contigs against nt virsuses database. @@ -772,7 +825,7 @@ rule blast_megahit_contigs: container: config['images']['blast'] shell: """ # BLAST megahit contigs against viral database - echo -e "#{params.outfmt_tabs}" \\ + echo -e "{params.outfmt_tabs}" \\ > {params.header} blastn -query {input.contigs} \\ -db {params.blast_db} \\ @@ -788,6 +841,59 @@ rule blast_megahit_contigs: """ +rule blast_megahit_xlsx: + """ + Data-processing step to aggregate the blast-ed megahit contigs results into + a single XLSX file. Within the output XLSX file, there will be a tab for each + sample's blast results. The first tab in the spreadsheet will contain concat + blast results across all samples. + @Input: + Blast texts file containing alignment results of megahit contigs (gather) + @Output: + Blast XLSX file containing alignment results of megahit contigs for all samples + """ + input: + blasts=expand(join(workpath,"{name}","output","{name}.megahit_blast.tsv"), name=samples), + output: + tsv=join(workpath,"Project","megahit_blast.tsv"), + excel=join(workpath,"Project","megahit_blast.xlsx"), + params: + rname='blastmegahitxlsx', + outfmt_tabs='\\t'.join([ + 'sample', 'qaccver', 'saccver', 'staxid', + 'ssciname', 'scomname', 'sblastname', + 'sscinames', 'stitle', 'pident', 'qlen', + 'length', 'mismatch', 'gapopen', 'qstart', + 'qend', 'sstart', 'send', 'evalue', 'bitscore' + ]), + extension=".megahit_blast.tsv", + script=join(workpath, "workflow", "scripts", "file2spreadsheet.py"), + threads: int(allocated("threads", "blast_megahit_xlsx", cluster)) + container: config['images']['blast'] + shell: """ + # Create aggregated single file + # results of BLAST-ed megahit + # contigs against viral database + echo -e "{params.outfmt_tabs}" \\ + > {output.tsv} + # Adding a column to the output + # that contains each sample name + awk -F '\\t' -v OFS='\\t' \\ + 'FNR>1 {{sub("{params.extension}", "", FILENAME); print FILENAME, $0}}' \\ + {input.blasts} \\ + > {output.tsv} + + # Create an excel spreadsheet + # containing the blast results + # of aligning the megahit + # contigs against NCBI viral db + {params.script} \\ + --rm-suffix "{params.extension}" \\ + --input {output.tsv} {input.blasts} \\ + --output {output.excel} + """ + + rule krona: """ Reporting step to create a interactive Krona charts based on diff --git a/workflow/scripts/files2spreadsheet.py b/workflow/scripts/files2spreadsheet.py index f8bb204..a38618b 100755 --- a/workflow/scripts/files2spreadsheet.py +++ b/workflow/scripts/files2spreadsheet.py @@ -298,9 +298,9 @@ def excel_writer(files, spreadsheet, skip_comments=None, remove_suffix = ''): # additional suffix to remove # we must take into account the # extension has already been - # removed prior to + # removed, see above cleaned_suffix, ext = os.path.splitext(remove_suffix) - sheet = sheet.split(cleaned_suffix)[0] + sheet = sheet.rsplit(cleaned_suffix, 1)[0] # Sheet name cannot exceed # 31 characters in length