From 7684769e2bc4092b4b7be6827be9e5a68239fd7c Mon Sep 17 00:00:00 2001
From: skchronicles <kuhnsa3@gmail.com>
Date: Tue, 28 Nov 2023 17:56:32 -0500
Subject: [PATCH] Adding rules to aggregate blast results into a spreadsheet

---
 workflow/Snakefile                    |   5 ++
 workflow/rules/paired-end.smk         | 112 +++++++++++++++++++++++++-
 workflow/scripts/files2spreadsheet.py |   4 +-
 3 files changed, 116 insertions(+), 5 deletions(-)

diff --git a/workflow/Snakefile b/workflow/Snakefile
index ae1716b..bb57c63 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -118,12 +118,17 @@ rule all:
             join(workpath,"{name}","output","{name}.metaspades_blast.tsv"), 
             name=pe_samples
         ),
+        provided(
+            [join(workpath,"Project","metaspades_blast.xlsx")],
+            pe_samples
+        ),
         # BLAST Megahit contigs against nt virsuses database,
         # @imported from rules/paired-end.smk
         expand(
             join(workpath,"{name}","output","{name}.megahit_blast.tsv"), 
             name=samples
         ),
+        join(workpath,"Project","megahit_blast.xlsx"),
         # Krona, taxonomic classification report
         # @imported from rules/paired-end.smk
         expand(
diff --git a/workflow/rules/paired-end.smk b/workflow/rules/paired-end.smk
index 3d7e3bc..45d91e9 100644
--- a/workflow/rules/paired-end.smk
+++ b/workflow/rules/paired-end.smk
@@ -695,7 +695,7 @@ rule blast_metaspades_contigs:
     output:
         blast=join(workpath,"{name}","output","{name}.metaspades_blast.tsv"),
     params:
-        rname='blastmetaspadest',
+        rname='blastmetaspades',
         blast_db=config['references']['blast_viral_db'],
         header=join(workpath,"{name}","output","{name}.metaspades_blast.header.tsv"),
         tmp=join(workpath,"{name}","output","{name}.metaspades_blast.tmp.tsv"),
@@ -717,7 +717,7 @@ rule blast_metaspades_contigs:
     container: config['images']['blast']
     shell: """
     # BLAST metaspades contigs against viral database
-    echo -e "#{params.outfmt_tabs}" \\
+    echo -e "{params.outfmt_tabs}" \\
     > {params.header}
     blastn -query {input.contigs} \\
         -db {params.blast_db} \\
@@ -733,6 +733,59 @@ rule blast_metaspades_contigs:
     """
 
 
+rule blast_metaspades_xlsx:
+    """
+    Data-processing step to aggregate the blast-ed metaspades contigs results into
+    a single XLSX file. Within the output XLSX file, there will be a tab for each
+    sample's blast results. The first tab in the spreadsheet will contain concat
+    blast results across all samples.
+    @Input:
+        Blast texts file containing alignment results of metaspades contigs (gather)
+    @Output:
+        Blast XLSX file containing alignment results of metaspades contigs for all samples
+    """
+    input:
+        blasts=expand(join(workpath,"{name}","output","{name}.metaspades_blast.tsv"), name=pe_samples),
+    output:
+        tsv=join(workpath,"Project","metaspades_blast.tsv"), 
+        excel=join(workpath,"Project","metaspades_blast.xlsx"),
+    params:
+        rname='blastmetaspadesxlsx',
+        outfmt_tabs='\\t'.join([
+            'sample', 'qaccver', 'saccver', 'staxid', 
+            'ssciname', 'scomname', 'sblastname', 
+            'sscinames', 'stitle', 'pident', 'qlen', 
+            'length', 'mismatch', 'gapopen', 'qstart',
+            'qend', 'sstart', 'send', 'evalue', 'bitscore'
+        ]),
+        extension=".metaspades_blast.tsv",
+        script=join(workpath, "workflow", "scripts", "file2spreadsheet.py"),
+    threads: int(allocated("threads", "blast_metaspades_xlsx", cluster))
+    container: config['images']['blast']
+    shell: """
+    # Create aggregated single file 
+    # results of BLAST-ed metaspades 
+    # contigs against viral database
+    echo -e "{params.outfmt_tabs}" \\
+    > {output.tsv}
+    # Adding a column to the output
+    # that contains each sample name
+    awk -F '\\t' -v OFS='\\t' \\
+        'FNR>1 {{sub("{params.extension}", "", FILENAME); print FILENAME, $0}}' \\
+        {input.blasts} \\
+    > {output.tsv}
+
+    # Create an excel spreadsheet 
+    # containing the blast results 
+    # of aligning the metaspades
+    # contigs against NCBI viral db 
+    {params.script} \\
+        --rm-suffix "{params.extension}" \\
+        --input {output.tsv} {input.blasts} \\
+        --output {output.excel}
+    """
+
+
 rule blast_megahit_contigs:
     """
     Data-processing step to blast assembled contigs against nt virsuses database.
@@ -772,7 +825,7 @@ rule blast_megahit_contigs:
     container: config['images']['blast']
     shell: """
     # BLAST megahit contigs against viral database
-    echo -e "#{params.outfmt_tabs}" \\
+    echo -e "{params.outfmt_tabs}" \\
     > {params.header}
     blastn -query {input.contigs} \\
         -db {params.blast_db} \\
@@ -788,6 +841,59 @@ rule blast_megahit_contigs:
     """
 
 
+rule blast_megahit_xlsx:
+    """
+    Data-processing step to aggregate the blast-ed megahit contigs results into
+    a single XLSX file. Within the output XLSX file, there will be a tab for each
+    sample's blast results. The first tab in the spreadsheet will contain concat
+    blast results across all samples.
+    @Input:
+        Blast texts file containing alignment results of megahit contigs (gather)
+    @Output:
+        Blast XLSX file containing alignment results of megahit contigs for all samples
+    """
+    input:
+        blasts=expand(join(workpath,"{name}","output","{name}.megahit_blast.tsv"), name=samples),
+    output:
+        tsv=join(workpath,"Project","megahit_blast.tsv"), 
+        excel=join(workpath,"Project","megahit_blast.xlsx"),
+    params:
+        rname='blastmegahitxlsx',
+        outfmt_tabs='\\t'.join([
+            'sample', 'qaccver', 'saccver', 'staxid', 
+            'ssciname', 'scomname', 'sblastname', 
+            'sscinames', 'stitle', 'pident', 'qlen', 
+            'length', 'mismatch', 'gapopen', 'qstart',
+            'qend', 'sstart', 'send', 'evalue', 'bitscore'
+        ]),
+        extension=".megahit_blast.tsv",
+        script=join(workpath, "workflow", "scripts", "file2spreadsheet.py"),
+    threads: int(allocated("threads", "blast_megahit_xlsx", cluster))
+    container: config['images']['blast']
+    shell: """
+    # Create aggregated single file 
+    # results of BLAST-ed megahit 
+    # contigs against viral database
+    echo -e "{params.outfmt_tabs}" \\
+    > {output.tsv}
+    # Adding a column to the output
+    # that contains each sample name
+    awk -F '\\t' -v OFS='\\t' \\
+        'FNR>1 {{sub("{params.extension}", "", FILENAME); print FILENAME, $0}}' \\
+        {input.blasts} \\
+    > {output.tsv}
+
+    # Create an excel spreadsheet 
+    # containing the blast results 
+    # of aligning the megahit
+    # contigs against NCBI viral db 
+    {params.script} \\
+        --rm-suffix "{params.extension}" \\
+        --input {output.tsv} {input.blasts} \\
+        --output {output.excel}
+    """
+
+
 rule krona:
     """
     Reporting step to create a interactive Krona charts based on
diff --git a/workflow/scripts/files2spreadsheet.py b/workflow/scripts/files2spreadsheet.py
index f8bb204..a38618b 100755
--- a/workflow/scripts/files2spreadsheet.py
+++ b/workflow/scripts/files2spreadsheet.py
@@ -298,9 +298,9 @@ def excel_writer(files, spreadsheet,  skip_comments=None, remove_suffix = ''):
                 # additional suffix to remove
                 # we must take into account the
                 # extension has already been
-                # removed prior to 
+                # removed, see above
                 cleaned_suffix, ext = os.path.splitext(remove_suffix)
-                sheet = sheet.split(cleaned_suffix)[0]
+                sheet = sheet.rsplit(cleaned_suffix, 1)[0]
             
             # Sheet name cannot exceed 
             # 31 characters in length