From 5eb3ede26a8dcd67053b0e30e05fd1104c514bef Mon Sep 17 00:00:00 2001
From: skchronicles <kuhnsa3@gmail.com>
Date: Fri, 15 Sep 2023 16:42:02 -0400
Subject: [PATCH] Updating docstring and setting up sqanti rules

---
 workflow/rules/quant.smk  |  2 +-
 workflow/rules/sqanti.smk | 46 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 1 deletion(-)
 create mode 100644 workflow/rules/sqanti.smk

diff --git a/workflow/rules/quant.smk b/workflow/rules/quant.smk
index 5acbbb6..08b7a2e 100644
--- a/workflow/rules/quant.smk
+++ b/workflow/rules/quant.smk
@@ -124,7 +124,7 @@ rule flair_collapse:
     be concatenated prior to running flair-collapse. 
     Github: https://github.com/BrooksLabUCSC/flair
     @Input:
-        FLAIR Correct Genomic Alignments in BED12 (scatter)
+        FLAIR Correct Genomic Alignments in BED12 (gather)
     @Output:
         High-confidence Isoforms (BED),
         High-confidence Isoforms (GTF),
diff --git a/workflow/rules/sqanti.smk b/workflow/rules/sqanti.smk
new file mode 100644
index 0000000..cfa94fa
--- /dev/null
+++ b/workflow/rules/sqanti.smk
@@ -0,0 +1,46 @@
+# Sqanti related quality-control and filtering rules,
+# Sqanti is being used to annotate/characterize novel 
+# isoforms and to build an even higher-confidence,
+# filtered set of unique transcripts from flair.
+# The resulting annotation/transcriptome will be
+# used to quantify known/novel isoforms.
+rule sqanti_qc:
+    """
+    Data-processing step to characterize the input transcriptome 
+    by computing a series of attributes by transcript, which are
+    written to the classification file, and a series of attributes
+    by junction, which are written to the junctions file. Please 
+    note although we are running SQANTI3, the actual version of 
+    the tool we are using is 'v5.1.2'. For more information, 
+    please read through sqanti3's documenation:
+    https://github.com/ConesaLab/SQANTI3/wiki/
+    Github: https://github.com/ConesaLab/SQANTI3
+    @Input:
+        High-confidence Isoforms (FASTA) from flair collapse 
+    @Output:
+        Sqanti Classification file (TSV),
+        Corrected Annotation (GTF),
+        Corrected Transcriptome (FASTA)
+    """
+    pass
+
+
+rule sqanti_ml_filter:
+    """
+    Data-processing step to filter the sqanti qc output. The auhtor
+    from sqanti highly recommends filtering its output before using
+    it in down-stream analysis. Sqanti has a new filtering method
+    that employs random forest to discriminate potential artifacts
+    from true isoforms without the need for user-defined rules or
+    manually-set thresholds (i.e. previous method). For more info, 
+    please read through sqanti3's documenation:
+    https://github.com/ConesaLab/SQANTI3/wiki/
+    Github: https://github.com/ConesaLab/SQANTI3
+    @Input:
+        Sqanti Classification file (TSV) 
+    @Output:
+        ML Filtered Sqanti Classification file (TSV),
+        ML Filtered Corrected Annotation (GTF),
+        ML Filtered Corrected Transcriptome (FASTA)
+    """
+    pass
\ No newline at end of file