Improve the Falco wrapper (galaxyproject#6116)

* Improve the Falco wrapper - Mention FastQC in the description to make the tool findable when searching for the original - Add information about advantages and current limitations compared to FastQC to help section - Do not produce the results summary file by default, but only on demand - Simplify symlinking * Fix wrongly negated output filter
pavanvidem · Jun 30, 2024 · e1af25e · e1af25e
1 parent dd35055
commit e1af25e
Showing 1 changed file with 62 additions and 60 deletions.
diff --git a/tools/falco/falco.xml b/tools/falco/falco.xml
@@ -1,5 +1,5 @@
-<tool id="falco" name="Falco" version="1.2.2+galaxy0" profile="21.05">
-    <description>A high throughput sequence QC analysis tool</description>
+<tool id="falco" name="Falco" version="1.2.2+galaxy1" profile="21.05">
+    <description>An alternative, more performant implementation of FastQC for high throughput sequence quality control</description>
     <xrefs>
         <xref type="bio.tools">falco</xref>
     </xrefs>
@@ -8,15 +8,7 @@
     </requirements>
     <command detect_errors="aggressive"><![CDATA[
         #import re
-        #set input_name = re.sub('[^\w\-\s]', '_', str($input_file.element_identifier))
-
-        #if $input_file.ext.endswith('.gz'):
-            #set input_file_sl = $input_name + '.gz'
-        #elif $input_file.ext.endswith('.bz2'):
-            #set input_file_sl = $input_name + '.bz2'
-        #else
-            #set input_file_sl = $input_name
-        #end if
+        #set input_name_sl = re.sub('[^\w\-\s]', '_', str($input_file.element_identifier))
 
         #if 'bam' in $input_file.ext:
             #set format = 'bam'
@@ -28,7 +20,7 @@
             #set format = 'fastq'
         #end if
 
-        ln -s '${input_file}' '${input_file_sl}' &&
+        ln -s '${input_file}' '${input_name_sl}' &&
         falco
             #if $contaminants:
                 --contaminants '${contaminants}'
@@ -43,140 +35,164 @@
             #end if
             --threads \${GALAXY_SLOTS:-2}
             --quiet
-            --extract
             ## #if $min_length:
             ##     --min_length $min_length
             ## #end if
             $nogroup
             ## --kmers $kmers
             -f '${format}'
-            '${input_file_sl}'
-            -subsample $subsample
+            '${input_name_sl}'
+            #if $subsample > 1:
+              -subsample $subsample
+            #end if
             $bisulfite
             $reverse_complement
-
+            $generate_summary
     ]]></command>
     <inputs>
-        <param format="fastq,fastq.gz,fastq.bz2,bam,sam" name="input_file" type="data" label="Raw read data from your current history"/>
+        <param format="fastq,fastq.gz,bam,sam" name="input_file" type="data" label="Raw read data from your current history"/>
         <param name="contaminants" type="data" format="tabular" optional="true" label="Contaminant list" help="tab delimited file with 2 columns: name and sequence.  For example: Illumina Small RNA RT Primer&#x9;CAAGCAGAAGACGGCATACGA"/>
         <param argument="--adapters" type="data" format="tabular" optional="true" label="Adapter list" help="List of adapters adapter sequences which will be explicity searched against the library. It should be a tab-delimited file with 2 columns: name and sequence."/>
         <param name="limits" type="data" format="txt" optional="true" label="Submodule and Limit specifing file" help="a file that specifies which submodules are to be executed (default=all) and also specifies the thresholds for the each submodules warning parameter."/>
         <param argument="--nogroup" type="boolean" truevalue="--nogroup" falsevalue="" checked="False" label="Disable grouping of bases for reads &gt;50bp" help=" Using this option, your plots may end up a ridiculous size. You have been warned!"/>
         <!-- Not implemented in falco yet <param argument="-min_length" type="integer" value="" optional="true" label="Lower limit on the length of the sequence to be shown in the report" help=" [NOT YET IMPLEMENTED IN FALCO]. Sets an artificial lower limit on the length of the sequence to be shown in the report. As long as you set this to a value greater or equal to your longest read length then this will be the sequence length used to create your read groups. This can be useful for making  directly comaparable statistics from datasets with somewhat variable read length."/> -->
         <!-- Ignored by falco and always set to 7 <param argument="-kmers" type="integer" value="7" min="2" max="10" label="Length of Kmer to look for" help="IGNORED BY FALCO AND ALWAYS SET TO 7. Specifies the length of Kmer to look for in the Kmer content module. Specified Kmer length must be between 2 and 10. Default length is 7 if not specified." /> -->
-        <param argument="-subsample" type="integer" value="1" min="1" optional="true" label="Subsampling Factor" help="This makes falco faster (but possibly less accurate) by only processing reads that are multiple of this value (using 0-based indexing to number reads)"/>
+        <param argument="-subsample" type="integer" value="1" min="1" label="Subsampling Factor" help="This makes falco faster (but possibly less accurate) by only processing reads that are multiple of this value (using 0-based indexing to number reads)"/>
         <param argument="-bisulfite" type="boolean" truevalue="-bisulfite" falsevalue="" checked="False" label="Bisulfite Sequencing" help="This parameter indicates whether the reads are from whole genome bisulfite sequencing. When enabled, Falco will account for the expected increase in Ts and decrease in Cs in the base content."/>
         <param argument="reverse_complement" type="boolean" truevalue="-reverse-complement" falsevalue="" checked="False" label="Reverse Complement" help="This parameter specifies whether the input sequences are reverse-complemented. When enabled, all modules in Falco will be tested by swapping A/T and C/G."/>
+        <param name="generate_summary" type="boolean" truevalue="" falsevalue="-skip-summary" checked="False" label="Generate summary output of QC test results" />
     </inputs>
     <outputs>
         <data format="html" name="html_file" from_work_dir="fastqc_report.html" label="${tool.name} on ${on_string}: Webpage"/>
         <data format="txt" name="text_file" from_work_dir="fastqc_data.txt" label="${tool.name} on ${on_string}: RawData"/>
-        <data format="txt" name="summary_file" from_work_dir="summary.txt" label="${tool.name} on ${on_string}: SummaryData"/>
+        <data format="txt" name="summary_file" from_work_dir="summary.txt" label="${tool.name} on ${on_string}: SummaryData">
+            <filter>generate_summary</filter>
+        </data>
     </outputs>
     <tests>
-        <test>
+        <test expect_num_outputs="2">
             <param name="input_file" value="1000trimmed.fastq"/>
             <output name="html_file" file="fastqc_report.html" ftype="html" lines_diff="2"/>
             <output name="text_file" file="fastqc_data.txt" ftype="txt"/>
-            <output name="summary_file" file="summary.txt" ftype="txt"/>
         </test>
-        <test>
+        <test expect_num_outputs="2">
             <param name="input_file" value="1000trimmed.fastq"/>
             <param name="contaminants" value="contaminant_list.txt" ftype="tabular"/>
             <output name="html_file" file="fastqc_report_contaminants.html" ftype="html" lines_diff="2"/>
             <output name="text_file" file="fastqc_data_contaminants.txt" ftype="txt"/>
-            <output name="summary_file" file="fastqc_data_contaminant_summary.txt" ftype="txt"/>
         </test>
-        <test>
+        <test expect_num_outputs="2">
             <param name="input_file" value="1000trimmed.fastq"/>
             <param name="adapters" value="adapter_list.txt" ftype="tabular"/>
             <output name="html_file" file="fastqc_report_adapters.html" ftype="html" lines_diff="2"/>
             <output name="text_file" file="fastqc_data_adapters.txt" ftype="txt"/>
-            <output name="summary_file" file="fastqc_data_adapters_summary.txt" ftype="txt"/>
         </test>
-        <test>
+        <test expect_num_outputs="2">
             <param name="input_file" value="1000trimmed.fastq"/>
             <param name="limits" value="limits.txt" ftype="txt"/>
             <output name="html_file" file="fastqc_report_customlimits.html" ftype="html" lines_diff="2"/>
             <output name="text_file" file="fastqc_data_customlimits.txt" ftype="txt"/>
-            <output name="summary_file" file="fastqc_data_customlimits_summary.txt" ftype="txt"/>
         </test>
-        <!-- ## This feature has not yet been implemented in Falco, but if it is, it may go uncommented in the future.
-        <test>
+
+        <!-- ## The kmers param is ignored in Falco and always set to 7. If this ever gets reconsidered, this test could be uncommented.
+        <test expect_num_outputs="2">
             <param name="input_file" value="1000trimmed.fastq" ftype="fastq"/>
             <param name="kmers" value="7"/>
             <param name="limits" value="limits.txt" ftype="txt"/>
             <output name="html_file" file="fastqc_report_kmer.html" ftype="html" lines_diff="2"/>
             <output name="text_file" file="fastqc_data_kmer.txt" ftype="txt"/>
-            <output name="summary_file" file="fastqc_data_kmer_summary.txt" ftype="txt"/>
             <assert_command>
                 <has_text text="kmers 7"/> 
             </assert_command>
         </test>  
-        
-        <test> ##This feature is ignored in Falco and always set to 7. If this will be considered, may go uncommented in the future"
+            ## The min_length param is not yet implemented in Falco.
+               Once it is, this test may be uncommented.
+        <test expect_num_outputs="2">
             <param name="input_file" value="1000trimmed.fastq"/>
             <param name="min_length" value="108"/>
             <output name="html_file" file="fastqc_report_min_length.html" ftype="html" lines_diff="2"/>
             <output name="text_file" file="fastqc_data_min_length.txt" ftype="txt"/>
-            <output name="summary_file" file="fastqc_data_min_length_summary.txt" ftype="txt"/>
         </test> -->
 
-        <test>
+        <test expect_num_outputs="3">
             <param name="input_file" value="1000trimmed.fastq" ftype="fastq"/>
             <param name="nogroup" value="--nogroup"/>
+            <param name="generate_summary" value="true"/>
             <output name="html_file" file="fastqc_report_nogroup.html" ftype="html" lines_diff="2"/>
             <output name="text_file" file="fastqc_data_nogroup.txt" ftype="txt"/>
             <output name="summary_file" file="fastqc_data_nogroup_summary.txt" ftype="txt"/>
             <assert_command>
                 <has_text text="--nogroup"/>
             </assert_command>
         </test>
-        <test>
+        <test expect_num_outputs="3">
             <param name="input_file" value="1000trimmed.fastq"/>
             <param name="subsample" value="10"/>
+            <param name="generate_summary" value="true"/>
             <output name="html_file" file="fastqc_report_subsample.html" ftype="html" lines_diff="2"/>
             <output name="text_file" file="fastqc_report_subsample.txt" ftype="txt"/>
             <output name="summary_file" file="fastqc_report_subsample_summary.txt" ftype="txt"/>
         </test>
-        <test>
+        <test expect_num_outputs="3">
             <param name="input_file" value="1000trimmed.fastq"/>
             <param name="bisulfite" value="-bisulfite"/>
+            <param name="generate_summary" value="true"/>
             <output name="html_file" file="fastqc_report_bisulfite.html" ftype="html" lines_diff="2"/>
             <output name="text_file" file="fastqc_report_bisulfite.txt" ftype="txt"/>
             <output name="summary_file" file="fastqc_report_bisulfite_summary.txt" ftype="txt"/>
         </test>
-        <test>
+        <test expect_num_outputs="3">
             <param name="input_file" value="1000trimmed.fastq"/>
             <param name="reverse_complement" value="-reverse-complement"/>
+            <param name="generate_summary" value="true"/>
             <output name="html_file" file="fastqc_report_reverse_complement.html" ftype="html" lines_diff="2"/>
             <output name="text_file" file="fastqc_report_reverse_complement.txt" ftype="txt"/>
             <output name="summary_file" file="fastqc_report_reverse_complement_summary.txt" ftype="txt"/>
         </test>
     </tests>
     <help><![CDATA[
-.. class:: infomark
+**What it does**
+
+Falco_ is a high-speed emulation of the popular FastQC software for quality control of sequencing data.
 
-**Purpose**
+💚️ With its superior performance Falco saves computational resources and gives you back results faster than FastQC.
 
-Falco is an emulation of the popular FastQC software to check large sequencing reads for common problems.
+We recommend it for most use cases (but see below for exceptions). 💚️
 
-The main functions of Falco are:
+The main functions of Falco are very similar to those of FastQC:
 
 - Import of data from BAM, SAM or FastQ/FastQ.gz files (any variant),
 - Providing a quick overview to tell you in which areas there may be problems
 - Summary graphs and tables to quickly assess your data
 - Export of results to an HTML based permanent report
 - Offline operation to allow automated generation of reports without running the interactive application
 
------
-
 .. class:: infomark
 
+The plain text report generated by Falco can be used as a "FastQC" report in MultiQC and its data is very similar though not 100% identical to that generated by FastQC on the same inputs.
+
+.. class:: Warning mark
+
+    In the following situations, FastQC is still a better solution than this version of Falco:
+
+- your input is bz2-compressed fastq
+
+  Falco doesn't currently support fastq.bz2 as input format meaning Galaxy has to perform a relatively slow format conversion before running the tool, which together makes the analysis slower than with FastQC.
+
+- you are interested in PolyA and PolyG statistics in the Adapter Content section of the quality report
+
+  Falco doesn't currently calculate statistics for these "Adapters" by default.
+
+- your input consists of *mapped* reads in SAM/BAM format
+
+  Due to a bug in the current version of Falco, reads mapped to the reverse strand of the reference genome are not handled correctly and reported metrics are wrong!
+
+-----
+
 **Inputs and outputs**
 
-Falco_ is the best place to look for documentation - it's very good.
-A summary follows below for those in a tearing hurry.
+The Falco_ development repo includes very good documentation.
+A summary of it follows below for those in a tearing hurry.
 
 This wrapper will accept a Galaxy fastq, fastq.gz, sam or bam as the input read file to check.
 It will also take an optional file containing a list of contaminants information, in the form of
@@ -200,22 +216,8 @@ The tool produces a basic text and a HTML output file that contain all of the re
 
 All except Basic Statistics and Overrepresented sequences are plots.
  .. _Falco: https://github.com/smithlabcode/falco/
- .. _Picard-tools: https://broadinstitute.github.io/picard/
     ]]></help>
     <citations>
-        <citation type="bibtex">
-            @article{deSenaBrandine2021,
-            author = {de Sena Brandine, Gabriel and Smith, Andrew D.},
-            title = {Falco: high-speed FastQC emulation for quality control of sequencing data},
-            journal = {F1000Research},
-            year = {2021},
-            volume = {8},
-            pages = {1874},
-            url = {https://doi.org/10.12688/f1000research.21142.2},
-            doi = {10.12688/f1000research.21142.2},
-            note = {Version 2; peer review: 2 approved},
-          }
-
-        </citation>
+        <citation type="doi">10.12688/f1000research.21142.2</citation>
     </citations>
 </tool>