diff --git a/CHANGELOG.md b/CHANGELOG.md index 13808caf..c4d641ff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#500](https://github.com/nf-core/funcscan/pull/500) Updated pipeline template to nf-core/tools version 3.4.1 (by @jfy133) - [#508](https://github.com/nf-core/funcscan/pull/508) Added support for antiSMASH's --clusterhmmer, --fullhmmer, and --tigrfam options (❤️ to @yusukepockyby for requesting, @jfy133) +- [#506](https://github.com/nf-core/funcscan/pull/506) Added support GECCO convert for generation of additional files useful for downstream analysis (by @SkyLexS) ### `Fixed` diff --git a/README.md b/README.md index 77f8d6d1..013fd0b7 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ nf-core/funcscan was originally written by Jasmin Frangenberg, Anan Ibrahim, Lou We thank the following people for their extensive assistance in the development of this pipeline: -Adam Talbot, Alexandru Mizeranschi, Hugo Tavares, Júlia Mir Pedrol, Martin Klapper, Mehrdad Jaberi, Robert Syme, Rosa Herbst, Vedanth Ramji, @Microbion. +Adam Talbot, Alexandru Mizeranschi, Hugo Tavares, Júlia Mir Pedrol, Martin Klapper, Mehrdad Jaberi, Robert Syme, Rosa Herbst, Vedanth Ramji, @Microbion, Dediu Octavian-Codrin. ## Contributions and Support diff --git a/conf/modules.config b/conf/modules.config index 650c4ad0..26186279 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -532,6 +532,14 @@ process { ].join(' ').trim() } + withName: GECCO_CONVERT { + publishDir = [ + path: { "${params.outdir}/bgc/gecco/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + withName: HAMRONIZATION_ABRICATE { publishDir = [ path: { "${params.outdir}/arg/hamronization/abricate" }, diff --git a/conf/test_bgc_bakta.config b/conf/test_bgc_bakta.config index a91483c9..27717806 100644 --- a/conf/test_bgc_bakta.config +++ b/conf/test_bgc_bakta.config @@ -23,7 +23,7 @@ params { config_profile_description = 'Minimal test dataset to check BGC workflow function' // Input data - input = params.pipelines_testdata_base_path + 'funcscan/samplesheet_reduced.csv' + input = params.pipelines_testdata_base_path + 'funcscan/samplesheet_hits.csv' bgc_antismash_db = params.pipelines_testdata_base_path + 'funcscan/databases/antismash_trimmed_8_0_1.tar.gz' annotation_tool = 'bakta' @@ -33,6 +33,10 @@ params { run_amp_screening = false run_bgc_screening = true + bgc_gecco_runconvert = true + bgc_gecco_convertmode = 'gbk' + bgc_gecco_convertformat = 'bigslice' + bgc_run_hmmsearch = true bgc_hmmsearch_models = 'https://raw.githubusercontent.com/antismash/antismash/fd61de057e082fbf071732ac64b8b2e8883de32f/antismash/detection/hmm_detection/data/ToyB.hmm' } diff --git a/conf/test_bgc_prokka.config b/conf/test_bgc_prokka.config index 54d5d0db..865e3163 100644 --- a/conf/test_bgc_prokka.config +++ b/conf/test_bgc_prokka.config @@ -23,7 +23,7 @@ params { config_profile_description = 'Minimal test dataset to check BGC workflow function' // Input data - input = params.pipelines_testdata_base_path + 'funcscan/samplesheet_reduced.csv' + input = params.pipelines_testdata_base_path + 'funcscan/samplesheet_hits.csv' bgc_antismash_db = params.pipelines_testdata_base_path + 'funcscan/databases/antismash_trimmed_8_0_1.tar.gz' annotation_tool = 'prokka' @@ -32,6 +32,10 @@ params { run_amp_screening = false run_bgc_screening = true + bgc_gecco_runconvert = true + bgc_gecco_convertmode = 'gbk' + bgc_gecco_convertformat = 'fna' + bgc_run_hmmsearch = true bgc_hmmsearch_models = 'https://raw.githubusercontent.com/antismash/antismash/fd61de057e082fbf071732ac64b8b2e8883de32f/antismash/detection/hmm_detection/data/ToyB.hmm' } diff --git a/conf/test_bgc_pyrodigal.config b/conf/test_bgc_pyrodigal.config index 4b986dd6..cbd19fd6 100644 --- a/conf/test_bgc_pyrodigal.config +++ b/conf/test_bgc_pyrodigal.config @@ -23,7 +23,7 @@ params { config_profile_description = 'Minimal test dataset to check BGC workflow function' // Input data - input = params.pipelines_testdata_base_path + 'funcscan/samplesheet_reduced.csv' + input = params.pipelines_testdata_base_path + 'funcscan/samplesheet_hits.csv' bgc_antismash_db = params.pipelines_testdata_base_path + 'funcscan/databases/antismash_trimmed_8_0_1.tar.gz' annotation_tool = 'pyrodigal' @@ -32,6 +32,10 @@ params { run_amp_screening = false run_bgc_screening = true + bgc_gecco_runconvert = true + bgc_gecco_convertmode = 'clusters' + bgc_gecco_convertformat = 'gff' + bgc_run_hmmsearch = true bgc_hmmsearch_models = 'https://raw.githubusercontent.com/antismash/antismash/fd61de057e082fbf071732ac64b8b2e8883de32f/antismash/detection/hmm_detection/data/ToyB.hmm' } diff --git a/conf/test_preannotated_bgc.config b/conf/test_preannotated_bgc.config index 8bc11a51..15ca6d71 100644 --- a/conf/test_preannotated_bgc.config +++ b/conf/test_preannotated_bgc.config @@ -32,6 +32,8 @@ params { run_amp_screening = false run_bgc_screening = true + bgc_gecco_runconvert = true + bgc_run_hmmsearch = true bgc_hmmsearch_models = 'https://raw.githubusercontent.com/antismash/antismash/fd61de057e082fbf071732ac64b8b2e8883de32f/antismash/detection/hmm_detection/data/ToyB.hmm' } diff --git a/docs/output.md b/docs/output.md index 289d9086..bc36e94e 100644 --- a/docs/output.md +++ b/docs/output.md @@ -457,15 +457,21 @@ Note that filtered FASTA is only used for BGC workflow for run-time optimisation Output files - `gecco/` + - **GECCO** - `*.genes.tsv/`: TSV file containing detected/predicted genes with BGC probability scores - `*.features.tsv`: TSV file containing identified domains - `*.clusters.tsv`: TSV file containing coordinates of predicted clusters and BGC types - `*_cluster_*.gbk`: GenBank file (if clusters were found) containing sequence with annotations; one file per GECCO hit - - + - `*.gff`: GFF3 converted cluster tables containing the position and metadata for all the predicted clusters (only if `--bgc_gecco_runconvert --bgc_gecco_convertmode clusters --bgc_gecco_convertformat gff`) + - `*.region*.gbk`: Converted and aliased GenBank files so that they can be loaded by BiG-SLiCE (only if `--bgc_gecco_runconvert --bgc_gecco_convertmode gbk --bgc_gecco_convertformat bigslice`) + - `*.faa`: Amino-acid FASTA converted GenBank files of all the proteins in a cluster (only if `--bgc_gecco_runconvert --bgc_gecco_convertmode gbk --bgc_gecco_convertformat faa`) + - `*.fna`:Nucleotide sequence FASTA converted GenBank files from the cluster (only if `--bgc_gecco_runconvert --bgc_gecco_convertmode gbk --bgc_gecco_convertformat fna`) + [GECCO](https://gecco.embl.de) is a fast and scalable method for identifying putative novel Biosynthetic Gene Clusters (BGCs) in genomic and metagenomic data using Conditional Random Fields (CRFs). +The additional GFF3, GenBank, or FASTA files from `--bgc_gecco_runconvert`, can be useful for additional further analysis of the BGC hits. + ### Summary tools [AMPcombi](#ampcombi), [hAMRonization](#hamronization), [comBGC](#combgc), [MultiQC](#multiqc), [pipeline information](#pipeline-information), [argNorm](#argnorm). diff --git a/modules/nf-core/gecco/convert/environment.yml b/modules/nf-core/gecco/convert/environment.yml new file mode 100644 index 00000000..7eefcd9e --- /dev/null +++ b/modules/nf-core/gecco/convert/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::gecco=0.10.0 diff --git a/modules/nf-core/gecco/convert/main.nf b/modules/nf-core/gecco/convert/main.nf new file mode 100644 index 00000000..f1022d5f --- /dev/null +++ b/modules/nf-core/gecco/convert/main.nf @@ -0,0 +1,56 @@ +process GECCO_CONVERT { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gecco:0.10.0--pyhdfd78af_0': + 'biocontainers/gecco:0.10.0--pyhdfd78af_0' }" + + input: + tuple val(meta), path(clusters), path(gbk) + val(mode) + val(format) + + output: + tuple val(meta), path("*.gff") , emit: gff , optional: true + tuple val(meta), path("*.region*.gbk"), emit: bigslice, optional: true + tuple val(meta), path("*.faa") , emit: faa , optional: true + tuple val(meta), path("*.fna") , emit: fna , optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" // IMPORTANT: -o ${prefix} does not work in 0.10.0 + """ + gecco \\ + convert \\ + $args \\ + --jobs $task.cpus \\ + $mode \\ + --input-dir ./ \\ + --format ${format} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gecco: \$(echo \$(gecco --version) | cut -f 2 -d ' ' ) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + echo $args + + touch ${prefix}.gff + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gecco: \$(echo \$(gecco --version) | cut -f 2 -d ' ' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/gecco/convert/meta.yml b/modules/nf-core/gecco/convert/meta.yml new file mode 100644 index 00000000..bf58cb52 --- /dev/null +++ b/modules/nf-core/gecco/convert/meta.yml @@ -0,0 +1,118 @@ +name: "gecco_convert" +description: | + This command helps transforming the output files created by + GECCO into helpful format, should you want to use the results in + combination with other tools. +keywords: + - bgc + - reformatting + - clusters + - gbk + - gff + - bigslice + - faa + - fna +tools: + - "gecco": + description: "Biosynthetic Gene Cluster prediction with Conditional Random Fields." + homepage: "https://gecco.embl.de" + documentation: "https://gecco.embl.de" + tool_dev_url: "https://github.com/zellerlab/GECCO" + doi: "10.1101/2021.05.03.442509" + licence: ["GPL v3"] + identifier: "" + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - clusters: + type: file + description: | + TSV file containing coordinates of gecco predicted clusters and BGC types. + pattern: "*.clusters.tsv" + ontologies: + - edam: http://edamontology.org/format_3475 # TSV + - gbk: + type: file + description: | + Per cluster GenBank file containing sequence with annotations + pattern: "*.gbk" + ontologies: + - edam: "http://edamontology.org/format_1936" # GenBank + - mode: + type: string + description: Either clusters or gbk folder output, depending on what is reformatted + enum: ["clusters", "gbk"] + - format: + type: string + description: Format for the output file + enum: ["gff", "bigslice", "faa", "fna"] +output: + gff: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "*.gff": + type: file + description: | + GFF3 converted cluster tables containing the position + and metadata for all the predicted clusters + pattern: "*.gff" + ontologies: + - edam: "http://edamontology.org/format_1975" # GFF3 + bigslice: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "*.region*.gbk": + type: file + description: | + Converted and aliased GenBank files so that they can be loaded by BiG-SLiCE + pattern: "*.region*.gbk" + ontologies: + - edam: "http://edamontology.org/format_1936" # GenBank + faa: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "*.faa": + type: file + description: | + Amino-acid FASTA converted GenBank files of all the proteins in a cluster + pattern: "*.faa" + ontologies: + - edam: "http://edamontology.org/format_1929" # FASTA + fna: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "*.fna": + type: file + description: | + Nucleotide sequence FASTA converted GenBank files from the cluster + pattern: "*.fna" + ontologies: + - edam: "http://edamontology.org/format_1929" # FASTA + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: "http://edamontology.org/format_3750" # YAML + +authors: + - "@vagkaratzas" +maintainers: + - "@vagkaratzas" diff --git a/modules/nf-core/gecco/convert/tests/main.nf.test b/modules/nf-core/gecco/convert/tests/main.nf.test new file mode 100644 index 00000000..769a1a87 --- /dev/null +++ b/modules/nf-core/gecco/convert/tests/main.nf.test @@ -0,0 +1,118 @@ +nextflow_process { + + name "Test Process GECCO_CONVERT" + script "../main.nf" + process "GECCO_CONVERT" + + tag "modules" + tag "modules_nfcore" + tag "gecco" + tag "gecco/convert" + tag "gecco/run" + + setup { + run("GECCO_RUN") { + script "../../run/main.nf" + process { + """ + input[0] = [ + [ id:'test_gecco', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/prokaryotes/candidatus_portiera_aleyrodidarum/genome/genome.fasta', checkIfExists: true), + [] + ] + input[1] = [] + """ + } + } + } + + test("candidatus_portiera_aleyrodidarum - clusters - gff") { + + when { + process { + """ + input[0] = GECCO_RUN.out.clusters + .mix(GECCO_RUN.out.gbk) + .groupTuple(by:0) + .map { meta, paths -> + [meta, paths[0], paths[1]] + } + input[1] = "clusters" + input[2] = "gff" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.gff, + process.out.versions.collect{ path(it).yaml } + ).match() } + ) + } + + } + + test("candidatus_portiera_aleyrodidarum - gbk - faa") { + + when { + process { + """ + input[0] = GECCO_RUN.out.clusters + .mix(GECCO_RUN.out.gbk) + .groupTuple(by:0) + .map { meta, paths -> + [meta, paths[0], paths[1]] + } + input[1] = "gbk" + input[2] = "faa" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.faa, + process.out.versions.collect{ path(it).yaml } + ).match() } + ) + } + + } + + test("candidatus_portiera_aleyrodidarum - clusters - gff - stub") { + + options "-stub" + + when { + process { + """ + input[0] = GECCO_RUN.out.clusters + .mix(GECCO_RUN.out.gbk) + .groupTuple(by:0) + .map { meta, paths -> + [meta, paths[0], paths[1]] + } + input[1] = "clusters" + input[2] = "gff" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out, + process.out.versions.collect{ path(it).yaml } + ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/gecco/convert/tests/main.nf.test.snap b/modules/nf-core/gecco/convert/tests/main.nf.test.snap new file mode 100644 index 00000000..96ce6a8c --- /dev/null +++ b/modules/nf-core/gecco/convert/tests/main.nf.test.snap @@ -0,0 +1,112 @@ +{ + "candidatus_portiera_aleyrodidarum - gbk - faa": { + "content": [ + [ + [ + { + "id": "test_gecco", + "single_end": true + }, + "NC_018507.1_cluster_1.faa:md5,82c70d6273c21eadf2d16f5fcdcd5e7f" + ] + ], + [ + { + "GECCO_CONVERT": { + "gecco": "0.10.0" + } + } + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.0" + }, + "timestamp": "2025-11-09T09:00:23.11547703" + }, + "candidatus_portiera_aleyrodidarum - clusters - gff": { + "content": [ + [ + [ + { + "id": "test_gecco", + "single_end": true + }, + "test_gecco.clusters.gff:md5,21437cce86b3880f2c4d41798563d0df" + ] + ], + [ + { + "GECCO_CONVERT": { + "gecco": "0.10.0" + } + } + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.0" + }, + "timestamp": "2025-11-09T09:00:08.211056173" + }, + "candidatus_portiera_aleyrodidarum - clusters - gff - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test_gecco", + "single_end": true + }, + "test_gecco.gff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + "versions.yml:md5,86c663d3d9423b28361d384e9146c0a7" + ], + "bigslice": [ + + ], + "faa": [ + + ], + "fna": [ + + ], + "gff": [ + [ + { + "id": "test_gecco", + "single_end": true + }, + "test_gecco.gff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,86c663d3d9423b28361d384e9146c0a7" + ] + }, + [ + { + "GECCO_CONVERT": { + "gecco": "0.10.0" + } + } + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.04.8" + }, + "timestamp": "2025-11-09T08:28:53.159566209" + } +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 455759b6..b87d4938 100644 --- a/nextflow.config +++ b/nextflow.config @@ -253,6 +253,9 @@ params { bgc_gecco_pfilter = 1E-9 bgc_gecco_edgedistance = 0 bgc_gecco_mask = false + bgc_gecco_runconvert = false + bgc_gecco_convertmode = 'clusters' + bgc_gecco_convertformat = 'gff' bgc_run_hmmsearch = false bgc_hmmsearch_models = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 814f4b93..3b8d8fd8 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -383,7 +383,7 @@ "default": "Bacteria", "fa_icon": "fas fa-crown", "description": "Specify the kingdom that the input represents.", - "help_text": "Specifies the kingdom that the input sample is derived from and/or you wish to screen for\n\n> \u26a0\ufe0f Prokka cannot annotate Eukaryotes.\n\nFor more information please check the Prokka [documentation](https://github.com/tseemann/prokka).\n\n> Modifies tool parameter(s):\n> - Prokka: `--kingdom`", + "help_text": "Specifies the kingdom that the input sample is derived from and/or you wish to screen for\n\n> ⚠️ Prokka cannot annotate Eukaryotes.\n\nFor more information please check the Prokka [documentation](https://github.com/tseemann/prokka).\n\n> Modifies tool parameter(s):\n> - Prokka: `--kingdom`", "enum": ["Archaea", "Bacteria", "Mitochondria", "Viruses"] }, "annotation_prokka_gcode": { @@ -399,12 +399,12 @@ "type": "integer", "default": 1, "description": "Minimum contig size required for annotation (bp).", - "help_text": "Specify the minimum contig lengths to carry out annotations on. The Prokka developers recommend that this should be \u2265 200 bp, if you plan to submit such annotations to NCBI.\n\nFor more information please check the Prokka [documentation](https://github.com/tseemann/prokka).\n\n> Modifies tool parameter(s):\n> - Prokka: `--mincontiglen`", + "help_text": "Specify the minimum contig lengths to carry out annotations on. The Prokka developers recommend that this should be ≥ 200 bp, if you plan to submit such annotations to NCBI.\n\nFor more information please check the Prokka [documentation](https://github.com/tseemann/prokka).\n\n> Modifies tool parameter(s):\n> - Prokka: `--mincontiglen`", "fa_icon": "fas fa-ruler-horizontal" }, "annotation_prokka_evalue": { "type": "number", - "default": 1e-6, + "default": 0.000001, "description": "E-value cut-off.", "help_text": "Specifiy the maximum E-value used for filtering the alignment hits.\n\nFor more information please check the Prokka [documentation](https://github.com/tseemann/prokka).\n\n> Modifies tool parameter(s):\n> - Prokka: `--evalue`", "fa_icon": "fas fa-sort-amount-down" @@ -705,7 +705,7 @@ "amp_ampcombi_db": { "type": "string", "description": "The path to the folder containing the reference database files.", - "help_text": "The path to the folder containing the reference database files (`*.fasta` and `*.tsv`); a fasta file and the corresponding table with structural, functional and if reported taxonomic classifications. AMPcombi will then generate the corresponding `mmseqs2` directory, in which all binary files are prepared for the downstream alignment of the recovered AMPs with [MMseqs2](https://github.com/soedinglab/MMseqs2). These can also be provided by the user by setting up an mmseqs2 compatible database using `mmseqs createdb *.fasta` in a directory called `mmseqs2`.\n\nExample file structure for the reference database supplied by the user:\n\n```bash\namp_DRAMP_database/\n\u251c\u2500\u2500 general_amps_2024_11_13.fasta\n\u251c\u2500\u2500 general_amps_2024_11_13.txt\n\u2514\u2500\u2500 mmseqs2\n \u251c\u2500\u2500 ref_DB\n \u251c\u2500\u2500 ref_DB.dbtype\n \u251c\u2500\u2500 ref_DB_h\n \u251c\u2500\u2500 ref_DB_h.dbtype\n \u251c\u2500\u2500 ref_DB_h.index\n \u251c\u2500\u2500 ref_DB.index\n \u251c\u2500\u2500 ref_DB.lookup\n \u2514\u2500\u2500 ref_DB.source```\n\nFor more information check the AMPcombi [documentation](https://ampcombi.readthedocs.io/en/main/usage.html#parse-tables)." + "help_text": "The path to the folder containing the reference database files (`*.fasta` and `*.tsv`); a fasta file and the corresponding table with structural, functional and if reported taxonomic classifications. AMPcombi will then generate the corresponding `mmseqs2` directory, in which all binary files are prepared for the downstream alignment of the recovered AMPs with [MMseqs2](https://github.com/soedinglab/MMseqs2). These can also be provided by the user by setting up an mmseqs2 compatible database using `mmseqs createdb *.fasta` in a directory called `mmseqs2`.\n\nExample file structure for the reference database supplied by the user:\n\n```bash\namp_DRAMP_database/\n├── general_amps_2024_11_13.fasta\n├── general_amps_2024_11_13.txt\n└── mmseqs2\n ├── ref_DB\n ├── ref_DB.dbtype\n ├── ref_DB_h\n ├── ref_DB_h.dbtype\n ├── ref_DB_h.index\n ├── ref_DB.index\n ├── ref_DB.lookup\n └── ref_DB.source```\n\nFor more information check the AMPcombi [documentation](https://ampcombi.readthedocs.io/en/main/usage.html#parse-tables)." }, "amp_ampcombi_parsetables_cutoff": { "type": "number", @@ -1065,14 +1065,14 @@ }, "arg_rgi_includeloose": { "type": "boolean", - "description": "Include all of loose, strict and perfect hits (i.e. \u2265 95% identity) found by RGI.", + "description": "Include all of loose, strict and perfect hits (i.e. ≥ 95% identity) found by RGI.", "help_text": "When activated RGI output will include 'Loose' hits in addition to 'Strict' and 'Perfect' hits. The 'Loose' algorithm works outside of the detection model cut-offs to provide detection of new, emergent threats and more distant homologs of AMR genes, but will also catalog homologous sequences and spurious partial matches that may not have a role in AMR.\n\nFor more information check the RGI [documentation](https://github.com/arpcard/rgi).\n\n> Modifies tool parameter(s):\n> - RGI_MAIN: `--include_loose`", "fa_icon": "far fa-hand-scissors" }, "arg_rgi_includenudge": { "type": "boolean", "description": "Suppresses the default behaviour of RGI with `--arg_rgi_includeloose`.", - "help_text": "This flag suppresses the default behaviour of RGI, by listing all 'Loose' matches of \u2265 95% identity as 'Strict' or 'Perfect', regardless of alignment length.\n\nFor more information check the RGI [documentation](https://github.com/arpcard/rgi).\n\n> Modifies tool parameter(s):\n> - RGI_MAIN: `--include_nudge`", + "help_text": "This flag suppresses the default behaviour of RGI, by listing all 'Loose' matches of ≥ 95% identity as 'Strict' or 'Perfect', regardless of alignment length.\n\nFor more information check the RGI [documentation](https://github.com/arpcard/rgi).\n\n> Modifies tool parameter(s):\n> - RGI_MAIN: `--include_nudge`", "fa_icon": "fas fa-hand-scissors" }, "arg_rgi_lowquality": { @@ -1432,6 +1432,25 @@ "description": "The minimum number of annotated genes that must separate a cluster from the edge.", "help_text": "The minimum number of annotated genes that must separate a possible BGC cluster from the edge. Edge clusters will still be included if they are longer. A lower number will increase the number of false positives on small contigs. Used during BGC extraction.\n\nFor more information see the GECCO [documentation](https://github.com/zellerlab/GECCO).\n\n> Modifies tool parameter(s):\n> - GECCO: `--edge-distance`", "fa_icon": "fas fa-ruler-horizontal" + }, + "bgc_gecco_runconvert": { + "type": "boolean", + "description": "Enable GECCO file conversion to formats for downstream analysis.", + "help_text": "Converts GECCO output into formats like GFF3, GenBank, or FASTA for further analysis." + }, + "bgc_gecco_convertmode": { + "type": "string", + "default": "clusters", + "enum": ["clusters", "gbk"], + "description": "Specify conversion mode for GECCO convert.", + "help_text": "Either clusters or gbk folder output, depending on what is reformatted.\n\n Modifies tool parameter:\n- gecco convert: `gecco convert `" + }, + "bgc_gecco_convertformat": { + "type": "string", + "default": "gff", + "enum": ["gff", "bigslice", "fna", "faa"], + "description": "Specify output format for GECCO convert.", + "help_text": "Choose output format for clusters mode: 'gff', or gbk mode: 'bigslice', 'fna', or 'faa'.\n\nModifies tool parameter:\n- gecco convert: `--format`" } }, "fa_icon": "fas fa-angle-double-right" diff --git a/subworkflows/local/bgc.nf b/subworkflows/local/bgc.nf index 6bdc2881..75c34854 100644 --- a/subworkflows/local/bgc.nf +++ b/subworkflows/local/bgc.nf @@ -12,13 +12,14 @@ include { DEEPBGC_PIPELINE } from '../../modules/nf-core/d include { COMBGC } from '../../modules/local/combgc' include { TABIX_BGZIP as BGC_TABIX_BGZIP } from '../../modules/nf-core/tabix/bgzip/main' include { MERGE_TAXONOMY_COMBGC } from '../../modules/local/merge_taxonomy_combgc' +include { GECCO_CONVERT } from '../../modules/nf-core/gecco/convert/main' workflow BGC { take: fastas // tuple val(meta), path(PREPPED_INPUT.out.fna) - faas // tuple val(meta), path(.out.faa) - gbks // tuple val(meta), path(.out.gbk) - tsvs // tuple val(meta), path(MMSEQS_CREATETSV.out.tsv) + faas // tuple val(meta), path(.out.faa) + gbks // tuple val(meta), path(.out.gbk) + tsvs // tuple val(meta), path(MMSEQS_CREATETSV.out.tsv) main: ch_versions = Channel.empty() @@ -71,7 +72,8 @@ workflow BGC { if (!params.bgc_skip_deepbgc) { if (params.bgc_deepbgc_db) { - ch_deepbgc_database = Channel.fromPath(params.bgc_deepbgc_db, checkIfExists: true) + ch_deepbgc_database = Channel + .fromPath(params.bgc_deepbgc_db, checkIfExists: true) .first() } else { @@ -104,6 +106,17 @@ workflow BGC { ch_bgcresults_for_combgc = ch_bgcresults_for_combgc.mix(ch_geccoresults_for_combgc) } + // GECCO CONVERT + if (params.bgc_gecco_runconvert) { + ch_gecco_clusters_and_gbk = GECCO_RUN.out.clusters + .join(GECCO_RUN.out.gbk) + .map { meta, clusters_file, gbk_file -> + [meta, clusters_file, gbk_file] + } + + GECCO_CONVERT(ch_gecco_clusters_and_gbk, params.bgc_gecco_convertmode, params.bgc_gecco_convertformat) + ch_versions = ch_versions.mix(GECCO_CONVERT.out.versions) + } // HMMSEARCH if (params.bgc_run_hmmsearch) { if (params.bgc_hmmsearch_models) { @@ -162,7 +175,8 @@ workflow BGC { MERGE_TAXONOMY_COMBGC(ch_combgc_summaries, ch_mmseqs_taxonomy_list) ch_versions = ch_versions.mix(MERGE_TAXONOMY_COMBGC.out.versions) - ch_tabix_input = Channel.of(['id': 'combgc_complete_summary_taxonomy']) + ch_tabix_input = Channel + .of(['id': 'combgc_complete_summary_taxonomy']) .combine(MERGE_TAXONOMY_COMBGC.out.tsv) BGC_TABIX_BGZIP(ch_tabix_input) diff --git a/subworkflows/local/utils_nfcore_funcscan_pipeline/main.nf b/subworkflows/local/utils_nfcore_funcscan_pipeline/main.nf index 6b7dac62..e77d6166 100644 --- a/subworkflows/local/utils_nfcore_funcscan_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_funcscan_pipeline/main.nf @@ -8,15 +8,15 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { UTILS_NFSCHEMA_PLUGIN } from '../../nf-core/utils_nfschema_plugin' -include { paramsSummaryMap } from 'plugin/nf-schema' -include { samplesheetToList } from 'plugin/nf-schema' -include { paramsHelp } from 'plugin/nf-schema' -include { completionEmail } from '../../nf-core/utils_nfcore_pipeline' -include { completionSummary } from '../../nf-core/utils_nfcore_pipeline' -include { imNotification } from '../../nf-core/utils_nfcore_pipeline' -include { UTILS_NFCORE_PIPELINE } from '../../nf-core/utils_nfcore_pipeline' -include { UTILS_NEXTFLOW_PIPELINE } from '../../nf-core/utils_nextflow_pipeline' +include { UTILS_NFSCHEMA_PLUGIN } from '../../nf-core/utils_nfschema_plugin' +include { paramsSummaryMap } from 'plugin/nf-schema' +include { samplesheetToList } from 'plugin/nf-schema' +include { paramsHelp } from 'plugin/nf-schema' +include { completionEmail } from '../../nf-core/utils_nfcore_pipeline' +include { completionSummary } from '../../nf-core/utils_nfcore_pipeline' +include { imNotification } from '../../nf-core/utils_nfcore_pipeline' +include { UTILS_NFCORE_PIPELINE } from '../../nf-core/utils_nfcore_pipeline' +include { UTILS_NEXTFLOW_PIPELINE } from '../../nf-core/utils_nextflow_pipeline' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -26,15 +26,15 @@ include { UTILS_NEXTFLOW_PIPELINE } from '../../nf-core/utils_nextflow_pipelin workflow PIPELINE_INITIALISATION { take: - version // boolean: Display version and exit - validate_params // boolean: Boolean whether to validate parameters against the schema at runtime - monochrome_logs // boolean: Do not use coloured log outputs + version // boolean: Display version and exit + validate_params // boolean: Boolean whether to validate parameters against the schema at runtime + monochrome_logs // boolean: Do not use coloured log outputs nextflow_cli_args // array: List of positional nextflow CLI args - outdir // string: The output directory where the results will be saved - input // string: Path to input samplesheet - help // boolean: Display help message and exit - help_full // boolean: Show the full help message - show_hidden // boolean: Show hidden parameters in the help message + outdir // string: The output directory where the results will be saved + input // string: Path to input samplesheet + help // boolean: Display help message and exit + help_full // boolean: Show the full help message + show_hidden // boolean: Show hidden parameters in the help message main: @@ -72,7 +72,7 @@ workflow PIPELINE_INITIALISATION { """ command = "nextflow run ${workflow.manifest.name} -profile --input samplesheet.csv --outdir " - UTILS_NFSCHEMA_PLUGIN ( + UTILS_NFSCHEMA_PLUGIN( workflow, validate_params, null, @@ -81,7 +81,7 @@ workflow PIPELINE_INITIALISATION { show_hidden, before_text, after_text, - command + command, ) // @@ -99,7 +99,8 @@ workflow PIPELINE_INITIALISATION { // Create channel from input file provided through params.input // - Channel.fromList(samplesheetToList(input, "${projectDir}/assets/schema_input.json")) + Channel + .fromList(samplesheetToList(input, "${projectDir}/assets/schema_input.json")) .set { ch_samplesheet } emit: @@ -115,13 +116,13 @@ workflow PIPELINE_INITIALISATION { workflow PIPELINE_COMPLETION { take: - email // string: email address - email_on_fail // string: email address sent on pipeline failure + email // string: email address + email_on_fail // string: email address sent on pipeline failure plaintext_email // boolean: Send plain-text email instead of HTML - outdir // path: Path to output directory where results will be published + outdir // path: Path to output directory where results will be published monochrome_logs // boolean: Disable ANSI colour codes in log output - hook_url // string: hook URL for notifications - multiqc_report // string: Path to MultiQC report + hook_url // string: hook URL for notifications + multiqc_report // string: Path to MultiQC report main: summary_params = paramsSummaryMap(workflow, parameters_schema: "nextflow_schema.json") @@ -164,6 +165,14 @@ workflow PIPELINE_COMPLETION { // Check and validate pipeline parameters // def validateInputParameters() { + if (params.run_bgc_screening && !params.bgc_skip_gecco && params.bgc_gecco_runconvert) { + if (params.bgc_gecco_convertmode == 'gbk' && params.bgc_gecco_convertformat == 'gff') { + error("[nf-core/funcscan] ERROR: when specifying --bgc_gecco_convertmode 'gbk', --bgc_gecco_convertformat can only be set to 'bigslice', 'fna' or 'faa'. You specified --bgc_gecco_convertformat '${params.bgc_gecco_convertformat}'. Check input!") + } + if (params.bgc_gecco_convertmode == 'clusters' && params.bgc_gecco_convertformat != 'gff') { + error("[nf-core/funcscan] ERROR: when specifying --bgc_gecco_convertmode 'clusters', --bgc_gecco_convertformat can only be set to 'gff'. You specified --bgc_gecco_convertformat '${params.bgc_gecco_convertformat}'. Check input!") + } + } } // diff --git a/tests/test_bgc_bakta.nf.test b/tests/test_bgc_bakta.nf.test index 10d8d2ba..a688070d 100644 --- a/tests/test_bgc_bakta.nf.test +++ b/tests/test_bgc_bakta.nf.test @@ -38,7 +38,9 @@ nextflow_pipeline { // GECCO { assert snapshot( path("$outputDir/bgc/gecco/sample_2/sample_2.genes.tsv"), // channel: genes - path("$outputDir/bgc/gecco/sample_2/sample_2.features.tsv") // channel: features + path("$outputDir/bgc/gecco/sample_2/sample_2.features.tsv"), // channel: features + path("$outputDir/bgc/gecco/sample_2/sample_2.clusters.tsv"), // channel: clusters + file("$outputDir/bgc/gecco/sample_2/NODE_18_length_18230_cov_4.622228.region001.gbk").name, // from gecco convert ).match("gecco") } ) } diff --git a/tests/test_bgc_bakta.nf.test.snap b/tests/test_bgc_bakta.nf.test.snap index 5a0932ad..b847a229 100644 --- a/tests/test_bgc_bakta.nf.test.snap +++ b/tests/test_bgc_bakta.nf.test.snap @@ -13,23 +13,25 @@ }, "deepbgc_bgc_gbk": { "content": [ - "sample_2.bgc.gbk:md5,d41d8cd98f00b204e9800998ecf8427e" + "sample_2.bgc.gbk:md5,d7e7a8421ee13457487108f9d41aff54" ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" + "nf-test": "0.9.2", + "nextflow": "25.10.0" }, - "timestamp": "2024-07-24T10:32:18.378687548" + "timestamp": "2025-12-17T11:42:38.648657935" }, "gecco": { "content": [ - "sample_2.genes.tsv:md5,66e3724c7e7da102bf58acd564211e8b", - "sample_2.features.tsv:md5,2ef146213836ca80d3079776f17c7cb2" + "sample_2.genes.tsv:md5,4e45a9882d7b9d5510fefe9f34329a96", + "sample_2.features.tsv:md5,b64ad7a5cb4af9971b1bd4b379ff2486", + "sample_2.clusters.tsv:md5,61a89e5684004b6c4e7b943c373e8d1e", + "NODE_18_length_18230_cov_4.622228.region001.gbk" ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" + "nf-test": "0.9.2", + "nextflow": "25.10.0" }, - "timestamp": "2024-07-24T10:32:18.404694725" + "timestamp": "2025-12-17T13:30:31.319332788" } } \ No newline at end of file diff --git a/tests/test_bgc_prokka.nf.test b/tests/test_bgc_prokka.nf.test index afb9c8f6..cf6392aa 100644 --- a/tests/test_bgc_prokka.nf.test +++ b/tests/test_bgc_prokka.nf.test @@ -38,7 +38,9 @@ nextflow_pipeline { // GECCO { assert snapshot( path("$outputDir/bgc/gecco/sample_2/sample_2.genes.tsv"), // channel: genes - path("$outputDir/bgc/gecco/sample_2/sample_2.features.tsv") // channel: features + path("$outputDir/bgc/gecco/sample_2/sample_2.features.tsv"), // channel: features + path("$outputDir/bgc/gecco/sample_2/sample_2.clusters.tsv"), // channel: clusters + path("$outputDir/bgc/gecco/sample_2/PROKKA_2_cluster_1.fna"), // from gecco convert ).match("gecco") } ) } diff --git a/tests/test_bgc_prokka.nf.test.snap b/tests/test_bgc_prokka.nf.test.snap index 69670287..3a5e36f5 100644 --- a/tests/test_bgc_prokka.nf.test.snap +++ b/tests/test_bgc_prokka.nf.test.snap @@ -13,23 +13,25 @@ }, "deepbgc_bgc_gbk": { "content": [ - "sample_2.bgc.gbk:md5,d41d8cd98f00b204e9800998ecf8427e" + "sample_2.bgc.gbk:md5,03712704561ca22c5e29f45a50f4a18d" ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" + "nf-test": "0.9.2", + "nextflow": "25.10.0" }, - "timestamp": "2024-07-24T10:39:33.920624113" + "timestamp": "2025-12-17T11:25:30.345420425" }, "gecco": { "content": [ - "sample_2.genes.tsv:md5,050b82ca462430ecc0635acb2e297531", - "sample_2.features.tsv:md5,79354868ee3de6fdc419195b8fa8edb6" + "sample_2.genes.tsv:md5,b9a64f054cea791ebbe0738e33431b2c", + "sample_2.features.tsv:md5,9900311acf9e6396fe4106c03ab628ba", + "sample_2.clusters.tsv:md5,78c908f8db4194ce989d4dddf16eea18", + "PROKKA_2_cluster_1.fna:md5,7647cfa207914f33f2abd32f2f7639d1" ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" + "nf-test": "0.9.2", + "nextflow": "25.10.0" }, - "timestamp": "2024-07-24T10:39:33.944935473" + "timestamp": "2025-12-17T11:33:10.649330826" } } \ No newline at end of file diff --git a/tests/test_bgc_pyrodigal.nf.test b/tests/test_bgc_pyrodigal.nf.test index 9ccf8024..7ad939d9 100644 --- a/tests/test_bgc_pyrodigal.nf.test +++ b/tests/test_bgc_pyrodigal.nf.test @@ -38,7 +38,9 @@ nextflow_pipeline { // GECCO { assert snapshot( path("$outputDir/bgc/gecco/sample_2/sample_2.genes.tsv"), // channel: genes - path("$outputDir/bgc/gecco/sample_2/sample_2.features.tsv") // channel: features + path("$outputDir/bgc/gecco/sample_2/sample_2.features.tsv"), // channel: features + path("$outputDir/bgc/gecco/sample_2/sample_2.clusters.tsv"), // channel: clusters + path("$outputDir/bgc/gecco/sample_2/sample_2.clusters.gff"), // from gecco convert ).match("gecco") } ) } diff --git a/tests/test_bgc_pyrodigal.nf.test.snap b/tests/test_bgc_pyrodigal.nf.test.snap index 80348839..60d02b2d 100644 --- a/tests/test_bgc_pyrodigal.nf.test.snap +++ b/tests/test_bgc_pyrodigal.nf.test.snap @@ -13,23 +13,25 @@ }, "deepbgc_bgc_gbk": { "content": [ - "sample_2.bgc.gbk:md5,d41d8cd98f00b204e9800998ecf8427e" + "sample_2.bgc.gbk:md5,a22271277ced910adede93fe202a7008" ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" + "nf-test": "0.9.2", + "nextflow": "25.10.0" }, - "timestamp": "2024-07-24T10:45:44.435766452" + "timestamp": "2025-12-17T11:16:13.022611618" }, "gecco": { "content": [ - "sample_2.genes.tsv:md5,66e3724c7e7da102bf58acd564211e8b", - "sample_2.features.tsv:md5,2ef146213836ca80d3079776f17c7cb2" + "sample_2.genes.tsv:md5,4e45a9882d7b9d5510fefe9f34329a96", + "sample_2.features.tsv:md5,b64ad7a5cb4af9971b1bd4b379ff2486", + "sample_2.clusters.tsv:md5,61a89e5684004b6c4e7b943c373e8d1e", + "sample_2.clusters.gff:md5,3caa3574e1be1ac5a0e1d80f01bacddd" ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" + "nf-test": "0.9.2", + "nextflow": "25.10.0" }, - "timestamp": "2024-07-24T10:45:25.732866237" + "timestamp": "2025-12-17T11:20:47.603380605" } } \ No newline at end of file