diff --git a/bio/mmseqs2/db/environment.linux-64.pin.txt b/bio/mmseqs2/db/environment.linux-64.pin.txt new file mode 100644 index 00000000000..0e325911d2a --- /dev/null +++ b/bio/mmseqs2/db/environment.linux-64.pin.txt @@ -0,0 +1,32 @@ +# This file may be used to create an environment using: +# $ conda create --name --file +# platform: linux-64 +# created-by: conda 25.3.0 +@EXPLICIT +https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81 +https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2025.1.31-hbcca054_0.conda#19f3a56f68d2fd06c516076bff482c52 +https://conda.anaconda.org/conda-forge/linux-64/libgomp-14.2.0-h767d61c_2.conda#06d02030237f4d5b3d9a7e7d348fe3c6 +https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2#73aaf86a425cc6e73fcf236a5a46396d +https://conda.anaconda.org/conda-forge/linux-64/libgcc-14.2.0-h767d61c_2.conda#ef504d1acbd74b7cc6849ef8af47dd03 +https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.4-hb9d3cd8_0.conda#e2775acf57efd5af15b8e3d1d74d72d3 +https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-14.2.0-h69a702a_2.conda#a2222a6ada71fb478682efe483ce0f92 +https://conda.anaconda.org/conda-forge/linux-64/libgettextpo-0.23.1-h5888daf_0.conda#a09ce5decdef385bcce78c32809fa794 +https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.18-h4ce23a2_1.conda#e796ff8ddc598affdf7c173d6145f087 +https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.6.4-hb9d3cd8_0.conda#42d5b6a0f30d3c10cd88cb8584fda1cb +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-14.2.0-h8f9b012_2.conda#a78c856b6dc6bf4ea8daeb9beaaa3fb0 +https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda#edb0dca6bc32e4f4789199455a1dbeb8 +https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda#47e340acb35de30501a76c7c799c41d7 +https://conda.anaconda.org/conda-forge/linux-64/openssl-3.4.1-h7b32b05_0.conda#41adf927e746dc75ecf0ef841c454e48 +https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda#62ee74e96c5ebb0af99386de58cf9553 +https://conda.anaconda.org/conda-forge/linux-64/libasprintf-0.23.1-h8e693c7_0.conda#988f4937281a66ca19d1adb3b5e3f859 +https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.49.1-hee588c1_2.conda#962d6ac93c30b1dfc54c9cccafd1003e +https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hf672d98_0.conda#be2de152d8073ef1c01b7728475f2fe7 +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-14.2.0-h4852527_2.conda#c75da67f045c2627f59e6fcb5f4e3a9b +https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.13.7-h0d44e9d_0.conda#3ac6daa5c1210293a6deaec0c345b230 +https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda#283b96675859b20a825f8fa30f311446 +https://conda.anaconda.org/conda-forge/linux-64/zlib-1.3.1-hb9d3cd8_2.conda#c9f075ab2f33b3bbee9e62d4ad0a6cd8 +https://conda.anaconda.org/conda-forge/linux-64/aria2-1.37.0-hbc8128a_2.conda#03b8874fa70df577f3eee53085d025cf +https://conda.anaconda.org/conda-forge/linux-64/gmp-6.3.0-hac33072_2.conda#c94a5994ef49749880a8139cf9afcbe1 +https://conda.anaconda.org/conda-forge/linux-64/mpfr-4.2.1-h90cbb55_3.conda#2eeb50cab6652538eee8fc0bc3340c81 +https://conda.anaconda.org/conda-forge/linux-64/gawk-5.3.1-hcd3d067_0.conda#91d4414ab699180b2b0b10b8112c5a2f +https://conda.anaconda.org/bioconda/linux-64/mmseqs2-17.b804f-hd6d6fdc_1.tar.bz2#561fb589d37cff61ec6b887fc2976498 diff --git a/bio/mmseqs2/db/environment.yaml b/bio/mmseqs2/db/environment.yaml new file mode 100644 index 00000000000..69bc613e23d --- /dev/null +++ b/bio/mmseqs2/db/environment.yaml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - mmseqs2 =17.b804f diff --git a/bio/mmseqs2/db/meta.yaml b/bio/mmseqs2/db/meta.yaml new file mode 100644 index 00000000000..b78f3485aef --- /dev/null +++ b/bio/mmseqs2/db/meta.yaml @@ -0,0 +1,13 @@ +name: MMseqs2 db +url: https://github.com/soedinglab/mmseqs2 +description: | + ultra fast and sensitive sequence search and clustering suite +authors: + - Filipe G. Vieira +input: + - input FAS file +output: + - output: DB files +params: + - module: workflow to use + - extra: additional program arguments diff --git a/bio/mmseqs2/db/test/Snakefile b/bio/mmseqs2/db/test/Snakefile new file mode 100644 index 00000000000..ba50df7d37d --- /dev/null +++ b/bio/mmseqs2/db/test/Snakefile @@ -0,0 +1,49 @@ +rule mmseqs2_databases: + output: + db=multiext( + "out/databases/{sample}", + "", + ".dbtype", + ".index", + ".lookup", + ".source", + ".version", + "_h", + "_h.dbtype", + "_h.index", + "_mapping", + "_taxonomy", + ), + log: + "logs/databases/{sample}.log", + params: + module="databases SILVA", + extra="-v 3", + threads: 1 + wrapper: + "master/bio/mmseqs2/db" + + +rule mmseqs2_createdb: + input: + fas="seqs/{sample}.fasta", + output: + db=multiext( + "out/createdb/{sample}", + "", + ".dbtype", + ".index", + ".lookup", + ".source", + "_h", + "_h.dbtype", + "_h.index", + ), + log: + "logs/createdb/{sample}.log", + params: + module="createdb", + extra="-v 3", + threads: 1 + wrapper: + "master/bio/mmseqs2/db" diff --git a/bio/mmseqs2/db/test/expected/createdb/a b/bio/mmseqs2/db/test/expected/createdb/a new file mode 100644 index 00000000000..9b8977c2c5a Binary files /dev/null and b/bio/mmseqs2/db/test/expected/createdb/a differ diff --git a/bio/mmseqs2/db/test/expected/createdb/a.dbtype b/bio/mmseqs2/db/test/expected/createdb/a.dbtype new file mode 100644 index 00000000000..f66c9cf4c96 Binary files /dev/null and b/bio/mmseqs2/db/test/expected/createdb/a.dbtype differ diff --git a/bio/mmseqs2/db/test/expected/createdb/a.index b/bio/mmseqs2/db/test/expected/createdb/a.index new file mode 100644 index 00000000000..c97126fbee9 --- /dev/null +++ b/bio/mmseqs2/db/test/expected/createdb/a.index @@ -0,0 +1,4 @@ +0 0 9 +1 9 9 +2 18 8 +3 26 8 diff --git a/bio/mmseqs2/db/test/expected/createdb/a.lookup b/bio/mmseqs2/db/test/expected/createdb/a.lookup new file mode 100644 index 00000000000..13797744afd --- /dev/null +++ b/bio/mmseqs2/db/test/expected/createdb/a.lookup @@ -0,0 +1,4 @@ +0 1 0 +1 2 0 +2 1 0 +3 3 0 diff --git a/bio/mmseqs2/db/test/expected/createdb/a.source b/bio/mmseqs2/db/test/expected/createdb/a.source new file mode 100644 index 00000000000..b60b20168f3 --- /dev/null +++ b/bio/mmseqs2/db/test/expected/createdb/a.source @@ -0,0 +1 @@ +0 a.fasta diff --git a/bio/mmseqs2/db/test/expected/createdb/a_h b/bio/mmseqs2/db/test/expected/createdb/a_h new file mode 100644 index 00000000000..c7228ed84cf Binary files /dev/null and b/bio/mmseqs2/db/test/expected/createdb/a_h differ diff --git a/bio/mmseqs2/db/test/expected/createdb/a_h.dbtype b/bio/mmseqs2/db/test/expected/createdb/a_h.dbtype new file mode 100644 index 00000000000..d540dfdb4f4 Binary files /dev/null and b/bio/mmseqs2/db/test/expected/createdb/a_h.dbtype differ diff --git a/bio/mmseqs2/db/test/expected/createdb/a_h.index b/bio/mmseqs2/db/test/expected/createdb/a_h.index new file mode 100644 index 00000000000..d87f06c27fc --- /dev/null +++ b/bio/mmseqs2/db/test/expected/createdb/a_h.index @@ -0,0 +1,4 @@ +0 0 3 +1 3 3 +2 6 3 +3 9 3 diff --git a/bio/mmseqs2/db/test/expected/databases/a.dbtype b/bio/mmseqs2/db/test/expected/databases/a.dbtype new file mode 100644 index 00000000000..f66c9cf4c96 Binary files /dev/null and b/bio/mmseqs2/db/test/expected/databases/a.dbtype differ diff --git a/bio/mmseqs2/db/test/expected/databases/a.source b/bio/mmseqs2/db/test/expected/databases/a.source new file mode 100644 index 00000000000..eeb0885cb0d --- /dev/null +++ b/bio/mmseqs2/db/test/expected/databases/a.source @@ -0,0 +1 @@ +0 silva.fasta.gz diff --git a/bio/mmseqs2/db/test/expected/databases/a.version b/bio/mmseqs2/db/test/expected/databases/a.version new file mode 100644 index 00000000000..d8af945b707 --- /dev/null +++ b/bio/mmseqs2/db/test/expected/databases/a.version @@ -0,0 +1,164 @@ +README for SILVA 138 export files + +RAST FILES: +=========== + +Specific export files for the MG-RAST server (Argonne National Lab) + +TAXONOMY FILES: +=============== + +tax_slv_[ls]su_VERSION.txt +------------------------- +These files contain taxonomic rank designations for all taxonomic paths +used in the SILVA taxonomies. Additionally, a unique numeric identifier is +assigned to each taxon (path). These identifiers will be mostly stable in +upcoming SILVA releases. + +IDs used in the SSU and LSU files do not match. + +Field description: +path: + The full taxonomic path including the name of the group itself. + Segments are separated with ";" +taxid: + numerical identifier +rank: + The rank designation. +remark: + Can be empty ('') or a or w. + a: Marks taxa of environmental origin. That is, taxa containing no + sequence coming from a cultivated organism. + w: Marks taxa scheduled for revision in the next release. +release: + The SILVA release version + +tax_slv_[ls]su_VERSION.diff +--------------------------- +Difference between the current version of the SILVA taxonomy and the previous +release. + +Field description: +status: + the status of the taxono (+ added, - removed) +taxid: + numerical identifier +path: + the full path of the added/removed taxon + +taxmap_TAXNAME_[ls]su_VERSION.txt +---------------------------- +mapping of each entry in the SILVA database to a taxonomic path. Different +rRNA regions of the same INSDC entry (genome) may be assigned to multiple +paths (contaminations or micro diversity among the rRNA sequences). + +The taxmap_embl* files contain the taxonomic path assigned by the original +submitter of the sequence. The last column in this file contains the numerical +ID of the NCBI taxonomy project assigned to this entry. This ID is extracted +from the source feature of an EMBL entry during the import of the sequence. + +Field description: +pacc: + INSDC primary accession +start + start position of the rRNA region within the INSDC sequence entry +stop: + stop position of the rRNA region within the INSDC sequence entry +path: + taxonomic path assigned to the region +name + the organism name assigned to the sequence +taxid: + optional field containing the numerical ID of the taxonomic path + + +tax_TAXNAME_[ls]su_VERSION.{map,tre} +---------------------------- +SILVA taxonomy in the Newick tree format and the corespoding numerical id to +taxonomic path mapping file (MEGAN compatible). + +tax_TAXNAME_[ls]su_VERSION.acc_taxid +---------------------------- +Mapping of 'SILVA' sequence IDs (..) +used in FASTA files to the numeric SILVA taxid (MEGAN compatible). + + + + +SEQUENCE FILES: +=============== + +*_tax_silva.fasta.gz +----------------- +Multi FASTA files of the SSU/LSU databases including the SILVA taxonomy for +Bacteria, Archaea and Eukaryotes in the header. + +REMARK: The sequences in the files are NOT truncated to the effective LSU or +SSU genes. They contain the full entries as they have been deposited in the +public repositories (ENA/GenBank/DDBJ). + +Fasta header: +>accession_number.start_position.stop_position taxonomic path organism name + +*_tax_silva_full_align_trunc.fasta.gz +----------------------- +Multi FASTA files of the SSU/LSU databases including the SILVA taxonomy for +Bacteria, Archaea and Eukaryotes in the header (including the FULL alignment). + +REMARK: Sequences in these files haven been truncated. This means that all +nucleotides that have not been aligned were removed from the sequence. + +*_tax_silva_trunc.fasta.gz +----------------------- +Multi FASTA files of the SSU/LSU database including the SILVA taxonomy for +Bacteria, Archaea and Eukaryotes in the header. + +REMARK: Sequences in these files haven been truncated. This means that all +nucleotides that have not been aligned were removed from the sequence. + + + +CUSTOMISED FILES: +================= + +*.acs +----- +Lists with all accession numbers in LSUParc and SSUParc + +*.clstr +------- +Mapping of 'ref' sequences to 'nr' sequences. The file uses the CD-Hit file +format. + +*quality*.csv +------------- +complete quality values for all SILVA Parc sequences Datasets. +Header: Primary Accession,Start,Stop,Region Length,Annotation Source,Sequence Quality, +% Ambiguities,% Homopolymers,% Vector Contamination,Alignment Quality, Base Pair Score, +# Aligned Bases,Pintail Quality + + + +Directory 'User' +User specific exports done on request + +Abbreviations: + +LSU: Large subunit (23S/28S ribosomal RNAs) +SSU: Small subunit (16S/18S ribosomal RNAs) + + + + +Questions: contact@arb-silva.de + +December 2019 + +If you are using SILVA please cite: +Quast C, Pruesse E, Yilmaz P, Gerken J, Schweer T, Yarza P, Peplies J, +Glockner FO (2013) The SILVA ribosomal RNA gene database project: improved +data processing and web-based tools. Nucleic Acids Research 41:D590-D596 + +Yilmaz P, Parfrey LW, Yarza P, Gerken J, Pruesse E, Quast C, Schweer T, +Peplies J, Ludwig W, Glockner FO (2014) The SILVA and "All-species Living +Tree Project (LTP)" taxonomic frameworks. Nucleic Acid Res. 42:D643-D648 diff --git a/bio/mmseqs2/db/test/expected/databases/a_h.dbtype b/bio/mmseqs2/db/test/expected/databases/a_h.dbtype new file mode 100644 index 00000000000..d540dfdb4f4 Binary files /dev/null and b/bio/mmseqs2/db/test/expected/databases/a_h.dbtype differ diff --git a/bio/mmseqs2/db/test/seqs/a.fasta b/bio/mmseqs2/db/test/seqs/a.fasta new file mode 100644 index 00000000000..e184171588b --- /dev/null +++ b/bio/mmseqs2/db/test/seqs/a.fasta @@ -0,0 +1,8 @@ +>1 +ACGGCAT +>2 +ATGGCAT +>1 +CGGCAT +>3 +ATGGCA diff --git a/bio/mmseqs2/db/wrapper.py b/bio/mmseqs2/db/wrapper.py new file mode 100644 index 00000000000..0329a7e22ba --- /dev/null +++ b/bio/mmseqs2/db/wrapper.py @@ -0,0 +1,31 @@ +__author__ = "Filipe G. Vieira" +__copyright__ = "Copyright 2024, Filipe G. Vieira" +__license__ = "MIT" + +import os +import tempfile +from snakemake.shell import shell + +extra = snakemake.params.get("extra", "") +log = snakemake.log_fmt_shell(stdout=True, stderr=True) + + +input = snakemake.input +if isinstance(input, list): + input = os.path.commonprefix(input) + +# TODO: arbitrary output file names +out = snakemake.output +if isinstance(out, list): + out = os.path.commonprefix(out).rstrip("_") + + +with tempfile.TemporaryDirectory() as tmpdir: + # Modules with threads + if snakemake.params.module in ["databases"]: + extra = f"--threads {snakemake.threads} {extra}" + # Modules with no temp folder + if snakemake.params.module in ["createdb"]: + tmpdir = "" + + shell("mmseqs {snakemake.params.module} {input} {out} {tmpdir} {extra} {log}") diff --git a/bio/mmseqs2/workflows/environment.linux-64.pin.txt b/bio/mmseqs2/workflows/environment.linux-64.pin.txt new file mode 100644 index 00000000000..b709aa25085 --- /dev/null +++ b/bio/mmseqs2/workflows/environment.linux-64.pin.txt @@ -0,0 +1,43 @@ +# This file may be used to create an environment using: +# $ conda create --name --file +# platform: linux-64 +# created-by: conda 25.7.0 +@EXPLICIT +https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81 +https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.8.3-hbd8a1cb_0.conda#74784ee3d225fc3dca89edb635b4e5cc +https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.44-h1423503_1.conda#0be7c6e070c19105f966d3758448d018 +https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_5.conda#dcd5ff1940cd38f6df777cac86819d60 +https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda#94305520c52a4aa3f6c2b1ff6008d9f8 +https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda#4222072737ccff51314b5ece9c7d6f5a +https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2#73aaf86a425cc6e73fcf236a5a46396d +https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_5.conda#264fbfba7fb20acf3b29cde153e345ce +https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_8.conda#51a19bba1b8ebfb60df25cde030b7ebc +https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.5-hb9d3cd8_0.conda#f7f0d6cc2dc986d42ac2689ec88192be +https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.1-hecca717_0.conda#4211416ecba1866fab0c6470986c22d6 +https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda#ede4673863426c0883c0063d853bbd85 +https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_5.conda#069afdf8ea72504e48d23ae1171d951c +https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.18-h3b78370_2.conda#915f5995e94f60e9a4826e0b0920ee88 +https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda#1a580f7796c7bf6393fddb8bbbde58dc +https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb9d3cd8_0.conda#c7e925f37e3b40d893459e625f6a53f1 +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.1.0-h8f9b012_5.conda#4e02a49aaa9d5190cb630fa43528fbe6 +https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.41.2-he9a06e4_0.conda#80c07c68d2f6870250959dcc95b209d1 +https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda#edb0dca6bc32e4f4789199455a1dbeb8 +https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda#47e340acb35de30501a76c7c799c41d7 +https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.3-h26f9b46_1.conda#4fc6c4c88da64c0219c0c6c0408cedd4 +https://conda.anaconda.org/conda-forge/linux-64/libasprintf-0.25.1-h3f43e3d_1.conda#3b0d184bc9404516d418d4509e418bdc +https://conda.anaconda.org/conda-forge/linux-64/libgettextpo-0.25.1-h3f43e3d_1.conda#2f4de899028319b27eb7a4023be5dfd2 +https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.4-h0c1763c_0.conda#0b367fad34931cb79e0d6b7e5c06bb1c +https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hcf80075_0.conda#eecce068c7e4eddeb169591baac20ac4 +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-15.1.0-h4852527_5.conda#8bba50c7f4679f08c861b597ad2bda6b +https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.13.8-h2cb61b6_1.conda#42a8e4b54e322b4cd1dbfb30a8a7ce9e +https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda#283b96675859b20a825f8fa30f311446 +https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_hd72426e_102.conda#a0116df4f4ed05c303811a837d5b39d8 +https://conda.anaconda.org/conda-forge/linux-64/zlib-1.3.1-hb9d3cd8_2.conda#c9f075ab2f33b3bbee9e62d4ad0a6cd8 +https://conda.anaconda.org/conda-forge/linux-64/aria2-1.37.0-hbc8128a_2.conda#03b8874fa70df577f3eee53085d025cf +https://conda.anaconda.org/conda-forge/linux-64/gmp-6.3.0-hac33072_2.conda#c94a5994ef49749880a8139cf9afcbe1 +https://conda.anaconda.org/conda-forge/linux-64/python-3.13.7-h2b335a9_100_cp313.conda#724dcf9960e933838247971da07fe5cf +https://conda.anaconda.org/conda-forge/linux-64/mpfr-4.2.1-h90cbb55_3.conda#2eeb50cab6652538eee8fc0bc3340c81 +https://conda.anaconda.org/conda-forge/noarch/pip-25.2-pyh145f28c_0.conda#e7ab34d5a93e0819b62563c78635d937 +https://conda.anaconda.org/bioconda/noarch/snakemake-wrapper-utils-0.8.0-pyhdfd78af_0.conda#1650e521333852f45468d97b1b2fdcce +https://conda.anaconda.org/conda-forge/linux-64/gawk-5.3.1-hcd3d067_0.conda#91d4414ab699180b2b0b10b8112c5a2f +https://conda.anaconda.org/bioconda/linux-64/mmseqs2-17.b804f-hd6d6fdc_1.tar.bz2#561fb589d37cff61ec6b887fc2976498 diff --git a/bio/mmseqs2/workflows/environment.yaml b/bio/mmseqs2/workflows/environment.yaml new file mode 100644 index 00000000000..9e5c0511020 --- /dev/null +++ b/bio/mmseqs2/workflows/environment.yaml @@ -0,0 +1,7 @@ +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - mmseqs2 =17.b804f + - snakemake-wrapper-utils =0.8.0 diff --git a/bio/mmseqs2/workflows/meta.yaml b/bio/mmseqs2/workflows/meta.yaml new file mode 100644 index 00000000000..2a3c56bc99d --- /dev/null +++ b/bio/mmseqs2/workflows/meta.yaml @@ -0,0 +1,14 @@ +name: MMseqs2 workflows +url: https://github.com/soedinglab/mmseqs2 +description: | + ultra fast and sensitive sequence search and clustering suite +authors: + - Filipe G. Vieira +input: + - query: input query FAS file(s) + - target: input target FAS file(s) or DB +output: + - output: FAS, cluster or DB file(s) +params: + - module: workflow to use + - extra: additional program arguments diff --git a/bio/mmseqs2/workflows/test/Snakefile b/bio/mmseqs2/workflows/test/Snakefile new file mode 100644 index 00000000000..bf0e1db7a40 --- /dev/null +++ b/bio/mmseqs2/workflows/test/Snakefile @@ -0,0 +1,96 @@ +rule mmseqs2_workflow_search: + input: + query=["seqs/nucl.a.fas.gz", "seqs/nucl.b.fas.gz"], + target="seqs/nucl.{sample}.fas.gz", + output: + aln="out/search/{sample}.tab", + log: + "logs/search/{sample}.log", + params: + module="easy-search", + extra="--search-type 3", + threads: 2 + wrapper: + "master/bio/mmseqs2/workflows" + + +rule mmseqs2_workflow_cluster: + input: + query=["seqs/nucl.a.fas.gz", "seqs/nucl.b.fas.gz"], + output: + cluster=multiext( + "out/cluster/a_b", "_all_seqs.fasta", "_cluster.tsv", "_rep_seq.fasta" + ), + log: + "logs/cluster/a_b.log", + params: + module="easy-cluster", + extra="--max-seqs 5", + threads: 2 + wrapper: + "master/bio/mmseqs2/workflows" + + +rule mmseqs2_workflow_linclust: + input: + query=["seqs/nucl.a.fas.gz", "seqs/nucl.b.fas.gz"], + output: + cluster=multiext( + "out/linclust/a_b", "_all_seqs.fasta", "_cluster.tsv", "_rep_seq.fasta" + ), + log: + "logs/linclust/a_b.log", + params: + module="easy-linclust", + extra="--max-seq-len 1000", + threads: 2 + wrapper: + "master/bio/mmseqs2/workflows" + + +rule mmseqs2_workflow_taxonomy: + input: + query=["seqs/nucl.a.fas.gz", "seqs/nucl.b.fas.gz"], + target=multiext( + "db/{sample}", + "", + ".dbtype", + ".index", + ".lookup", + ".source", + "_h", + "_h.dbtype", + "_h.index", + ), + output: + report=multiext( + "out/taxonomy/{sample}", + "_lca.tsv", + "_report", + "_tophit_aln", + "_tophit_report", + ), + log: + "logs/taxonomy/{sample}.log", + params: + module="easy-taxonomy", + extra="--max-seq-len 1000 --search-type 3", + threads: 2 + wrapper: + "master/bio/mmseqs2/workflows" + + +rule mmseqs2_workflow_rbh: + input: + query="seqs/nucl.b.fas.gz", + target="seqs/nucl.{sample}.fas.gz", + output: + aln="out/rbh/{sample}.tab", + log: + "logs/rbh/{sample}.log", + params: + module="easy-rbh", + extra="--search-type 3", + threads: 2 + wrapper: + "master/bio/mmseqs2/workflows" diff --git a/bio/mmseqs2/workflows/test/db/a b/bio/mmseqs2/workflows/test/db/a new file mode 100644 index 00000000000..9b8977c2c5a Binary files /dev/null and b/bio/mmseqs2/workflows/test/db/a differ diff --git a/bio/mmseqs2/workflows/test/db/a.dbtype b/bio/mmseqs2/workflows/test/db/a.dbtype new file mode 100644 index 00000000000..f66c9cf4c96 Binary files /dev/null and b/bio/mmseqs2/workflows/test/db/a.dbtype differ diff --git a/bio/mmseqs2/workflows/test/db/a.index b/bio/mmseqs2/workflows/test/db/a.index new file mode 100644 index 00000000000..c97126fbee9 --- /dev/null +++ b/bio/mmseqs2/workflows/test/db/a.index @@ -0,0 +1,4 @@ +0 0 9 +1 9 9 +2 18 8 +3 26 8 diff --git a/bio/mmseqs2/workflows/test/db/a.lookup b/bio/mmseqs2/workflows/test/db/a.lookup new file mode 100644 index 00000000000..13797744afd --- /dev/null +++ b/bio/mmseqs2/workflows/test/db/a.lookup @@ -0,0 +1,4 @@ +0 1 0 +1 2 0 +2 1 0 +3 3 0 diff --git a/bio/mmseqs2/workflows/test/db/a.source b/bio/mmseqs2/workflows/test/db/a.source new file mode 100644 index 00000000000..b60b20168f3 --- /dev/null +++ b/bio/mmseqs2/workflows/test/db/a.source @@ -0,0 +1 @@ +0 a.fasta diff --git a/bio/mmseqs2/workflows/test/db/a_h b/bio/mmseqs2/workflows/test/db/a_h new file mode 100644 index 00000000000..c7228ed84cf Binary files /dev/null and b/bio/mmseqs2/workflows/test/db/a_h differ diff --git a/bio/mmseqs2/workflows/test/db/a_h.dbtype b/bio/mmseqs2/workflows/test/db/a_h.dbtype new file mode 100644 index 00000000000..d540dfdb4f4 Binary files /dev/null and b/bio/mmseqs2/workflows/test/db/a_h.dbtype differ diff --git a/bio/mmseqs2/workflows/test/db/a_h.index b/bio/mmseqs2/workflows/test/db/a_h.index new file mode 100644 index 00000000000..d87f06c27fc --- /dev/null +++ b/bio/mmseqs2/workflows/test/db/a_h.index @@ -0,0 +1,4 @@ +0 0 3 +1 3 3 +2 6 3 +3 9 3 diff --git a/bio/mmseqs2/workflows/test/db/a_mapping b/bio/mmseqs2/workflows/test/db/a_mapping new file mode 100644 index 00000000000..f0ac9407d76 --- /dev/null +++ b/bio/mmseqs2/workflows/test/db/a_mapping @@ -0,0 +1,5 @@ +PV111755.1 9606 +PQ877082.1 9606 +PQ877081.1 9606 +PQ877080.1 9606 +PQ809255.1 9606 diff --git a/bio/mmseqs2/workflows/test/db/a_taxonomy b/bio/mmseqs2/workflows/test/db/a_taxonomy new file mode 100644 index 00000000000..bf6376e877d Binary files /dev/null and b/bio/mmseqs2/workflows/test/db/a_taxonomy differ diff --git a/bio/mmseqs2/workflows/test/expected/cluster/a_b_all_seqs.fasta b/bio/mmseqs2/workflows/test/expected/cluster/a_b_all_seqs.fasta new file mode 100644 index 00000000000..5c65727cb16 --- /dev/null +++ b/bio/mmseqs2/workflows/test/expected/cluster/a_b_all_seqs.fasta @@ -0,0 +1,22 @@ +>PV111755.1 +>PV111755.1 Homo sapiens isolate T3.6 cytochrome b (cytb) gene, partial cds; mitochondrial +AACATTTCAGCATGATGAAACTTCGGCTCACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTAGCCATGCACTACTCACCAGACGCCTCAACCGCCTTTTCATCAATCGCCCACATCACTCGAGACGTAAATTATGGCTGAATCATCCGCTACCTTCACGCCAATGGCGCCTCAATATTCTTTATCTGCCTCTTCCTACACATCGGACGAGGCCTATATTACGGATCATTTCTCTACTCAGAAACCTGAAACATCGGCATTATCCTCCTGCTTGCAACTATAGCAACAGCCTTCATAGGCTATGTCCTCCCGTGAGGCCAAATATCATTCTGA +>PQ809250.1 +>PQ809250.1 Homo sapiens isolate 06A_E-RJP-293 control region, partial sequence; mitochondrial +TTCTCTGTTCTTTCATGGGGAAGCAGATTTGGGTACCACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATTTCGTACATTACTGCCAGCCACCATGAATATTGTACGGTACCATAAATACTTGACCACCTGTAGTACATAAAAACCCAATCCACATCAAAACCCCCCCCCTCATGCTTACAAGCAAGTACAGCAATCAACCTTCAACTATCCCCCATCAACTGCAACTCCAAAGCCCCCCCTCCCCCACTAGGATACCAACAAACCTACCTACCCTTAACAGTACATAGCACATAAAGCCATTCACCGTACATAGCACATTACAGTCAAATCCTTTCTCGTCCCCATGGATGACCCCCCTCAAAAAGGGGTCCCTTGACCACCATCCTCCGGGAAATCAATATCCCGCACAAAAGGGCTACTCTCCTCGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGGGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGTTCCCCTTAATAAAACATCACGATGGATCACAGGTCTATCACCTATTAACCACTCACGGAGCTCTCCATGCATTGTATTTCGTCTGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGAGCGGAGCACCTATGTCGCAGATCTGTCTTGATTCCTACTCATCCTATTATTCATCGCACTACGTCAATATTACAGCGAGCATATCTACTAAGCGATAATTAATTAATGCTTGTAGACTATATACAATTGAATGCTGCCAGCGCTTCACAGACTCTACAAATTCACAACCCCCTCCCCGTCGTCAGACTACCA +>PQ877081.1 Homo sapiens voucher InBOL_LP D-loop, partial sequence; mitochondrial +TTCTTTCATGGGGAAGCAGATTTGGGTACGACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATCTCGTACATTACTGCCAGCCACCATGAATATTGTACGGTACCATAAATACTTGACCACCTGTAGTACATAAAAACCCAATCCACATCAAAACCCCCTCCCCATGCTTACAAGCAAGTACAGCAATCAACCTTCAACTATCACACATCAACTGCAACTCCAAAGCCACCCCTCACCCACTAGGATACCAACAAACCTACCCACCCTTAACAGTACATAGTACATAAAGCCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGACCCCCCTCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCCCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACTTACCAAAGTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCGCTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAACCCCAAAAACAAAGAACCCTAACACCAGCCTAACCAGATTTCAAATTTTATCTTTTGGCG +>PQ877082.1 Homo sapiens voucher InBOL_BM D-loop, partial sequence; mitochondrial +TGAATATTGCACGGTACCATAAATACTTGACCACCTGTAGTACATAAAAACCCAACCCACATCAAAACCCCCTCCCCATGCTTACAAGCAAGTACAGCAATCAACCTTCAACTATCACACATCAACTGCAACTCCAAAGCCACCCCTCACCCACTAGGATACCAACAAACCTACCCACCCTTAACAGTACATAGTACATAAAGCCATTCACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGACCCCCCTCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACTTACTAAAGCGTATTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCGCTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAACCCCAAAAACAAAGAACCCTAACACCAGCCTAACCAGATTTCAAA +>PQ877080.1 Homo sapiens voucher InBOL_AP D-loop, partial sequence; mitochondrial +TTCTTTCATGGGGAAGCGGATTTGGGTACCACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATTTCGTACATTACTGCCAGCCACCATGAATATTGTACGGTACCATAAATACTTGACCACCTGTAGTACATAAAAACCCAATCCACATCAAAACCCCCTCCCCATGCTTACAAGCAAGTACAGCAATCAACCCTCAACTATCACACATCAACTGCAACTCCAAAGCCACCCCTCACCCACTAGGATACCAACAAACCTACCCACCCTTAACAGCACATAGTACATAAAGCCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGACCCCCCTCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGGGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACTTACTGAAGCGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCGCTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAACCCCAAAAACAAAGAACCCTAACACCAGCCTAACCA +>PQ809255.1 Homo sapiens isolate 06E_E-RJP-298 control region, partial sequence; mitochondrial +ATTCTCTGTTCTTTCATGGGGAAGCAGATTTGGGTACCACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATTTCGTACATTACTGCCAGCCACCATGAATATTGTACGGTACCATAAATACTTGACCACCTGTAGTACATAAAAACCCAATCCACATCAAAACCCCCTCCCCATGCTTACAAGCAAGTACAGCAATCAACCTTCAACTATCACACATCAACTGCAACTCCAAAGCCACCCCTCACCCACTAGGATACCAACAAACCTACCCACCCTTAACAGTACATAGTACATAAAGCCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCACGGATGACCCCCCTCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACATACTAAAGTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCGCTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCCTCCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAACCCCAAAAACAAAGAACCCTAACACCAGCCTAACCAGATTTCAAATTTTATC +>PQ809253.1 Homo sapiens isolate 06C_E-RJP-296 control region, partial sequence; mitochondrial +AACTATTCTCTGTTCTTTCTGGGGAAGCAGATTTGGGTACCACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATTTCGTACATTACTGCCAGCCACCATGAATATTGTACGGTACCATAAATACTTGACCACCTGTAGTACATAAAAACCCAATCCACATCAAAACCCCCTCCCCATGCTTACAAGCAAGTACAGCAATCAACCTTCAACTATCACACATCAACTGCAACTCCAAAGCCACCCCTCACCCACTAGGATACCAACAAACCTACCCACCCTTAACAGTACATAGTACATAATGCCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGACCCCCCTCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATATTTACTAAAGTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATCGAATGTCTGCACAGCCGCTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCCGCTTCTGGCCACAGCACTTAAACACAT +>PQ809252.1 Homo sapiens isolate 06B_E-RJP-295 control region, partial sequence; mitochondrial +ATTCTCTGTTCTTTCATGGGGAAGCAGATTTGAGTACCACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATTTCGTACATTACTGCCAGCCACCATGAATATTGTACAGTACCATAAATACTTGACCACCTGTAGTACATAAAAACCCAATCCACATCAAAACCCCCTCCCCATGCTTACAAGCAAGTACAGCAATCAACCTTCAACTATCACACATCAACTGCAACTCCAAAGCCACCCCTCACCCACTAGGATACCAACAAACCTACCCACCCTTAACAGTACATAGTACATAAAGCCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGACCCCCCTCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCCATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACTTACTAAAGTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCGCTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCCGCTTCTGGCCACAGCACTTAAACACATCTC +>PQ809251.1 Homo sapiens isolate 09H_E-RJP-294 control region, partial sequence; mitochondrial +CTGTTCTTTCTGGGGAAGCAGATTTGGGTACCACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATTTCGTACATTACTGCCAGCCACCATGAATATTGCACGGTACCATAAATACTTGACCACCTGTAGTACATAAAAACCCAATCCACATCAAAACCCCCTCCCCATGCTTACAAGCAAGTACAGCAATCAACCTTCAACTATCACACATCAACTGCAACTCCAAAGCCACCCCTCACCCACTAGGATACCAACAAACCTACCCACCCTTAACAGTACATAGCACATAAAGCCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGACCCCCCTCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACTTACTAAAGCGTGTTAATTAATCAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCGCTTTCCACACAGACATCATAACAAAAAAT +>PQ809249.1 Homo sapiens isolate 05H_E-RJP-292 control region, partial sequence; mitochondrial +TTCTCTGTTCTTTCATGGGGAAGCAGATTTGGGTACCACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATTTCGTACATTACTGCCAGCCACCATGAATATTGTACAGTACCATAAATACTTGACCACCTGTAGTACATAAAAACCCAATCCACATCAAAACCCCCTCCCCATGCTTACAAGCAAGTACAGCAATCAACCTTCAACTATCACACATCAACTGCAACTCCAAAGCCACCCCTCACCCACTAGGATATCAACAAACCTACCTACCCTTAACAGTACATAGTACATAAAGCCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGACCCCCCTCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATTCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATATCTACCAAAGCGTATTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCGCTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCCTCCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAACCCCAAAAACAAAGAACCCTAACACCAGCCTAACCAGATTTCAAATTTTATCT diff --git a/bio/mmseqs2/workflows/test/expected/cluster/a_b_cluster.tsv b/bio/mmseqs2/workflows/test/expected/cluster/a_b_cluster.tsv new file mode 100644 index 00000000000..4cd8d6f366b --- /dev/null +++ b/bio/mmseqs2/workflows/test/expected/cluster/a_b_cluster.tsv @@ -0,0 +1,10 @@ +PV111755.1 PV111755.1 +PQ809250.1 PQ809250.1 +PQ809250.1 PQ877081.1 +PQ809250.1 PQ877082.1 +PQ809250.1 PQ877080.1 +PQ809250.1 PQ809255.1 +PQ809250.1 PQ809253.1 +PQ809250.1 PQ809252.1 +PQ809250.1 PQ809251.1 +PQ809250.1 PQ809249.1 diff --git a/bio/mmseqs2/workflows/test/expected/cluster/a_b_rep_seq.fasta b/bio/mmseqs2/workflows/test/expected/cluster/a_b_rep_seq.fasta new file mode 100644 index 00000000000..0b49a3870d3 --- /dev/null +++ b/bio/mmseqs2/workflows/test/expected/cluster/a_b_rep_seq.fasta @@ -0,0 +1,4 @@ +>PV111755.1 Homo sapiens isolate T3.6 cytochrome b (cytb) gene, partial cds; mitochondrial +AACATTTCAGCATGATGAAACTTCGGCTCACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTAGCCATGCACTACTCACCAGACGCCTCAACCGCCTTTTCATCAATCGCCCACATCACTCGAGACGTAAATTATGGCTGAATCATCCGCTACCTTCACGCCAATGGCGCCTCAATATTCTTTATCTGCCTCTTCCTACACATCGGACGAGGCCTATATTACGGATCATTTCTCTACTCAGAAACCTGAAACATCGGCATTATCCTCCTGCTTGCAACTATAGCAACAGCCTTCATAGGCTATGTCCTCCCGTGAGGCCAAATATCATTCTGA +>PQ809250.1 Homo sapiens isolate 06A_E-RJP-293 control region, partial sequence; mitochondrial +TTCTCTGTTCTTTCATGGGGAAGCAGATTTGGGTACCACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATTTCGTACATTACTGCCAGCCACCATGAATATTGTACGGTACCATAAATACTTGACCACCTGTAGTACATAAAAACCCAATCCACATCAAAACCCCCCCCCTCATGCTTACAAGCAAGTACAGCAATCAACCTTCAACTATCCCCCATCAACTGCAACTCCAAAGCCCCCCCTCCCCCACTAGGATACCAACAAACCTACCTACCCTTAACAGTACATAGCACATAAAGCCATTCACCGTACATAGCACATTACAGTCAAATCCTTTCTCGTCCCCATGGATGACCCCCCTCAAAAAGGGGTCCCTTGACCACCATCCTCCGGGAAATCAATATCCCGCACAAAAGGGCTACTCTCCTCGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGGGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGTTCCCCTTAATAAAACATCACGATGGATCACAGGTCTATCACCTATTAACCACTCACGGAGCTCTCCATGCATTGTATTTCGTCTGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGAGCGGAGCACCTATGTCGCAGATCTGTCTTGATTCCTACTCATCCTATTATTCATCGCACTACGTCAATATTACAGCGAGCATATCTACTAAGCGATAATTAATTAATGCTTGTAGACTATATACAATTGAATGCTGCCAGCGCTTCACAGACTCTACAAATTCACAACCCCCTCCCCGTCGTCAGACTACCA diff --git a/bio/mmseqs2/workflows/test/expected/linclust/a_b_all_seqs.fasta b/bio/mmseqs2/workflows/test/expected/linclust/a_b_all_seqs.fasta new file mode 100644 index 00000000000..66f16641a77 --- /dev/null +++ b/bio/mmseqs2/workflows/test/expected/linclust/a_b_all_seqs.fasta @@ -0,0 +1,22 @@ +>PV111755.1 +>PV111755.1 Homo sapiens isolate T3.6 cytochrome b (cytb) gene, partial cds; mitochondrial +AACATTTCAGCATGATGAAACTTCGGCTCACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTAGCCATGCACTACTCACCAGACGCCTCAACCGCCTTTTCATCAATCGCCCACATCACTCGAGACGTAAATTATGGCTGAATCATCCGCTACCTTCACGCCAATGGCGCCTCAATATTCTTTATCTGCCTCTTCCTACACATCGGACGAGGCCTATATTACGGATCATTTCTCTACTCAGAAACCTGAAACATCGGCATTATCCTCCTGCTTGCAACTATAGCAACAGCCTTCATAGGCTATGTCCTCCCGTGAGGCCAAATATCATTCTGA +>PQ809250.1 +>PQ809250.1 Homo sapiens isolate 06A_E-RJP-293 control region, partial sequence; mitochondrial +TTCTCTGTTCTTTCATGGGGAAGCAGATTTGGGTACCACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATTTCGTACATTACTGCCAGCCACCATGAATATTGTACGGTACCATAAATACTTGACCACCTGTAGTACATAAAAACCCAATCCACATCAAAACCCCCCCCCTCATGCTTACAAGCAAGTACAGCAATCAACCTTCAACTATCCCCCATCAACTGCAACTCCAAAGCCCCCCCTCCCCCACTAGGATACCAACAAACCTACCTACCCTTAACAGTACATAGCACATAAAGCCATTCACCGTACATAGCACATTACAGTCAAATCCTTTCTCGTCCCCATGGATGACCCCCCTCAAAAAGGGGTCCCTTGACCACCATCCTCCGGGAAATCAATATCCCGCACAAAAGGGCTACTCTCCTCGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGGGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGTTCCCCTTAATAAAACATCACGATGGATCACAGGTCTATCACCTATTAACCACTCACGGAGCTCTCCATGCATTGTATTTCGTCTGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGAGCGGAGCACCTATGTCGCAGATCTGTCTTGATTCCTACTCATCCTATTATTCATCGCACTACGTCAATATTACAGCGAGCATATCTACTAAGCGATAATTAATTAATGCTTGTAGACTATATACAATTGAATGCTGCCAGCGCTTCACAGACTCTACAAATTCACAACCCCCTCCCCGTCGTCAGACTACCA +>PQ809255.1 Homo sapiens isolate 06E_E-RJP-298 control region, partial sequence; mitochondrial +ATTCTCTGTTCTTTCATGGGGAAGCAGATTTGGGTACCACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATTTCGTACATTACTGCCAGCCACCATGAATATTGTACGGTACCATAAATACTTGACCACCTGTAGTACATAAAAACCCAATCCACATCAAAACCCCCTCCCCATGCTTACAAGCAAGTACAGCAATCAACCTTCAACTATCACACATCAACTGCAACTCCAAAGCCACCCCTCACCCACTAGGATACCAACAAACCTACCCACCCTTAACAGTACATAGTACATAAAGCCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCACGGATGACCCCCCTCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACATACTAAAGTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCGCTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCCTCCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAACCCCAAAAACAAAGAACCCTAACACCAGCCTAACCAGATTTCAAATTTTATC +>PQ877082.1 Homo sapiens voucher InBOL_BM D-loop, partial sequence; mitochondrial +TGAATATTGCACGGTACCATAAATACTTGACCACCTGTAGTACATAAAAACCCAACCCACATCAAAACCCCCTCCCCATGCTTACAAGCAAGTACAGCAATCAACCTTCAACTATCACACATCAACTGCAACTCCAAAGCCACCCCTCACCCACTAGGATACCAACAAACCTACCCACCCTTAACAGTACATAGTACATAAAGCCATTCACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGACCCCCCTCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACTTACTAAAGCGTATTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCGCTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAACCCCAAAAACAAAGAACCCTAACACCAGCCTAACCAGATTTCAAA +>PQ877081.1 Homo sapiens voucher InBOL_LP D-loop, partial sequence; mitochondrial +TTCTTTCATGGGGAAGCAGATTTGGGTACGACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATCTCGTACATTACTGCCAGCCACCATGAATATTGTACGGTACCATAAATACTTGACCACCTGTAGTACATAAAAACCCAATCCACATCAAAACCCCCTCCCCATGCTTACAAGCAAGTACAGCAATCAACCTTCAACTATCACACATCAACTGCAACTCCAAAGCCACCCCTCACCCACTAGGATACCAACAAACCTACCCACCCTTAACAGTACATAGTACATAAAGCCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGACCCCCCTCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCCCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACTTACCAAAGTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCGCTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAACCCCAAAAACAAAGAACCCTAACACCAGCCTAACCAGATTTCAAATTTTATCTTTTGGCG +>PQ877080.1 Homo sapiens voucher InBOL_AP D-loop, partial sequence; mitochondrial +TTCTTTCATGGGGAAGCGGATTTGGGTACCACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATTTCGTACATTACTGCCAGCCACCATGAATATTGTACGGTACCATAAATACTTGACCACCTGTAGTACATAAAAACCCAATCCACATCAAAACCCCCTCCCCATGCTTACAAGCAAGTACAGCAATCAACCCTCAACTATCACACATCAACTGCAACTCCAAAGCCACCCCTCACCCACTAGGATACCAACAAACCTACCCACCCTTAACAGCACATAGTACATAAAGCCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGACCCCCCTCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGGGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACTTACTGAAGCGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCGCTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAACCCCAAAAACAAAGAACCCTAACACCAGCCTAACCA +>PQ809253.1 Homo sapiens isolate 06C_E-RJP-296 control region, partial sequence; mitochondrial +AACTATTCTCTGTTCTTTCTGGGGAAGCAGATTTGGGTACCACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATTTCGTACATTACTGCCAGCCACCATGAATATTGTACGGTACCATAAATACTTGACCACCTGTAGTACATAAAAACCCAATCCACATCAAAACCCCCTCCCCATGCTTACAAGCAAGTACAGCAATCAACCTTCAACTATCACACATCAACTGCAACTCCAAAGCCACCCCTCACCCACTAGGATACCAACAAACCTACCCACCCTTAACAGTACATAGTACATAATGCCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGACCCCCCTCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATATTTACTAAAGTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATCGAATGTCTGCACAGCCGCTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCCGCTTCTGGCCACAGCACTTAAACACAT +>PQ809252.1 Homo sapiens isolate 06B_E-RJP-295 control region, partial sequence; mitochondrial +ATTCTCTGTTCTTTCATGGGGAAGCAGATTTGAGTACCACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATTTCGTACATTACTGCCAGCCACCATGAATATTGTACAGTACCATAAATACTTGACCACCTGTAGTACATAAAAACCCAATCCACATCAAAACCCCCTCCCCATGCTTACAAGCAAGTACAGCAATCAACCTTCAACTATCACACATCAACTGCAACTCCAAAGCCACCCCTCACCCACTAGGATACCAACAAACCTACCCACCCTTAACAGTACATAGTACATAAAGCCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGACCCCCCTCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCCATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACTTACTAAAGTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCGCTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCCGCTTCTGGCCACAGCACTTAAACACATCTC +>PQ809251.1 Homo sapiens isolate 09H_E-RJP-294 control region, partial sequence; mitochondrial +CTGTTCTTTCTGGGGAAGCAGATTTGGGTACCACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATTTCGTACATTACTGCCAGCCACCATGAATATTGCACGGTACCATAAATACTTGACCACCTGTAGTACATAAAAACCCAATCCACATCAAAACCCCCTCCCCATGCTTACAAGCAAGTACAGCAATCAACCTTCAACTATCACACATCAACTGCAACTCCAAAGCCACCCCTCACCCACTAGGATACCAACAAACCTACCCACCCTTAACAGTACATAGCACATAAAGCCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGACCCCCCTCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACTTACTAAAGCGTGTTAATTAATCAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCGCTTTCCACACAGACATCATAACAAAAAAT +>PQ809249.1 Homo sapiens isolate 05H_E-RJP-292 control region, partial sequence; mitochondrial +TTCTCTGTTCTTTCATGGGGAAGCAGATTTGGGTACCACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATTTCGTACATTACTGCCAGCCACCATGAATATTGTACAGTACCATAAATACTTGACCACCTGTAGTACATAAAAACCCAATCCACATCAAAACCCCCTCCCCATGCTTACAAGCAAGTACAGCAATCAACCTTCAACTATCACACATCAACTGCAACTCCAAAGCCACCCCTCACCCACTAGGATATCAACAAACCTACCTACCCTTAACAGTACATAGTACATAAAGCCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGACCCCCCTCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATTCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATATCTACCAAAGCGTATTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCGCTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCCTCCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAACCCCAAAAACAAAGAACCCTAACACCAGCCTAACCAGATTTCAAATTTTATCT diff --git a/bio/mmseqs2/workflows/test/expected/linclust/a_b_cluster.tsv b/bio/mmseqs2/workflows/test/expected/linclust/a_b_cluster.tsv new file mode 100644 index 00000000000..83fe5601e0c --- /dev/null +++ b/bio/mmseqs2/workflows/test/expected/linclust/a_b_cluster.tsv @@ -0,0 +1,10 @@ +PV111755.1 PV111755.1 +PQ809250.1 PQ809250.1 +PQ809250.1 PQ809255.1 +PQ809250.1 PQ877082.1 +PQ809250.1 PQ877081.1 +PQ809250.1 PQ877080.1 +PQ809250.1 PQ809253.1 +PQ809250.1 PQ809252.1 +PQ809250.1 PQ809251.1 +PQ809250.1 PQ809249.1 diff --git a/bio/mmseqs2/workflows/test/expected/linclust/a_b_rep_seq.fasta b/bio/mmseqs2/workflows/test/expected/linclust/a_b_rep_seq.fasta new file mode 100644 index 00000000000..0b49a3870d3 --- /dev/null +++ b/bio/mmseqs2/workflows/test/expected/linclust/a_b_rep_seq.fasta @@ -0,0 +1,4 @@ +>PV111755.1 Homo sapiens isolate T3.6 cytochrome b (cytb) gene, partial cds; mitochondrial +AACATTTCAGCATGATGAAACTTCGGCTCACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTAGCCATGCACTACTCACCAGACGCCTCAACCGCCTTTTCATCAATCGCCCACATCACTCGAGACGTAAATTATGGCTGAATCATCCGCTACCTTCACGCCAATGGCGCCTCAATATTCTTTATCTGCCTCTTCCTACACATCGGACGAGGCCTATATTACGGATCATTTCTCTACTCAGAAACCTGAAACATCGGCATTATCCTCCTGCTTGCAACTATAGCAACAGCCTTCATAGGCTATGTCCTCCCGTGAGGCCAAATATCATTCTGA +>PQ809250.1 Homo sapiens isolate 06A_E-RJP-293 control region, partial sequence; mitochondrial +TTCTCTGTTCTTTCATGGGGAAGCAGATTTGGGTACCACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATTTCGTACATTACTGCCAGCCACCATGAATATTGTACGGTACCATAAATACTTGACCACCTGTAGTACATAAAAACCCAATCCACATCAAAACCCCCCCCCTCATGCTTACAAGCAAGTACAGCAATCAACCTTCAACTATCCCCCATCAACTGCAACTCCAAAGCCCCCCCTCCCCCACTAGGATACCAACAAACCTACCTACCCTTAACAGTACATAGCACATAAAGCCATTCACCGTACATAGCACATTACAGTCAAATCCTTTCTCGTCCCCATGGATGACCCCCCTCAAAAAGGGGTCCCTTGACCACCATCCTCCGGGAAATCAATATCCCGCACAAAAGGGCTACTCTCCTCGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGGGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGTTCCCCTTAATAAAACATCACGATGGATCACAGGTCTATCACCTATTAACCACTCACGGAGCTCTCCATGCATTGTATTTCGTCTGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGAGCGGAGCACCTATGTCGCAGATCTGTCTTGATTCCTACTCATCCTATTATTCATCGCACTACGTCAATATTACAGCGAGCATATCTACTAAGCGATAATTAATTAATGCTTGTAGACTATATACAATTGAATGCTGCCAGCGCTTCACAGACTCTACAAATTCACAACCCCCTCCCCGTCGTCAGACTACCA diff --git a/bio/mmseqs2/workflows/test/expected/rbh/a.tab b/bio/mmseqs2/workflows/test/expected/rbh/a.tab new file mode 100644 index 00000000000..22b0461e7c1 --- /dev/null +++ b/bio/mmseqs2/workflows/test/expected/rbh/a.tab @@ -0,0 +1 @@ +PQ809249.1 PQ809255.1 0.989 959 11 0 1 959 2 960 0.000E+00 1675 diff --git a/bio/mmseqs2/workflows/test/expected/search/a.tab b/bio/mmseqs2/workflows/test/expected/search/a.tab new file mode 100644 index 00000000000..ca9173b8d96 --- /dev/null +++ b/bio/mmseqs2/workflows/test/expected/search/a.tab @@ -0,0 +1,37 @@ +PV111755.1 PV111755.1 1.000 348 0 0 1 348 1 348 7.626E-195 625 +PQ877082.1 PQ877082.1 1.000 851 0 0 1 851 1 851 0.000E+00 1526 +PQ877082.1 PQ877081.1 0.991 851 8 0 1 851 94 944 0.000E+00 1495 +PQ877082.1 PQ809255.1 0.990 852 8 0 1 851 102 953 0.000E+00 1489 +PQ877082.1 PQ877080.1 0.990 842 8 0 1 842 94 935 0.000E+00 1474 +PQ877081.1 PQ877081.1 1.000 959 0 0 1 959 1 959 0.000E+00 1720 +PQ877081.1 PQ809255.1 0.992 952 8 0 1 951 9 960 0.000E+00 1672 +PQ877081.1 PQ877080.1 0.989 935 10 0 1 935 1 935 0.000E+00 1632 +PQ877081.1 PQ877082.1 0.991 851 8 0 94 944 1 851 0.000E+00 1495 +PQ877080.1 PQ877080.1 1.000 935 0 0 1 935 1 935 0.000E+00 1677 +PQ877080.1 PQ809255.1 0.990 936 9 0 1 935 9 944 0.000E+00 1635 +PQ877080.1 PQ877081.1 0.989 935 10 0 1 935 1 935 0.000E+00 1632 +PQ877080.1 PQ877082.1 0.990 842 8 0 94 935 1 842 0.000E+00 1474 +PQ809255.1 PQ809255.1 1.000 960 0 0 1 960 1 960 0.000E+00 1721 +PQ809255.1 PQ877081.1 0.992 952 8 0 9 960 1 951 0.000E+00 1672 +PQ809255.1 PQ877080.1 0.990 936 9 0 9 944 1 935 0.000E+00 1635 +PQ809255.1 PQ877082.1 0.990 852 8 0 102 953 1 851 0.000E+00 1489 +PQ809253.1 PQ809255.1 0.992 898 7 0 5 900 1 898 0.000E+00 1572 +PQ809253.1 PQ877081.1 0.991 889 8 0 13 900 1 889 0.000E+00 1555 +PQ809253.1 PQ877080.1 0.988 889 11 0 13 900 1 889 0.000E+00 1546 +PQ809253.1 PQ877082.1 0.989 796 9 0 105 900 1 796 0.000E+00 1392 +PQ809252.1 PQ809255.1 0.993 901 6 0 1 900 1 901 0.000E+00 1585 +PQ809252.1 PQ877081.1 0.992 892 7 0 9 900 1 892 0.000E+00 1568 +PQ809252.1 PQ877080.1 0.989 892 10 0 9 900 1 892 0.000E+00 1559 +PQ809252.1 PQ877082.1 0.991 799 7 0 102 900 1 799 0.000E+00 1402 +PQ809251.1 PQ809255.1 0.991 841 8 0 1 840 6 846 0.000E+00 1473 +PQ809251.1 PQ877080.1 0.989 838 9 0 4 840 1 838 0.000E+00 1459 +PQ809251.1 PQ877081.1 0.989 838 9 0 4 840 1 838 0.000E+00 1459 +PQ809251.1 PQ877082.1 0.993 745 5 0 96 840 1 745 0.000E+00 1314 +PQ809250.1 PQ809255.1 0.927 830 58 0 1 800 2 831 0.000E+00 1152 +PQ809250.1 PQ877080.1 0.924 823 60 0 8 800 1 823 0.000E+00 1126 +PQ809250.1 PQ877081.1 0.923 823 61 0 8 800 1 823 0.000E+00 1126 +PQ809250.1 PQ877082.1 0.919 730 57 0 101 800 1 730 1.985E-288 973 +PQ809249.1 PQ809255.1 0.989 959 11 0 1 959 2 960 0.000E+00 1675 +PQ809249.1 PQ877081.1 0.987 953 12 0 8 960 1 952 0.000E+00 1652 +PQ809249.1 PQ877080.1 0.985 936 14 0 8 943 1 935 0.000E+00 1612 +PQ809249.1 PQ877082.1 0.987 852 11 0 101 952 1 851 0.000E+00 1475 diff --git a/bio/mmseqs2/workflows/test/expected/taxonomy/a_lca.tsv b/bio/mmseqs2/workflows/test/expected/taxonomy/a_lca.tsv new file mode 100644 index 00000000000..55d5fb490af --- /dev/null +++ b/bio/mmseqs2/workflows/test/expected/taxonomy/a_lca.tsv @@ -0,0 +1,10 @@ +PV111755.1 0 no rank unclassified +PQ877082.1 0 no rank unclassified +PQ877081.1 0 no rank unclassified +PQ877080.1 0 no rank unclassified +PQ809255.1 0 no rank unclassified +PQ809253.1 0 no rank unclassified +PQ809252.1 0 no rank unclassified +PQ809251.1 0 no rank unclassified +PQ809250.1 0 no rank unclassified +PQ809249.1 0 no rank unclassified diff --git a/bio/mmseqs2/workflows/test/expected/taxonomy/a_report b/bio/mmseqs2/workflows/test/expected/taxonomy/a_report new file mode 100644 index 00000000000..5b006536c06 --- /dev/null +++ b/bio/mmseqs2/workflows/test/expected/taxonomy/a_report @@ -0,0 +1 @@ +100.0000 10 10 no rank 0 unclassified diff --git a/bio/mmseqs2/workflows/test/expected/taxonomy/a_tophit_aln b/bio/mmseqs2/workflows/test/expected/taxonomy/a_tophit_aln new file mode 100644 index 00000000000..e69de29bb2d diff --git a/bio/mmseqs2/workflows/test/expected/taxonomy/a_tophit_report b/bio/mmseqs2/workflows/test/expected/taxonomy/a_tophit_report new file mode 100644 index 00000000000..e69de29bb2d diff --git a/bio/mmseqs2/workflows/test/seqs/nucl.a.fas.gz b/bio/mmseqs2/workflows/test/seqs/nucl.a.fas.gz new file mode 100644 index 00000000000..2b4259bb51d Binary files /dev/null and b/bio/mmseqs2/workflows/test/seqs/nucl.a.fas.gz differ diff --git a/bio/mmseqs2/workflows/test/seqs/nucl.b.fas.gz b/bio/mmseqs2/workflows/test/seqs/nucl.b.fas.gz new file mode 100644 index 00000000000..ce1b0ace542 Binary files /dev/null and b/bio/mmseqs2/workflows/test/seqs/nucl.b.fas.gz differ diff --git a/bio/mmseqs2/workflows/wrapper.py b/bio/mmseqs2/workflows/wrapper.py new file mode 100644 index 00000000000..4d819ac07a9 --- /dev/null +++ b/bio/mmseqs2/workflows/wrapper.py @@ -0,0 +1,37 @@ +__author__ = "Filipe G. Vieira" +__copyright__ = "Copyright 2024, Filipe G. Vieira" +__license__ = "MIT" + +import os +import tempfile +from snakemake.shell import shell +from snakemake_wrapper_utils.snakemake import get_format + +extra = snakemake.params.get("extra", "") +log = snakemake.log_fmt_shell(stdout=True, stderr=True) + + +target = snakemake.input.get("target", "") +if isinstance(target, list): + target = os.path.commonprefix(snakemake.input.target) + +# TODO: arbitrary output file names +out = snakemake.output +if isinstance(out, list): + out = os.path.commonprefix(out).rstrip("_") +else: + format_mode = get_format(out) + # (0) BLAST-TAB + # (1) SAM + # (2) BLAST-TAB + query/db length + # (3) HTML + # (4) BLAST-TAB + column headers + if format_mode == "sam": + extra += " --format-mode 1" + elif format_mode == "html": + extra += " --format-mode 3" + +with tempfile.TemporaryDirectory() as tmpdir: + shell( + "mmseqs {snakemake.params.module} {snakemake.input.query} {target} {out} {tmpdir} --threads {snakemake.threads} {extra} {log}" + ) diff --git a/test_wrappers.py b/test_wrappers.py index 6b4a32f16f7..6818db9b1e0 100644 --- a/test_wrappers.py +++ b/test_wrappers.py @@ -145,6 +145,65 @@ def _run(wrapper, cmd, check_log=None, compare_results_with_expected=None): return _run +def test_mmseqs2(run): + run( + "bio/mmseqs2/workflows", + [ + "snakemake", + "--cores", + "2", + "--use-conda", + "-F", + "out/search/a.tab", + "out/cluster/a_b_cluster.tsv", + "out/linclust/a_b_cluster.tsv", + "out/taxonomy/a_lca.tsv", + "out/rbh/a.tab", + ], + compare_results_with_expected={ + "out/search/a.tab": "expected/search/a.tab", + "out/cluster/a_b_cluster.tsv": "expected/cluster/a_b_cluster.tsv", + "out/cluster/a_b_rep_seq.fasta": "expected/cluster/a_b_rep_seq.fasta", + "out/cluster/a_b_all_seqs.fasta": "expected/cluster/a_b_all_seqs.fasta", + "out/linclust/a_b_rep_seq.fasta": "expected/linclust/a_b_rep_seq.fasta", + "out/linclust/a_b_all_seqs.fasta": "expected/linclust/a_b_all_seqs.fasta", + "out/linclust/a_b_cluster.tsv": "expected/linclust/a_b_cluster.tsv", + "out/taxonomy/a_tophit_report": "expected/taxonomy/a_tophit_report", + "out/taxonomy/a_tophit_aln": "expected/taxonomy/a_tophit_aln", + "out/taxonomy/a_report": "expected/taxonomy/a_report", + "out/taxonomy/a_lca.tsv": "expected/taxonomy/a_lca.tsv", + "out/rbh/a.tab": "expected/rbh/a.tab", + }, + ) + + run( + "bio/mmseqs2/db", + [ + "snakemake", + "--cores", + "2", + "--use-conda", + "-F", + "out/databases/a", + "out/createdb/a", + ], + compare_results_with_expected={ + "out/databases/a.dbtype": "expected/databases/a.dbtype", + "out/databases/a_h.dbtype": "expected/databases/a_h.dbtype", + "out/databases/a.source": "expected/databases/a.source", + "out/databases/a.version": "expected/databases/a.version", + "out/createdb/a": "expected/createdb/a", + "out/createdb/a.dbtype": "expected/createdb/a.dbtype", + "out/createdb/a_h": "expected/createdb/a_h", + "out/createdb/a_h.dbtype": "expected/createdb/a_h.dbtype", + "out/createdb/a_h.index": "expected/createdb/a_h.index", + "out/createdb/a.index": "expected/createdb/a.index", + "out/createdb/a.lookup": "expected/createdb/a.lookup", + "out/createdb/a.source": "expected/createdb/a.source", + }, + ) + + def test_aria2c(run): run( "utils/aria2c",