Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions bio/mmseqs2/db/environment.linux-64.pin.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# This file may be used to create an environment using:
# $ conda create --name <env> --file <this file>
# platform: linux-64
# created-by: conda 25.3.0
@EXPLICIT
https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81
https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2025.1.31-hbcca054_0.conda#19f3a56f68d2fd06c516076bff482c52
https://conda.anaconda.org/conda-forge/linux-64/libgomp-14.2.0-h767d61c_2.conda#06d02030237f4d5b3d9a7e7d348fe3c6
https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2#73aaf86a425cc6e73fcf236a5a46396d
https://conda.anaconda.org/conda-forge/linux-64/libgcc-14.2.0-h767d61c_2.conda#ef504d1acbd74b7cc6849ef8af47dd03
https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.4-hb9d3cd8_0.conda#e2775acf57efd5af15b8e3d1d74d72d3
https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-14.2.0-h69a702a_2.conda#a2222a6ada71fb478682efe483ce0f92
https://conda.anaconda.org/conda-forge/linux-64/libgettextpo-0.23.1-h5888daf_0.conda#a09ce5decdef385bcce78c32809fa794
https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.18-h4ce23a2_1.conda#e796ff8ddc598affdf7c173d6145f087
https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.6.4-hb9d3cd8_0.conda#42d5b6a0f30d3c10cd88cb8584fda1cb
https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-14.2.0-h8f9b012_2.conda#a78c856b6dc6bf4ea8daeb9beaaa3fb0
https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda#edb0dca6bc32e4f4789199455a1dbeb8
https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda#47e340acb35de30501a76c7c799c41d7
https://conda.anaconda.org/conda-forge/linux-64/openssl-3.4.1-h7b32b05_0.conda#41adf927e746dc75ecf0ef841c454e48
https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda#62ee74e96c5ebb0af99386de58cf9553
https://conda.anaconda.org/conda-forge/linux-64/libasprintf-0.23.1-h8e693c7_0.conda#988f4937281a66ca19d1adb3b5e3f859
https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.49.1-hee588c1_2.conda#962d6ac93c30b1dfc54c9cccafd1003e
https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hf672d98_0.conda#be2de152d8073ef1c01b7728475f2fe7
https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-14.2.0-h4852527_2.conda#c75da67f045c2627f59e6fcb5f4e3a9b
https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.13.7-h0d44e9d_0.conda#3ac6daa5c1210293a6deaec0c345b230
https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda#283b96675859b20a825f8fa30f311446
https://conda.anaconda.org/conda-forge/linux-64/zlib-1.3.1-hb9d3cd8_2.conda#c9f075ab2f33b3bbee9e62d4ad0a6cd8
https://conda.anaconda.org/conda-forge/linux-64/aria2-1.37.0-hbc8128a_2.conda#03b8874fa70df577f3eee53085d025cf
https://conda.anaconda.org/conda-forge/linux-64/gmp-6.3.0-hac33072_2.conda#c94a5994ef49749880a8139cf9afcbe1
https://conda.anaconda.org/conda-forge/linux-64/mpfr-4.2.1-h90cbb55_3.conda#2eeb50cab6652538eee8fc0bc3340c81
https://conda.anaconda.org/conda-forge/linux-64/gawk-5.3.1-hcd3d067_0.conda#91d4414ab699180b2b0b10b8112c5a2f
https://conda.anaconda.org/bioconda/linux-64/mmseqs2-17.b804f-hd6d6fdc_1.tar.bz2#561fb589d37cff61ec6b887fc2976498
6 changes: 6 additions & 0 deletions bio/mmseqs2/db/environment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
channels:
- conda-forge
- bioconda
- nodefaults
dependencies:
- mmseqs2 =17.b804f
13 changes: 13 additions & 0 deletions bio/mmseqs2/db/meta.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
name: MMseqs2 db
url: https://github.com/soedinglab/mmseqs2
description: |
ultra fast and sensitive sequence search and clustering suite
authors:
- Filipe G. Vieira
input:
- input FAS file
output:
- output: DB files
params:
- module: workflow to use
- extra: additional program arguments
49 changes: 49 additions & 0 deletions bio/mmseqs2/db/test/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
rule mmseqs2_databases:
output:
db=multiext(
"out/databases/{sample}",
"",
".dbtype",
".index",
".lookup",
".source",
".version",
"_h",
"_h.dbtype",
"_h.index",
"_mapping",
"_taxonomy",
),
log:
"logs/databases/{sample}.log",
params:
module="databases SILVA",
extra="-v 3",
threads: 1
wrapper:
"master/bio/mmseqs2/db"


rule mmseqs2_createdb:
input:
fas="seqs/{sample}.fasta",
output:
db=multiext(
"out/createdb/{sample}",
"",
".dbtype",
".index",
".lookup",
".source",
"_h",
"_h.dbtype",
"_h.index",
),
log:
"logs/createdb/{sample}.log",
params:
module="createdb",
extra="-v 3",
threads: 1
wrapper:
"master/bio/mmseqs2/db"
Binary file added bio/mmseqs2/db/test/expected/createdb/a
Binary file not shown.
Binary file added bio/mmseqs2/db/test/expected/createdb/a.dbtype
Binary file not shown.
4 changes: 4 additions & 0 deletions bio/mmseqs2/db/test/expected/createdb/a.index
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
0 0 9
1 9 9
2 18 8
3 26 8
4 changes: 4 additions & 0 deletions bio/mmseqs2/db/test/expected/createdb/a.lookup
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
0 1 0
1 2 0
2 1 0
3 3 0
1 change: 1 addition & 0 deletions bio/mmseqs2/db/test/expected/createdb/a.source
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0 a.fasta
Binary file added bio/mmseqs2/db/test/expected/createdb/a_h
Binary file not shown.
Binary file added bio/mmseqs2/db/test/expected/createdb/a_h.dbtype
Binary file not shown.
4 changes: 4 additions & 0 deletions bio/mmseqs2/db/test/expected/createdb/a_h.index
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
0 0 3
1 3 3
2 6 3
3 9 3
Binary file added bio/mmseqs2/db/test/expected/databases/a.dbtype
Binary file not shown.
1 change: 1 addition & 0 deletions bio/mmseqs2/db/test/expected/databases/a.source
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0 silva.fasta.gz
164 changes: 164 additions & 0 deletions bio/mmseqs2/db/test/expected/databases/a.version
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
README for SILVA 138 export files

RAST FILES:
===========

Specific export files for the MG-RAST server (Argonne National Lab)

TAXONOMY FILES:
===============

tax_slv_[ls]su_VERSION.txt
-------------------------
These files contain taxonomic rank designations for all taxonomic paths
used in the SILVA taxonomies. Additionally, a unique numeric identifier is
assigned to each taxon (path). These identifiers will be mostly stable in
upcoming SILVA releases.

IDs used in the SSU and LSU files do not match.

Field description:
path:
The full taxonomic path including the name of the group itself.
Segments are separated with ";"
taxid:
numerical identifier
rank:
The rank designation.
remark:
Can be empty ('') or a or w.
a: Marks taxa of environmental origin. That is, taxa containing no
sequence coming from a cultivated organism.
w: Marks taxa scheduled for revision in the next release.
release:
The SILVA release version

tax_slv_[ls]su_VERSION.diff
---------------------------
Difference between the current version of the SILVA taxonomy and the previous
release.

Field description:
status:
the status of the taxono (+ added, - removed)
taxid:
numerical identifier
path:
the full path of the added/removed taxon

taxmap_TAXNAME_[ls]su_VERSION.txt
----------------------------
mapping of each entry in the SILVA database to a taxonomic path. Different
rRNA regions of the same INSDC entry (genome) may be assigned to multiple
paths (contaminations or micro diversity among the rRNA sequences).

The taxmap_embl* files contain the taxonomic path assigned by the original
submitter of the sequence. The last column in this file contains the numerical
ID of the NCBI taxonomy project assigned to this entry. This ID is extracted
from the source feature of an EMBL entry during the import of the sequence.

Field description:
pacc:
INSDC primary accession
start
start position of the rRNA region within the INSDC sequence entry
stop:
stop position of the rRNA region within the INSDC sequence entry
path:
taxonomic path assigned to the region
name
the organism name assigned to the sequence
taxid:
optional field containing the numerical ID of the taxonomic path


tax_TAXNAME_[ls]su_VERSION.{map,tre}
----------------------------
SILVA taxonomy in the Newick tree format and the corespoding numerical id to
taxonomic path mapping file (MEGAN compatible).

tax_TAXNAME_[ls]su_VERSION.acc_taxid
----------------------------
Mapping of 'SILVA' sequence IDs (<INSDC primary accession>.<start>.<stop>)
used in FASTA files to the numeric SILVA taxid (MEGAN compatible).




SEQUENCE FILES:
===============

*_tax_silva.fasta.gz
-----------------
Multi FASTA files of the SSU/LSU databases including the SILVA taxonomy for
Bacteria, Archaea and Eukaryotes in the header.

REMARK: The sequences in the files are NOT truncated to the effective LSU or
SSU genes. They contain the full entries as they have been deposited in the
public repositories (ENA/GenBank/DDBJ).

Fasta header:
>accession_number.start_position.stop_position taxonomic path organism name

*_tax_silva_full_align_trunc.fasta.gz
-----------------------
Multi FASTA files of the SSU/LSU databases including the SILVA taxonomy for
Bacteria, Archaea and Eukaryotes in the header (including the FULL alignment).

REMARK: Sequences in these files haven been truncated. This means that all
nucleotides that have not been aligned were removed from the sequence.

*_tax_silva_trunc.fasta.gz
-----------------------
Multi FASTA files of the SSU/LSU database including the SILVA taxonomy for
Bacteria, Archaea and Eukaryotes in the header.

REMARK: Sequences in these files haven been truncated. This means that all
nucleotides that have not been aligned were removed from the sequence.



CUSTOMISED FILES:
=================

*.acs
-----
Lists with all accession numbers in LSUParc and SSUParc

*.clstr
-------
Mapping of 'ref' sequences to 'nr' sequences. The file uses the CD-Hit file
format.

*quality*.csv
-------------
complete quality values for all SILVA Parc sequences Datasets.
Header: Primary Accession,Start,Stop,Region Length,Annotation Source,Sequence Quality,
% Ambiguities,% Homopolymers,% Vector Contamination,Alignment Quality, Base Pair Score,
# Aligned Bases,Pintail Quality



Directory 'User'
User specific exports done on request

Abbreviations:

LSU: Large subunit (23S/28S ribosomal RNAs)
SSU: Small subunit (16S/18S ribosomal RNAs)




Questions: [email protected]

December 2019

If you are using SILVA please cite:
Quast C, Pruesse E, Yilmaz P, Gerken J, Schweer T, Yarza P, Peplies J,
Glockner FO (2013) The SILVA ribosomal RNA gene database project: improved
data processing and web-based tools. Nucleic Acids Research 41:D590-D596

Yilmaz P, Parfrey LW, Yarza P, Gerken J, Pruesse E, Quast C, Schweer T,
Peplies J, Ludwig W, Glockner FO (2014) The SILVA and "All-species Living
Tree Project (LTP)" taxonomic frameworks. Nucleic Acid Res. 42:D643-D648
Binary file added bio/mmseqs2/db/test/expected/databases/a_h.dbtype
Binary file not shown.
8 changes: 8 additions & 0 deletions bio/mmseqs2/db/test/seqs/a.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
>1
ACGGCAT
>2
ATGGCAT
>1
CGGCAT
>3
ATGGCA
31 changes: 31 additions & 0 deletions bio/mmseqs2/db/wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
__author__ = "Filipe G. Vieira"
__copyright__ = "Copyright 2024, Filipe G. Vieira"
__license__ = "MIT"

import os
import tempfile
from snakemake.shell import shell

extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)


input = snakemake.input
if isinstance(input, list):
input = os.path.commonprefix(input)

# TODO: arbitrary output file names
out = snakemake.output
if isinstance(out, list):
out = os.path.commonprefix(out).rstrip("_")


with tempfile.TemporaryDirectory() as tmpdir:
# Modules with threads
if snakemake.params.module in ["databases"]:
extra = f"--threads {snakemake.threads} {extra}"
# Modules with no temp folder
if snakemake.params.module in ["createdb"]:
tmpdir = ""

shell("mmseqs {snakemake.params.module} {input} {out} {tmpdir} {extra} {log}")
43 changes: 43 additions & 0 deletions bio/mmseqs2/workflows/environment.linux-64.pin.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# This file may be used to create an environment using:
# $ conda create --name <env> --file <this file>
# platform: linux-64
# created-by: conda 25.7.0
@EXPLICIT
https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81
https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.8.3-hbd8a1cb_0.conda#74784ee3d225fc3dca89edb635b4e5cc
https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.44-h1423503_1.conda#0be7c6e070c19105f966d3758448d018
https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_5.conda#dcd5ff1940cd38f6df777cac86819d60
https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda#94305520c52a4aa3f6c2b1ff6008d9f8
https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda#4222072737ccff51314b5ece9c7d6f5a
https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2#73aaf86a425cc6e73fcf236a5a46396d
https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_5.conda#264fbfba7fb20acf3b29cde153e345ce
https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_8.conda#51a19bba1b8ebfb60df25cde030b7ebc
https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.5-hb9d3cd8_0.conda#f7f0d6cc2dc986d42ac2689ec88192be
https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.1-hecca717_0.conda#4211416ecba1866fab0c6470986c22d6
https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda#ede4673863426c0883c0063d853bbd85
https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_5.conda#069afdf8ea72504e48d23ae1171d951c
https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.18-h3b78370_2.conda#915f5995e94f60e9a4826e0b0920ee88
https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda#1a580f7796c7bf6393fddb8bbbde58dc
https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb9d3cd8_0.conda#c7e925f37e3b40d893459e625f6a53f1
https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.1.0-h8f9b012_5.conda#4e02a49aaa9d5190cb630fa43528fbe6
https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.41.2-he9a06e4_0.conda#80c07c68d2f6870250959dcc95b209d1
https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda#edb0dca6bc32e4f4789199455a1dbeb8
https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda#47e340acb35de30501a76c7c799c41d7
https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.3-h26f9b46_1.conda#4fc6c4c88da64c0219c0c6c0408cedd4
https://conda.anaconda.org/conda-forge/linux-64/libasprintf-0.25.1-h3f43e3d_1.conda#3b0d184bc9404516d418d4509e418bdc
https://conda.anaconda.org/conda-forge/linux-64/libgettextpo-0.25.1-h3f43e3d_1.conda#2f4de899028319b27eb7a4023be5dfd2
https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.4-h0c1763c_0.conda#0b367fad34931cb79e0d6b7e5c06bb1c
https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hcf80075_0.conda#eecce068c7e4eddeb169591baac20ac4
https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-15.1.0-h4852527_5.conda#8bba50c7f4679f08c861b597ad2bda6b
https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.13.8-h2cb61b6_1.conda#42a8e4b54e322b4cd1dbfb30a8a7ce9e
https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda#283b96675859b20a825f8fa30f311446
https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_hd72426e_102.conda#a0116df4f4ed05c303811a837d5b39d8
https://conda.anaconda.org/conda-forge/linux-64/zlib-1.3.1-hb9d3cd8_2.conda#c9f075ab2f33b3bbee9e62d4ad0a6cd8
https://conda.anaconda.org/conda-forge/linux-64/aria2-1.37.0-hbc8128a_2.conda#03b8874fa70df577f3eee53085d025cf
https://conda.anaconda.org/conda-forge/linux-64/gmp-6.3.0-hac33072_2.conda#c94a5994ef49749880a8139cf9afcbe1
https://conda.anaconda.org/conda-forge/linux-64/python-3.13.7-h2b335a9_100_cp313.conda#724dcf9960e933838247971da07fe5cf
https://conda.anaconda.org/conda-forge/linux-64/mpfr-4.2.1-h90cbb55_3.conda#2eeb50cab6652538eee8fc0bc3340c81
https://conda.anaconda.org/conda-forge/noarch/pip-25.2-pyh145f28c_0.conda#e7ab34d5a93e0819b62563c78635d937
https://conda.anaconda.org/bioconda/noarch/snakemake-wrapper-utils-0.8.0-pyhdfd78af_0.conda#1650e521333852f45468d97b1b2fdcce
https://conda.anaconda.org/conda-forge/linux-64/gawk-5.3.1-hcd3d067_0.conda#91d4414ab699180b2b0b10b8112c5a2f
https://conda.anaconda.org/bioconda/linux-64/mmseqs2-17.b804f-hd6d6fdc_1.tar.bz2#561fb589d37cff61ec6b887fc2976498
7 changes: 7 additions & 0 deletions bio/mmseqs2/workflows/environment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
channels:
- conda-forge
- bioconda
- nodefaults
dependencies:
- mmseqs2 =17.b804f
- snakemake-wrapper-utils =0.8.0
14 changes: 14 additions & 0 deletions bio/mmseqs2/workflows/meta.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
name: MMseqs2 workflows
url: https://github.com/soedinglab/mmseqs2
description: |
ultra fast and sensitive sequence search and clustering suite
authors:
- Filipe G. Vieira
input:
- query: input query FAS file(s)
- target: input target FAS file(s) or DB
output:
- output: FAS, cluster or DB file(s)
params:
- module: workflow to use
- extra: additional program arguments
Loading
Loading