Skip to content

Commit

Permalink
Merge pull request #8 from sanger-tol/outdir
Browse files Browse the repository at this point in the history
Rework of the output directories
  • Loading branch information
muffato authored Oct 10, 2023
2 parents 63d5bdc + 27b5e8a commit 04b7a5e
Show file tree
Hide file tree
Showing 10 changed files with 80 additions and 102 deletions.
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
**sanger-tol/ensemblgenedownload** is a pipeline that downloads gene annotations from Ensembl into a Tree of Life directory structure.

The pipeline takes a CSV file that contains assembly accession number, Ensembl species names (as they may differ from Tree of Life ones !), output directories, geneset methods and geneset versions.
Assembly accession numbers are optional. If missing, the pipeline assumes it can be retrieved from files named `ACCESSION` in the standard location on disk.
The pipeline downloads the Fasta files of the genes (cdna, cds, and protein sequences) as well as the GFF3 file.
All files are compressed with `bgzip`, and indexed with `samtools faidx` or `tabix`.

Expand Down
10 changes: 5 additions & 5 deletions assets/samplesheet.csv
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
species_dir,assembly_name,assembly_accession,ensembl_species_name,annotation_method,geneset_version
25g/data/echinoderms/Asterias_rubens,eAstRub1.3,GCA_902459465.3,Asterias_rubens,refseq,2020_11
25g/data/echinoderms/Asterias_rubens,eAstRub1.3,GCA_902459465.3,Asterias_rubens,refseq,2022_03
25g/data/insects/Osmia_bicornis,iOsmBic2.1_alternate_haplotype,GCA_907164925.1,Osmia_bicornis_bicornis,ensembl,2022_02
darwin/data/insects/Noctua_fimbriata,ilNocFimb1.1,GCA_905163415.1,Noctua_fimbriata,braker,2022_03
outdir,assembly_accession,ensembl_species_name,annotation_method,geneset_version
Asterias_rubens/eAstRub1.3,GCA_902459465.3,Asterias_rubens,refseq,2020_11
Asterias_rubens/eAstRub1.3,GCA_902459465.3,Asterias_rubens,refseq,2022_03
Osmia_bicornis/iOsmBic2.1_alternate_haplotype,GCA_907164925.1,Osmia_bicornis_bicornis,ensembl,2022_02
Noctua_fimbriata/ilNocFimb1.1,GCA_905163415.1,Noctua_fimbriata,braker,2022_03
9 changes: 2 additions & 7 deletions assets/schema_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,11 @@
"items": {
"type": "object",
"properties": {
"species_dir": {
"outdir": {
"type": "string",
"pattern": "^\\S+$",
"errorMessage": "Species directory must be provided and exist"
},
"assembly_name": {
"type": "string",
"pattern": "^\\S+$",
"errorMessage": "Assembly name must be provided and cannot contain spaces"
},
"assembly_accession": {
"type": "string",
"pattern": "^GCA_[0-9]{9}\\.[0-9]+$",
Expand All @@ -38,6 +33,6 @@
"errorMessage": "The version of the geneset must be in the form `YYYY_MM`."
}
},
"required": ["species_dir", "assembly_name", "ensembl_species_name", "annotation_method", "geneset_version"]
"required": ["outdir", "assembly_accession", "ensembl_species_name", "annotation_method", "geneset_version"]
}
}
34 changes: 10 additions & 24 deletions bin/check_samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@ class RowChecker:

def __init__(
self,
dir_col="species_dir",
name_col="assembly_name",
dir_col="outdir",
accession_col="assembly_accession",
ensembl_name_col="ensembl_species_name",
method_col="annotation_method",
Expand All @@ -41,9 +40,7 @@ def __init__(
Args:
dir_col (str): The name of the column that contains the species directory
(default "species_dir").
name_col (str): The name of the column that contains the assembly name
(default "assembly_name").
(default "outdir").
accession_col (str): The name of the column that contains the accession
number (default "assembly_accession").
ensembl_name_col(str): The name of the column that contains the Ensembl species name
Expand All @@ -56,7 +53,6 @@ def __init__(
"""
super().__init__(**kwargs)
self._dir_col = dir_col
self._name_col = name_col
self._accession_col = accession_col
self._ensembl_name_col = ensembl_name_col
self._method_col = method_col
Expand All @@ -76,12 +72,11 @@ def validate_and_transform(self, row):
"""
self._validate_dir(row)
self._validate_name(row)
self._validate_accession(row)
self._validate_ensembl_name(row)
self._validate_method(row)
self._validate_geneset(row)
self._seen.add((row[self._name_col], row[self._method_col], row[self._geneset_col]))
self._seen.add((row[self._accession_col], row[self._method_col], row[self._geneset_col]))
self.modified.append(row)

def _validate_dir(self, row):
Expand All @@ -91,20 +86,11 @@ def _validate_dir(self, row):

def _validate_accession(self, row):
"""Assert that the accession number exists and matches the expected nomenclature."""
if (
self._accession_col in row
and row[self._accession_col]
and not self._regex_accession.match(row[self._accession_col])
):
if not row[self._accession_col]:
raise AssertionError("Assembly accession is required.")
if not self._regex_accession.match(row[self._accession_col]):
raise AssertionError("Accession numbers must match %s." % self._regex_accession)

def _validate_name(self, row):
"""Assert that the assembly name is non-empty and has no space."""
if not row[self._name_col]:
raise AssertionError("Assembly name is required.")
if " " in row[self._name_col]:
raise AssertionError("Accession names must not contain whitespace.")

def _validate_ensembl_name(self, row):
"""Assert that the Ensembl name is non-empty and has no space."""
if not row[self._ensembl_name_col]:
Expand Down Expand Up @@ -179,13 +165,13 @@ def check_samplesheet(file_in, file_out):
Example:
This function checks that the samplesheet follows the following structure::
species_dir,assembly_name,ensembl_species_name,annotation_method,geneset_version
25g/data/echinoderms/Asterias_rubens,eAstRub1.3,Asterias_rubens,ensembl,2020_11
outdir,assembly_accession,ensembl_species_name,annotation_method,geneset_version
Asterias_rubens/eAstRub1.3,GCA_902459465.3,Asterias_rubens,refseq,2020_11
"""
required_columns = {
"species_dir",
"assembly_name",
"outdir",
"assembly_accession",
"ensembl_species_name",
"annotation_method",
"geneset_version",
Expand Down
59 changes: 30 additions & 29 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

This document describes the output produced by the pipeline.

The directories listed below will be created in the results directory (or `species_dir` when using a samplesheet) after the pipeline has finished.
The directories listed below will be created in a directory based on the `--outdir` command-line parameter and the `outdir` column of the samplesheet.
) after the pipeline has finished.
All paths are relative to the top-level results directory.

The directories comply with Tree of Life's canonical directory structure.
Expand All @@ -27,37 +28,37 @@ All BED files are indexed with tabix in both TBI and CSI modes, unless the seque
Here are the files you can expect in the `gene/` sub-directory.

```text
/lustre/scratch124/tol/projects/darwin/data/insects/Noctua_fimbriata/
└── analysis
└── ilNocFimb1.1
└── gene
└── braker
├── GCA_905163415.1.braker.2022_03.cdna.fa.gz
├── GCA_905163415.1.braker.2022_03.cdna.fa.gz.dict
├── GCA_905163415.1.braker.2022_03.cdna.fa.gz.fai
├── GCA_905163415.1.braker.2022_03.cdna.fa.gz.gzi
├── GCA_905163415.1.braker.2022_03.cdna.seq_length.tsv
├── GCA_905163415.1.braker.2022_03.cds.fa.gz
├── GCA_905163415.1.braker.2022_03.cds.fa.gz.dict
├── GCA_905163415.1.braker.2022_03.cds.fa.gz.fai
├── GCA_905163415.1.braker.2022_03.cds.fa.gz.gzi
├── GCA_905163415.1.braker.2022_03.cds.seq_length.tsv
├── GCA_905163415.1.braker.2022_03.gff3.gz
├── GCA_905163415.1.braker.2022_03.gff3.gz.csi
├── GCA_905163415.1.braker.2022_03.gff3.gz.gzi
├── GCA_905163415.1.braker.2022_03.pep.fa.gz
├── GCA_905163415.1.braker.2022_03.pep.fa.gz.dict
├── GCA_905163415.1.braker.2022_03.pep.fa.gz.fai
├── GCA_905163415.1.braker.2022_03.pep.fa.gz.gzi
└── GCA_905163415.1.braker.2022_03.pep.seq_length.tsv
gene
└── ensembl
└── 2022_02
├── GCA_907164925.1.ensembl.2022_02.cdna.fa.gz
├── GCA_907164925.1.ensembl.2022_02.cdna.fa.gz.dict
├── GCA_907164925.1.ensembl.2022_02.cdna.fa.gz.fai
├── GCA_907164925.1.ensembl.2022_02.cdna.fa.gz.gzi
├── GCA_907164925.1.ensembl.2022_02.cdna.fa.gz.sizes
├── GCA_907164925.1.ensembl.2022_02.cds.fa.gz
├── GCA_907164925.1.ensembl.2022_02.cds.fa.gz.dict
├── GCA_907164925.1.ensembl.2022_02.cds.fa.gz.fai
├── GCA_907164925.1.ensembl.2022_02.cds.fa.gz.gzi
├── GCA_907164925.1.ensembl.2022_02.cds.fa.gz.sizes
├── GCA_907164925.1.ensembl.2022_02.gff3.gz
├── GCA_907164925.1.ensembl.2022_02.gff3.gz.csi
├── GCA_907164925.1.ensembl.2022_02.gff3.gz.gzi
├── GCA_907164925.1.ensembl.2022_02.gff3.gz.tbi
├── GCA_907164925.1.ensembl.2022_02.pep.fa.gz
├── GCA_907164925.1.ensembl.2022_02.pep.fa.gz.dict
├── GCA_907164925.1.ensembl.2022_02.pep.fa.gz.fai
├── GCA_907164925.1.ensembl.2022_02.pep.fa.gz.gzi
└── GCA_907164925.1.ensembl.2022_02.pep.fa.gz.sizes
```

The directory structure includes the assembly name, e.g. `fParRan2.2`, and all files are named after the assembly accession, e.g. `GCA_900634625.2`.
The file name (and the directory name) includes the annotation method and date. Current methods include:
All files are named after:

- `ensembl` for Ensembl's own annotation pipeline
- `braker` for [BRAKER2](https://academic.oup.com/nargab/article/3/1/lqaa108/6066535)
- `refseq` for [RefSeq](https://academic.oup.com/nar/article/49/D1/D1020/6018440)
- the assembly accession, e.g. `GCA_907164925.1`;
- the annotation method, e.g. `ensembl`;
- the annotation date, e.g. `2022_02`.

These information are also in the directory names to allow multiple annotations to be loaded.

The `.seq_length.tsv` files are tabular analogous to the common `chrom.sizes`. They contain the sequence names and their lengths.

Expand Down
42 changes: 22 additions & 20 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ The pipeline accepts command-one line arguments to specify a single genome to do
- `--assembly_accession`: The accession number of the assembly,
- `--annotation_method`: The annotation method of the geneset,
- `--geneset_version`: The geneset version to download,
- `--outdir`: Where to download the data.
- `--outdir`: Where the pipeline runtime information will be stored, and where data will be downloaded (except if absolute paths are given in the samplesheet).

```console
nextflow run sanger-tol/ensemblgenedownload -profile singularity --ensembl_species_name Noctua_fimbriata --assembly_accession GCA_905163415.1 --annotation_method braker --geneset_version 2022_03 --outdir Noctua_fimbriata_braker_2022_03
Expand All @@ -37,28 +37,32 @@ Those parameters can be retrieved by browsing the [Ensembl Rapid Release](https:
> [!WARNING]
> Only the _Rapid Release_ site is currently supported, not the other Ensembl sites.
Current annotation methods include:

- `ensembl` for Ensembl's own annotation pipeline
- `braker` for [BRAKER2](https://academic.oup.com/nargab/article/3/1/lqaa108/6066535)
- `refseq` for [RefSeq](https://academic.oup.com/nar/article/49/D1/D1020/6018440)

## Bulk download

The pipeline can download multiple genesets at once, by providing them in a `.csv` file through the `--input` parameter.
It has to be a comma-separated file with five or six columns, and a header row as shown in the examples below.
It has to be a comma-separated file with five columns, and a header row as shown in the examples below.

```console

species_dir,assembly_name,assembly_accession,ensembl_species_name,annotation_method,geneset_version
25g/data/echinoderms/Asterias_rubens,eAstRub1.3,GCA_902459465.3,Asterias_rubens,refseq,2020_11
25g/data/echinoderms/Asterias_rubens,eAstRub1.3,GCA_902459465.3,Asterias_rubens,refseq,2022_03
25g/data/insects/Osmia_bicornis,iOsmBic2.1_alternate_haplotype,GCA_907164925.1,Osmia_bicornis_bicornis,ensembl,2022_02
darwin/data/insects/Noctua_fimbriata,ilNocFimb1.1,GCA_905163415.1,Noctua_fimbriata,braker,2022_03
outdir,assembly_accession,ensembl_species_name,annotation_method,geneset_version
Asterias_rubens/eAstRub1.3,GCA_902459465.3,Asterias_rubens,refseq,2020_11
Asterias_rubens/eAstRub1.3,GCA_902459465.3,Asterias_rubens,refseq,2022_03
Osmia_bicornis/iOsmBic2.1_alternate_haplotype,GCA_907164925.1,Osmia_bicornis_bicornis,ensembl,2022_02
Noctua_fimbriata/ilNocFimb1.1,GCA_905163415.1,Noctua_fimbriata,braker,2022_03
```

| Column | Description |
| ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `species_dir` | Output directory for this species (evaluated from `--outdir` if a relative path). Analysis results are deposited in `analysis/$assembly_name/`. |
| `assembly_name` | Name of the assembly. Used to build the actual output directory. |
| `assembly_accession` | (Optional). Accession number of the assembly to download. Typically of the form `GCA_*.*`. If missing, the pipeline will infer it from the ACCESSION file. |
| `ensembl_species_name` | Name of the species, _as used by Ensembl_. Note: it may differ from Tree of Life's |
| `annotation_method` | Name of the method of the geneset. |
| `geneset_version` | Version of the geneset, usually in the form `YYYY_MM`. |
| Column | Description |
| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `outdir` | Output directory for this annotation (evaluated from `--outdir` if a relative path). Analysis results are in a sub-directory `gene/$annotation_method/$geneset_version`. |
| `assembly_accession` | Accession number of the assembly to download. Typically of the form `GCA_*.*`. |
| `ensembl_species_name` | Name of the species, _as used by Ensembl_. Note: it may differ from Tree of Life's. |
| `annotation_method` | Name of the method of the geneset. |
| `geneset_version` | Version of the geneset, usually in the form `YYYY_MM`. |

A samplesheet may contain:

Expand All @@ -68,9 +72,7 @@ A samplesheet may contain:
- only one row per geneset

All samplesheet columns correspond exactly to their corresponding command-line parameter,
except `species_dir` which overrides or complements `--oudir`.
`species_dir` is used to fit the output of this pipeline into a directory structure compatible with the other pipelines
from Sanger Tree of Life.
except `outdir` which, if a relative path, is interpreted under `--oudir`.

An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.

Expand Down Expand Up @@ -104,8 +106,8 @@ nextflow run sanger-tol/ensemblgenedownload -profile docker -params-file params.
with `params.yaml` containing:

```yaml
ensembl_species_name: "Noctua_fimbriata"
assembly_accession: "GCA_905163415.1"
ensembl_species_name: "Noctua_fimbriata"
annotation_method: "braker"
geneset_version: "2022_03"
outdir: "./results/"
Expand Down
2 changes: 1 addition & 1 deletion nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
"pattern": "^\\S+\\.csv$",
"schema": "assets/schema_input.json",
"description": "Path to comma-separated file containing information about the genesets to download. Used for bulk download of many genesets.",
"help_text": "The file has to be a comma-separated file with six columns, and a header row. The columns names must be `species_dir`, `assembly_name`, `ensembl_species_name`, `annotation_method`, and `geneset_version`. An additional `assembly_accession` column can be provided too.",
"help_text": "The file has to be a comma-separated file with five columns, and a header row. The columns names must be `outdir`, `assembly_accession`, `ensembl_species_name`, `annotation_method`, and `geneset_version`.",
"fa_icon": "fas fa-file-csv"
},
"ftp_root": {
Expand Down
8 changes: 4 additions & 4 deletions subworkflows/local/download.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ include { ENSEMBL_GENESET_DOWNLOAD } from '../../modules/local/ensembl_gene
workflow DOWNLOAD {

take:
annotation_params // tuple(analysis_dir, ensembl_species_name, assembly_accession, annotation_method, geneset_version)
annotation_params // tuple(outdir, assembly_accession, ensembl_species_name, annotation_method, geneset_version)


main:
Expand All @@ -17,9 +17,9 @@ workflow DOWNLOAD {
ENSEMBL_GENESET_DOWNLOAD (
annotation_params.map {

species_dir,
ensembl_species_name,
outdir,
assembly_accession,
ensembl_species_name,
annotation_method,
geneset_version

Expand All @@ -29,7 +29,7 @@ workflow DOWNLOAD {
assembly_accession: assembly_accession,
geneset_version: geneset_version,
method: annotation_method,
outdir: species_dir,
outdir: outdir,
],

// e.g. https://ftp.ensembl.org/pub/rapid-release/species/Agriopis_aurantiaria/GCA_914767915.1/braker/geneset/2021_12/Agriopis_aurantiaria-GCA_914767915.1-2021_12-cdna.fa.gz
Expand Down
Loading

0 comments on commit 04b7a5e

Please sign in to comment.