Merge pull request #8 from sanger-tol/outdir

Rework of the output directories
sanger-tol · Oct 10, 2023 · 04b7a5e · 04b7a5e
2 parents 63d5bdc + 27b5e8a
commit 04b7a5e
Show file tree

Hide file tree

Showing 10 changed files with 80 additions and 102 deletions.
diff --git a/README.md b/README.md
@@ -20,7 +20,6 @@
 **sanger-tol/ensemblgenedownload** is a pipeline that downloads gene annotations from Ensembl into a Tree of Life directory structure.
 
 The pipeline takes a CSV file that contains assembly accession number, Ensembl species names (as they may differ from Tree of Life ones !), output directories, geneset methods and geneset versions.
-Assembly accession numbers are optional. If missing, the pipeline assumes it can be retrieved from files named `ACCESSION` in the standard location on disk.
 The pipeline downloads the Fasta files of the genes (cdna, cds, and protein sequences) as well as the GFF3 file.
 All files are compressed with `bgzip`, and indexed with `samtools faidx` or `tabix`.
 

diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv
@@ -1,5 +1,5 @@
-species_dir,assembly_name,assembly_accession,ensembl_species_name,annotation_method,geneset_version
-25g/data/echinoderms/Asterias_rubens,eAstRub1.3,GCA_902459465.3,Asterias_rubens,refseq,2020_11
-25g/data/echinoderms/Asterias_rubens,eAstRub1.3,GCA_902459465.3,Asterias_rubens,refseq,2022_03
-25g/data/insects/Osmia_bicornis,iOsmBic2.1_alternate_haplotype,GCA_907164925.1,Osmia_bicornis_bicornis,ensembl,2022_02
-darwin/data/insects/Noctua_fimbriata,ilNocFimb1.1,GCA_905163415.1,Noctua_fimbriata,braker,2022_03
+outdir,assembly_accession,ensembl_species_name,annotation_method,geneset_version
+Asterias_rubens/eAstRub1.3,GCA_902459465.3,Asterias_rubens,refseq,2020_11
+Asterias_rubens/eAstRub1.3,GCA_902459465.3,Asterias_rubens,refseq,2022_03
+Osmia_bicornis/iOsmBic2.1_alternate_haplotype,GCA_907164925.1,Osmia_bicornis_bicornis,ensembl,2022_02
+Noctua_fimbriata/ilNocFimb1.1,GCA_905163415.1,Noctua_fimbriata,braker,2022_03
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -7,16 +7,11 @@
     "items": {
         "type": "object",
         "properties": {
-            "species_dir": {
+            "outdir": {
                 "type": "string",
                 "pattern": "^\\S+$",
                 "errorMessage": "Species directory must be provided and exist"
             },
-            "assembly_name": {
-                "type": "string",
-                "pattern": "^\\S+$",
-                "errorMessage": "Assembly name must be provided and cannot contain spaces"
-            },
             "assembly_accession": {
                 "type": "string",
                 "pattern": "^GCA_[0-9]{9}\\.[0-9]+$",
@@ -38,6 +33,6 @@
                 "errorMessage": "The version of the geneset must be in the form `YYYY_MM`."
             }
         },
-        "required": ["species_dir", "assembly_name", "ensembl_species_name", "annotation_method", "geneset_version"]
+        "required": ["outdir", "assembly_accession", "ensembl_species_name", "annotation_method", "geneset_version"]
     }
 }
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -28,8 +28,7 @@ class RowChecker:
 
     def __init__(
         self,
-        dir_col="species_dir",
-        name_col="assembly_name",
+        dir_col="outdir",
         accession_col="assembly_accession",
         ensembl_name_col="ensembl_species_name",
         method_col="annotation_method",
@@ -41,9 +40,7 @@ def __init__(
 
         Args:
             dir_col (str): The name of the column that contains the species directory
-                (default "species_dir").
-            name_col (str): The name of the column that contains the assembly name
-                (default "assembly_name").
+                (default "outdir").
             accession_col (str): The name of the column that contains the accession
                 number (default "assembly_accession").
             ensembl_name_col(str): The name of the column that contains the Ensembl species name
@@ -56,7 +53,6 @@ def __init__(
         """
         super().__init__(**kwargs)
         self._dir_col = dir_col
-        self._name_col = name_col
         self._accession_col = accession_col
         self._ensembl_name_col = ensembl_name_col
         self._method_col = method_col
@@ -76,12 +72,11 @@ def validate_and_transform(self, row):
 
         """
         self._validate_dir(row)
-        self._validate_name(row)
         self._validate_accession(row)
         self._validate_ensembl_name(row)
         self._validate_method(row)
         self._validate_geneset(row)
-        self._seen.add((row[self._name_col], row[self._method_col], row[self._geneset_col]))
+        self._seen.add((row[self._accession_col], row[self._method_col], row[self._geneset_col]))
         self.modified.append(row)
 
     def _validate_dir(self, row):
@@ -91,20 +86,11 @@ def _validate_dir(self, row):
 
     def _validate_accession(self, row):
         """Assert that the accession number exists and matches the expected nomenclature."""
-        if (
-            self._accession_col in row
-            and row[self._accession_col]
-            and not self._regex_accession.match(row[self._accession_col])
-        ):
+        if not row[self._accession_col]:
+            raise AssertionError("Assembly accession is required.")
+        if not self._regex_accession.match(row[self._accession_col]):
             raise AssertionError("Accession numbers must match %s." % self._regex_accession)
 
-    def _validate_name(self, row):
-        """Assert that the assembly name is non-empty and has no space."""
-        if not row[self._name_col]:
-            raise AssertionError("Assembly name is required.")
-        if " " in row[self._name_col]:
-            raise AssertionError("Accession names must not contain whitespace.")
-
     def _validate_ensembl_name(self, row):
         """Assert that the Ensembl name is non-empty and has no space."""
         if not row[self._ensembl_name_col]:
@@ -179,13 +165,13 @@ def check_samplesheet(file_in, file_out):
     Example:
         This function checks that the samplesheet follows the following structure::
 
-            species_dir,assembly_name,ensembl_species_name,annotation_method,geneset_version
-            25g/data/echinoderms/Asterias_rubens,eAstRub1.3,Asterias_rubens,ensembl,2020_11
+            outdir,assembly_accession,ensembl_species_name,annotation_method,geneset_version
+            Asterias_rubens/eAstRub1.3,GCA_902459465.3,Asterias_rubens,refseq,2020_11
 
     """
     required_columns = {
-        "species_dir",
-        "assembly_name",
+        "outdir",
+        "assembly_accession",
         "ensembl_species_name",
         "annotation_method",
         "geneset_version",

diff --git a/docs/output.md b/docs/output.md
@@ -4,7 +4,8 @@
 
 This document describes the output produced by the pipeline.
 
-The directories listed below will be created in the results directory (or `species_dir` when using a samplesheet) after the pipeline has finished.
+The directories listed below will be created in a directory based on the `--outdir` command-line parameter and the `outdir` column of the samplesheet.
+) after the pipeline has finished.
 All paths are relative to the top-level results directory.
 
 The directories comply with Tree of Life's canonical directory structure.
@@ -27,37 +28,37 @@ All BED files are indexed with tabix in both TBI and CSI modes, unless the seque
 Here are the files you can expect in the `gene/` sub-directory.
 
 ```text
-/lustre/scratch124/tol/projects/darwin/data/insects/Noctua_fimbriata/
-└── analysis
-    └── ilNocFimb1.1
-        └── gene
-            └── braker
-                ├── GCA_905163415.1.braker.2022_03.cdna.fa.gz
-                ├── GCA_905163415.1.braker.2022_03.cdna.fa.gz.dict
-                ├── GCA_905163415.1.braker.2022_03.cdna.fa.gz.fai
-                ├── GCA_905163415.1.braker.2022_03.cdna.fa.gz.gzi
-                ├── GCA_905163415.1.braker.2022_03.cdna.seq_length.tsv
-                ├── GCA_905163415.1.braker.2022_03.cds.fa.gz
-                ├── GCA_905163415.1.braker.2022_03.cds.fa.gz.dict
-                ├── GCA_905163415.1.braker.2022_03.cds.fa.gz.fai
-                ├── GCA_905163415.1.braker.2022_03.cds.fa.gz.gzi
-                ├── GCA_905163415.1.braker.2022_03.cds.seq_length.tsv
-                ├── GCA_905163415.1.braker.2022_03.gff3.gz
-                ├── GCA_905163415.1.braker.2022_03.gff3.gz.csi
-                ├── GCA_905163415.1.braker.2022_03.gff3.gz.gzi
-                ├── GCA_905163415.1.braker.2022_03.pep.fa.gz
-                ├── GCA_905163415.1.braker.2022_03.pep.fa.gz.dict
-                ├── GCA_905163415.1.braker.2022_03.pep.fa.gz.fai
-                ├── GCA_905163415.1.braker.2022_03.pep.fa.gz.gzi
-                └── GCA_905163415.1.braker.2022_03.pep.seq_length.tsv
+gene
+└── ensembl
+    └── 2022_02
+        ├── GCA_907164925.1.ensembl.2022_02.cdna.fa.gz
+        ├── GCA_907164925.1.ensembl.2022_02.cdna.fa.gz.dict
+        ├── GCA_907164925.1.ensembl.2022_02.cdna.fa.gz.fai
+        ├── GCA_907164925.1.ensembl.2022_02.cdna.fa.gz.gzi
+        ├── GCA_907164925.1.ensembl.2022_02.cdna.fa.gz.sizes
+        ├── GCA_907164925.1.ensembl.2022_02.cds.fa.gz
+        ├── GCA_907164925.1.ensembl.2022_02.cds.fa.gz.dict
+        ├── GCA_907164925.1.ensembl.2022_02.cds.fa.gz.fai
+        ├── GCA_907164925.1.ensembl.2022_02.cds.fa.gz.gzi
+        ├── GCA_907164925.1.ensembl.2022_02.cds.fa.gz.sizes
+        ├── GCA_907164925.1.ensembl.2022_02.gff3.gz
+        ├── GCA_907164925.1.ensembl.2022_02.gff3.gz.csi
+        ├── GCA_907164925.1.ensembl.2022_02.gff3.gz.gzi
+        ├── GCA_907164925.1.ensembl.2022_02.gff3.gz.tbi
+        ├── GCA_907164925.1.ensembl.2022_02.pep.fa.gz
+        ├── GCA_907164925.1.ensembl.2022_02.pep.fa.gz.dict
+        ├── GCA_907164925.1.ensembl.2022_02.pep.fa.gz.fai
+        ├── GCA_907164925.1.ensembl.2022_02.pep.fa.gz.gzi
+        └── GCA_907164925.1.ensembl.2022_02.pep.fa.gz.sizes
 ```
 
-The directory structure includes the assembly name, e.g. `fParRan2.2`, and all files are named after the assembly accession, e.g. `GCA_900634625.2`.
-The file name (and the directory name) includes the annotation method and date. Current methods include:
+All files are named after:
 
-- `ensembl` for Ensembl's own annotation pipeline
-- `braker` for [BRAKER2](https://academic.oup.com/nargab/article/3/1/lqaa108/6066535)
-- `refseq` for [RefSeq](https://academic.oup.com/nar/article/49/D1/D1020/6018440)
+- the assembly accession, e.g. `GCA_907164925.1`;
+- the annotation method, e.g. `ensembl`;
+- the annotation date, e.g. `2022_02`.
+
+These information are also in the directory names to allow multiple annotations to be loaded.
 
 The `.seq_length.tsv` files are tabular analogous to the common `chrom.sizes`. They contain the sequence names and their lengths.
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -17,7 +17,7 @@ The pipeline accepts command-one line arguments to specify a single genome to do
 - `--assembly_accession`: The accession number of the assembly,
 - `--annotation_method`: The annotation method of the geneset,
 - `--geneset_version`: The geneset version to download,
-- `--outdir`: Where to download the data.
+- `--outdir`: Where the pipeline runtime information will be stored, and where data will be downloaded (except if absolute paths are given in the samplesheet).
 
 ```console
 nextflow run sanger-tol/ensemblgenedownload -profile singularity --ensembl_species_name Noctua_fimbriata --assembly_accession GCA_905163415.1 --annotation_method braker --geneset_version 2022_03 --outdir Noctua_fimbriata_braker_2022_03
@@ -37,28 +37,32 @@ Those parameters can be retrieved by browsing the [Ensembl Rapid Release](https:
 > [!WARNING]
 > Only the _Rapid Release_ site is currently supported, not the other Ensembl sites.
 
+Current annotation methods include:
+
+- `ensembl` for Ensembl's own annotation pipeline
+- `braker` for [BRAKER2](https://academic.oup.com/nargab/article/3/1/lqaa108/6066535)
+- `refseq` for [RefSeq](https://academic.oup.com/nar/article/49/D1/D1020/6018440)
+
 ## Bulk download
 
 The pipeline can download multiple genesets at once, by providing them in a `.csv` file through the `--input` parameter.
-It has to be a comma-separated file with five or six columns, and a header row as shown in the examples below.
+It has to be a comma-separated file with five columns, and a header row as shown in the examples below.
 
 ```console
-
-species_dir,assembly_name,assembly_accession,ensembl_species_name,annotation_method,geneset_version
-25g/data/echinoderms/Asterias_rubens,eAstRub1.3,GCA_902459465.3,Asterias_rubens,refseq,2020_11
-25g/data/echinoderms/Asterias_rubens,eAstRub1.3,GCA_902459465.3,Asterias_rubens,refseq,2022_03
-25g/data/insects/Osmia_bicornis,iOsmBic2.1_alternate_haplotype,GCA_907164925.1,Osmia_bicornis_bicornis,ensembl,2022_02
-darwin/data/insects/Noctua_fimbriata,ilNocFimb1.1,GCA_905163415.1,Noctua_fimbriata,braker,2022_03
+outdir,assembly_accession,ensembl_species_name,annotation_method,geneset_version
+Asterias_rubens/eAstRub1.3,GCA_902459465.3,Asterias_rubens,refseq,2020_11
+Asterias_rubens/eAstRub1.3,GCA_902459465.3,Asterias_rubens,refseq,2022_03
+Osmia_bicornis/iOsmBic2.1_alternate_haplotype,GCA_907164925.1,Osmia_bicornis_bicornis,ensembl,2022_02
+Noctua_fimbriata/ilNocFimb1.1,GCA_905163415.1,Noctua_fimbriata,braker,2022_03
 ```
 
-| Column                 | Description                                                                                                                                                |
-| ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `species_dir`          | Output directory for this species (evaluated from `--outdir` if a relative path). Analysis results are deposited in `analysis/$assembly_name/`.            |
-| `assembly_name`        | Name of the assembly. Used to build the actual output directory.                                                                                           |
-| `assembly_accession`   | (Optional). Accession number of the assembly to download. Typically of the form `GCA_*.*`. If missing, the pipeline will infer it from the ACCESSION file. |
-| `ensembl_species_name` | Name of the species, _as used by Ensembl_. Note: it may differ from Tree of Life's                                                                         |
-| `annotation_method`    | Name of the method of the geneset.                                                                                                                         |
-| `geneset_version`      | Version of the geneset, usually in the form `YYYY_MM`.                                                                                                     |
+| Column                 | Description                                                                                                                                                              |
+| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `outdir`               | Output directory for this annotation (evaluated from `--outdir` if a relative path). Analysis results are in a sub-directory `gene/$annotation_method/$geneset_version`. |
+| `assembly_accession`   | Accession number of the assembly to download. Typically of the form `GCA_*.*`.                                                                                           |
+| `ensembl_species_name` | Name of the species, _as used by Ensembl_. Note: it may differ from Tree of Life's.                                                                                      |
+| `annotation_method`    | Name of the method of the geneset.                                                                                                                                       |
+| `geneset_version`      | Version of the geneset, usually in the form `YYYY_MM`.                                                                                                                   |
 
 A samplesheet may contain:
 
@@ -68,9 +72,7 @@ A samplesheet may contain:
 - only one row per geneset
 
 All samplesheet columns correspond exactly to their corresponding command-line parameter,
-except `species_dir` which overrides or complements `--oudir`.
-`species_dir` is used to fit the output of this pipeline into a directory structure compatible with the other pipelines
-from Sanger Tree of Life.
+except `outdir` which, if a relative path, is interpreted under `--oudir`.
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 
@@ -104,8 +106,8 @@ nextflow run sanger-tol/ensemblgenedownload -profile docker -params-file params.
 with `params.yaml` containing:
 
 ```yaml
-ensembl_species_name: "Noctua_fimbriata"
 assembly_accession: "GCA_905163415.1"
+ensembl_species_name: "Noctua_fimbriata"
 annotation_method: "braker"
 geneset_version: "2022_03"
 outdir: "./results/"

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -50,7 +50,7 @@
                     "pattern": "^\\S+\\.csv$",
                     "schema": "assets/schema_input.json",
                     "description": "Path to comma-separated file containing information about the genesets to download. Used for bulk download of many genesets.",
-                    "help_text": "The file has to be a comma-separated file with six columns, and a header row. The columns names must be `species_dir`, `assembly_name`, `ensembl_species_name`, `annotation_method`, and `geneset_version`. An additional `assembly_accession` column can be provided too.",
+                    "help_text": "The file has to be a comma-separated file with five columns, and a header row. The columns names must be `outdir`, `assembly_accession`, `ensembl_species_name`, `annotation_method`, and `geneset_version`.",
                     "fa_icon": "fas fa-file-csv"
                 },
                 "ftp_root": {

diff --git a/subworkflows/local/download.nf b/subworkflows/local/download.nf
@@ -8,7 +8,7 @@ include { ENSEMBL_GENESET_DOWNLOAD      } from '../../modules/local/ensembl_gene
 workflow DOWNLOAD {
 
     take:
-    annotation_params         // tuple(analysis_dir, ensembl_species_name, assembly_accession, annotation_method, geneset_version)
+    annotation_params         // tuple(outdir, assembly_accession, ensembl_species_name, annotation_method, geneset_version)
 
 
     main:
@@ -17,9 +17,9 @@ workflow DOWNLOAD {
     ENSEMBL_GENESET_DOWNLOAD (
         annotation_params.map {
 
-            species_dir,
-            ensembl_species_name,
+            outdir,
             assembly_accession,
+            ensembl_species_name,
             annotation_method,
             geneset_version
 
@@ -29,7 +29,7 @@ workflow DOWNLOAD {
                     assembly_accession: assembly_accession,
                     geneset_version: geneset_version,
                     method: annotation_method,
-                    outdir: species_dir,
+                    outdir: outdir,
                 ],
 
                 // e.g. https://ftp.ensembl.org/pub/rapid-release/species/Agriopis_aurantiaria/GCA_914767915.1/braker/geneset/2021_12/Agriopis_aurantiaria-GCA_914767915.1-2021_12-cdna.fa.gz