add automatic fixes provided by ruff

Ensembl · Nov 6, 2024 · 16da59b · 16da59b
1 parent 902b1c7
commit 16da59b
Show file tree

Hide file tree

Showing 68 changed files with 1,016 additions and 635 deletions.
diff --git a/src/python/ensembl/io/genomio/annotation/update_description.py b/src/python/ensembl/io/genomio/annotation/update_description.py
@@ -21,7 +21,7 @@
 
 import logging
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any
 
 from sqlalchemy.orm import Session
 from sqlalchemy import and_, select
@@ -39,18 +39,18 @@
     "transcript": "transcript",
 }
 
-FeatStruct = Tuple[str, str, str]
+FeatStruct = tuple[str, str, str]
 
 
-def get_core_data(session: Session, table: str, match_xrefs: bool = False) -> Dict[str, FeatStruct]:
+def get_core_data(session: Session, table: str, match_xrefs: bool = False) -> dict[str, FeatStruct]:
     """Returns the table descriptions from a core database.
 
     Args:
         session: Session open on a core database.
         table: "gene" or "transcript" table from the core database.
         match_xrefs: If the IDs do not match, try to match an Xref ID instead.
-    """
 
+    """
     if table == "gene":
         stmt = (
             select(Gene.gene_id, Gene.stable_id, Gene.description, Xref.dbprimary_acc)
@@ -103,6 +103,7 @@ def load_descriptions(
         report: Print the mapping of changes to perform in the standard output.
         do_update: Actually update the core database.
         match_xrefs: If the IDs do not match, try to match an Xref ID instead.
+
     """
     func = get_json(func_file)
     logging.info(f"{len(func)} annotations from {func_file}")
@@ -125,7 +126,13 @@ def load_descriptions(
         }
         # Compare, only keep the descriptions that have changed
         features_to_update = _get_features_to_update(
-            table, feat_func, feat_data, stats, report=report, do_update=do_update, match_xrefs=match_xrefs
+            table,
+            feat_func,
+            feat_data,
+            stats,
+            report=report,
+            do_update=do_update,
+            match_xrefs=match_xrefs,
         )
 
         # Show stats for this feature type
@@ -141,8 +148,10 @@ def load_descriptions(
 
 
 def _get_cur_feat(
-    feat_data: Dict[str, FeatStruct], new_feat: Dict[str, Any], match_xrefs: bool = False
-) -> Optional[FeatStruct]:
+    feat_data: dict[str, FeatStruct],
+    new_feat: dict[str, Any],
+    match_xrefs: bool = False,
+) -> FeatStruct | None:
     """Match a feature ID, synonyms or xrefs to a core stable ID and return the matching core feature.
 
     Returns None if no match.
@@ -169,14 +178,14 @@ def _get_cur_feat(
 
 def _get_features_to_update(
     table: str,
-    feat_func: List[Dict[str, Any]],
-    feat_data: Dict[str, FeatStruct],
-    stats: Dict[str, int],
+    feat_func: list[dict[str, Any]],
+    feat_data: dict[str, FeatStruct],
+    stats: dict[str, int],
     *,
     report: bool = False,
     do_update: bool = False,
     match_xrefs: bool = True,
-) -> List[Dict[str, Any]]:
+) -> list[dict[str, Any]]:
     """Checks a list of features and returns those whose description we want to update.
 
     Args:
@@ -190,6 +199,7 @@ def _get_features_to_update(
 
     Returns:
         The list of features with their operation changed to update or insert.
+
     """
     to_update = []
     for new_feat in feat_func:
@@ -247,7 +257,9 @@ def main() -> None:
     parser.add_argument("--report", action="store_true", help="Show what change would be made")
     parser.add_argument("--update", action="store_true", help="Make the changes to the database")
     parser.add_argument(
-        "--match_xrefs", action="store_true", help="Use xref IDs to match features if IDs do not work"
+        "--match_xrefs",
+        action="store_true",
+        help="Use xref IDs to match features if IDs do not work",
     )
     parser.add_log_arguments(add_log_file=True)
     args = parser.parse_args()

diff --git a/src/python/ensembl/io/genomio/assembly/download.py b/src/python/ensembl/io/genomio/assembly/download.py
@@ -34,7 +34,6 @@
 from pathlib import Path
 import re
 import time
-from typing import Dict, Optional
 
 from ensembl.utils.argparse import ArgumentParser
 from ensembl.utils.logging import init_logging_with_args
@@ -70,8 +69,8 @@ def establish_ftp(ftp_conn: FTP, ftp_url: str, accession: str) -> FTP:
 
     Raises:
         UnsupportedFormatError: If `accession` does not follow INSDC's accession format.
-    """
 
+    """
     match = re.match(r"^(GC[AF])_([0-9]{3})([0-9]{3})([0-9]{3})(\.[0-9]+)?$", accession)
     if not match:
         raise UnsupportedFormatError(f"Could not recognize GCA accession format: {accession}")
@@ -89,15 +88,15 @@ def establish_ftp(ftp_conn: FTP, ftp_url: str, accession: str) -> FTP:
     return ftp_conn
 
 
-def md5_files(dl_dir: Path, md5_path: Optional[Path] = None, md5_filename: str = "md5checksums.txt") -> bool:
-    """
-    Check all files checksums with the sums listed in a checksum file, if available.
+def md5_files(dl_dir: Path, md5_path: Path | None = None, md5_filename: str = "md5checksums.txt") -> bool:
+    """Check all files checksums with the sums listed in a checksum file, if available.
     Return False if there is no checksum file, or a file is missing, or has a wrong checksum.
 
     Args:
         dl_dir: Path location to containing downloaded FTP files.
         md5_path: Full path to an MD5 checksum file.
         md5_filename: Name of a checksum file in the `dl_dir` (used if no `md5_path` is given).
+
     """
     # Get or set md5 file to user or default setting
     if md5_path is None:
@@ -127,14 +126,14 @@ def md5_files(dl_dir: Path, md5_path: Optional[Path] = None, md5_filename: str =
     return True
 
 
-def get_checksums(checksum_path: Path) -> Dict[str, str]:
-    """
-    Get a dict of checksums from a file, with file names as keys and sums as values
+def get_checksums(checksum_path: Path) -> dict[str, str]:
+    """Get a dict of checksums from a file, with file names as keys and sums as values
 
     Args:
         checksum_path: Path location to MD5 checksum file.
+
     """
-    sums: Dict[str, str] = {}
+    sums: dict[str, str] = {}
     if not checksum_path.is_file():
         return sums
     with checksum_path.open(mode="r") as fh:
@@ -147,16 +146,15 @@ def get_checksums(checksum_path: Path) -> Dict[str, str]:
 
 
 def download_files(ftp_connection: FTP, accession: str, dl_dir: Path, max_redo: int) -> None:
-    """
-    Given an INSDC accession, download all available files from the ftp to the download dir
+    """Given an INSDC accession, download all available files from the ftp to the download dir
 
     Args:
         ftp_connection: An open FTP connection object
         accession: Genome assembly accession.
         dl_dir: Path to downloaded FTP files.
         max_redo: Maximum FTP connection retry attempts.
-    """
 
+    """
     # Get the list of assemblies for this accession
     for ftp_dir, _ in ftp_connection.mlsd():
         if re.search(accession, ftp_dir):
@@ -176,12 +174,16 @@ def download_files(ftp_connection: FTP, accession: str, dl_dir: Path, max_redo:
                         _download_file(ftp_connection, ftp_file, md5_sums, dl_dir, max_redo)
         else:
             logging.warning(
-                f"Could not find accession '{accession}' from ftp {ftp_dir} in open FTP connection"
+                f"Could not find accession '{accession}' from ftp {ftp_dir} in open FTP connection",
             )
 
 
 def _download_file(
-    ftp_connection: FTP, ftp_file: str, md5_sums: Dict[str, str], dl_dir: Path, max_redo: int = 0
+    ftp_connection: FTP,
+    ftp_file: str,
+    md5_sums: dict[str, str],
+    dl_dir: Path,
+    max_redo: int = 0,
 ) -> None:
     """Downloads individual files from FTP server.
 
@@ -191,6 +193,7 @@ def _download_file(
         md5_sums: Dictionary of key value pairs filename - md5_checksums.
         dl_dir: Path to downloaded FTP files.
         max_redo: Maximum number of connection retry attempts.
+
     """
     has_md5 = True
     expected_sum = ""
@@ -238,7 +241,7 @@ def _download_file(
         raise FileDownloadError(f"Could not download file {ftp_file} after {redo} tries")
 
 
-def get_files_selection(dl_dir: Path) -> Dict[str, str]:
+def get_files_selection(dl_dir: Path) -> dict[str, str]:
     """Returns a dictionary with the relevant downloaded files classified.
 
     Args:
@@ -249,6 +252,7 @@ def get_files_selection(dl_dir: Path) -> Dict[str, str]:
 
     Raises:
         FileDownloadError: If `dl_dir` tree does not include a file named `*_assembly_report.txt`.
+
     """
     files = {}
     root_name = get_root_name(dl_dir)
@@ -267,6 +271,7 @@ def get_root_name(dl_dir: Path) -> str:
 
     Args:
         dl_dir: Path location of downloaded FTP files.
+
     """
     root_name = ""
     for dl_file in dl_dir.iterdir():
@@ -294,6 +299,7 @@ def retrieve_assembly_data(
 
     Raises:
         FileDownloadError: If no files are downloaded or if any does not match its MD5 checksum.
+
     """
     download_dir = Path(download_dir)
 
@@ -304,7 +310,7 @@ def retrieve_assembly_data(
     if not md5_files(download_dir, None):
         logging.info(" Download the files")
 
-        for increment in range(0, max_increment + 1):
+        for increment in range(max_increment + 1):
             if increment > 0:
                 logging.info(f" Increment accession version once from {accession}")
                 version = int(accession[-1])
@@ -331,7 +337,9 @@ def main() -> None:
     parser = ArgumentParser(description="Download an assembly data files from INSDC or RefSeq.")
     parser.add_argument("--accession", required=True, help="Genome assembly accession")
     parser.add_argument_dst_path(
-        "--download_dir", default=Path.cwd(), help="Folder where the data will be downloaded"
+        "--download_dir",
+        default=Path.cwd(),
+        help="Folder where the data will be downloaded",
     )
     parser.add_log_arguments()
     args = parser.parse_args()

diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py
@@ -100,21 +100,21 @@ def singularity_image_setter(sif_cache_dir: Path | None, datasets_version: str |
 
     Returns:
         `spython.main.client` instance of singularity container image housing `datasets`.
-    """
 
+    """
     # Set singularity cache dir from user defined path or use environment
     if sif_cache_dir and sif_cache_dir.is_dir():
         image_dl_path = sif_cache_dir
         logging.info(f"Using user-defined cache_dir: '{image_dl_path}'")
     elif os.environ.get("NXF_SINGULARITY_CACHEDIR"):
         image_dl_path = Path(os.environ["NXF_SINGULARITY_CACHEDIR"])
         logging.info(
-            f"Using preferred nextflow singularity cache dir 'NXF_SINGULARITY_CACHEDIR': {image_dl_path}"
+            f"Using preferred nextflow singularity cache dir 'NXF_SINGULARITY_CACHEDIR': {image_dl_path}",
         )
     elif os.environ.get("SINGULARITY_CACHEDIR"):
         image_dl_path = Path(os.environ["SINGULARITY_CACHEDIR"])
         logging.info(
-            f"Using the default singularity installation cache dir 'SINGULARITY_CACHEDIR': {image_dl_path}"
+            f"Using the default singularity installation cache dir 'SINGULARITY_CACHEDIR': {image_dl_path}",
         )
     else:
         image_dl_path = Path()
@@ -142,6 +142,7 @@ def get_assembly_accessions(src_file: StrPath) -> list[str]:
 
     Raises:
         UnsupportedFormatError: If an accession does not match the INSDC assembly accession format.
+
     """
     query_accessions: list[str] = []
     with Path(src_file).open(mode="r") as fin:
@@ -165,8 +166,8 @@ def fetch_accessions_from_core_dbs(src_file: StrPath, server_url: URL) -> dict[s
 
     Returns:
         Dict of core database names (key) and their corresponding INSDC assembly accession (value).
-    """
 
+    """
     core_accn_meta = {}
     database_count = 0
     count_accn_found = 0
@@ -179,7 +180,7 @@ def fetch_accessions_from_core_dbs(src_file: StrPath, server_url: URL) -> dict[s
             db_connection = DBConnection(db_connection_url)
             with db_connection.begin() as conn:
                 query_result = conn.execute(
-                    text('SELECT meta_value FROM meta WHERE meta_key = "assembly.accession";')
+                    text('SELECT meta_value FROM meta WHERE meta_key = "assembly.accession";'),
                 ).fetchall()
 
             if not query_result:
@@ -193,14 +194,17 @@ def fetch_accessions_from_core_dbs(src_file: StrPath, server_url: URL) -> dict[s
                 logging.warning(f"Core {core_db} has {len(query_result)} assembly.accessions")
 
     logging.info(
-        f"From initial input core databases ({database_count}), obtained ({count_accn_found}) accessions"
+        f"From initial input core databases ({database_count}), obtained ({count_accn_found}) accessions",
     )
 
     return core_accn_meta
 
 
 def fetch_datasets_reports(
-    sif_image: Client, assembly_accessions: dict[str, str], download_directory: StrPath, batch_size: int
+    sif_image: Client,
+    assembly_accessions: dict[str, str],
+    download_directory: StrPath,
+    batch_size: int,
 ) -> dict[str, dict]:
     """Obtain assembly reports in JSON format for each assembly accession via `datasets` CLI.
 
@@ -229,7 +233,10 @@ def fetch_datasets_reports(
     for accessions in accn_subsample:
         # Make call to singularity datasets providing a multi-accession query
         client_return = Client.execute(
-            image=sif_image, command=datasets_command + accessions, return_result=True, quiet=True
+            image=sif_image,
+            command=datasets_command + accessions,
+            return_result=True,
+            quiet=True,
         )
         raw_result = client_return["message"]
 
@@ -272,6 +279,7 @@ def extract_assembly_metadata(assembly_reports: dict[str, dict]) -> dict[str, Re
 
     Returns:
         Parsed assembly report meta (source, meta).
+
     """
     parsed_meta = {}
 
@@ -329,6 +337,7 @@ def generate_report_tsv(
         query_type: Type of query (either core databases or accessions).
         output_directory: Directory to store report TSV file.
         outfile_name: Name to give to the output TSV file.
+
     """
     tsv_outfile = Path(output_directory, f"{outfile_name}.tsv")
 
@@ -384,7 +393,9 @@ def main() -> None:
     subparsers = parser.add_subparsers(title="report assembly status from", required=True, dest="src")
     # Specific arguments required when using Ensembl core database names as source
     core_db_parser = subparsers.add_parser(
-        "core_db", parents=[base_parser], help="list of Ensembl core databases"
+        "core_db",
+        parents=[base_parser],
+        help="list of Ensembl core databases",
     )
     core_db_parser.add_argument_src_path(
         "--input",
@@ -394,10 +405,14 @@ def main() -> None:
     core_db_parser.add_server_arguments()
     # Specific arguments required when using assembly accessions as source
     accessions_parser = subparsers.add_parser(
-        "accession", parents=[base_parser], help="list of INSDC accessions"
+        "accession",
+        parents=[base_parser],
+        help="list of INSDC accessions",
     )
     accessions_parser.add_argument_src_path(
-        "--input", required=True, help="file path with list of assembly INSDC query accessions"
+        "--input",
+        required=True,
+        help="file path with list of assembly INSDC query accessions",
     )
 
     args = parser.parse_args()
@@ -414,7 +429,10 @@ def main() -> None:
 
     # Datasets query implementation for one or more batched accessions
     assembly_reports = fetch_datasets_reports(
-        datasets_image, query_accessions, args.reports_dir, args.datasets_batch_size
+        datasets_image,
+        query_accessions,
+        args.reports_dir,
+        args.datasets_batch_size,
     )
 
     # Extract the key assembly report meta information for reporting status