Skip to content

Commit

Permalink
add automatic fixes provided by ruff
Browse files Browse the repository at this point in the history
  • Loading branch information
JAlvarezJarreta committed Nov 6, 2024
1 parent 902b1c7 commit 16da59b
Show file tree
Hide file tree
Showing 68 changed files with 1,016 additions and 635 deletions.
36 changes: 24 additions & 12 deletions src/python/ensembl/io/genomio/annotation/update_description.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

import logging
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from typing import Any

from sqlalchemy.orm import Session
from sqlalchemy import and_, select
Expand All @@ -39,18 +39,18 @@
"transcript": "transcript",
}

FeatStruct = Tuple[str, str, str]
FeatStruct = tuple[str, str, str]


def get_core_data(session: Session, table: str, match_xrefs: bool = False) -> Dict[str, FeatStruct]:
def get_core_data(session: Session, table: str, match_xrefs: bool = False) -> dict[str, FeatStruct]:
"""Returns the table descriptions from a core database.
Args:
session: Session open on a core database.
table: "gene" or "transcript" table from the core database.
match_xrefs: If the IDs do not match, try to match an Xref ID instead.
"""
"""
if table == "gene":
stmt = (
select(Gene.gene_id, Gene.stable_id, Gene.description, Xref.dbprimary_acc)
Expand Down Expand Up @@ -103,6 +103,7 @@ def load_descriptions(
report: Print the mapping of changes to perform in the standard output.
do_update: Actually update the core database.
match_xrefs: If the IDs do not match, try to match an Xref ID instead.
"""
func = get_json(func_file)
logging.info(f"{len(func)} annotations from {func_file}")
Expand All @@ -125,7 +126,13 @@ def load_descriptions(
}
# Compare, only keep the descriptions that have changed
features_to_update = _get_features_to_update(
table, feat_func, feat_data, stats, report=report, do_update=do_update, match_xrefs=match_xrefs
table,
feat_func,
feat_data,
stats,
report=report,
do_update=do_update,
match_xrefs=match_xrefs,
)

# Show stats for this feature type
Expand All @@ -141,8 +148,10 @@ def load_descriptions(


def _get_cur_feat(
feat_data: Dict[str, FeatStruct], new_feat: Dict[str, Any], match_xrefs: bool = False
) -> Optional[FeatStruct]:
feat_data: dict[str, FeatStruct],
new_feat: dict[str, Any],
match_xrefs: bool = False,
) -> FeatStruct | None:
"""Match a feature ID, synonyms or xrefs to a core stable ID and return the matching core feature.
Returns None if no match.
Expand All @@ -169,14 +178,14 @@ def _get_cur_feat(

def _get_features_to_update(
table: str,
feat_func: List[Dict[str, Any]],
feat_data: Dict[str, FeatStruct],
stats: Dict[str, int],
feat_func: list[dict[str, Any]],
feat_data: dict[str, FeatStruct],
stats: dict[str, int],
*,
report: bool = False,
do_update: bool = False,
match_xrefs: bool = True,
) -> List[Dict[str, Any]]:
) -> list[dict[str, Any]]:
"""Checks a list of features and returns those whose description we want to update.
Args:
Expand All @@ -190,6 +199,7 @@ def _get_features_to_update(
Returns:
The list of features with their operation changed to update or insert.
"""
to_update = []
for new_feat in feat_func:
Expand Down Expand Up @@ -247,7 +257,9 @@ def main() -> None:
parser.add_argument("--report", action="store_true", help="Show what change would be made")
parser.add_argument("--update", action="store_true", help="Make the changes to the database")
parser.add_argument(
"--match_xrefs", action="store_true", help="Use xref IDs to match features if IDs do not work"
"--match_xrefs",
action="store_true",
help="Use xref IDs to match features if IDs do not work",
)
parser.add_log_arguments(add_log_file=True)
args = parser.parse_args()
Expand Down
42 changes: 25 additions & 17 deletions src/python/ensembl/io/genomio/assembly/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
from pathlib import Path
import re
import time
from typing import Dict, Optional

from ensembl.utils.argparse import ArgumentParser
from ensembl.utils.logging import init_logging_with_args
Expand Down Expand Up @@ -70,8 +69,8 @@ def establish_ftp(ftp_conn: FTP, ftp_url: str, accession: str) -> FTP:
Raises:
UnsupportedFormatError: If `accession` does not follow INSDC's accession format.
"""
"""
match = re.match(r"^(GC[AF])_([0-9]{3})([0-9]{3})([0-9]{3})(\.[0-9]+)?$", accession)
if not match:
raise UnsupportedFormatError(f"Could not recognize GCA accession format: {accession}")
Expand All @@ -89,15 +88,15 @@ def establish_ftp(ftp_conn: FTP, ftp_url: str, accession: str) -> FTP:
return ftp_conn


def md5_files(dl_dir: Path, md5_path: Optional[Path] = None, md5_filename: str = "md5checksums.txt") -> bool:
"""
Check all files checksums with the sums listed in a checksum file, if available.
def md5_files(dl_dir: Path, md5_path: Path | None = None, md5_filename: str = "md5checksums.txt") -> bool:
"""Check all files checksums with the sums listed in a checksum file, if available.
Return False if there is no checksum file, or a file is missing, or has a wrong checksum.
Args:
dl_dir: Path location to containing downloaded FTP files.
md5_path: Full path to an MD5 checksum file.
md5_filename: Name of a checksum file in the `dl_dir` (used if no `md5_path` is given).
"""
# Get or set md5 file to user or default setting
if md5_path is None:
Expand Down Expand Up @@ -127,14 +126,14 @@ def md5_files(dl_dir: Path, md5_path: Optional[Path] = None, md5_filename: str =
return True


def get_checksums(checksum_path: Path) -> Dict[str, str]:
"""
Get a dict of checksums from a file, with file names as keys and sums as values
def get_checksums(checksum_path: Path) -> dict[str, str]:
"""Get a dict of checksums from a file, with file names as keys and sums as values
Args:
checksum_path: Path location to MD5 checksum file.
"""
sums: Dict[str, str] = {}
sums: dict[str, str] = {}
if not checksum_path.is_file():
return sums
with checksum_path.open(mode="r") as fh:
Expand All @@ -147,16 +146,15 @@ def get_checksums(checksum_path: Path) -> Dict[str, str]:


def download_files(ftp_connection: FTP, accession: str, dl_dir: Path, max_redo: int) -> None:
"""
Given an INSDC accession, download all available files from the ftp to the download dir
"""Given an INSDC accession, download all available files from the ftp to the download dir
Args:
ftp_connection: An open FTP connection object
accession: Genome assembly accession.
dl_dir: Path to downloaded FTP files.
max_redo: Maximum FTP connection retry attempts.
"""
"""
# Get the list of assemblies for this accession
for ftp_dir, _ in ftp_connection.mlsd():
if re.search(accession, ftp_dir):
Expand All @@ -176,12 +174,16 @@ def download_files(ftp_connection: FTP, accession: str, dl_dir: Path, max_redo:
_download_file(ftp_connection, ftp_file, md5_sums, dl_dir, max_redo)
else:
logging.warning(
f"Could not find accession '{accession}' from ftp {ftp_dir} in open FTP connection"
f"Could not find accession '{accession}' from ftp {ftp_dir} in open FTP connection",
)


def _download_file(
ftp_connection: FTP, ftp_file: str, md5_sums: Dict[str, str], dl_dir: Path, max_redo: int = 0
ftp_connection: FTP,
ftp_file: str,
md5_sums: dict[str, str],
dl_dir: Path,
max_redo: int = 0,
) -> None:
"""Downloads individual files from FTP server.
Expand All @@ -191,6 +193,7 @@ def _download_file(
md5_sums: Dictionary of key value pairs filename - md5_checksums.
dl_dir: Path to downloaded FTP files.
max_redo: Maximum number of connection retry attempts.
"""
has_md5 = True
expected_sum = ""
Expand Down Expand Up @@ -238,7 +241,7 @@ def _download_file(
raise FileDownloadError(f"Could not download file {ftp_file} after {redo} tries")


def get_files_selection(dl_dir: Path) -> Dict[str, str]:
def get_files_selection(dl_dir: Path) -> dict[str, str]:
"""Returns a dictionary with the relevant downloaded files classified.
Args:
Expand All @@ -249,6 +252,7 @@ def get_files_selection(dl_dir: Path) -> Dict[str, str]:
Raises:
FileDownloadError: If `dl_dir` tree does not include a file named `*_assembly_report.txt`.
"""
files = {}
root_name = get_root_name(dl_dir)
Expand All @@ -267,6 +271,7 @@ def get_root_name(dl_dir: Path) -> str:
Args:
dl_dir: Path location of downloaded FTP files.
"""
root_name = ""
for dl_file in dl_dir.iterdir():
Expand Down Expand Up @@ -294,6 +299,7 @@ def retrieve_assembly_data(
Raises:
FileDownloadError: If no files are downloaded or if any does not match its MD5 checksum.
"""
download_dir = Path(download_dir)

Expand All @@ -304,7 +310,7 @@ def retrieve_assembly_data(
if not md5_files(download_dir, None):
logging.info(" Download the files")

for increment in range(0, max_increment + 1):
for increment in range(max_increment + 1):
if increment > 0:
logging.info(f" Increment accession version once from {accession}")
version = int(accession[-1])
Expand All @@ -331,7 +337,9 @@ def main() -> None:
parser = ArgumentParser(description="Download an assembly data files from INSDC or RefSeq.")
parser.add_argument("--accession", required=True, help="Genome assembly accession")
parser.add_argument_dst_path(
"--download_dir", default=Path.cwd(), help="Folder where the data will be downloaded"
"--download_dir",
default=Path.cwd(),
help="Folder where the data will be downloaded",
)
parser.add_log_arguments()
args = parser.parse_args()
Expand Down
42 changes: 30 additions & 12 deletions src/python/ensembl/io/genomio/assembly/status.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,21 +100,21 @@ def singularity_image_setter(sif_cache_dir: Path | None, datasets_version: str |
Returns:
`spython.main.client` instance of singularity container image housing `datasets`.
"""
"""
# Set singularity cache dir from user defined path or use environment
if sif_cache_dir and sif_cache_dir.is_dir():
image_dl_path = sif_cache_dir
logging.info(f"Using user-defined cache_dir: '{image_dl_path}'")
elif os.environ.get("NXF_SINGULARITY_CACHEDIR"):
image_dl_path = Path(os.environ["NXF_SINGULARITY_CACHEDIR"])
logging.info(
f"Using preferred nextflow singularity cache dir 'NXF_SINGULARITY_CACHEDIR': {image_dl_path}"
f"Using preferred nextflow singularity cache dir 'NXF_SINGULARITY_CACHEDIR': {image_dl_path}",
)
elif os.environ.get("SINGULARITY_CACHEDIR"):
image_dl_path = Path(os.environ["SINGULARITY_CACHEDIR"])
logging.info(
f"Using the default singularity installation cache dir 'SINGULARITY_CACHEDIR': {image_dl_path}"
f"Using the default singularity installation cache dir 'SINGULARITY_CACHEDIR': {image_dl_path}",
)
else:
image_dl_path = Path()
Expand Down Expand Up @@ -142,6 +142,7 @@ def get_assembly_accessions(src_file: StrPath) -> list[str]:
Raises:
UnsupportedFormatError: If an accession does not match the INSDC assembly accession format.
"""
query_accessions: list[str] = []
with Path(src_file).open(mode="r") as fin:
Expand All @@ -165,8 +166,8 @@ def fetch_accessions_from_core_dbs(src_file: StrPath, server_url: URL) -> dict[s
Returns:
Dict of core database names (key) and their corresponding INSDC assembly accession (value).
"""
"""
core_accn_meta = {}
database_count = 0
count_accn_found = 0
Expand All @@ -179,7 +180,7 @@ def fetch_accessions_from_core_dbs(src_file: StrPath, server_url: URL) -> dict[s
db_connection = DBConnection(db_connection_url)
with db_connection.begin() as conn:
query_result = conn.execute(
text('SELECT meta_value FROM meta WHERE meta_key = "assembly.accession";')
text('SELECT meta_value FROM meta WHERE meta_key = "assembly.accession";'),
).fetchall()

if not query_result:
Expand All @@ -193,14 +194,17 @@ def fetch_accessions_from_core_dbs(src_file: StrPath, server_url: URL) -> dict[s
logging.warning(f"Core {core_db} has {len(query_result)} assembly.accessions")

logging.info(
f"From initial input core databases ({database_count}), obtained ({count_accn_found}) accessions"
f"From initial input core databases ({database_count}), obtained ({count_accn_found}) accessions",
)

return core_accn_meta


def fetch_datasets_reports(
sif_image: Client, assembly_accessions: dict[str, str], download_directory: StrPath, batch_size: int
sif_image: Client,
assembly_accessions: dict[str, str],
download_directory: StrPath,
batch_size: int,
) -> dict[str, dict]:
"""Obtain assembly reports in JSON format for each assembly accession via `datasets` CLI.
Expand Down Expand Up @@ -229,7 +233,10 @@ def fetch_datasets_reports(
for accessions in accn_subsample:
# Make call to singularity datasets providing a multi-accession query
client_return = Client.execute(
image=sif_image, command=datasets_command + accessions, return_result=True, quiet=True
image=sif_image,
command=datasets_command + accessions,
return_result=True,
quiet=True,
)
raw_result = client_return["message"]

Expand Down Expand Up @@ -272,6 +279,7 @@ def extract_assembly_metadata(assembly_reports: dict[str, dict]) -> dict[str, Re
Returns:
Parsed assembly report meta (source, meta).
"""
parsed_meta = {}

Expand Down Expand Up @@ -329,6 +337,7 @@ def generate_report_tsv(
query_type: Type of query (either core databases or accessions).
output_directory: Directory to store report TSV file.
outfile_name: Name to give to the output TSV file.
"""
tsv_outfile = Path(output_directory, f"{outfile_name}.tsv")

Expand Down Expand Up @@ -384,7 +393,9 @@ def main() -> None:
subparsers = parser.add_subparsers(title="report assembly status from", required=True, dest="src")
# Specific arguments required when using Ensembl core database names as source
core_db_parser = subparsers.add_parser(
"core_db", parents=[base_parser], help="list of Ensembl core databases"
"core_db",
parents=[base_parser],
help="list of Ensembl core databases",
)
core_db_parser.add_argument_src_path(
"--input",
Expand All @@ -394,10 +405,14 @@ def main() -> None:
core_db_parser.add_server_arguments()
# Specific arguments required when using assembly accessions as source
accessions_parser = subparsers.add_parser(
"accession", parents=[base_parser], help="list of INSDC accessions"
"accession",
parents=[base_parser],
help="list of INSDC accessions",
)
accessions_parser.add_argument_src_path(
"--input", required=True, help="file path with list of assembly INSDC query accessions"
"--input",
required=True,
help="file path with list of assembly INSDC query accessions",
)

args = parser.parse_args()
Expand All @@ -414,7 +429,10 @@ def main() -> None:

# Datasets query implementation for one or more batched accessions
assembly_reports = fetch_datasets_reports(
datasets_image, query_accessions, args.reports_dir, args.datasets_batch_size
datasets_image,
query_accessions,
args.reports_dir,
args.datasets_batch_size,
)

# Extract the key assembly report meta information for reporting status
Expand Down
Loading

0 comments on commit 16da59b

Please sign in to comment.