Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Run ruff's linter and fix any relevant errors raised #452

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -190,9 +190,32 @@ show-fixes = true
[tool.ruff.format]
docstring-code-format = true

[tool.ruff.lint]
select = [
"ALL",
]
ignore = [
"COM812", # https://docs.astral.sh/ruff/rules/trailing-comma-on-bare-tuple/
"D107", # https://docs.astral.sh/ruff/rules/undocumented-public-init/
"D203", # https://docs.astral.sh/ruff/rules/one-blank-line-before-class/
"D211", # https://docs.astral.sh/ruff/rules/blank-line-before-class/
"D213", # https://docs.astral.sh/ruff/rules/multi-line-summary-second-line/
"G004", # https://docs.astral.sh/ruff/rules/logging-f-string/
"I001", # https://docs.astral.sh/ruff/rules/unsorted-imports/
"ICN001", # https://docs.astral.sh/ruff/rules/unconventional-import-alias/
"ISC001", # https://docs.astral.sh/ruff/rules/single-line-implicit-string-concatenation/
"UP035", # https://docs.astral.sh/ruff/rules/deprecated-import/
]

[tool.ruff.lint.per-file-ignores]
# Ignore `F403` (unable to detect undefined names) in all `__init__.py` files
"__init__.py" = ["F403"]
# Ignore the following checks in all pytest files
"src/python/tests/**.py" = [
"INP001", # https://docs.astral.sh/ruff/rules/implicit-namespace-package/
"PLR0913", # https://docs.astral.sh/ruff/rules/too-many-arguments/
"S101", # https://docs.astral.sh/ruff/rules/assert/
]

[tool.mypy]
mypy_path = "src/python"
Expand Down
36 changes: 24 additions & 12 deletions src/python/ensembl/io/genomio/annotation/update_description.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

import logging
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from typing import Any

from sqlalchemy.orm import Session
from sqlalchemy import and_, select
Expand All @@ -39,18 +39,18 @@
"transcript": "transcript",
}

FeatStruct = Tuple[str, str, str]
FeatStruct = tuple[str, str, str]


def get_core_data(session: Session, table: str, match_xrefs: bool = False) -> Dict[str, FeatStruct]:
def get_core_data(session: Session, table: str, match_xrefs: bool = False) -> dict[str, FeatStruct]:
"""Returns the table descriptions from a core database.

Args:
session: Session open on a core database.
table: "gene" or "transcript" table from the core database.
match_xrefs: If the IDs do not match, try to match an Xref ID instead.
"""

"""
if table == "gene":
stmt = (
select(Gene.gene_id, Gene.stable_id, Gene.description, Xref.dbprimary_acc)
Expand Down Expand Up @@ -103,6 +103,7 @@ def load_descriptions(
report: Print the mapping of changes to perform in the standard output.
do_update: Actually update the core database.
match_xrefs: If the IDs do not match, try to match an Xref ID instead.

"""
func = get_json(func_file)
logging.info(f"{len(func)} annotations from {func_file}")
Expand All @@ -125,7 +126,13 @@ def load_descriptions(
}
# Compare, only keep the descriptions that have changed
features_to_update = _get_features_to_update(
table, feat_func, feat_data, stats, report=report, do_update=do_update, match_xrefs=match_xrefs
table,
feat_func,
feat_data,
stats,
report=report,
do_update=do_update,
match_xrefs=match_xrefs,
)

# Show stats for this feature type
Expand All @@ -141,8 +148,10 @@ def load_descriptions(


def _get_cur_feat(
feat_data: Dict[str, FeatStruct], new_feat: Dict[str, Any], match_xrefs: bool = False
) -> Optional[FeatStruct]:
feat_data: dict[str, FeatStruct],
new_feat: dict[str, Any],
match_xrefs: bool = False,
) -> FeatStruct | None:
"""Match a feature ID, synonyms or xrefs to a core stable ID and return the matching core feature.

Returns None if no match.
Expand All @@ -169,14 +178,14 @@ def _get_cur_feat(

def _get_features_to_update(
table: str,
feat_func: List[Dict[str, Any]],
feat_data: Dict[str, FeatStruct],
stats: Dict[str, int],
feat_func: list[dict[str, Any]],
feat_data: dict[str, FeatStruct],
stats: dict[str, int],
*,
report: bool = False,
do_update: bool = False,
match_xrefs: bool = True,
) -> List[Dict[str, Any]]:
) -> list[dict[str, Any]]:
"""Checks a list of features and returns those whose description we want to update.

Args:
Expand All @@ -190,6 +199,7 @@ def _get_features_to_update(

Returns:
The list of features with their operation changed to update or insert.

"""
to_update = []
for new_feat in feat_func:
Expand Down Expand Up @@ -247,7 +257,9 @@ def main() -> None:
parser.add_argument("--report", action="store_true", help="Show what change would be made")
parser.add_argument("--update", action="store_true", help="Make the changes to the database")
parser.add_argument(
"--match_xrefs", action="store_true", help="Use xref IDs to match features if IDs do not work"
"--match_xrefs",
action="store_true",
help="Use xref IDs to match features if IDs do not work",
)
parser.add_log_arguments(add_log_file=True)
args = parser.parse_args()
Expand Down
42 changes: 25 additions & 17 deletions src/python/ensembl/io/genomio/assembly/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
from pathlib import Path
import re
import time
from typing import Dict, Optional

from ensembl.utils.argparse import ArgumentParser
from ensembl.utils.logging import init_logging_with_args
Expand Down Expand Up @@ -70,8 +69,8 @@ def establish_ftp(ftp_conn: FTP, ftp_url: str, accession: str) -> FTP:

Raises:
UnsupportedFormatError: If `accession` does not follow INSDC's accession format.
"""

"""
match = re.match(r"^(GC[AF])_([0-9]{3})([0-9]{3})([0-9]{3})(\.[0-9]+)?$", accession)
if not match:
raise UnsupportedFormatError(f"Could not recognize GCA accession format: {accession}")
Expand All @@ -89,15 +88,15 @@ def establish_ftp(ftp_conn: FTP, ftp_url: str, accession: str) -> FTP:
return ftp_conn


def md5_files(dl_dir: Path, md5_path: Optional[Path] = None, md5_filename: str = "md5checksums.txt") -> bool:
"""
Check all files checksums with the sums listed in a checksum file, if available.
def md5_files(dl_dir: Path, md5_path: Path | None = None, md5_filename: str = "md5checksums.txt") -> bool:
"""Check all files checksums with the sums listed in a checksum file, if available.
Return False if there is no checksum file, or a file is missing, or has a wrong checksum.

Args:
dl_dir: Path location to containing downloaded FTP files.
md5_path: Full path to an MD5 checksum file.
md5_filename: Name of a checksum file in the `dl_dir` (used if no `md5_path` is given).

"""
# Get or set md5 file to user or default setting
if md5_path is None:
Expand Down Expand Up @@ -127,14 +126,14 @@ def md5_files(dl_dir: Path, md5_path: Optional[Path] = None, md5_filename: str =
return True


def get_checksums(checksum_path: Path) -> Dict[str, str]:
"""
Get a dict of checksums from a file, with file names as keys and sums as values
def get_checksums(checksum_path: Path) -> dict[str, str]:
"""Get a dict of checksums from a file, with file names as keys and sums as values

Args:
checksum_path: Path location to MD5 checksum file.

"""
sums: Dict[str, str] = {}
sums: dict[str, str] = {}
if not checksum_path.is_file():
return sums
with checksum_path.open(mode="r") as fh:
Expand All @@ -147,16 +146,15 @@ def get_checksums(checksum_path: Path) -> Dict[str, str]:


def download_files(ftp_connection: FTP, accession: str, dl_dir: Path, max_redo: int) -> None:
"""
Given an INSDC accession, download all available files from the ftp to the download dir
"""Given an INSDC accession, download all available files from the ftp to the download dir

Args:
ftp_connection: An open FTP connection object
accession: Genome assembly accession.
dl_dir: Path to downloaded FTP files.
max_redo: Maximum FTP connection retry attempts.
"""

"""
# Get the list of assemblies for this accession
for ftp_dir, _ in ftp_connection.mlsd():
if re.search(accession, ftp_dir):
Expand All @@ -176,12 +174,16 @@ def download_files(ftp_connection: FTP, accession: str, dl_dir: Path, max_redo:
_download_file(ftp_connection, ftp_file, md5_sums, dl_dir, max_redo)
else:
logging.warning(
f"Could not find accession '{accession}' from ftp {ftp_dir} in open FTP connection"
f"Could not find accession '{accession}' from ftp {ftp_dir} in open FTP connection",
)


def _download_file(
ftp_connection: FTP, ftp_file: str, md5_sums: Dict[str, str], dl_dir: Path, max_redo: int = 0
ftp_connection: FTP,
ftp_file: str,
md5_sums: dict[str, str],
dl_dir: Path,
max_redo: int = 0,
) -> None:
"""Downloads individual files from FTP server.

Expand All @@ -191,6 +193,7 @@ def _download_file(
md5_sums: Dictionary of key value pairs filename - md5_checksums.
dl_dir: Path to downloaded FTP files.
max_redo: Maximum number of connection retry attempts.

"""
has_md5 = True
expected_sum = ""
Expand Down Expand Up @@ -238,7 +241,7 @@ def _download_file(
raise FileDownloadError(f"Could not download file {ftp_file} after {redo} tries")


def get_files_selection(dl_dir: Path) -> Dict[str, str]:
def get_files_selection(dl_dir: Path) -> dict[str, str]:
"""Returns a dictionary with the relevant downloaded files classified.

Args:
Expand All @@ -249,6 +252,7 @@ def get_files_selection(dl_dir: Path) -> Dict[str, str]:

Raises:
FileDownloadError: If `dl_dir` tree does not include a file named `*_assembly_report.txt`.

"""
files = {}
root_name = get_root_name(dl_dir)
Expand All @@ -267,6 +271,7 @@ def get_root_name(dl_dir: Path) -> str:

Args:
dl_dir: Path location of downloaded FTP files.

"""
root_name = ""
for dl_file in dl_dir.iterdir():
Expand Down Expand Up @@ -294,6 +299,7 @@ def retrieve_assembly_data(

Raises:
FileDownloadError: If no files are downloaded or if any does not match its MD5 checksum.

"""
download_dir = Path(download_dir)

Expand All @@ -304,7 +310,7 @@ def retrieve_assembly_data(
if not md5_files(download_dir, None):
logging.info(" Download the files")

for increment in range(0, max_increment + 1):
for increment in range(max_increment + 1):
if increment > 0:
logging.info(f" Increment accession version once from {accession}")
version = int(accession[-1])
Expand All @@ -331,7 +337,9 @@ def main() -> None:
parser = ArgumentParser(description="Download an assembly data files from INSDC or RefSeq.")
parser.add_argument("--accession", required=True, help="Genome assembly accession")
parser.add_argument_dst_path(
"--download_dir", default=Path.cwd(), help="Folder where the data will be downloaded"
"--download_dir",
default=Path.cwd(),
help="Folder where the data will be downloaded",
)
parser.add_log_arguments()
args = parser.parse_args()
Expand Down
Loading