From 35a07a4804bc2a37005472fa22c84db8d15fa561 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Fri, 15 Sep 2023 12:02:59 +0200 Subject: [PATCH 01/58] feat: begin update to sqlalchemy2, update models and reqs --- ebel/manager/models.py | 32 +-- ebel/manager/orientdb/biodbs/bel.py | 4 + ebel/manager/orientdb/biodbs/biogrid.py | 28 ++- ebel/manager/orientdb/biodbs/clinvar.py | 3 +- ebel/manager/orientdb/biodbs/disgenet.py | 3 +- ebel/manager/orientdb/biodbs/intact.py | 5 +- ebel/manager/orientdb/biodbs/mirtarbase.py | 3 +- ebel/manager/orientdb/biodbs/nsides.py | 4 +- ebel/manager/orientdb/biodbs/stringdb.py | 6 +- ebel/manager/orientdb/biodbs/uniprot.py | 11 +- ebel/manager/orientdb/odb_meta.py | 9 +- ebel/manager/orientdb/urls.py | 2 +- ebel/manager/rdbms/models/biogrid.py | 100 ++++---- ebel/manager/rdbms/models/chebi.py | 152 ++++++------ .../rdbms/models/clinical_trials_gov.py | 81 ++++--- ebel/manager/rdbms/models/clinvar.py | 110 +++++---- ebel/manager/rdbms/models/disgenet.py | 66 +++--- ebel/manager/rdbms/models/drugbank.py | 150 ++++++------ ebel/manager/rdbms/models/ensembl.py | 23 +- ebel/manager/rdbms/models/expression_atlas.py | 106 +++++---- ebel/manager/rdbms/models/gwas_catalog.py | 82 +++---- ebel/manager/rdbms/models/hgnc.py | 223 +++++++++--------- ebel/manager/rdbms/models/human_ortholog.py | 25 +- ebel/manager/rdbms/models/intact.py | 21 +- ebel/manager/rdbms/models/iuphar.py | 142 +++++------ ebel/manager/rdbms/models/kegg.py | 21 +- ebel/manager/rdbms/models/mirtarbase.py | 21 +- ebel/manager/rdbms/models/ncbi.py | 178 +++++++------- ebel/manager/rdbms/models/nsides.py | 27 ++- ebel/manager/rdbms/models/pathway_commons.py | 42 ++-- ebel/manager/rdbms/models/protein_atlas.py | 95 ++++---- ebel/manager/rdbms/models/reactome.py | 13 +- ebel/manager/rdbms/models/stringdb.py | 67 +++--- ebel/manager/rdbms/models/uniprot.py | 48 ++-- pyproject.toml | 28 +-- requirements.txt | 32 +-- 36 files changed, 1023 insertions(+), 940 deletions(-) diff --git a/ebel/manager/models.py b/ebel/manager/models.py index 719860b..1ab5587 100755 --- a/ebel/manager/models.py +++ b/ebel/manager/models.py @@ -13,9 +13,9 @@ import requests import sqlalchemy from lark import Lark, Token, Tree -from sqlalchemy import Boolean, Column, ForeignKey, Index, Integer, String +from sqlalchemy import Boolean, ForeignKey, Index, Integer, String from sqlalchemy.ext.declarative import declarative_base, declared_attr -from sqlalchemy.orm import relationship +from sqlalchemy.orm import relationship, mapped_column from sqlalchemy.sql.expression import func from sqlalchemy_utils import create_database, database_exists from tqdm import tqdm @@ -55,7 +55,7 @@ def foreign_key_to(table_name): :rtype: sqlalchemy.Column """ foreign_column = table_name + ".id" - return Column(Integer, ForeignKey(foreign_column)) + return mapped_column(Integer, ForeignKey(foreign_column)) class MasterModel(object): @@ -71,7 +71,7 @@ def __tablename__(self): __mapper_args__ = {"always_refresh": True} - id = Column(Integer, primary_key=True) + id = mapped_column(Integer, primary_key=True) def _to_dict(self): """Protected method for converting values to dictionary.""" @@ -94,10 +94,10 @@ class Namespace(Base, MasterModel): __tablename__ = "namespace" __table_args__ = (Index("idx_url", "url", mysql_length=100),) - url = Column(String(2048), nullable=False) - keyword = Column(String(255), index=True) - cacheable = Column(Boolean) - case_sensitive = Column(Boolean) + url = mapped_column(String(2048), nullable=False) + keyword = mapped_column(String(255), index=True) + cacheable = mapped_column(Boolean) + case_sensitive = mapped_column(Boolean) entries = relationship("NamespaceEntry", back_populates="namespace") @@ -108,8 +108,8 @@ class NamespaceEntry(Base, MasterModel): __tablename__ = "namespace_entry" __table_args__ = (Index("idx_name", "name", mysql_length=100),) - name = Column(String(2048), nullable=True) - encoding = Column(String(8), nullable=True) + name = mapped_column(String(2048), nullable=True) + encoding = mapped_column(String(8), nullable=True) namespace__id = foreign_key_to("namespace") namespace = relationship("Namespace", back_populates="entries") @@ -121,10 +121,10 @@ class Annotation(Base, MasterModel): __tablename__ = "annotation" __table_args__ = (Index("idx_url2", "url", mysql_length=100),) - url = Column(String(2048), nullable=False) - keyword = Column(String(255), index=True) - cacheable = Column(Boolean) - case_sensitive = Column(Boolean) + url = mapped_column(String(2048), nullable=False) + keyword = mapped_column(String(255), index=True) + cacheable = mapped_column(Boolean) + case_sensitive = mapped_column(Boolean) entries = relationship("AnnotationEntry", back_populates="annotation", cascade="all, delete-orphan") @@ -135,8 +135,8 @@ class AnnotationEntry(Base, MasterModel): __tablename__ = "annotation_entry" __table_args__ = (Index("idx_identifier", "identifier", mysql_length=100),) - name = Column(String(2048), nullable=True) - identifier = Column(String(255), nullable=True) + name = mapped_column(String(2048), nullable=True) + identifier = mapped_column(String(255), nullable=True) annotation__id = foreign_key_to("annotation") annotation = relationship("Annotation", back_populates="entries") diff --git a/ebel/manager/orientdb/biodbs/bel.py b/ebel/manager/orientdb/biodbs/bel.py index 10c5106..d5a0f20 100644 --- a/ebel/manager/orientdb/biodbs/bel.py +++ b/ebel/manager/orientdb/biodbs/bel.py @@ -680,3 +680,7 @@ def insert_data(self) -> Dict[str, int]: def update_interactions(self) -> int: """Abstract method.""" pass + +if __name__ == "__main__": + b = Bel() + b.clinical_trials.update() \ No newline at end of file diff --git a/ebel/manager/orientdb/biodbs/biogrid.py b/ebel/manager/orientdb/biodbs/biogrid.py index c2e7e1c..63df5a5 100644 --- a/ebel/manager/orientdb/biodbs/biogrid.py +++ b/ebel/manager/orientdb/biodbs/biogrid.py @@ -1,5 +1,5 @@ """BioGrid.""" - +import logging import typing from enum import Enum from typing import Dict, Tuple @@ -7,6 +7,7 @@ import numpy as np import pandas as pd from pyorientdb import OrientDB +from sqlalchemy import text from tqdm import tqdm from ebel import tools @@ -18,6 +19,8 @@ STANDARD_NAMESPACES = {9606: "HGNC", 10090: "MGI", 10116: "RGD"} +logger = logging.getLogger(__name__) + class BioGridNode: """Custom class definition for BioGRID nodes.""" @@ -311,6 +314,8 @@ def insert_data(self) -> Dict[str, int]: df.index += 1 df.index.rename("id", inplace=True) + logger.info("Insert BIOGRID data") + df.to_sql(biogrid.Biogrid.__tablename__, self.engine, if_exists="append") return {self.biodb_name: df.shape[0]} @@ -469,13 +474,14 @@ def get_uniprot_modification_pairs(self): where m.modification != 'No Modification' and ia.uniprot IS NOT NULL and ib.uniprot IS NOT NULL group by - ia.symbol, - ia.uniprot, - ia.taxonomy_id, - ib.symbol, - ib.uniprot, - ib.taxonomy_id""" - return [dict(x) for x in self.engine.execute(sql).fetchall()] + subject_symbol, + subject_uniprot, + subject_taxonomy_id, + object_symbol, + object_uniprot, + object_taxonomy_id""" + results = self.session.execute(text(sql)).fetchall() + return [x._asdict() for x in results] def get_create_pure_protein_rid_by_uniprot(self, taxonomy_id, symbol, uniprot): """Get pure protein rid by UniProt accession ID if the protein is involved in a BEL statement.""" @@ -561,8 +567,8 @@ def update_interactions(self) -> int: object_uniprot=e["object_uniprot"], ) - for row in self.engine.execute(sql).fetchall(): - row_dict = dict(row) + for row in self.session.execute(text(sql)).fetchall(): + row_dict = row._asdict() be = BioGridEdge(subject_rid=subj_pure_rid, object_rid=obj_pure_rid, **row_dict) edge_value_dict = be.get_edge_value_dict() @@ -620,4 +626,4 @@ def create_view(self): biogrid_modification m on (m.id=b.modification_id) left join biogrid_source s on (s.id=b.source_id) left join biogrid_publication p on (p.id=b.publication_id)""" - self.engine.execute(sql) + self.session.execute(text(sql)) diff --git a/ebel/manager/orientdb/biodbs/clinvar.py b/ebel/manager/orientdb/biodbs/clinvar.py index a5d0f47..ef8f237 100644 --- a/ebel/manager/orientdb/biodbs/clinvar.py +++ b/ebel/manager/orientdb/biodbs/clinvar.py @@ -5,6 +5,7 @@ import pandas as pd from pyorientdb import OrientDB +from sqlalchemy import text from tqdm import tqdm from ebel.manager.orientdb import odb_meta, odb_structure, urls @@ -176,7 +177,7 @@ def get_disease_snps_dict(self) -> Dict[str, List[Snp]]: results = dict() for kwd in disease_keywords: sql = sql_temp.format(keyword=kwd) - rows = self.engine.execute(sql) + rows = self.session.execute(text(sql)) results[kwd] = [Snp(*x) for x in rows.fetchall()] return results diff --git a/ebel/manager/orientdb/biodbs/disgenet.py b/ebel/manager/orientdb/biodbs/disgenet.py index c1d27f7..d884350 100644 --- a/ebel/manager/orientdb/biodbs/disgenet.py +++ b/ebel/manager/orientdb/biodbs/disgenet.py @@ -4,6 +4,7 @@ import pandas as pd from pyorientdb import OrientDB +from sqlalchemy import text from tqdm import tqdm from ebel.manager.orientdb import odb_meta, odb_structure, urls @@ -202,7 +203,7 @@ def update_snps(self) -> int: results = dict() for kwd in self.disease_keywords: sql = sql_temp.format(kwd) - rows = self.engine.execute(sql) + rows = self.session.execute(text(sql)) results[kwd] = rows inserted = 0 diff --git a/ebel/manager/orientdb/biodbs/intact.py b/ebel/manager/orientdb/biodbs/intact.py index 39f8bc9..0a4d57d 100644 --- a/ebel/manager/orientdb/biodbs/intact.py +++ b/ebel/manager/orientdb/biodbs/intact.py @@ -5,6 +5,7 @@ import pandas as pd from pyorientdb import OrientDB +from sqlalchemy import text from tqdm import tqdm from ebel.manager.orientdb import odb_meta, odb_structure, urls @@ -142,7 +143,7 @@ def get_namespace_name_by_uniprot(self, uniprot_accession: str) -> tuple: return_value = () sql = f"""Select s.symbol, u.taxid from uniprot u inner join uniprot_gene_symbol s on (u.id=s.uniprot_id) where u.accession='{uniprot_accession}' limit 1""" - result = self.engine.execute(sql).fetchone() + result = self.session.execute(text(sql)).fetchone() taxid_to_namespace = {9606: "HGNC", 10090: "MGI", 10116: "RGD"} if result: name, taxid = result @@ -191,7 +192,7 @@ def update_interactions(self) -> int: for uniprot_accession in tqdm(uniprot_accessions, desc="Update IntAct interactions"): sql = sql_temp.format(uniprot_accession=uniprot_accession) - result = self.engine.execute(sql) + result = self.session.execute(text(sql)) for ( up_a, diff --git a/ebel/manager/orientdb/biodbs/mirtarbase.py b/ebel/manager/orientdb/biodbs/mirtarbase.py index 14eec42..4085b2a 100644 --- a/ebel/manager/orientdb/biodbs/mirtarbase.py +++ b/ebel/manager/orientdb/biodbs/mirtarbase.py @@ -3,6 +3,7 @@ import pandas as pd from pyorientdb import OrientDB +from sqlalchemy import text from tqdm import tqdm from ebel.manager.orientdb import odb_meta, odb_structure, urls @@ -70,7 +71,7 @@ def update_interactions(self) -> int: species_target_gene='Homo sapiens' and support_type in ('Functional MTI', 'Non-Functional MTI')""" cols = ["mi_rna", "symbol", "support_type", "pmid", "experiments"] - df_mirtarbase = pd.DataFrame(self.engine.execute(sql).fetchall(), columns=cols) + df_mirtarbase = pd.DataFrame(self.session.execute(text(sql)).fetchall(), columns=cols) df_mirtarbase.experiments = df_mirtarbase.experiments.str.split("//") df_join = df_mirtarbase.set_index("symbol").join(df_symbol_rid.set_index("symbol"), how="inner") diff --git a/ebel/manager/orientdb/biodbs/nsides.py b/ebel/manager/orientdb/biodbs/nsides.py index 50b5909..f9cb0a7 100644 --- a/ebel/manager/orientdb/biodbs/nsides.py +++ b/ebel/manager/orientdb/biodbs/nsides.py @@ -7,6 +7,7 @@ import pandas as pd from pyorientdb import OrientDB +from sqlalchemy import text from tqdm import tqdm from ebel.constants import RID @@ -171,7 +172,8 @@ def update_bel(self) -> int: updated = 0 for drugbank_id, drugbank_rid in tqdm(drugbank_id_rids.items(), desc=f"Update {self.biodb_name.upper()}"): - for r in self.engine.execute(sql_temp.format(drugbank_id)): + sql = sql_temp.format(drugbank_id) + for r in self.session.execute(text(sql)): ( condition_meddra_id, condition_concept_name, diff --git a/ebel/manager/orientdb/biodbs/stringdb.py b/ebel/manager/orientdb/biodbs/stringdb.py index a92c005..27272e9 100644 --- a/ebel/manager/orientdb/biodbs/stringdb.py +++ b/ebel/manager/orientdb/biodbs/stringdb.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd from pyorientdb import OrientDB +from sqlalchemy import text from tqdm import tqdm from ebel.manager.orientdb import odb_meta, odb_structure, urls @@ -161,7 +162,7 @@ def get_stringdb_action_hgnc_set(self): """Get unique HGNC symbols from stringdb_actions table.""" sql = f"""(Select distinct( symbol1 ) from {self.table_action}) union (Select distinct( symbol2 ) from {self.table_action})""" - return set([x[0] for x in self.engine.execute(sql).fetchall()]) + return set([x[0] for x in self.session.execute(text(sql)).fetchall()]) def update_interactions(self) -> Dict[str, int]: """Update the edges with StringDB metadata.""" @@ -294,7 +295,8 @@ def update_action_interactions(self, hgnc: Hgnc) -> int: updated = 0 for symbol in tqdm(symbols, desc="Update has_action_st edges"): - rows = self.engine.execute(sql_temp.format(symbol=symbol)) + sql = sql_temp.format(symbol=symbol) + rows = self.engine.execute(text(sql)) for row in rows.fetchall(): action = Action(*row) diff --git a/ebel/manager/orientdb/biodbs/uniprot.py b/ebel/manager/orientdb/biodbs/uniprot.py index bfdfe1d..4e4c131 100644 --- a/ebel/manager/orientdb/biodbs/uniprot.py +++ b/ebel/manager/orientdb/biodbs/uniprot.py @@ -9,6 +9,7 @@ import pandas as pd from lxml.etree import iterparse from pyorientdb import OrientDB +from sqlalchemy import text from tqdm import tqdm from ebel.defaults import default_tax_ids @@ -145,14 +146,14 @@ def insert_data(self) -> Dict[str, int]: """Insert UniProt data depending on NCBI taxonomy identifier.""" dialect = self.session.bind.dialect.name if dialect == "mysql": - self.engine.execute("SET FOREIGN_KEY_CHECKS=0") + self.session.execute(text("SET FOREIGN_KEY_CHECKS=0")) inserted = self.insert_uniprot() self.add_gene_symbols() self.session.commit() if dialect == "mysql": - self.engine.execute("SET FOREIGN_KEY_CHECKS=1") + self.session.execute(text("SET FOREIGN_KEY_CHECKS=1")) return {self.biodb_name: inserted} @@ -311,7 +312,7 @@ def _get_accesssion_recname(self, taxid, gene_symbol) -> Union[Tuple[str, str], f"Select accession, recommended_name from uniprot as u inner join uniprot_gene_symbol as gs " f'on (u.id=gs.uniprot_id) where u.taxid={taxid} and gs.symbol="{gene_symbol}" limit 1' ) - results = self.engine.execute(sql) + results = self.session.execute(text(sql)) return results.fetchone() if results else None def _update_proteins(self, namespace, taxid) -> int: @@ -338,7 +339,7 @@ def _update_proteins(self, namespace, taxid) -> int: def _get_recname_taxid_by_accession_from_uniprot_api(self, accession) -> Tuple[str, int]: """Fetch uniprot entry by accession and adds to the database. Returns recommended name.""" sql = f"Select recommended_name,taxid from uniprot where accession='{accession}' limit 1" - result = self.engine.execute(sql).fetchone() + result = self.session.execute(text(sql)).fetchone() if result: return result @@ -353,7 +354,7 @@ def _update_uniprot_proteins(self) -> int: ) for protein in self.query(sql_uniprot).itertuples(index=False): sql = sql_temp.format(protein.accession) - found = self.engine.execute(sql).fetchone() + found = self.session.execute(text(sql)).fetchone() if found: recommended_name, taxid = found num_updated = self.execute(sql_update.format(recommended_name, taxid, protein.accession))[0] diff --git a/ebel/manager/orientdb/odb_meta.py b/ebel/manager/orientdb/odb_meta.py index cb9b95a..10c1972 100644 --- a/ebel/manager/orientdb/odb_meta.py +++ b/ebel/manager/orientdb/odb_meta.py @@ -25,6 +25,7 @@ PyOrientIndexException, PyOrientSecurityAccessException) from pyorientdb.otypes import OrientRecord +from sqlalchemy import text from sqlalchemy.sql.schema import Table from sqlalchemy_utils import create_database, database_exists from tqdm import tqdm @@ -240,12 +241,12 @@ def clear_and_import_data(self) -> Dict[str, int]: return inserted def create_index_rdbms(self, table_name: str, columns): - """Creates index on column(s) in RDBMS.""" + """Creates index on mapped_column(s) in RDBMS.""" if isinstance(columns, str): columns = [columns] sql_columns = ",".join(columns) index_name = f"idx_{table_name}_" + "_".join(columns) - self.engine.execute(f"CREATE INDEX {index_name} ON {table_name} ({sql_columns})") + self.session.execute(f"CREATE INDEX {index_name} ON {table_name} ({sql_columns})") def clear_edges_by_bel_doc_rid(self, bel_document_rid: str, even_if_other_doc_rids_exists=True): """Delete all edges linked to a specified BEL document rID.""" @@ -819,7 +820,7 @@ def number_of_generics(self) -> Dict[str, int]: for table_name, table in self.tables_base.metadata.tables.items(): if self.table_exists(table_name): sql = f"Select count(*) from `{table_name}`" - numbers[table_name] = self.engine.execute(sql).fetchone()[0] + numbers[table_name] = self.session.execute(text(sql)).fetchone()[0] else: numbers[table_name] = 0 elif self.generic_classes: @@ -1348,7 +1349,7 @@ def get_set_gene_rids_by_position( for gene_type, sql in sqls.items(): if gene_type in gene_types: - results = self.engine.execute(sql) + results = self.session.execute(sql) for (symbol,) in results.fetchall(): bel = f'g(HGNC:"{symbol}")' data = { diff --git a/ebel/manager/orientdb/urls.py b/ebel/manager/orientdb/urls.py index cbf84f5..a6c31d2 100755 --- a/ebel/manager/orientdb/urls.py +++ b/ebel/manager/orientdb/urls.py @@ -79,7 +79,7 @@ IUPHAR_LIGANDS = "https://www.guidetopharmacology.org/DATA/ligands.csv" # CHEBI # -CHEBI_BASE = "ftp://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/" +CHEBI_BASE = "https://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/" CHEBI_CHEMICALDATA = f"{CHEBI_BASE}chemical_data.tsv" CHEBI_COMMENT = f"{CHEBI_BASE}comments.tsv" CHEBI_COMPOUND = f"{CHEBI_BASE}compounds.tsv.gz" diff --git a/ebel/manager/rdbms/models/biogrid.py b/ebel/manager/rdbms/models/biogrid.py index c3af157..d552d56 100644 --- a/ebel/manager/rdbms/models/biogrid.py +++ b/ebel/manager/rdbms/models/biogrid.py @@ -1,7 +1,7 @@ """BioGRID RDBMS model definition.""" -from sqlalchemy import Column, Float, ForeignKey, Integer, String, Text +from sqlalchemy import Float, ForeignKey, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship +from sqlalchemy.orm import relationship, mapped_column, Mapped from ebel.manager.rdbms.models import object_as_dict @@ -12,26 +12,28 @@ class Biogrid(Base): """Class definition for the biogrid table.""" __tablename__ = "biogrid" - id = Column(Integer, primary_key=True) - - biogrid_a_id = Column(Integer, ForeignKey("biogrid_interactor.biogrid_id")) - biogrid_a = relationship("Interactor", foreign_keys=[biogrid_a_id]) - biogrid_b_id = Column(Integer, ForeignKey("biogrid_interactor.biogrid_id")) - biogrid_b = relationship("Interactor", foreign_keys=[biogrid_b_id]) - biogrid_id = Column(Integer, nullable=True) - experimental_system_id = Column(Integer, ForeignKey("biogrid_experimental_system.id")) - experimental_system = relationship("ExperimentalSystem", foreign_keys=[experimental_system_id]) - throughput_id = Column(Integer, ForeignKey("biogrid_throughput.id")) - throughput = relationship("Throughput", foreign_keys=[throughput_id]) - score = Column(Float, nullable=True) - modification_id = Column(Integer, ForeignKey("biogrid_modification.id")) - modification = relationship("Modification", foreign_keys=[modification_id]) - qualifications = Column(String(255), nullable=True) - source_id = Column(Integer, ForeignKey("biogrid_source.id")) - source = relationship("Source", foreign_keys=[source_id]) - publication_id = Column(Integer, ForeignKey("biogrid_publication.id")) - publication = relationship("Publication", foreign_keys=[publication_id]) - qualification = Column(Text, nullable=True) + id = mapped_column(Integer, primary_key=True) + + biogrid_a_id: Mapped[int] = mapped_column(ForeignKey("biogrid_interactor.biogrid_id")) + biogrid_a: Mapped["Interactor"] = relationship("Interactor", foreign_keys=[biogrid_a_id]) + biogrid_b_id: Mapped[int] = mapped_column(ForeignKey("biogrid_interactor.biogrid_id")) + biogrid_b: Mapped["Interactor"] = relationship("Interactor", foreign_keys=[biogrid_b_id]) + biogrid_id: Mapped[int] = mapped_column(nullable=True) + experimental_system_id: Mapped[int] = mapped_column(ForeignKey("biogrid_experimental_system.id")) + experimental_system: Mapped["ExperimentalSystem"] = relationship( + "ExperimentalSystem", foreign_keys=[experimental_system_id] + ) + throughput_id: Mapped[int] = mapped_column(ForeignKey("biogrid_throughput.id")) + throughput: Mapped["Throughput"] = relationship("Throughput", foreign_keys=[throughput_id]) + score: Mapped[float] = mapped_column(nullable=True) + modification_id: Mapped[int] = mapped_column(ForeignKey("biogrid_modification.id"), nullable=True) + modification: Mapped["Modification"] = relationship("Modification", foreign_keys=[modification_id]) + qualifications: Mapped[str] = mapped_column(String(255), nullable=True) + source_id: Mapped[int] = mapped_column(ForeignKey("biogrid_source.id")) + source: Mapped["Source"] = relationship("Source", foreign_keys=[source_id]) + publication_id: Mapped[int] = mapped_column(ForeignKey("biogrid_publication.id")) + publication: Mapped["Publication"] = relationship("Publication", foreign_keys=[publication_id]) + qualification: Mapped[str] = mapped_column(Text, nullable=True) def as_dict(self): """Convert object values to dictionary.""" @@ -53,11 +55,11 @@ class Publication(Base): """Class definition for the biogrid_publication table.""" __tablename__ = "biogrid_publication" - id = Column(Integer, primary_key=True) - author_name = Column(String(255), nullable=True) - publication_year = Column(Integer, nullable=True) - source = Column(String(255), nullable=True) - source_identifier = Column(String(255), nullable=True) + id: Mapped[int] = mapped_column(primary_key=True) + author_name: Mapped[str] = mapped_column(String(255), nullable=True) + publication_year: Mapped[int] = mapped_column(nullable=True) + source: Mapped[str] = mapped_column(String(255), nullable=True) + source_identifier: Mapped[str] = mapped_column(String(255), nullable=True) def as_dict(self): """Convert object values to dictionary.""" @@ -68,9 +70,9 @@ class Throughput(Base): """Class definition for the biogrid_throughput table.""" __tablename__ = "biogrid_throughput" - id = Column(Integer, primary_key=True) - throughput = Column(String(255)) - frequency = Column(Integer) + id: Mapped[int] = mapped_column(primary_key=True) + throughput: Mapped[str] = mapped_column(String(255)) + frequency: Mapped[int] = mapped_column() def as_dict(self): """Convert object values to dictionary.""" @@ -81,8 +83,8 @@ class Taxonomy(Base): """Class definition for the biogrid_taxonomy table.""" __tablename__ = "biogrid_taxonomy" - taxonomy_id = Column(Integer, primary_key=True) # == NCBI Taxonomy ID - organism_name = Column(String(1000)) + taxonomy_id: Mapped[int] = mapped_column(primary_key=True) # == NCBI Taxonomy ID + organism_name: Mapped[str] = mapped_column(String(1000)) def as_dict(self): """Convert object values to dictionary.""" @@ -93,10 +95,10 @@ class ExperimentalSystem(Base): """Class definition for the biogrid_experimental_system table.""" __tablename__ = "biogrid_experimental_system" - id = Column(Integer, primary_key=True) - experimental_system = Column(String(255), nullable=True) - experimental_system_type = Column(String(255), nullable=True) - frequency = Column(Integer) + id: Mapped[int] = mapped_column(primary_key=True) + experimental_system: Mapped[str] = mapped_column(String(255), nullable=True) + experimental_system_type: Mapped[str] = mapped_column(String(255), nullable=True) + frequency: Mapped[int] = mapped_column() def as_dict(self): """Convert object values to dictionary.""" @@ -107,15 +109,15 @@ class Interactor(Base): """Class definition for the biogrid_interactor table.""" __tablename__ = "biogrid_interactor" - biogrid_id = Column(Integer, primary_key=True) + biogrid_id: Mapped[int] = mapped_column(primary_key=True) - entrez = Column(Integer, nullable=True, index=True) - systematic_name = Column(String(255), nullable=True, index=True) - symbol = Column(String(255), nullable=True, index=True) - taxonomy_id = Column(Integer, ForeignKey("biogrid_taxonomy.taxonomy_id")) - taxonomy = relationship("Taxonomy", foreign_keys=[taxonomy_id]) - uniprot = Column(String(255), nullable=True, index=True) - trembl = Column(String(1000), nullable=True) + entrez: Mapped[int] = mapped_column(nullable=True, index=True) + systematic_name: Mapped[str] = mapped_column(String(255), nullable=True, index=True) + symbol: Mapped[str] = mapped_column(String(255), nullable=True, index=True) + taxonomy_id: Mapped[int] = mapped_column(ForeignKey("biogrid_taxonomy.taxonomy_id")) + taxonomy: Mapped["Taxonomy"] = relationship("Taxonomy", foreign_keys=[taxonomy_id]) + uniprot: Mapped[str] = mapped_column(String(255), nullable=True, index=True) + trembl: Mapped[str] = mapped_column(String(1000), nullable=True) def as_dict(self): """Convert object values to dictionary.""" @@ -133,8 +135,8 @@ class Source(Base): """Class definition for the biogrid_source table.""" __tablename__ = "biogrid_source" - id = Column(Integer, primary_key=True) - source = Column(String(255), nullable=True) + id: Mapped[int] = mapped_column(primary_key=True) + source: Mapped[str] = mapped_column(String(255), nullable=True) def as_dict(self): """Convert object values to dictionary.""" @@ -145,9 +147,9 @@ class Modification(Base): """Class definition for the biogrid_modification table.""" __tablename__ = "biogrid_modification" - id = Column(Integer, primary_key=True) - modification = Column(String(255), nullable=True) - frequency = Column(Integer) + id: Mapped[int] = mapped_column(primary_key=True) + modification: Mapped[str] = mapped_column(String(255), nullable=True) + frequency: Mapped[int] = mapped_column() def as_dict(self): """Convert object values to dictionary.""" diff --git a/ebel/manager/rdbms/models/chebi.py b/ebel/manager/rdbms/models/chebi.py index 28ea3ce..99876ff 100644 --- a/ebel/manager/rdbms/models/chebi.py +++ b/ebel/manager/rdbms/models/chebi.py @@ -1,9 +1,11 @@ """CHEBI RDBMS model definition.""" +import datetime +from typing import List -from sqlalchemy import (Column, DateTime, ForeignKey, Index, Integer, String, +from sqlalchemy import (DateTime, ForeignKey, Index, Integer, String, Text) from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship +from sqlalchemy.orm import relationship, mapped_column, Mapped Base = declarative_base() @@ -12,14 +14,14 @@ class ChemicalData(Base): """Class definition for the chebi_chemical_data table.""" __tablename__ = "chebi_chemical_data" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - chemical_data = Column(Text, nullable=True) - source = Column(Text, nullable=False) - type = Column(Text, nullable=False) + chemical_data: Mapped[str] = mapped_column(Text, nullable=True) + source: Mapped[str] = mapped_column(Text, nullable=False) + type: Mapped[str] = mapped_column(Text, nullable=False) - compound_id = Column(Integer, ForeignKey("chebi_compound.id")) - compounds = relationship("Compound", back_populates="chemicalData") + compound_id: Mapped[int] = mapped_column(ForeignKey("chebi_compound.id")) + compounds: Mapped["Compound"] = relationship("Compound", back_populates="chemicalData") def __str__(self): """Class string definition.""" @@ -38,15 +40,15 @@ class Comment(Base): """Class definition for the chebi_comment table.""" __tablename__ = "chebi_comment" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - text = Column(Text, nullable=False) - created_on = Column(DateTime, nullable=False) - datatype = Column(String(80)) - datatype_id = Column(Integer, nullable=False) + text: Mapped[str] = mapped_column(Text, nullable=False) + created_on: Mapped[datetime.datetime] = mapped_column(DateTime, nullable=False) + datatype: Mapped[str] = mapped_column(String(80)) + datatype_id: Mapped[int] = mapped_column(nullable=False) - compound_id = Column(Integer, ForeignKey("chebi_compound.id")) - compounds = relationship("Compound", back_populates="comments") + compound_id: Mapped[int] = mapped_column(ForeignKey("chebi_compound.id")) + compounds: Mapped["Compound"] = relationship("Compound", back_populates="comments") def __str__(self): """Class string definition.""" @@ -64,27 +66,29 @@ class Compound(Base): """Class definition for the chebi_compound table.""" __tablename__ = "chebi_compound" - id = Column(Integer, primary_key=True) - - name = Column(String(2000)) - source = Column(String(32), nullable=False) - parent_id = Column(Integer) - chebi_accession = Column(String(30), nullable=False) - status = Column(String(1), nullable=False) - definition = Column(Text) - star = Column(Integer, nullable=False) - modified_on = Column(Text) - created_by = Column(Text) - - chemicalData = relationship("ChemicalData", back_populates="compounds") - comments = relationship("Comment", back_populates="compounds") - database_accessions = relationship("DatabaseAccession", back_populates="compounds") - names = relationship("Name", back_populates="compounds") - references = relationship("Reference", back_populates="compounds") + id: Mapped[int] = mapped_column(primary_key=True) + + name: Mapped[str] = mapped_column(String(2000), nullable=True) + source: Mapped[str] = mapped_column(String(32), nullable=False) + parent_id: Mapped[int] = mapped_column(nullable=True) + chebi_accession: Mapped[str] = mapped_column(String(30), nullable=False) + status: Mapped[str] = mapped_column(String(1), nullable=False) + definition: Mapped[str] = mapped_column(Text, nullable=True) + star: Mapped[int] = mapped_column(nullable=False) + modified_on: Mapped[str] = mapped_column(Text, nullable=True) + created_by: Mapped[int] = mapped_column(Text, nullable=True) + + chemicalData: Mapped[List["ChemicalData"]] = relationship("ChemicalData", back_populates="compounds") + comments: Mapped[List["Comment"]] = relationship("Comment", back_populates="compounds") + database_accessions: Mapped[List["DatabaseAccession"]] = relationship( + "DatabaseAccession", back_populates="compounds" + ) + names: Mapped[List["Name"]] = relationship("Name", back_populates="compounds") + references: Mapped[List["Reference"]] = relationship("Reference", back_populates="compounds") # final_id_relations = relationship("Relation", back_populates="final_id_compounds") # init_id_relations = relationship("Relation", back_populates="init_id_compounds") - structures = relationship("Structure", back_populates="compounds") - inchis = relationship("Inchi", back_populates="compounds") + structures: Mapped[List["Structure"]] = relationship("Structure", back_populates="compounds") + inchis: Mapped[List["Inchi"]] = relationship("Inchi", back_populates="compounds") def __str__(self): return self.name @@ -111,12 +115,12 @@ class Inchi(Base): """Class definition for the chebi_inchi table.""" __tablename__ = "chebi_inchi" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - inchi = Column(Text) + inchi: Mapped[str] = mapped_column(Text) - compound_id = Column(Integer, ForeignKey("chebi_compound.id")) - compounds = relationship("Compound", back_populates="inchis") + compound_id: Mapped[int] = mapped_column(ForeignKey("chebi_compound.id")) + compounds: Mapped[List["Compound"]] = relationship("Compound", back_populates="inchis") def __str__(self): return self.inchi @@ -130,14 +134,14 @@ class DatabaseAccession(Base): """Class definition for the chebi_database_accession table.""" __tablename__ = "chebi_database_accession" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - accession_number = Column(String(255), nullable=True) - type = Column(Text, nullable=False) - source = Column(Text, nullable=False) + accession_number: Mapped[str] = mapped_column(String(255), nullable=True) + type: Mapped[str] = mapped_column(Text, nullable=False) + source: Mapped[str] = mapped_column(Text, nullable=False) - compound_id = Column(Integer, ForeignKey("chebi_compound.id")) - compounds = relationship("Compound", back_populates="database_accessions") + compound_id: Mapped[int] = mapped_column(ForeignKey("chebi_compound.id")) + compounds: Mapped[List["Compound"]] = relationship("Compound", back_populates="database_accessions") def __str__(self): return self.accession_number @@ -155,16 +159,16 @@ class Name(Base): """Class definition for the chebi_name table.""" __tablename__ = "chebi_name" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - name = Column(Text, nullable=True) - type = Column(Text, nullable=False) - source = Column(Text, nullable=False) - adapted = Column(Text, nullable=False) - language = Column(Text, nullable=False) + name: Mapped[str] = mapped_column(Text, nullable=True) + type: Mapped[str] = mapped_column(Text, nullable=False) + source: Mapped[str] = mapped_column(Text, nullable=False) + adapted: Mapped[str] = mapped_column(Text, nullable=False) + language: Mapped[str] = mapped_column(Text, nullable=False) - compound_id = Column(Integer, ForeignKey("chebi_compound.id")) - compounds = relationship("Compound", back_populates="names") + compound_id: Mapped[int] = mapped_column(ForeignKey("chebi_compound.id")) + compounds: Mapped[List["Compound"]] = relationship("Compound", back_populates="names") def __str__(self): return self.name @@ -185,15 +189,15 @@ class Reference(Base): __tablename__ = "chebi_reference" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - reference_id = Column(String(60), nullable=False, index=True) - reference_db_name = Column(String(60), nullable=False, index=True) - location_in_ref = Column(String(90), index=True) - reference_name = Column(String(1024)) + reference_id: Mapped[str] = mapped_column(String(60), nullable=False, index=True) + reference_db_name: Mapped[str] = mapped_column(String(60), nullable=False, index=True) + location_in_ref: Mapped[str] = mapped_column(String(90), nullable=True, index=True) + reference_name: Mapped[str] = mapped_column(String(1024), nullable=True) - compound_id = Column(Integer, ForeignKey("chebi_compound.id")) - compounds = relationship("Compound", back_populates="references") + compound_id: Mapped[int] = mapped_column(ForeignKey("chebi_compound.id")) + compounds: Mapped[List["Compound"]] = relationship("Compound", back_populates="references") __table_args__ = (Index("ix_chebi_reference__reference_name", reference_name, mysql_length=500),) @@ -224,16 +228,16 @@ class Relation(Base): """Class definition for the chebi_relation table.""" __tablename__ = "chebi_relation" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - type = Column(Text, nullable=False) - status = Column(String(1), nullable=False) + type: Mapped[str] = mapped_column(Text, nullable=False) + status: Mapped[str] = mapped_column(String(1), nullable=False) - final_id = Column(Integer, ForeignKey("chebi_compound.id")) - init_id = Column(Integer, ForeignKey("chebi_compound.id")) + final_id: Mapped[int] = mapped_column(ForeignKey("chebi_compound.id")) + init_id: Mapped[int] = mapped_column(ForeignKey("chebi_compound.id")) - final_id_compounds = relationship("Compound", foreign_keys=[final_id]) - init_id_compounds = relationship("Compound", foreign_keys=[init_id]) + final_id_compounds: Mapped[List["Compound"]] = relationship("Compound", foreign_keys=[final_id]) + init_id_compounds: Mapped[List["Compound"]] = relationship("Compound", foreign_keys=[init_id]) def __str__(self): return f"{self.type} - {self.status}" @@ -252,16 +256,16 @@ class Structure(Base): """Class definition for the chebi_structure table.""" __tablename__ = "chebi_structure" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - structure = Column(Text, nullable=False) - type = Column(Text, nullable=False) - dimension = Column(Text, nullable=False) - default_structure = Column(String(1), nullable=False) - autogen_structure = Column(String(1), nullable=False) + structure: Mapped[str] = mapped_column(Text, nullable=False) + type: Mapped[str] = mapped_column(Text, nullable=False) + dimension: Mapped[str] = mapped_column(Text, nullable=False) + default_structure: Mapped[str] = mapped_column(String(1), nullable=False) + autogen_structure: Mapped[str] = mapped_column(String(1), nullable=False) - compound_id = Column(Integer, ForeignKey("chebi_compound.id")) - compounds = relationship("Compound", back_populates="structures") + compound_id: Mapped[int] = mapped_column(ForeignKey("chebi_compound.id")) + compounds: Mapped[List["Compound"]] = relationship("Compound", back_populates="structures") def __str__(self): return self.structure diff --git a/ebel/manager/rdbms/models/clinical_trials_gov.py b/ebel/manager/rdbms/models/clinical_trials_gov.py index f2f02ba..a94ff4f 100644 --- a/ebel/manager/rdbms/models/clinical_trials_gov.py +++ b/ebel/manager/rdbms/models/clinical_trials_gov.py @@ -1,9 +1,10 @@ """ClinicalTrials.gov RDBMS model definition.""" import re +from typing import List -from sqlalchemy import Column, ForeignKey, Integer, String, Table, Text +from sqlalchemy import ForeignKey, Integer, String, Table, Text, Column from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship +from sqlalchemy.orm import relationship, mapped_column, Mapped from ebel.manager.rdbms.models import object_as_dict @@ -83,49 +84,49 @@ class ClinicalTrialGov(Base): __tablename__ = "clinical_trials_gov" - id = Column(Integer, primary_key=True) - nct_id = Column(String(100), index=True) - org_study_id = Column(Text) - brief_title = Column(Text) - official_title = Column(Text) - is_fda_regulated_drug = Column(Text) - brief_summary = Column(Text) - detailed_description = Column(Text) - overall_status = Column(Text) - start_date = Column(Text) - completion_date = Column(Text) - phase = Column(Text) - study_type = Column(Text) - study_design_intervention_model = Column(Text) - study_design_primary_purpose = Column(Text) - study_design_masking = Column(Text) + id: Mapped[int] = mapped_column(primary_key=True) + nct_id = mapped_column(String(100), index=True) + org_study_id: Mapped[str] = mapped_column(Text) + brief_title: Mapped[str] = mapped_column(Text) + official_title: Mapped[str] = mapped_column(Text) + is_fda_regulated_drug: Mapped[str] = mapped_column(Text) + brief_summary: Mapped[str] = mapped_column(Text) + detailed_description: Mapped[str] = mapped_column(Text) + overall_status: Mapped[str] = mapped_column(Text) + start_date: Mapped[str] = mapped_column(Text) + completion_date: Mapped[str] = mapped_column(Text) + phase: Mapped[str] = mapped_column(Text) + study_type: Mapped[str] = mapped_column(Text) + study_design_intervention_model: Mapped[str] = mapped_column(Text) + study_design_primary_purpose: Mapped[str] = mapped_column(Text) + study_design_masking: Mapped[str] = mapped_column(Text) # primary_outcomes # secondary_outcomes - patient_data_sharing_ipd = Column(Text) - patient_data_ipd_description = Column(Text) + patient_data_sharing_ipd: Mapped[str] = mapped_column(Text) + patient_data_ipd_description: Mapped[str] = mapped_column(Text) - keywords = relationship( + keywords: Mapped[List["Keyword"]] = relationship( "Keyword", secondary=ctg_keyword_n2m, back_populates="trials", cascade="save-update", ) - conditions = relationship( + conditions: Mapped[List["Condition"]] = relationship( "Condition", secondary=ctg_condition_n2m, back_populates="trials", cascade="save-update", ) - mesh_terms = relationship( + mesh_terms: Mapped[List["MeshTerm"]] = relationship( "MeshTerm", secondary=ctg_mesh_term_n2m, back_populates="trials", cascade="save-update", ) - interventions = relationship( + interventions: Mapped[List["Intervention"]] = relationship( "Intervention", secondary=ctg_intervention_n2m, back_populates="trials", @@ -157,9 +158,11 @@ class Keyword(Base): """Class definition for the clinical_trials_gov_keyword table.""" __tablename__ = "clinical_trials_gov_keyword" - id = Column(Integer, primary_key=True) - keyword = Column(String(255), index=True) - trials = relationship("ClinicalTrialGov", secondary=ctg_keyword_n2m, back_populates="keywords") + id: Mapped[int] = mapped_column(primary_key=True) + keyword: Mapped[str] = mapped_column(String(255), index=True) + trials: Mapped[List["ClinicalTrialGov"]] = relationship( + "ClinicalTrialGov", secondary=ctg_keyword_n2m, back_populates="keywords" + ) def as_dict(self): """Convert object values to dictionary.""" @@ -170,9 +173,11 @@ class Condition(Base): """Class definition for the clinical_trials_gov_condition table.""" __tablename__ = "clinical_trials_gov_condition" - id = Column(Integer, primary_key=True) - condition = Column(Text) - trials = relationship("ClinicalTrialGov", secondary=ctg_condition_n2m, back_populates="conditions") + id: Mapped[int] = mapped_column(primary_key=True) + condition: Mapped[str] = mapped_column(Text) + trials: Mapped[List["ClinicalTrialGov"]] = relationship( + "ClinicalTrialGov", secondary=ctg_condition_n2m, back_populates="conditions" + ) def as_dict(self): """Convert object values to dictionary.""" @@ -183,9 +188,11 @@ class MeshTerm(Base): """Class definition for the clinical_trials_gov_mesh_term table.""" __tablename__ = "clinical_trials_gov_mesh_term" - id = Column(Integer, primary_key=True) - mesh_term = Column(String(100), unique=True) - trials = relationship("ClinicalTrialGov", secondary=ctg_mesh_term_n2m, back_populates="mesh_terms") + id: Mapped[int] = mapped_column(primary_key=True) + mesh_term: Mapped[str] = mapped_column(String(100), unique=True) + trials: Mapped[List["ClinicalTrialGov"]] = relationship( + "ClinicalTrialGov", secondary=ctg_mesh_term_n2m, back_populates="mesh_terms" + ) def as_dict(self): """Convert object values to dictionary.""" @@ -200,10 +207,10 @@ class Intervention(Base): """Class definition for the clinical_trials_gov_intervention table.""" __tablename__ = "clinical_trials_gov_intervention" - id = Column(Integer, primary_key=True) - intervention_type = Column(String(100), index=True) - intervention_name = Column(String(255), index=True) - trials = relationship( + id: Mapped[int] = mapped_column(primary_key=True) + intervention_type: Mapped[str] = mapped_column(String(100), index=True) + intervention_name: Mapped[str] = mapped_column(String(255), index=True) + trials: Mapped[List["ClinicalTrialGov"]] = relationship( "ClinicalTrialGov", secondary=ctg_intervention_n2m, back_populates="interventions", diff --git a/ebel/manager/rdbms/models/clinvar.py b/ebel/manager/rdbms/models/clinvar.py index a7995ab..3d91d16 100644 --- a/ebel/manager/rdbms/models/clinvar.py +++ b/ebel/manager/rdbms/models/clinvar.py @@ -1,7 +1,9 @@ """ClinVar RDBMS model definition.""" -from sqlalchemy import Column, ForeignKey, Index, Integer, String, Table, Text +from typing import List + +from sqlalchemy import ForeignKey, Index, Integer, String, Table, Text, Column from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship +from sqlalchemy.orm import relationship, mapped_column, Mapped from ebel.manager.rdbms.models import object_as_dict @@ -19,23 +21,23 @@ class ClinvarPhenotypeMedgen(Base): """Class definition for the clinvar_phenotype_medgen table.""" __tablename__ = "clinvar_phenotype_medgen" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - identifier = Column(String(100), index=True) - clinvar_id = Column(Integer, ForeignKey("clinvar.id")) - clinvar = relationship("Clinvar", foreign_keys=[clinvar_id], viewonly=True) + identifier: Mapped[str] = mapped_column(String(100), index=True) + clinvar_id: Mapped[int] = mapped_column(Integer, ForeignKey("clinvar.id")) + clinvar: Mapped["Clinvar"] = relationship("Clinvar", foreign_keys=[clinvar_id], viewonly=True) class ClinvarOtherIdentifier(Base): """Class definition for the clinvar_other_identifier table.""" __tablename__ = "clinvar_other_identifier" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - db = Column(String(100), index=True) - identifier = Column(String(100), index=True) - clinvar_id = Column(Integer, ForeignKey("clinvar.id")) - clinvar = relationship("Clinvar", foreign_keys=[clinvar_id], viewonly=True) + db: Mapped[str] = mapped_column(String(100), index=True) + identifier: Mapped[str] = mapped_column(String(100), index=True) + clinvar_id: Mapped[int] = mapped_column(ForeignKey("clinvar.id")) + clinvar: Mapped["Clinvar"] = relationship("Clinvar", foreign_keys=[clinvar_id], viewonly=True) def as_dict(self): """Convert object values to dictionary.""" @@ -46,44 +48,50 @@ class Clinvar(Base): """Class definition for the clinvar table.""" __tablename__ = "clinvar" - id = Column(Integer, primary_key=True) - - allele_id = Column(Integer) - type = Column(String(100)) - name = Column(String(1000)) - gene_id = Column(Integer, index=True) - gene_symbol = Column(String(1000)) - hgnc_id = Column(String(100)) - clinical_significance = Column(String(100)) - clin_sig_simple = Column(Integer) - last_evaluated = Column(String(100)) - rs_db_snp = Column(Integer, index=True) - nsv_esv_db_var = Column(String(100)) - rcvaccession = Column(String(1000)) - origin = Column(Text) - origin_simple = Column(Text) - assembly = Column(String(100), index=True) - chromosome_accession = Column(Text) - chromosome = Column(Text) - start = Column(Integer) - stop = Column(Integer) - reference_allele = Column(Text) - alternate_allele = Column(Text) - cytogenetic = Column(Text) - review_status = Column(Text) - number_submitters = Column(Integer) - guidelines = Column(Text) - tested_in_gtr = Column(Text) - submitter_categories = Column(Integer) - variation_id = Column(Integer) - position_vcf = Column(Integer) - reference_allele_vcf = Column(Text(100000)) - alternate_allele_vcf = Column(Text(100000)) - - phenotypeMedgens = relationship("ClinvarPhenotypeMedgen", foreign_keys=[ClinvarPhenotypeMedgen.clinvar_id]) - otherIdentifiers = relationship("ClinvarOtherIdentifier", foreign_keys=[ClinvarOtherIdentifier.clinvar_id]) - - phenotypes = relationship("ClinvarPhenotype", secondary=clinvar__clinvar_phenotype) + id: Mapped[int] = mapped_column(primary_key=True) + + allele_id: Mapped[int] = mapped_column() + type: Mapped[str] = mapped_column(String(100)) + name: Mapped[str] = mapped_column(String(1000)) + gene_id: Mapped[int] = mapped_column(index=True) + gene_symbol: Mapped[str] = mapped_column(String(1000)) + hgnc_id: Mapped[str] = mapped_column(String(100)) + clinical_significance: Mapped[str] = mapped_column(String(100)) + clin_sig_simple: Mapped[int] = mapped_column() + last_evaluated: Mapped[str] = mapped_column(String(100)) + rs_db_snp: Mapped[int] = mapped_column(index=True) + nsv_esv_db_var: Mapped[str] = mapped_column(String(100)) + rcvaccession: Mapped[str] = mapped_column(String(1000)) + origin: Mapped[str] = mapped_column(Text) + origin_simple: Mapped[str] = mapped_column(Text) + assembly: Mapped[str] = mapped_column(String(100), index=True) + chromosome_accession: Mapped[str] = mapped_column(Text) + chromosome: Mapped[str] = mapped_column(Text) + start: Mapped[int] = mapped_column() + stop: Mapped[int] = mapped_column() + reference_allele: Mapped[str] = mapped_column(Text) + alternate_allele: Mapped[str] = mapped_column(Text) + cytogenetic: Mapped[str] = mapped_column(Text) + review_status: Mapped[str] = mapped_column(Text) + number_submitters: Mapped[int] = mapped_column() + guidelines: Mapped[str] = mapped_column(Text) + tested_in_gtr: Mapped[str] = mapped_column(Text) + submitter_categories: Mapped[int] = mapped_column() + variation_id: Mapped[int] = mapped_column() + position_vcf: Mapped[int] = mapped_column() + reference_allele_vcf: Mapped[str] = mapped_column(Text(100000)) + alternate_allele_vcf: Mapped[str] = mapped_column(Text(100000)) + + phenotypeMedgens: Mapped[List["ClinvarPhenotypeMedgen"]] = relationship( + "ClinvarPhenotypeMedgen", foreign_keys=[ClinvarPhenotypeMedgen.clinvar_id] + ) + otherIdentifiers: Mapped[List["ClinvarOtherIdentifier"]] = relationship( + "ClinvarOtherIdentifier", foreign_keys=[ClinvarOtherIdentifier.clinvar_id] + ) + + phenotypes: Mapped[List["ClinvarPhenotype"]] = relationship( + "ClinvarPhenotype", secondary=clinvar__clinvar_phenotype + ) __table_args__ = (Index("ix_clinvar__gene_symbol", gene_symbol, mysql_length=500),) @@ -99,8 +107,8 @@ class ClinvarPhenotype(Base): """Class definition for the clinvar_phenotype table.""" __tablename__ = "clinvar_phenotype" - id = Column(Integer, primary_key=True) - phenotype = Column(Text) + id = mapped_column(Integer, primary_key=True) + phenotype = mapped_column(Text) clinvars = relationship("Clinvar", secondary=clinvar__clinvar_phenotype, back_populates="phenotypes") diff --git a/ebel/manager/rdbms/models/disgenet.py b/ebel/manager/rdbms/models/disgenet.py index cd32bcf..3127fd4 100644 --- a/ebel/manager/rdbms/models/disgenet.py +++ b/ebel/manager/rdbms/models/disgenet.py @@ -1,7 +1,9 @@ """DisGeNet RDBMS model definition.""" -from sqlalchemy import BigInteger, Column, Float, ForeignKey, Integer, String +from typing import List + +from sqlalchemy import BigInteger, Float, ForeignKey, Integer, String from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship +from sqlalchemy.orm import relationship, mapped_column, Mapped from ebel.manager.rdbms.models import object_as_dict @@ -12,16 +14,18 @@ class DisgenetGene(Base): """Class definition for the disgenet_gene table.""" __tablename__ = "disgenet_gene" - id = Column(Integer, primary_key=True) - - gene_id = Column(Integer, ForeignKey("disgenet_gene_symbol.gene_id")) - gene_symbol = relationship("DisgenetGeneSymbol", back_populates="gene_disease_pmid_associations") - disease_id = Column(String(100), ForeignKey("disgenet_disease.disease_id")) - disease = relationship("DisgenetDisease", foreign_keys=[disease_id]) - score = Column(Float) - pmid = Column(BigInteger) - source_id = Column(Integer, ForeignKey("disgenet_source.id")) - source = relationship("DisgenetSource", foreign_keys=[source_id]) + id: Mapped[int] = mapped_column(primary_key=True) + + gene_id: Mapped[int] = mapped_column(ForeignKey("disgenet_gene_symbol.gene_id")) + gene_symbol: Mapped["DisgenetGeneSymbol"] = relationship( + "DisgenetGeneSymbol", back_populates="gene_disease_pmid_associations" + ) + disease_id: Mapped[str] = mapped_column(String(100), ForeignKey("disgenet_disease.disease_id")) + disease: Mapped["DisgenetDisease"] = relationship("DisgenetDisease", foreign_keys=[disease_id]) + score: Mapped[float] = mapped_column() + pmid: Mapped[int] = mapped_column() + source_id: Mapped[int] = mapped_column(ForeignKey("disgenet_source.id")) + source: Mapped["DisgenetSource"] = relationship("DisgenetSource", foreign_keys=[source_id]) def as_dict(self): """Convert object values to dictionary.""" @@ -40,10 +44,12 @@ class DisgenetGeneSymbol(Base): """Class definition for the disgenet_gene_symbol table.""" __tablename__ = "disgenet_gene_symbol" - gene_id = Column(Integer, primary_key=True) - gene_symbol = Column(String(50), index=True) + gene_id: Mapped[int] = mapped_column(primary_key=True) + gene_symbol: Mapped[str] = mapped_column(String(50), index=True) - gene_disease_pmid_associations = relationship("DisgenetGene", back_populates="gene_symbol") + gene_disease_pmid_associations: Mapped[List["DisgenetGene"]] = relationship( + "DisgenetGene", back_populates="gene_symbol" + ) def as_dict(self): """Convert object values to dictionary.""" @@ -54,17 +60,17 @@ class DisgenetVariant(Base): """Class definition for the disgenet_variant table.""" __tablename__ = "disgenet_variant" - id = Column(Integer, primary_key=True) - - snp_id = Column(String(20), index=True) - chromosome = Column(String(2)) - position = Column(BigInteger) - disease_id = Column(String(100), ForeignKey("disgenet_disease.disease_id")) - disease = relationship("DisgenetDisease", foreign_keys=[disease_id]) - score = Column(Float) - pmid = Column(BigInteger, index=True) - source_id = Column(Integer, ForeignKey("disgenet_source.id")) - source = relationship("DisgenetSource", foreign_keys=[source_id]) + id: Mapped[int] = mapped_column(primary_key=True) + + snp_id: Mapped[str] = mapped_column(String(20), index=True) + chromosome: Mapped[str] = mapped_column(String(2)) + position: Mapped[int] = mapped_column() + disease_id: Mapped[str] = mapped_column(String(100), ForeignKey("disgenet_disease.disease_id")) + disease: Mapped["DisgenetDisease"] = relationship("DisgenetDisease", foreign_keys=[disease_id]) + score: Mapped[float] = mapped_column() + pmid: Mapped[int] = mapped_column(index=True) + source_id: Mapped[int] = mapped_column(ForeignKey("disgenet_source.id")) + source: Mapped["DisgenetSource"] = relationship("DisgenetSource", foreign_keys=[source_id]) def as_dict(self): """Convert object values to dictionary.""" @@ -77,8 +83,8 @@ class DisgenetDisease(Base): """Class definition for the disgenet_disease table.""" __tablename__ = "disgenet_disease" - disease_id = Column(String(100), primary_key=True) - disease_name = Column(String(255), index=True) + disease_id: Mapped[str] = mapped_column(String(100), primary_key=True) + disease_name: Mapped[str] = mapped_column(String(255), index=True) def as_dict(self): """Convert object values to dictionary.""" @@ -89,8 +95,8 @@ class DisgenetSource(Base): """Class definition for the disgenet_source table.""" __tablename__ = "disgenet_source" - id = Column(Integer, primary_key=True) - source = Column(String(100), index=True) + id: Mapped[int] = mapped_column(primary_key=True) + source: Mapped[str] = mapped_column(String(100), index=True) def as_dict(self): """Convert object values to dictionary.""" diff --git a/ebel/manager/rdbms/models/drugbank.py b/ebel/manager/rdbms/models/drugbank.py index e045bba..c0f1ba6 100644 --- a/ebel/manager/rdbms/models/drugbank.py +++ b/ebel/manager/rdbms/models/drugbank.py @@ -1,8 +1,10 @@ """DrugBank RDBMS model definition.""" +import datetime +from typing import List from sqlalchemy import Column, Date, ForeignKey, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship +from sqlalchemy.orm import relationship, mapped_column, Mapped Base = declarative_base() @@ -11,34 +13,34 @@ class Drugbank(Base): """Class definition for the drugbank table.""" __tablename__ = "drugbank" - id = Column(Integer, primary_key=True) - drugbank_id = Column(String(10), index=True) - name = Column(String(255)) - description = Column(Text) - cas_number = Column(String(20)) - unii = Column(String(20)) - state = Column(String(20)) - indication = Column(Text) - pharmacodynamics = Column(Text) - toxicity = Column(Text) - metabolism = Column(Text) - absorption = Column(Text) - half_life = Column(Text) - route_of_elimination = Column(Text) - volume_of_distribution = Column(Text) - clearance = Column(Text) - mechanism_of_action = Column(Text) - fda_label = Column(Text) - - references = relationship("Reference", back_populates="drugbank", cascade="save-update") - synonyms = relationship("Synonym", back_populates="drugbank", cascade="save-update") - targets = relationship("Target", back_populates="drugbank", cascade="save-update") - external_identifiers = relationship("ExternalIdentifier", back_populates="drugbank", cascade="save-update") - product_names = relationship("ProductName", back_populates="drugbank", cascade="save-update") - drug_interactions = relationship("DrugInteraction", back_populates="drugbank", cascade="save-update") - statuses = relationship("Status", back_populates="drugbank", cascade="save-update") - patents = relationship("Patent", back_populates="drugbank", cascade="save-update") - pathways = relationship("Pathway", back_populates="drugbank", cascade="save-update") + id: Mapped[int] = mapped_column(primary_key=True) + drugbank_id: Mapped[str] = mapped_column(String(10), index=True) + name: Mapped[str] = mapped_column(String(255)) + description: Mapped[str] = mapped_column(Text) + cas_number: Mapped[str] = mapped_column(String(20)) + unii: Mapped[str] = mapped_column(String(20)) + state: Mapped[str] = mapped_column(String(20)) + indication: Mapped[str] = mapped_column(Text) + pharmacodynamics: Mapped[str] = mapped_column(Text) + toxicity: Mapped[str] = mapped_column(Text) + metabolism: Mapped[str] = mapped_column(Text) + absorption: Mapped[str] = mapped_column(Text) + half_life: Mapped[str] = mapped_column(Text) + route_of_elimination: Mapped[str] = mapped_column(Text) + volume_of_distribution: Mapped[str] = mapped_column(Text) + clearance: Mapped[str] = mapped_column(Text) + mechanism_of_action: Mapped[str] = mapped_column(Text) + fda_label: Mapped[str] = mapped_column(Text) + + references: Mapped[List["Reference"]] = relationship("Reference", back_populates="drugbank", cascade="save-update") + synonyms: Mapped[List["Synonym"]] = relationship("Synonym", back_populates="drugbank", cascade="save-update") + targets: Mapped[List["Target"]] = relationship("Target", back_populates="drugbank", cascade="save-update") + external_identifiers: Mapped[List["ExternalIdentifier"]] = relationship("ExternalIdentifier", back_populates="drugbank", cascade="save-update") + product_names: Mapped[List["ProductName"]] = relationship("ProductName", back_populates="drugbank", cascade="save-update") + drug_interactions: Mapped[List["DrugInteraction"]] = relationship("DrugInteraction", back_populates="drugbank", cascade="save-update") + statuses: Mapped[List["Status"]] = relationship("Status", back_populates="drugbank", cascade="save-update") + patents: Mapped[List["Patent"]] = relationship("Patent", back_populates="drugbank", cascade="save-update") + pathways: Mapped[List["Pathway"]] = relationship("Pathway", back_populates="drugbank", cascade="save-update") def __str__(self): """Class string definition.""" @@ -77,11 +79,11 @@ class Pathway(Base): """Class definition for the drugbank_pathway table.""" __tablename__ = "drugbank_pathway" - id = Column(Integer, primary_key=True) - smpdb_id = Column(String(255)) + id: Mapped[int] = mapped_column(primary_key=True) + smpdb_id: Mapped[str] = mapped_column(String(255)) - drugbank_id = Column(Integer, ForeignKey("drugbank.id")) - drugbank = relationship("Drugbank", back_populates="pathways") + drugbank_id: Mapped[str] = mapped_column(ForeignKey("drugbank.id")) + drugbank: Mapped["Drugbank"] = relationship("Drugbank", back_populates="pathways") def __str__(self): return self.smpdb_id @@ -95,15 +97,15 @@ class Patent(Base): """Class definition for the drugbank_patent table.""" __tablename__ = "drugbank_patent" - id = Column(Integer, primary_key=True) - number = Column(String(255)) - country = Column(String(255)) - approved = Column(Date) - expires = Column(Date) - pediatric_extension = Column(String(255)) + id: Mapped[int] = mapped_column(primary_key=True) + number: Mapped[str] = mapped_column(String(255)) + country: Mapped[str] = mapped_column(String(255)) + approved: Mapped[datetime.date] = mapped_column(Date) + expires: Mapped[datetime.date] = mapped_column(Date) + pediatric_extension: Mapped[str] = mapped_column(String(255)) - drugbank_id = Column(Integer, ForeignKey("drugbank.id")) - drugbank = relationship("Drugbank", back_populates="patents") + drugbank_id: Mapped[int] = mapped_column(ForeignKey("drugbank.id")) + drugbank: Mapped[Drugbank] = relationship("Drugbank", back_populates="patents") def __str__(self): return self.number @@ -124,11 +126,11 @@ class Status(Base): """Class definition for the drugbank_status table.""" __tablename__ = "drugbank_status" - id = Column(Integer, primary_key=True) - status = Column(String(20), index=True) + id: Mapped[int] = mapped_column(primary_key=True) + status: Mapped[str] = mapped_column(String(20), index=True) - drugbank_id = Column(Integer, ForeignKey("drugbank.id")) - drugbank = relationship("Drugbank", back_populates="statuses") + drugbank_id: Mapped[int] = mapped_column(ForeignKey("drugbank.id")) + drugbank: Mapped[Drugbank] = relationship("Drugbank", back_populates="statuses") def __str__(self): return self.status @@ -142,12 +144,12 @@ class ExternalIdentifier(Base): """Class definition for the drugbank_external_identifier table.""" __tablename__ = "drugbank_external_identifier" - id = Column(Integer, primary_key=True) - resource = Column(String(255), index=True) - identifier = Column(String(255), index=True) + id: Mapped[int] = mapped_column(primary_key=True) + resource: Mapped[str] = mapped_column(String(255), index=True) + identifier: Mapped[str] = mapped_column(String(255), index=True) - drugbank_id = Column(Integer, ForeignKey("drugbank.id")) - drugbank = relationship("Drugbank", back_populates="external_identifiers") + drugbank_id: Mapped[int] = mapped_column(ForeignKey("drugbank.id")) + drugbank: Mapped[Drugbank] = relationship("Drugbank", back_populates="external_identifiers") def __str__(self): return self.identifier @@ -165,11 +167,11 @@ class Reference(Base): """Class definition for the drugbank_reference table.""" __tablename__ = "drugbank_reference" - id = Column(Integer, primary_key=True) - pmid = Column(Integer) + id: Mapped[int] = mapped_column(primary_key=True) + pmid: Mapped[int] = mapped_column() - drugbank_id = Column(Integer, ForeignKey("drugbank.id")) - drugbank = relationship("Drugbank", back_populates="references") + drugbank_id: Mapped[int] = mapped_column(ForeignKey("drugbank.id")) + drugbank: Mapped[Drugbank] = relationship("Drugbank", back_populates="references") def __str__(self): return self.pmid @@ -183,13 +185,13 @@ class Target(Base): """Class definition for the drugbank_target table.""" __tablename__ = "drugbank_target" - id = Column(Integer, primary_key=True) - uniprot = Column(String(20), index=True) - action = Column(String(50), index=True) - known_action = Column(String(20), index=True) + id: Mapped[int] = mapped_column(primary_key=True) + uniprot: Mapped[str] = mapped_column(String(20), index=True) + action: Mapped[str] = mapped_column(String(50), index=True) + known_action: Mapped[str] = mapped_column(String(20), index=True) - drugbank_id = Column(Integer, ForeignKey("drugbank.id")) - drugbank = relationship("Drugbank", back_populates="targets") + drugbank_id: Mapped[int] = mapped_column(ForeignKey("drugbank.id")) + drugbank: Mapped[Drugbank] = relationship("Drugbank", back_populates="targets") def __str__(self): return self.uniprot @@ -208,13 +210,13 @@ class DrugInteraction(Base): """Class definition for the drugbank_drug_interaction table.""" __tablename__ = "drugbank_drug_interaction" - id = Column(Integer, primary_key=True) - drugbank_id = Column(String(10), index=True) - name = Column(Text) - description = Column(Text) + id: Mapped[int] = mapped_column(primary_key=True) + drugbank_id: Mapped[str] = mapped_column(String(10), index=True) + name: Mapped[str] = mapped_column(Text) + description: Mapped[str] = mapped_column(Text) - db_id = Column(Integer, ForeignKey("drugbank.id")) # exception because drugbank_id is already a field - drugbank = relationship("Drugbank", back_populates="drug_interactions") + db_id: Mapped[str] = mapped_column(ForeignKey("drugbank.id")) # exception because drugbank_id is already a field + drugbank: Mapped[Drugbank] = relationship("Drugbank", back_populates="drug_interactions") def __str__(self): return self.drugbank_id @@ -233,11 +235,11 @@ class ProductName(Base): """Class definition for the drugbank_product_name table.""" __tablename__ = "drugbank_product_name" - id = Column(Integer, primary_key=True) - name = Column(Text) + id: Mapped[int] = mapped_column(primary_key=True) + name: Mapped[str] = mapped_column(Text) - drugbank_id = Column(Integer, ForeignKey("drugbank.id")) - drugbank = relationship("Drugbank", back_populates="product_names") + drugbank_id: Mapped[int] = mapped_column(ForeignKey("drugbank.id")) + drugbank: Mapped[Drugbank] = relationship("Drugbank", back_populates="product_names") def __str__(self): return self.name @@ -251,11 +253,11 @@ class Synonym(Base): """Class definition for the drugbank_synonym table.""" __tablename__ = "drugbank_synonym" - id = Column(Integer, primary_key=True) - synonym = Column(Text) + id: Mapped[int] = mapped_column(primary_key=True) + synonym: Mapped[str] = mapped_column(Text) - drugbank_id = Column(Integer, ForeignKey("drugbank.id")) - drugbank = relationship("Drugbank", back_populates="synonyms") + drugbank_id: Mapped[int] = mapped_column(ForeignKey("drugbank.id")) + drugbank: Mapped[Drugbank] = relationship("Drugbank", back_populates="synonyms") def __str__(self): return self.synonym diff --git a/ebel/manager/rdbms/models/ensembl.py b/ebel/manager/rdbms/models/ensembl.py index 6c7115e..6d88a66 100644 --- a/ebel/manager/rdbms/models/ensembl.py +++ b/ebel/manager/rdbms/models/ensembl.py @@ -2,6 +2,7 @@ from sqlalchemy import Column, Integer, String from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import mapped_column, Mapped from ebel.manager.rdbms.models import object_as_dict @@ -12,17 +13,17 @@ class Ensembl(Base): """Class definition for the ensembl table.""" __tablename__ = "ensembl" - id = Column(Integer, primary_key=True) - enst = Column(String(20), index=True) - version = Column(Integer) - chromosome = Column(String(10), index=True) - start = Column(Integer, index=True) - stop = Column(Integer, index=True) - orientation = Column(Integer) - gene_id = Column(String(255)) - gene_id_short = Column(String(255)) - hgnc_id = Column(String(255), index=True) - symbol = Column(String(50), index=True) + id: Mapped[int] = mapped_column(primary_key=True) + enst: Mapped[str] = mapped_column(String(20), index=True) + version: Mapped[int] = mapped_column() + chromosome: Mapped[str] = mapped_column(String(10), index=True) + start: Mapped[int] = mapped_column(index=True) + stop: Mapped[int] = mapped_column(index=True) + orientation: Mapped[int] = mapped_column() + gene_id: Mapped[str] = mapped_column(String(255)) + gene_id_short: Mapped[str] = mapped_column(String(255)) + hgnc_id: Mapped[str] = mapped_column(String(255), index=True) + symbol: Mapped[str] = mapped_column(String(50), index=True) def as_dict(self): """Convert object values to dictionary.""" diff --git a/ebel/manager/rdbms/models/expression_atlas.py b/ebel/manager/rdbms/models/expression_atlas.py index ce70217..cf7afaf 100644 --- a/ebel/manager/rdbms/models/expression_atlas.py +++ b/ebel/manager/rdbms/models/expression_atlas.py @@ -1,7 +1,9 @@ """Expression Atlas RDBMS model definition.""" +from typing import List + from sqlalchemy import Column, Float, ForeignKey, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship +from sqlalchemy.orm import relationship, mapped_column, Mapped from ebel.manager.rdbms.models import object_as_dict @@ -13,14 +15,14 @@ class Experiment(Base): __tablename__ = "expression_atlas_experiment" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - name = Column(String(100), index=True) - title = Column(Text) + name: Mapped[str] = mapped_column(String(100), index=True) + title: Mapped[str] = mapped_column(Text) - idfs = relationship("Idf", back_populates="experiment") - group_comparisons = relationship("GroupComparison", back_populates="experiment") - sdrf_condenseds = relationship("SdrfCondensed", back_populates="experiment") + idfs: Mapped[List["Idf"]] = relationship("Idf", back_populates="experiment") + group_comparisons: Mapped[List["GroupComparison"]] = relationship("GroupComparison", back_populates="experiment") + sdrf_condenseds: Mapped[List["SdrfCondensed"]] = relationship("SdrfCondensed", back_populates="experiment") def as_dict(self): """Convert object values to dictionary.""" @@ -36,13 +38,13 @@ class Idf(Base): __tablename__ = "expression_atlas_idf" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - key_name = Column(Text, nullable=False) - value = Column(Text, nullable=False) + key_name: Mapped[str] = mapped_column(Text, nullable=False) + value: Mapped[str] = mapped_column(Text, nullable=False) - experiment_id = Column(Integer, ForeignKey("expression_atlas_experiment.id")) - experiment = relationship("Experiment", back_populates="idfs") + experiment_id: Mapped[int] = mapped_column(ForeignKey("expression_atlas_experiment.id")) + experiment: Mapped[Experiment] = relationship("Experiment", back_populates="idfs") def as_dict(self): """Convert object values to dictionary.""" @@ -54,16 +56,16 @@ class GroupComparison(Base): __tablename__ = "expression_atlas_group_comparison" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - experiment_id = Column(Integer, ForeignKey("expression_atlas_experiment.id")) - experiment = relationship("Experiment", back_populates="group_comparisons") + experiment_id: Mapped[int] = mapped_column(ForeignKey("expression_atlas_experiment.id")) + experiment: Mapped[Experiment] = relationship("Experiment", back_populates="group_comparisons") - group_comparison = Column(String(100)) - name = Column(Text) + group_comparison: Mapped[str] = mapped_column(String(100)) + name: Mapped[str] = mapped_column(Text) - fold_changes = relationship("FoldChange", back_populates="group_comparison") - gseas = relationship("Gsea", back_populates="group_comparison") + fold_changes: Mapped[List["FoldChange"]] = relationship("FoldChange", back_populates="group_comparison") + gseas: Mapped[List["Gsea"]] = relationship("Gsea", back_populates="group_comparison") def as_dict(self): """Convert object values to dictionary.""" @@ -75,16 +77,16 @@ class FoldChange(Base): __tablename__ = "expression_atlas_foldchange" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - gene_id = Column(String(255)) - gene_name = Column(String(100), index=True) - log2foldchange = Column(Float, index=True) - p_value = Column(Float, index=True) - t_statistic = Column(Float) + gene_id: Mapped[str] = mapped_column(String(255)) + gene_name: Mapped[str] = mapped_column(String(100), index=True) + log2foldchange: Mapped[float] = mapped_column(index=True) + p_value: Mapped[float] = mapped_column(index=True) + t_statistic: Mapped[float] = mapped_column() - group_comparison_id = Column(Integer, ForeignKey("expression_atlas_group_comparison.id")) - group_comparison = relationship("GroupComparison", back_populates="fold_changes") + group_comparison_id: Mapped[int] = mapped_column(ForeignKey("expression_atlas_group_comparison.id")) + group_comparison: Mapped[GroupComparison] = relationship("GroupComparison", back_populates="fold_changes") def as_dict(self): """Convert object values to dictionary.""" @@ -96,17 +98,17 @@ class SdrfCondensed(Base): __tablename__ = "expression_atlas_sdrf_condensed" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - experiment_id = Column(Integer, ForeignKey("expression_atlas_experiment.id")) - experiment = relationship("Experiment", back_populates="sdrf_condenseds") + experiment_id: Mapped[int] = mapped_column(ForeignKey("expression_atlas_experiment.id")) + experiment: Mapped[Experiment] = relationship("Experiment", back_populates="sdrf_condenseds") - method = Column(String(255)) - sample = Column(String(255)) - parameter_type = Column(String(255)) - parameter = Column(String(255)) - value = Column(String(255)) - url = Column(String(255)) + method: Mapped[str] = mapped_column(String(255)) + sample: Mapped[str] = mapped_column(String(255)) + parameter_type: Mapped[str] = mapped_column(String(255)) + parameter: Mapped[str] = mapped_column(String(255)) + value: Mapped[str] = mapped_column(String(255)) + url: Mapped[str] = mapped_column(String(255)) def as_dict(self): """Convert object values to dictionary.""" @@ -118,22 +120,22 @@ class Gsea(Base): __tablename__ = "expression_atlas_gsea" - id = Column(Integer, primary_key=True) - - group_comparison_id = Column(Integer, ForeignKey("expression_atlas_group_comparison.id")) - group_comparison = relationship("GroupComparison", back_populates="gseas") - - term = Column(String(255), index=True) - accession = Column(String(255)) - genes_tot = Column(Integer) - stat_non_dir_p = Column(Float) - p_adj_non_dir = Column(Float, index=True) - significant_in_gene_set = Column(Integer) - non_significant_in_gene_set = Column(Integer) - significant_not_in_gene_set = Column(Integer) - non_significant_not_in_gene_set = Column(Integer) - effect_size = Column(Float) - gsea_type = Column(String(100)) + id: Mapped[int] = mapped_column(primary_key=True) + + group_comparison_id: Mapped[int] = mapped_column(ForeignKey("expression_atlas_group_comparison.id")) + group_comparison: Mapped[GroupComparison] = relationship("GroupComparison", back_populates="gseas") + + term: Mapped[str] = mapped_column(String(255), index=True) + accession: Mapped[str] = mapped_column(String(255)) + genes_tot: Mapped[int] = mapped_column() + stat_non_dir_p: Mapped[float] = mapped_column() + p_adj_non_dir: Mapped[float] = mapped_column(index=True) + significant_in_gene_set: Mapped[int] = mapped_column() + non_significant_in_gene_set: Mapped[int] = mapped_column() + significant_not_in_gene_set: Mapped[int] = mapped_column() + non_significant_not_in_gene_set: Mapped[int] = mapped_column() + effect_size: Mapped[float] = mapped_column() + gsea_type: Mapped[str] = mapped_column(String(100)) def as_dict(self): """Convert object values to dictionary.""" diff --git a/ebel/manager/rdbms/models/gwas_catalog.py b/ebel/manager/rdbms/models/gwas_catalog.py index 97a6f8c..6c2c9a9 100644 --- a/ebel/manager/rdbms/models/gwas_catalog.py +++ b/ebel/manager/rdbms/models/gwas_catalog.py @@ -1,7 +1,9 @@ """GWAS Catalog RDBMS model definition.""" +from typing import List + from sqlalchemy import Column, Float, ForeignKey, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship +from sqlalchemy.orm import relationship, mapped_column, Mapped from ebel.manager.rdbms.models import object_as_dict @@ -12,42 +14,42 @@ class GwasCatalog(Base): """Class definition for the gwascatalog table.""" __tablename__ = "gwascatalog" - id = Column(Integer, primary_key=True) - date_added_to_catalog = Column(String(255)) - pubmedid = Column(Integer) - first_author = Column(String(255)) - date = Column(String(255)) - journal = Column(String(255)) - link = Column(String(255)) - study = Column(Text) - disease_trait = Column(String(255)) - initial_sample_size = Column(Text) - replication_sample_size = Column(Text) - region = Column(String(50)) - chr_id = Column(Text) - chr_pos = Column(Text) - reported_gene_s = Column(Text) - mapped_gene = Column(Text) - upstream_gene_id = Column(String(50)) - downstream_gene_id = Column(String(50)) - upstream_gene_distance = Column(Integer) - downstream_gene_distance = Column(Integer) - strongest_snp_risk_allele = Column(Text) - snp = Column(Text) - merged = Column(Integer) - snp_id_current = Column(Text) - context = Column(Text) - intergenic = Column(Integer) - risk_allele_frequency = Column(Text) - p_value = Column(Float) - pvalue_mlog = Column(Float) - p_value_text = Column(Text) - or_or_beta = Column(Float) - _95_ci_text = Column(Text) - platform_snps_passing_qc = Column(Text) - cnv = Column(Text) + id: Mapped[int] = mapped_column(primary_key=True) + date_added_to_catalog: Mapped[str] = mapped_column(String(255)) + pubmedid: Mapped[int] = mapped_column() + first_author: Mapped[str] = mapped_column(String(255)) + date: Mapped[str] = mapped_column(String(255)) + journal: Mapped[str] = mapped_column(String(255)) + link: Mapped[str] = mapped_column(String(255)) + study: Mapped[str] = mapped_column(Text) + disease_trait: Mapped[str] = mapped_column(String(255)) + initial_sample_size: Mapped[str] = mapped_column(Text) + replication_sample_size: Mapped[str] = mapped_column(Text) + region: Mapped[str] = mapped_column(String(50)) + chr_id: Mapped[str] = mapped_column(Text) + chr_pos: Mapped[str] = mapped_column(Text) + reported_gene_s: Mapped[str] = mapped_column(Text) + mapped_gene: Mapped[str] = mapped_column(Text) + upstream_gene_id: Mapped[str] = mapped_column(String(50)) + downstream_gene_id: Mapped[str] = mapped_column(String(50)) + upstream_gene_distance: Mapped[int] = mapped_column() + downstream_gene_distance: Mapped[int] = mapped_column() + strongest_snp_risk_allele: Mapped[str] = mapped_column(Text) + snp: Mapped[str] = mapped_column(Text) + merged: Mapped[int] = mapped_column() + snp_id_current: Mapped[str] = mapped_column(Text) + context: Mapped[str] = mapped_column(Text) + intergenic: Mapped[int] = mapped_column() + risk_allele_frequency: Mapped[str] = mapped_column(Text) + p_value: Mapped[float] = mapped_column() + pvalue_mlog: Mapped[float] = mapped_column() + p_value_text: Mapped[str] = mapped_column(Text) + or_or_beta: Mapped[float] = mapped_column() + _95_ci_text: Mapped[str] = mapped_column(Text) + platform_snps_passing_qc: Mapped[str] = mapped_column(Text) + cnv: Mapped[str] = mapped_column(Text) - snp_genes = relationship("SnpGene", back_populates="gwascatalog") + snp_genes: Mapped[List["SnpGene"]] = relationship("SnpGene", back_populates="gwascatalog") def as_dict(self): """Convert object values to dictionary.""" @@ -60,7 +62,7 @@ class SnpGene(Base): """Class definition for the gwascatalog_snpgene table.""" __tablename__ = "gwascatalog_snpgene" - id = Column(Integer, primary_key=True) - ensembl_identifier = Column(String(100), nullable=False, index=True) - gwascatalog_id = Column(Integer, ForeignKey("gwascatalog.id")) - gwascatalog = relationship("GwasCatalog", back_populates="snp_genes") + id: Mapped[int] = mapped_column(primary_key=True) + ensembl_identifier: Mapped[str] = mapped_column(String(100), nullable=False, index=True) + gwascatalog_id: Mapped[int] = mapped_column(ForeignKey("gwascatalog.id")) + gwascatalog: Mapped[GwasCatalog] = relationship("GwasCatalog", back_populates="snp_genes") diff --git a/ebel/manager/rdbms/models/hgnc.py b/ebel/manager/rdbms/models/hgnc.py index da21ff4..26c5a20 100644 --- a/ebel/manager/rdbms/models/hgnc.py +++ b/ebel/manager/rdbms/models/hgnc.py @@ -1,8 +1,11 @@ """HGNC RDBMS model definition.""" +import datetime +from typing import List + from sqlalchemy import (BigInteger, Column, Date, ForeignKey, Integer, String, Text) from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship +from sqlalchemy.orm import relationship, mapped_column, Mapped from ebel.manager.rdbms.models import object_as_dict @@ -13,59 +16,59 @@ class Hgnc(Base): """Class definition for the hgnc table.""" __tablename__ = "hgnc" - id = Column(Integer, primary_key=True) - hgnc_id = Column(String(20)) - version = Column(BigInteger) - bioparadigms_slc = Column(String(20)) - cd = Column(String(20)) - cosmic = Column(String(50)) - date_approved_reserved = Column(Date) - date_modified = Column(Date) - date_name_changed = Column(Date) - date_symbol_changed = Column(Date) - ensembl_gene_id = Column(String(20)) - entrez_id = Column(Integer) - homeodb = Column(Integer) - horde_id = Column(String(50)) - imgt = Column(String(50)) - iuphar = Column(String(50)) - kznf_gene_catalog = Column(Integer) - lncipedia = Column(String(50)) - lncrnadb = Column(String(50)) - location = Column(String(100)) - location_sortable = Column(String(100)) - locus_group = Column(String(50)) - locus_type = Column(String(50)) - merops = Column(String(20)) - mirbase = Column(String(20)) - name = Column(String(255)) - orphanet = Column(Integer) - snornabase = Column(String(20)) - status = Column(String(50)) - symbol = Column(String(100), index=True) - ucsc_id = Column(String(50)) - uuid = Column(String(50)) - vega_id = Column(String(50)) - agr = Column(String(50)) - kznf_gene_catalog = Column(Text) - - pre_symbols = relationship("PrevSymbol", back_populates="hgnc") - alias_names = relationship("AliasName", back_populates="hgnc") - alias_symbols = relationship("AliasSymbol", back_populates="hgnc") - ccdss = relationship("Ccds", back_populates="hgnc") - enas = relationship("Ena", back_populates="hgnc") - enzymes = relationship("Enzyme", back_populates="hgnc") - gene_group_names = relationship("GeneGroupName", back_populates="hgnc") - gene_group_ids = relationship("GeneGroupId", back_populates="hgnc") - uniprots = relationship("UniProt", back_populates="hgnc") - rna_centrals = relationship("RnaCentral", back_populates="hgnc") - rgds = relationship("Rgd", back_populates="hgnc") - refseqs = relationship("RefSeq", back_populates="hgnc") - pubmeds = relationship("PubMed", back_populates="hgnc") - prev_names = relationship("PrevName", back_populates="hgnc") - omims = relationship("Omim", back_populates="hgnc") - mgds = relationship("Mgd", back_populates="hgnc") - lsdbs = relationship("Lsdb", back_populates="hgnc") + id: Mapped[int] = mapped_column(primary_key=True) + hgnc_id: Mapped[str] = mapped_column(String(20)) + version: Mapped[int] = mapped_column() + bioparadigms_slc: Mapped[str] = mapped_column(String(20)) + cd: Mapped[str] = mapped_column(String(20)) + cosmic: Mapped[str] = mapped_column(String(50)) + date_approved_reserved: Mapped[datetime.date] = mapped_column(Date) + date_modified: Mapped[datetime.date] = mapped_column(Date) + date_name_changed: Mapped[datetime.date] = mapped_column(Date) + date_symbol_changed: Mapped[datetime.date] = mapped_column(Date) + ensembl_gene_id: Mapped[str] = mapped_column(String(20)) + entrez_id: Mapped[int] = mapped_column() + homeodb: Mapped[int] = mapped_column() + horde_id: Mapped[str] = mapped_column(String(50)) + imgt: Mapped[str] = mapped_column(String(50)) + iuphar: Mapped[str] = mapped_column(String(50)) + kznf_gene_catalog: Mapped[int] = mapped_column() + lncipedia: Mapped[str] = mapped_column(String(50)) + lncrnadb: Mapped[str] = mapped_column(String(50)) + location: Mapped[str] = mapped_column(String(100)) + location_sortable: Mapped[str] = mapped_column(String(100)) + locus_group: Mapped[str] = mapped_column(String(50)) + locus_type: Mapped[str] = mapped_column(String(50)) + merops: Mapped[str] = mapped_column(String(20)) + mirbase: Mapped[str] = mapped_column(String(20)) + name: Mapped[str] = mapped_column(String(255)) + orphanet: Mapped[int] = mapped_column() + snornabase: Mapped[str] = mapped_column(String(20)) + status: Mapped[str] = mapped_column(String(50)) + symbol: Mapped[str] = mapped_column(String(100), index=True) + ucsc_id: Mapped[str] = mapped_column(String(50)) + uuid: Mapped[str] = mapped_column(String(50)) + vega_id: Mapped[str] = mapped_column(String(50)) + agr: Mapped[str] = mapped_column(String(50)) + kznf_gene_catalog: Mapped[str] = mapped_column(Text) + + pre_symbols: Mapped[List["PrevSymbol"]] = relationship("PrevSymbol", back_populates="hgnc") + alias_names: Mapped[List["AliasName"]] = relationship("AliasName", back_populates="hgnc") + alias_symbols: Mapped[List["AliasSymbol"]] = relationship("AliasSymbol", back_populates="hgnc") + ccdss: Mapped[List["Ccds"]] = relationship("Ccds", back_populates="hgnc") + enas: Mapped[List["Ena"]] = relationship("Ena", back_populates="hgnc") + enzymes: Mapped[List["Enzyme"]] = relationship("Enzyme", back_populates="hgnc") + gene_group_names: Mapped[List["GeneGroupName"]] = relationship("GeneGroupName", back_populates="hgnc") + gene_group_ids: Mapped[List["GeneGroupId"]] = relationship("GeneGroupId", back_populates="hgnc") + uniprots: Mapped[List["UniProt"]] = relationship("UniProt", back_populates="hgnc") + rna_centrals: Mapped[List["RnaCentral"]] = relationship("RnaCentral", back_populates="hgnc") + rgds: Mapped[List["Rgd"]] = relationship("Rgd", back_populates="hgnc") + refseqs: Mapped[List["RefSeq"]] = relationship("RefSeq", back_populates="hgnc") + pubmeds: Mapped[List["PubMed"]] = relationship("PubMed", back_populates="hgnc") + prev_names: Mapped[List["PrevName"]] = relationship("PrevName", back_populates="hgnc") + omims: Mapped[List["Omim"]] = relationship("Omim", back_populates="hgnc") + mgds: Mapped[List["Mgd"]] = relationship("Mgd", back_populates="hgnc") + lsdbs: Mapped[List["Lsdb"]] = relationship("Lsdb", back_populates="hgnc") def as_dict(self): """Convert object values to dictionary.""" @@ -127,12 +130,12 @@ class PrevSymbol(Base): """Class definition for the hgnc_prev_symbol table.""" __tablename__ = "hgnc_prev_symbol" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - prev_symbol = Column(String(50), index=True) + prev_symbol: Mapped[str] = mapped_column(String(50), index=True) - hgnc_id = Column(Integer, ForeignKey("hgnc.id")) - hgnc = relationship("Hgnc", back_populates="pre_symbols") + hgnc_id: Mapped[int] = mapped_column(ForeignKey("hgnc.id")) + hgnc: Mapped[Hgnc] = relationship("Hgnc", back_populates="pre_symbols") def __str__(self): return self.prev_symbol @@ -142,12 +145,12 @@ class AliasName(Base): """Class definition for the hgnc_alias_name table.""" __tablename__ = "hgnc_alias_name" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - alias_name = Column(String(255)) + alias_name: Mapped[str] = mapped_column(String(255)) - hgnc_id = Column(Integer, ForeignKey("hgnc.id")) - hgnc = relationship("Hgnc", back_populates="alias_names") + hgnc_id: Mapped[int] = mapped_column(ForeignKey("hgnc.id")) + hgnc: Mapped[Hgnc] = relationship("Hgnc", back_populates="alias_names") def __str__(self): return self.alias_name @@ -157,12 +160,12 @@ class AliasSymbol(Base): """Class definition for the hgnc_alias_symbol table.""" __tablename__ = "hgnc_alias_symbol" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - alias_symbol = Column(String(50), index=True) + alias_symbol: Mapped[str] = mapped_column(String(50), index=True) - hgnc_id = Column(Integer, ForeignKey("hgnc.id")) - hgnc = relationship("Hgnc", back_populates="alias_symbols") + hgnc_id: Mapped[int] = mapped_column(ForeignKey("hgnc.id")) + hgnc: Mapped[Hgnc] = relationship("Hgnc", back_populates="alias_symbols") def __str__(self): return self.alias_symbol @@ -172,12 +175,12 @@ class Ccds(Base): """Class definition for the hgnc_ccds table.""" __tablename__ = "hgnc_ccds" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - identifier = Column(String(50), index=True) + identifier: Mapped[str] = mapped_column(String(50), index=True) - hgnc_id = Column(Integer, ForeignKey("hgnc.id")) - hgnc = relationship("Hgnc", back_populates="ccdss") + hgnc_id: Mapped[int] = mapped_column(ForeignKey("hgnc.id")) + hgnc: Mapped[Hgnc] = relationship("Hgnc", back_populates="ccdss") def __str__(self): return self.identifier @@ -187,12 +190,12 @@ class Ena(Base): """Class definition for the hgnc_ena table.""" __tablename__ = "hgnc_ena" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - identifier = Column(String(50), index=True) + identifier: Mapped[str] = mapped_column(String(50), index=True) - hgnc_id = Column(Integer, ForeignKey("hgnc.id")) - hgnc = relationship("Hgnc", back_populates="enas") + hgnc_id: Mapped[int] = mapped_column(ForeignKey("hgnc.id")) + hgnc: Mapped[Hgnc] = relationship("Hgnc", back_populates="enas") def __str__(self): return self.identifier @@ -202,11 +205,11 @@ class Enzyme(Base): """Class definition for the hgnc_enzyme table.""" __tablename__ = "hgnc_enzyme" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - ec_number = Column(String(50), index=True) + ec_number = mapped_column(String(50), index=True) - hgnc_id = Column(Integer, ForeignKey("hgnc.id")) + hgnc_id: Mapped[int] = mapped_column(ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="enzymes") def __str__(self): @@ -217,11 +220,11 @@ class GeneGroupName(Base): """Class definition for the hgnc_gene_group_name table.""" __tablename__ = "hgnc_gene_group_name" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - name = Column(String(255)) + name = mapped_column(String(255)) - hgnc_id = Column(Integer, ForeignKey("hgnc.id")) + hgnc_id: Mapped[int] = mapped_column(ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="gene_group_names") def __str__(self): @@ -236,11 +239,11 @@ class GeneGroupId(Base): """Class definition for the hgnc_gene_group_id table.""" __tablename__ = "hgnc_gene_group_id" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - identifier = Column(Integer) + identifier = mapped_column(Integer) - hgnc_id = Column(Integer, ForeignKey("hgnc.id")) + hgnc_id: Mapped[int] = mapped_column(ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="gene_group_ids") def __str__(self): @@ -251,11 +254,11 @@ class UniProt(Base): """Class definition for the hgnc_uniprot table.""" __tablename__ = "hgnc_uniprot" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - accession = Column(String(50), index=True) + accession = mapped_column(String(50), index=True) - hgnc_id = Column(Integer, ForeignKey("hgnc.id")) + hgnc_id: Mapped[int] = mapped_column(ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="uniprots") def __str__(self): @@ -266,11 +269,11 @@ class RnaCentral(Base): """Class definition for the hgnc_rna_central table.""" __tablename__ = "hgnc_rna_central" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - identifier = Column(String(50), index=True) + identifier = mapped_column(String(50), index=True) - hgnc_id = Column(Integer, ForeignKey("hgnc.id")) + hgnc_id: Mapped[int] = mapped_column(ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="rna_centrals") def __str__(self): @@ -281,11 +284,11 @@ class Rgd(Base): """Class definition for the hgnc_rgd table.""" __tablename__ = "hgnc_rgd" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - identifier = Column(String(50), index=True) + identifier = mapped_column(String(50), index=True) - hgnc_id = Column(Integer, ForeignKey("hgnc.id")) + hgnc_id: Mapped[int] = mapped_column(ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="rgds") def __str__(self): @@ -296,11 +299,11 @@ class RefSeq(Base): """Class definition for the hgnc_refseq table.""" __tablename__ = "hgnc_refseq" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - accession = Column(String(50), index=True) + accession = mapped_column(String(50), index=True) - hgnc_id = Column(Integer, ForeignKey("hgnc.id")) + hgnc_id: Mapped[int] = mapped_column(ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="refseqs") def __str__(self): @@ -311,11 +314,11 @@ class PubMed(Base): """Class definition for the hgnc_pubmed table.""" __tablename__ = "hgnc_pubmed" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - pmid = Column(Integer, index=True) + pmid = mapped_column(Integer, index=True) - hgnc_id = Column(Integer, ForeignKey("hgnc.id")) + hgnc_id: Mapped[int] = mapped_column(ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="pubmeds") def __str__(self): @@ -326,11 +329,11 @@ class PrevName(Base): """Class definition for the hgnc_prev_name table.""" __tablename__ = "hgnc_prev_name" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - prev_name = Column(String(255)) + prev_name = mapped_column(String(255)) - hgnc_id = Column(Integer, ForeignKey("hgnc.id")) + hgnc_id: Mapped[int] = mapped_column(ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="prev_names") def __str__(self): @@ -341,11 +344,11 @@ class Omim(Base): """Class definition for the hgnc_omim table.""" __tablename__ = "hgnc_omim" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - identifier = Column(Integer, index=True) + identifier = mapped_column(Integer, index=True) - hgnc_id = Column(Integer, ForeignKey("hgnc.id")) + hgnc_id: Mapped[int] = mapped_column(ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="omims") def __str__(self): @@ -356,11 +359,11 @@ class Mgd(Base): """Class definition for the hgnc_mgd table.""" __tablename__ = "hgnc_mgd" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - identifier = Column(String(50), index=True) + identifier = mapped_column(String(50), index=True) - hgnc_id = Column(Integer, ForeignKey("hgnc.id")) + hgnc_id: Mapped[int] = mapped_column(ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="mgds") def __str__(self): @@ -371,11 +374,11 @@ class Lsdb(Base): """Class definition for the hgnc_lsdb table.""" __tablename__ = "hgnc_lsdb" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - identifier = Column(Text) + identifier: Mapped[str] = mapped_column(Text) - hgnc_id = Column(Integer, ForeignKey("hgnc.id")) + hgnc_id: Mapped[int] = mapped_column(ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="lsdbs") def __str__(self): diff --git a/ebel/manager/rdbms/models/human_ortholog.py b/ebel/manager/rdbms/models/human_ortholog.py index a1ccd37..565d88c 100644 --- a/ebel/manager/rdbms/models/human_ortholog.py +++ b/ebel/manager/rdbms/models/human_ortholog.py @@ -1,6 +1,7 @@ """HGNC Human Ortholog RDBMS model definition.""" from sqlalchemy import Column, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import mapped_column, Mapped from ebel.manager.rdbms.models import object_as_dict @@ -12,18 +13,18 @@ class HumanOrtholog(Base): __tablename__ = "human_ortholog" - id = Column(Integer, primary_key=True) - - hgnc_id = Column(String(20), index=True) - human_entrez_gene = Column(Integer) - human_ensembl_gene = Column(String(20)) - human_symbol = Column(String(50), index=True) - ortholog_species = Column(Integer, index=True) - ortholog_species_entrez_gene = Column(Integer) - ortholog_species_ensembl_gene = Column(String(50)) - ortholog_species_db_id = Column(String(50)) - ortholog_species_symbol = Column(String(50), index=True) - support = Column(Text) + id: Mapped[int] = mapped_column(primary_key=True) + + hgnc_id: Mapped[str] = mapped_column(String(20), index=True) + human_entrez_gene: Mapped[int] = mapped_column() + human_ensembl_gene: Mapped[str] = mapped_column(String(20)) + human_symbol: Mapped[str] = mapped_column(String(50), index=True) + ortholog_species: Mapped[int] = mapped_column(index=True) + ortholog_species_entrez_gene: Mapped[int] = mapped_column() + ortholog_species_ensembl_gene: Mapped[str] = mapped_column(String(50)) + ortholog_species_db_id: Mapped[str] = mapped_column(String(50)) + ortholog_species_symbol: Mapped[str] = mapped_column(String(50), index=True) + support: Mapped[str] = mapped_column(Text) def as_dict(self): """Convert object values to dictionary.""" diff --git a/ebel/manager/rdbms/models/intact.py b/ebel/manager/rdbms/models/intact.py index ab5ac33..7067f37 100644 --- a/ebel/manager/rdbms/models/intact.py +++ b/ebel/manager/rdbms/models/intact.py @@ -1,6 +1,7 @@ """IntAct RDBMS model definition.""" from sqlalchemy import Column, Float, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import mapped_column, Mapped from ebel.manager.rdbms.models import object_as_dict @@ -11,16 +12,16 @@ class Intact(Base): """Class definition for the intact table.""" __tablename__ = "intact" - id = Column(Integer, primary_key=True) - confidence_value = Column(Float, index=True) - detection_method = Column(String(100), index=True) - detection_method_psimi_id = Column(Integer) - int_a_uniprot_id = Column(String(50), index=True) - int_b_uniprot_id = Column(String(50), index=True) - interaction_ids = Column(Text) - interaction_type = Column(String(100), index=True) - interaction_type_psimi_id = Column(Integer) - pmid = Column(Integer) + id: Mapped[int] = mapped_column(primary_key=True) + confidence_value: Mapped[float] = mapped_column(index=True) + detection_method: Mapped[str] = mapped_column(String(100), index=True) + detection_method_psimi_id: Mapped[int] = mapped_column() + int_a_uniprot_id: Mapped[str] = mapped_column(String(50), index=True) + int_b_uniprot_id: Mapped[str] = mapped_column(String(50), index=True) + interaction_ids: Mapped[str] = mapped_column(Text) + interaction_type: Mapped[str] = mapped_column(String(100), index=True) + interaction_type_psimi_id: Mapped[int] = mapped_column() + pmid: Mapped[int] = mapped_column() def as_dict(self): """Convert object values to dictionary.""" diff --git a/ebel/manager/rdbms/models/iuphar.py b/ebel/manager/rdbms/models/iuphar.py index 790a929..cc1c1b8 100644 --- a/ebel/manager/rdbms/models/iuphar.py +++ b/ebel/manager/rdbms/models/iuphar.py @@ -1,8 +1,10 @@ """IUPHAR RDBMS model definition.""" +from typing import List + from sqlalchemy import (BigInteger, Boolean, Column, ForeignKey, Integer, Numeric, String, Text) from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship +from sqlalchemy.orm import relationship, mapped_column, Mapped from ebel.manager.rdbms.models import object_as_dict @@ -13,34 +15,34 @@ class IupharLigand(Base): """Class definition for the iuphar_ligand table.""" __tablename__ = "iuphar_ligand" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - name = Column(Text) - species = Column(Text) - type = Column(Text) - approved = Column(Boolean) - withdrawn = Column(Boolean) - labelled = Column(Boolean) - radioactive = Column(Boolean) - pubchem_sid = Column(BigInteger) - pubchem_cid = Column(Text) # TODO: This is a integer, but for import reasons this changed to text - uniprot_id = Column(Text) - ensembl_id = Column(Text) - ligand_subunit_ids = Column(Text) - ligand_subunit_name = Column(Text) - ligand_subunit_uni_prot_ids = Column(Text) - ligand_subunit_ensembl_ids = Column(Text) - iupac_name = Column(Text) - inn = Column(Text) - synonyms = Column(Text) - smiles = Column(Text) - inchi_key = Column(Text) - inchi = Column(Text) - gto_immu_pdb = Column(Boolean) - gto_mpdb = Column(Boolean) - antibacterial = Column(Boolean) + name: Mapped[str] = mapped_column(Text) + species: Mapped[str] = mapped_column(Text) + type: Mapped[str] = mapped_column(Text) + approved: Mapped[bool] = mapped_column() + withdrawn: Mapped[bool] = mapped_column() + labelled: Mapped[bool] = mapped_column() + radioactive: Mapped[bool] = mapped_column() + pubchem_sid: Mapped[int] = mapped_column() + pubchem_cid: Mapped[str] = mapped_column(Text) # TODO: This is a integer, but for import reasons this changed to text + uniprot_id: Mapped[str] = mapped_column(Text) + ensembl_id: Mapped[str] = mapped_column(Text) + ligand_subunit_ids: Mapped[str] = mapped_column(Text) + ligand_subunit_name: Mapped[str] = mapped_column(Text) + ligand_subunit_uni_prot_ids: Mapped[str] = mapped_column(Text) + ligand_subunit_ensembl_ids: Mapped[str] = mapped_column(Text) + iupac_name: Mapped[str] = mapped_column(Text) + inn: Mapped[str] = mapped_column(Text) + synonyms: Mapped[str] = mapped_column(Text) + smiles: Mapped[str] = mapped_column(Text) + inchi_key: Mapped[str] = mapped_column(Text) + inchi: Mapped[str] = mapped_column(Text) + gto_immu_pdb: Mapped[bool] = mapped_column() + gto_mpdb: Mapped[bool] = mapped_column() + antibacterial: Mapped[bool] = mapped_column() - interactions = relationship("IupharInteraction") + interactions: Mapped[List["IupharInteraction"]] = relationship("IupharInteraction") def as_dict(self): """Convert object values to dictionary.""" @@ -51,50 +53,50 @@ class IupharInteraction(Base): """Class definition for the iuphar_interaction table.""" __tablename__ = "iuphar_interaction" - id = Column(Integer, primary_key=True) + id = mapped_column(Integer, primary_key=True) - target = Column(String(255)) - target_id = Column(Integer) - target_subunit_ids = Column(Text) - target_gene_symbol = Column(String(100)) - target_uniprot = Column(String(100)) - target_ensembl_gene_id = Column(String(200)) - target_ligand = Column(String(100)) - target_ligand_id = Column(Integer) - target_ligand_subunit_ids = Column(Text) - target_ligand_gene_symbol = Column(String(50)) - target_ligand_uniprot_id = Column(String(200)) - target_ligand_ensembl_gene_id = Column(String(50)) - target_ligand_pubchem_sid = Column(Integer) - target_species = Column(String(100)) - ligand = Column(String(255)) - ligand_id = Column(Integer, ForeignKey("iuphar_ligand.id"), index=True) - ligand_subunit_ids = Column(Text) - ligand_gene_symbol = Column(String(50)) - ligand_species = Column(String(50)) - ligand_pubchem_sid = Column(Integer) - ligand_type = Column(Text) - approved = Column(Boolean) - type = Column(String(100)) - action = Column(String(100)) - action_comment = Column(String(255)) - selectivity = Column(String(50)) - endogenous = Column(Boolean) - primary_target = Column(Boolean) - concentration_range = Column(String(50)) - affinity_units = Column(String(10)) - affinity_high = Column(Numeric(6, 2)) - affinity_median = Column(Numeric(6, 2)) - affinity_low = Column(Numeric(6, 2)) - original_affinity_units = Column(String(10)) - original_affinity_low_nm = Column(Numeric(12, 3)) - original_affinity_median_nm = Column(Numeric(12, 3)) - original_affinity_high_nm = Column(Numeric(12, 3)) - original_affinity_relation = Column(String(1)) - assay_description = Column(Text) - receptor_site = Column(String(100)) - ligand_context = Column(String(50)) - pubmed_id = Column(Text) + target: Mapped[str] = mapped_column(String(255)) + target_id: Mapped[int] = mapped_column() + target_subunit_ids: Mapped[str] = mapped_column(Text) + target_gene_symbol: Mapped[str] = mapped_column(String(100)) + target_uniprot: Mapped[str] = mapped_column(String(100)) + target_ensembl_gene_id: Mapped[str] = mapped_column(String(200)) + target_ligand: Mapped[str] = mapped_column(String(100)) + target_ligand_id: Mapped[int] = mapped_column() + target_ligand_subunit_ids: Mapped[str] = mapped_column(Text) + target_ligand_gene_symbol: Mapped[str] = mapped_column(String(50)) + target_ligand_uniprot_id: Mapped[str] = mapped_column(String(200)) + target_ligand_ensembl_gene_id: Mapped[str] = mapped_column(String(50)) + target_ligand_pubchem_sid: Mapped[int] = mapped_column() + target_species: Mapped[str] = mapped_column(String(100)) + ligand: Mapped[str] = mapped_column(String(255)) + ligand_id: Mapped[int] = mapped_column(ForeignKey("iuphar_ligand.id"), index=True) + ligand_subunit_ids: Mapped[str] = mapped_column(Text) + ligand_gene_symbol: Mapped[str] = mapped_column(String(50)) + ligand_species: Mapped[str] = mapped_column(String(50)) + ligand_pubchem_sid: Mapped[int] = mapped_column() + ligand_type: Mapped[str] = mapped_column(Text) + approved: Mapped[bool] = mapped_column() + type: Mapped[str] = mapped_column(String(100)) + action: Mapped[str] = mapped_column(String(100)) + action_comment: Mapped[str] = mapped_column(String(255)) + selectivity: Mapped[str] = mapped_column(String(50)) + endogenous: Mapped[bool] = mapped_column() + primary_target: Mapped[bool] = mapped_column() + concentration_range: Mapped[str] = mapped_column(String(50)) + affinity_units: Mapped[str] = mapped_column(String(10)) + affinity_high: Mapped[float] = mapped_column(Numeric(6, 2)) + affinity_median: Mapped[float] = mapped_column(Numeric(6, 2)) + affinity_low: Mapped[float] = mapped_column(Numeric(6, 2)) + original_affinity_units: Mapped[str] = mapped_column(String(10)) + original_affinity_low_nm: Mapped[float] = mapped_column(Numeric(12, 3)) + original_affinity_median_nm: Mapped[float] = mapped_column(Numeric(12, 3)) + original_affinity_high_nm: Mapped[float] = mapped_column(Numeric(12, 3)) + original_affinity_relation: Mapped[str] = mapped_column(String(1)) + assay_description: Mapped[str] = mapped_column(Text) + receptor_site: Mapped[str] = mapped_column(String(100)) + ligand_context: Mapped[str] = mapped_column(String(50)) + pubmed_id: Mapped[str] = mapped_column(Text) def as_dict(self): """Convert object values to dictionary.""" diff --git a/ebel/manager/rdbms/models/kegg.py b/ebel/manager/rdbms/models/kegg.py index c5b07ee..d26d78d 100644 --- a/ebel/manager/rdbms/models/kegg.py +++ b/ebel/manager/rdbms/models/kegg.py @@ -1,6 +1,7 @@ """KEGG RDBMS model definition.""" from sqlalchemy import Column, Integer, String from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import mapped_column, Mapped from ebel.manager.rdbms.models import object_as_dict @@ -11,17 +12,17 @@ class Kegg(Base): """Class definition for the kegg table.""" __tablename__ = "kegg" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - pathway_identifier = Column(String(100)) - pathway_name = Column(String(1000)) - kegg_species_id = Column(String(100)) - kegg_gene_id_a = Column(String(100)) - gene_symbol_a = Column(String(100), index=True) - kegg_gene_id_b = Column(String(100)) - gene_symbol_b = Column(String(100), index=True) - kegg_int_type = Column(String(100)) - interaction_type = Column(String(50), index=True) + pathway_identifier: Mapped[str] = mapped_column(String(100)) + pathway_name: Mapped[str] = mapped_column(String(1000)) + kegg_species_id: Mapped[str] = mapped_column(String(100)) + kegg_gene_id_a: Mapped[str] = mapped_column(String(100)) + gene_symbol_a: Mapped[str] = mapped_column(String(100), index=True) + kegg_gene_id_b: Mapped[str] = mapped_column(String(100)) + gene_symbol_b: Mapped[str] = mapped_column(String(100), index=True) + kegg_int_type: Mapped[str] = mapped_column(String(100)) + interaction_type: Mapped[str] = mapped_column(String(50), index=True) def as_dict(self): """Convert object values to dictionary.""" diff --git a/ebel/manager/rdbms/models/mirtarbase.py b/ebel/manager/rdbms/models/mirtarbase.py index 6f5014e..700543f 100644 --- a/ebel/manager/rdbms/models/mirtarbase.py +++ b/ebel/manager/rdbms/models/mirtarbase.py @@ -1,6 +1,7 @@ """KEGG RDBMS model definition.""" from sqlalchemy import Column, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import mapped_column, Mapped from ebel.manager.rdbms.models import object_as_dict @@ -11,17 +12,17 @@ class Mirtarbase(Base): """Class definition for the mirtarbase table.""" __tablename__ = "mirtarbase" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - mi_rtar_base_id = Column(String(20)) - mi_rna = Column(String(50)) - species_mi_rna = Column(String(50), index=True) - target_gene = Column(String(50), index=True) - target_gene_entrez_id = Column(Integer) - species_target_gene = Column(String(50), index=True) - experiments = Column(Text) - support_type = Column(String(50), index=True) - references_pmid = Column(Integer) + mi_rtar_base_id: Mapped[str] = mapped_column(String(20)) + mi_rna: Mapped[str] = mapped_column(String(50)) + species_mi_rna: Mapped[str] = mapped_column(String(50), index=True) + target_gene: Mapped[str] = mapped_column(String(50), index=True) + target_gene_entrez_id: Mapped[int] = mapped_column() + species_target_gene: Mapped[str] = mapped_column(String(50), index=True) + experiments: Mapped[str] = mapped_column(Text) + support_type: Mapped[str] = mapped_column(String(50), index=True) + references_pmid: Mapped[int] = mapped_column() def as_dict(self): """Convert object values to dictionary.""" diff --git a/ebel/manager/rdbms/models/ncbi.py b/ebel/manager/rdbms/models/ncbi.py index 59a56f6..ade3a7b 100644 --- a/ebel/manager/rdbms/models/ncbi.py +++ b/ebel/manager/rdbms/models/ncbi.py @@ -1,7 +1,9 @@ """NCBI RDBMS model definition.""" +from typing import List + from sqlalchemy import Column, ForeignKey, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship +from sqlalchemy.orm import relationship, mapped_column, Mapped from . import object_as_dict @@ -12,27 +14,35 @@ class NcbiGeneInfo(Base): """Class definition for the ncbi_gene_info table.""" __tablename__ = "ncbi_gene_info" - gene_id = Column(Integer, primary_key=True) - - tax_id = Column(Integer, index=True) - symbol = Column(String(100), index=True) - type_of_gene = Column(String(100), index=True) - locus_tag = Column(String(100)) - chromosome = Column(String(100)) - map_location = Column(String(100)) - description_id = Column(Integer, ForeignKey("ncbi_gene_info_description.id")) - description = relationship("NcbiGeneInfoDescription", foreign_keys=[description_id]) - xrefs = relationship("NcbiGeneInfoXref", back_populates="gene") - mims = relationship("NcbiGeneMim", foreign_keys="NcbiGeneMim.gene_id", back_populates="gene") - orthologs = relationship( + gene_id: Mapped[int] = mapped_column(primary_key=True) + + tax_id: Mapped[int] = mapped_column(index=True) + symbol: Mapped[str] = mapped_column(String(100), index=True) + type_of_gene: Mapped[str] = mapped_column(String(100), index=True) + locus_tag: Mapped[str] = mapped_column(String(100)) + chromosome: Mapped[str] = mapped_column(String(100)) + map_location: Mapped[str] = mapped_column(String(100)) + description_id: Mapped[int] = mapped_column(ForeignKey("ncbi_gene_info_description.id")) + description: Mapped["NcbiGeneInfoDescription"] = relationship( + "NcbiGeneInfoDescription", foreign_keys=[description_id] + ) + xrefs: Mapped[List["NcbiGeneInfoXref"]] = relationship("NcbiGeneInfoXref", back_populates="gene") + mims: Mapped[List["NcbiGeneMim"]] = relationship( + "NcbiGeneMim", foreign_keys="NcbiGeneMim.gene_id", back_populates="gene" + ) + orthologs: Mapped[List["NcbiGeneOrtholog"]] = relationship( "NcbiGeneOrtholog", foreign_keys="NcbiGeneOrtholog.gene_id", back_populates="gene", ) - ensembl_ids = relationship("NcbiGeneEnsembl", back_populates="genes") - gene_ids_right = relationship("NcbiGeneOnRight", foreign_keys="NcbiGeneOnRight.gene_id", back_populates="gene") - gene_ids_left = relationship("NcbiGeneOnLeft", foreign_keys="NcbiGeneOnLeft.gene_id", back_populates="gene") - gene_ids_overlapping = relationship( + ensembl_ids: Mapped[List["NcbiGeneEnsembl"]] = relationship("NcbiGeneEnsembl", back_populates="genes") + gene_ids_right: Mapped["NcbiGeneOnRight"] = relationship( + "NcbiGeneOnRight", foreign_keys="NcbiGeneOnRight.gene_id", back_populates="gene" + ) + gene_ids_left: Mapped["NcbiGeneOnLeft"] = relationship( + "NcbiGeneOnLeft", foreign_keys="NcbiGeneOnLeft.gene_id", back_populates="gene" + ) + gene_ids_overlapping: Mapped["NcbiGeneOverlapping"] = relationship( "NcbiGeneOverlapping", foreign_keys="NcbiGeneOverlapping.gene_id", back_populates="gene", @@ -60,76 +70,76 @@ class NcbiGeneInfoDescription(Base): """Class definition for the ncbi_gene_info_description table.""" __tablename__ = "ncbi_gene_info_description" - id = Column(Integer, primary_key=True, autoincrement=True) - description = Column(Text) + id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True) + description: Mapped[str] = mapped_column(Text) class NcbiGeneOnRight(Base): """Class definition for the ncbi_gene_on_right table.""" __tablename__ = "ncbi_gene_on_right" - id = Column(Integer, primary_key=True, autoincrement=True) - gene_id = Column(Integer, ForeignKey("ncbi_gene_info.gene_id")) - gene_id_on_right = Column(Integer, ForeignKey("ncbi_gene_info.gene_id")) + id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True) + gene_id: Mapped[int] = mapped_column(ForeignKey("ncbi_gene_info.gene_id")) + gene_id_on_right: Mapped[int] = mapped_column(ForeignKey("ncbi_gene_info.gene_id")) - gene = relationship("NcbiGeneInfo", foreign_keys=[gene_id]) + gene: Mapped[NcbiGeneInfo] = relationship("NcbiGeneInfo", foreign_keys=[gene_id]) class NcbiGeneOnLeft(Base): """Class definition for the ncbi_gene_on_left table.""" __tablename__ = "ncbi_gene_on_left" - id = Column(Integer, primary_key=True, autoincrement=True) - gene_id = Column(Integer, ForeignKey("ncbi_gene_info.gene_id")) - gene_id_on_left = Column(Integer, ForeignKey("ncbi_gene_info.gene_id")) + id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True) + gene_id: Mapped[int] = mapped_column(ForeignKey("ncbi_gene_info.gene_id")) + gene_id_on_left: Mapped[int] = mapped_column(ForeignKey("ncbi_gene_info.gene_id")) - gene = relationship("NcbiGeneInfo", foreign_keys=[gene_id]) + gene: Mapped[NcbiGeneInfo] = relationship("NcbiGeneInfo", foreign_keys=[gene_id]) class NcbiGeneOverlapping(Base): """Class definition for the ncbi_gene_overlapping table.""" __tablename__ = "ncbi_gene_overlapping" - id = Column(Integer, primary_key=True, autoincrement=True) - gene_id = gene_id = Column(Integer, ForeignKey("ncbi_gene_info.gene_id")) - overlapping_gene_id = Column(Integer, ForeignKey("ncbi_gene_info.gene_id")) + id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True) + gene_id: Mapped[int] = mapped_column(ForeignKey("ncbi_gene_info.gene_id")) + overlapping_gene_id: Mapped[int] = mapped_column(ForeignKey("ncbi_gene_info.gene_id")) - gene = relationship("NcbiGeneInfo", foreign_keys=[gene_id]) + gene: Mapped[NcbiGeneInfo] = relationship("NcbiGeneInfo", foreign_keys=[gene_id]) class NcbiGeneOrtholog(Base): """Class definition for the ncbi_gene_ortholog table.""" __tablename__ = "ncbi_gene_ortholog" - id = Column(Integer, primary_key=True, autoincrement=True) - tax_id = Column(Integer, index=True) - gene_id = Column(Integer, ForeignKey("ncbi_gene_info.gene_id")) - other_tax_id = Column(Integer, index=True) - other_gene_id = Column(Integer, ForeignKey("ncbi_gene_info.gene_id")) + id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True) + tax_id: Mapped[int] = mapped_column(index=True) + gene_id: Mapped[int] = mapped_column(ForeignKey("ncbi_gene_info.gene_id")) + other_tax_id: Mapped[int] = mapped_column(index=True) + other_gene_id: Mapped[int] = mapped_column(ForeignKey("ncbi_gene_info.gene_id")) - gene = relationship("NcbiGeneInfo", foreign_keys=[gene_id]) + gene: Mapped[NcbiGeneInfo] = relationship("NcbiGeneInfo", foreign_keys=[gene_id]) class NcbiGenePubmed(Base): """Class definition for the ncbi_gene_pubmed table.""" __tablename__ = "ncbi_gene_pubmed" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - tax_id = Column(Integer, index=True) - gene_id = Column(Integer, ForeignKey("ncbi_gene_info.gene_id")) - pub_med_id = Column(Integer) + tax_id: Mapped[int] = mapped_column(index=True) + gene_id: Mapped[int] = mapped_column(ForeignKey("ncbi_gene_info.gene_id")) + pub_med_id: Mapped[int] = mapped_column() class NcbiGeneInfoXref(Base): """Class definition for the ncbi_gene_info_xref table.""" __tablename__ = "ncbi_gene_info_xref" - id = Column(Integer, primary_key=True, autoincrement=True) + id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True) - db = Column(String(100), index=True) - dbid = Column(String(100), index=True) - gene_id = Column(Integer, ForeignKey("ncbi_gene_info.gene_id")) + db: Mapped[str] = mapped_column(String(100), index=True) + dbid: Mapped[str] = mapped_column(String(100), index=True) + gene_id: Mapped[int] = mapped_column(ForeignKey("ncbi_gene_info.gene_id")) gene = relationship("NcbiGeneInfo", back_populates="xrefs") @@ -138,16 +148,16 @@ class NcbiGeneMim(Base): """Class definition for the ncbi_gene_mim table.""" __tablename__ = "ncbi_gene_mim" - id = Column(Integer, primary_key=True, autoincrement=True) + id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True) - mim_number = Column(Integer) - gene_id = Column(Integer, ForeignKey("ncbi_gene_info.gene_id")) - type = Column(String(100)) - source = Column(String(100)) - med_gen_cui = Column(String(100), index=True) - comment = Column(String(100)) + mim_number: Mapped[int] = mapped_column() + gene_id: Mapped[int] = mapped_column(ForeignKey("ncbi_gene_info.gene_id")) + type: Mapped[str] = mapped_column(String(100)) + source: Mapped[str] = mapped_column(String(100)) + med_gen_cui: Mapped[str] = mapped_column(String(100), index=True) + comment: Mapped[str] = mapped_column(String(100)) - gene = relationship("NcbiGeneInfo", back_populates="mims") + gene: Mapped[NcbiGeneInfo] = relationship("NcbiGeneInfo", back_populates="mims") def as_dict(self): """Convert object values to dictionary.""" @@ -165,17 +175,17 @@ class NcbiGeneEnsembl(Base): """Class definition for the ncbi_gene_ensembl table.""" __tablename__ = "ncbi_gene_ensembl" - id = Column(Integer, primary_key=True, autoincrement=True) + id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True) - tax_id = Column(Integer, index=True) - gene_id = Column(Integer, ForeignKey("ncbi_gene_info.gene_id")) - ensembl_gene_identifier = Column(String(100)) - rna_nucleotide_accession_version = Column(String(100)) - ensembl_rna_identifier = Column(String(100)) - protein_accession_version = Column(String(100)) - ensembl_protein_identifier = Column(String(100)) + tax_id: Mapped[int] = mapped_column(index=True) + gene_id: Mapped[int] = mapped_column(ForeignKey("ncbi_gene_info.gene_id")) + ensembl_gene_identifier: Mapped[str] = mapped_column(String(100)) + rna_nucleotide_accession_version: Mapped[str] = mapped_column(String(100)) + ensembl_rna_identifier: Mapped[str] = mapped_column(String(100)) + protein_accession_version: Mapped[str] = mapped_column(String(100)) + ensembl_protein_identifier: Mapped[str] = mapped_column(String(100)) - genes = relationship("NcbiGeneInfo", back_populates="ensembl_ids") + genes: Mapped[NcbiGeneInfo] = relationship("NcbiGeneInfo", back_populates="ensembl_ids") def as_dict(self): """Convert object values to dictionary.""" @@ -194,15 +204,15 @@ class NcbiGeneGo(Base): """Class definition for the ncbi_gene_go table.""" __tablename__ = "ncbi_gene_go" - id = Column(Integer, primary_key=True, autoincrement=True) + id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True) - tax_id = Column(Integer, index=True) - gene_id = Column(Integer, ForeignKey("ncbi_gene_info.gene_id")) - go_id = Column(String(100), index=True) - evidence = Column(String(10)) - qualifier = Column(String(100)) - go_term = Column(String(255)) - category = Column(String(10)) + tax_id: Mapped[int] = mapped_column(index=True) + gene_id: Mapped[int] = mapped_column(ForeignKey("ncbi_gene_info.gene_id")) + go_id: Mapped[str] = mapped_column(String(100), index=True) + evidence: Mapped[str] = mapped_column(String(10)) + qualifier: Mapped[str] = mapped_column(String(100)) + go_term: Mapped[str] = mapped_column(String(255)) + category: Mapped[str] = mapped_column(String(10)) pmids = relationship("NcbiGeneGoPmid", back_populates="gos") @@ -224,25 +234,25 @@ class NcbiGeneGoPmid(Base): """Class definition for the ncbi_gene_go_pmid table.""" __tablename__ = "ncbi_gene_go_pmid" - id = Column(Integer, primary_key=True, autoincrement=True) + id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True) - ncbi_gene_go_id = Column(Integer, ForeignKey("ncbi_gene_go.id")) - pmid = Column(Integer) + ncbi_gene_go_id: Mapped[int] = mapped_column(ForeignKey("ncbi_gene_go.id")) + pmid: Mapped[int] = mapped_column() - gos = relationship("NcbiGeneGo", back_populates="pmids") + gos: Mapped[List[NcbiGeneGo]] = relationship("NcbiGeneGo", back_populates="pmids") class NcbiMedGenName(Base): """Class definition for the ncbi_medgen_name table.""" __tablename__ = "ncbi_medgen_name" - id = Column(Integer, primary_key=True, autoincrement=True) + id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True) - cui = Column(String(100)) - name = Column(Text) - source = Column(String(100)) - suppress = Column(String(1)) - pmids = relationship("NcbiMedGenPmid", back_populates="med_gen_name") + cui: Mapped[str] = mapped_column(String(100)) + name: Mapped[str] = mapped_column(Text) + source: Mapped[str] = mapped_column(String(100)) + suppress: Mapped[str] = mapped_column(String(1)) + pmids: Mapped[List["NcbiMedGenPmid"]] = relationship("NcbiMedGenPmid", back_populates="med_gen_name") def as_dict(self): """Convert object values to dictionary.""" @@ -255,10 +265,10 @@ class NcbiMedGenPmid(Base): """Class definition for the ncbi_medgen_pmid table.""" __tablename__ = "ncbi_medgen_pmid" - id = Column(Integer, primary_key=True, autoincrement=True) + id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True) - ncbi_medgen_name_id = Column(Integer, ForeignKey("ncbi_medgen_name.id")) - pmid = Column(Integer, index=True) + ncbi_medgen_name_id: Mapped[int] = mapped_column(ForeignKey("ncbi_medgen_name.id")) + pmid: Mapped[int] = mapped_column(index=True) med_gen_name = relationship("NcbiMedGenName", back_populates="pmids") diff --git a/ebel/manager/rdbms/models/nsides.py b/ebel/manager/rdbms/models/nsides.py index ef5da64..2d92762 100644 --- a/ebel/manager/rdbms/models/nsides.py +++ b/ebel/manager/rdbms/models/nsides.py @@ -2,6 +2,7 @@ from sqlalchemy import Column, Float, Index, Integer, String from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import mapped_column, Mapped from ebel.manager.rdbms.models import object_as_dict @@ -21,24 +22,24 @@ class Nsides(Base): "mean_reporting_frequency", ), ) - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - drug_rxnorn_id = Column(String(20), index=True) # This has to be a String because of mapping to drugbank ids - drug_concept_name = Column(String(255), index=True) + drug_rxnorn_id: Mapped[str] = mapped_column(String(20), index=True) # This has to be a String because of mapping to drugbank ids + drug_concept_name: Mapped[str] = mapped_column(String(255), index=True) - source = Column(String(10), index=True) + source: Mapped[str] = mapped_column(String(10), index=True) - condition_meddra_id = Column(Integer) - condition_concept_name = Column(String(255), index=True) + condition_meddra_id: Mapped[int] = mapped_column() + condition_concept_name: Mapped[str] = mapped_column(String(255), index=True) # OFFSIDES specific - a = Column(Integer) - b = Column(Integer) - c = Column(Integer) - d = Column(Integer) - prr = Column(Float) - prr_error = Column(Float) - mean_reporting_frequency = Column(Float, index=True) + a = mapped_column(Integer) + b = mapped_column(Integer) + c = mapped_column(Integer) + d = mapped_column(Integer) + prr = mapped_column(Float) + prr_error = mapped_column(Float) + mean_reporting_frequency = mapped_column(Float, index=True) def as_dict(self): """Convert object values to dictionary.""" diff --git a/ebel/manager/rdbms/models/pathway_commons.py b/ebel/manager/rdbms/models/pathway_commons.py index 5478aed..fde22a0 100644 --- a/ebel/manager/rdbms/models/pathway_commons.py +++ b/ebel/manager/rdbms/models/pathway_commons.py @@ -1,7 +1,9 @@ """Pathway Commons RDBMS model definition.""" +from typing import List + from sqlalchemy import BigInteger, Column, ForeignKey, Integer, String, Table from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship +from sqlalchemy.orm import relationship, mapped_column, Mapped from ebel.manager.rdbms.models import object_as_dict @@ -36,21 +38,23 @@ class PathwayCommons(Base): """Class definition for the pathway_commons table.""" __tablename__ = "pathway_commons" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - participant_a = Column(String(50), index=True) - interaction_type = Column(String(50), index=True) - participant_b = Column(String(50), index=True) + participant_a: Mapped[str] = mapped_column(String(50), index=True) + interaction_type: Mapped[str] = mapped_column(String(50), index=True) + participant_b: Mapped[str] = mapped_column(String(50), index=True) - pmids = relationship("Pmid", back_populates="pathway_commons") + pmids: Mapped[List["Pmid"]] = relationship("Pmid", back_populates="pathway_commons") - pathway_names = relationship( + pathway_names: Mapped[List["PathwayName"]] = relationship( "PathwayName", secondary=pathway_commons__pathway_name, back_populates="pathway_commonses", ) - sources = relationship("Source", secondary=pathway_commons__source, back_populates="pathway_commonses") + sources: Mapped[List["Source"]] = relationship( + "Source", secondary=pathway_commons__source, back_populates="pathway_commonses" + ) def __str__(self): return f"{self.participant_a} {self.interaction_type} {self.participant_b}" @@ -68,11 +72,11 @@ class PathwayName(Base): """Class definition for the pathway_commons_pathway_name table.""" __tablename__ = "pathway_commons_pathway_name" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - name = Column(String(255), index=True) + name: Mapped[str] = mapped_column(String(255), index=True) - pathway_commonses = relationship( + pathway_commonses: Mapped[List[PathwayCommons]] = relationship( "PathwayCommons", secondary=pathway_commons__pathway_name, back_populates="pathway_names", @@ -87,12 +91,12 @@ class Pmid(Base): """Class definition for the pathway_commons_pmid table.""" __tablename__ = "pathway_commons_pmid" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - pmid = Column(BigInteger, index=True) + pmid: Mapped[int] = mapped_column(index=True) - pathway_commons_id = Column(Integer, ForeignKey("pathway_commons.id"), index=True) - pathway_commons = relationship("PathwayCommons", back_populates="pmids") + pathway_commons_id: Mapped[int] = mapped_column(ForeignKey("pathway_commons.id"), index=True) + pathway_commons: Mapped[List[PathwayCommons]] = relationship("PathwayCommons", back_populates="pmids") def __str__(self): """Class string definition.""" @@ -103,11 +107,13 @@ class Source(Base): """Class definition for the pathway_commons_source table.""" __tablename__ = "pathway_commons_source" - id = Column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - source = Column(String(50)) + source: Mapped[str] = mapped_column(String(50)) - pathway_commonses = relationship("PathwayCommons", secondary=pathway_commons__source, back_populates="sources") + pathway_commonses: Mapped[List[PathwayCommons]] = relationship( + "PathwayCommons", secondary=pathway_commons__source, back_populates="sources" + ) def __str__(self): """Class string definition.""" diff --git a/ebel/manager/rdbms/models/protein_atlas.py b/ebel/manager/rdbms/models/protein_atlas.py index 167a33a..a1a57e7 100644 --- a/ebel/manager/rdbms/models/protein_atlas.py +++ b/ebel/manager/rdbms/models/protein_atlas.py @@ -1,6 +1,7 @@ """Protein Atlas RDBMS model definition.""" from sqlalchemy import Column, Integer, Numeric, String, Text from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import mapped_column Base = declarative_base() @@ -9,14 +10,14 @@ class ProteinAtlasNormalTissue(Base): """Class definition for the protein_atlas_normal_tissue table.""" __tablename__ = "protein_atlas_normal_tissue" - id = Column(Integer, primary_key=True) + id = mapped_column(Integer, primary_key=True) - gene = Column(String(100), index=True) - gene_name = Column(String(100)) - tissue = Column(String(100)) - cell_type = Column(String(100)) - level = Column(String(100), index=True) - reliability = Column(String(100), index=True) + gene = mapped_column(String(100), index=True) + gene_name = mapped_column(String(100)) + tissue = mapped_column(String(100)) + cell_type = mapped_column(String(100)) + level = mapped_column(String(100), index=True) + reliability = mapped_column(String(100), index=True) def as_dict(self): """Convert object values to dictionary.""" @@ -34,22 +35,22 @@ class ProteinAtlasSubcellularLocation(Base): """Class definition for the protein_atlas_subcellular_location table.""" __tablename__ = "protein_atlas_subcellular_location" - id = Column(Integer, primary_key=True) - - gene = Column(String(100)) - gene_name = Column(String(100)) - reliability = Column(String(100)) - main_location = Column(String(100)) - additional_location = Column(String(100)) - extracellular_location = Column(String(100)) - enhanced = Column(String(100)) - supported = Column(String(100)) - approved = Column(String(100)) - uncertain = Column(String(100)) - single_cell_variation_intensity = Column(String(100)) - single_cell_variation_spatial = Column(String(100)) - cell_cycle_dependency = Column(Text) - go_id = Column(Text) + id = mapped_column(Integer, primary_key=True) + + gene = mapped_column(String(100)) + gene_name = mapped_column(String(100)) + reliability = mapped_column(String(100)) + main_location = mapped_column(String(100)) + additional_location = mapped_column(String(100)) + extracellular_location = mapped_column(String(100)) + enhanced = mapped_column(String(100)) + supported = mapped_column(String(100)) + approved = mapped_column(String(100)) + uncertain = mapped_column(String(100)) + single_cell_variation_intensity = mapped_column(String(100)) + single_cell_variation_spatial = mapped_column(String(100)) + cell_cycle_dependency = mapped_column(Text) + go_id = mapped_column(Text) def as_dict(self): """Convert object values to dictionary.""" @@ -75,12 +76,12 @@ class ProteinAtlasRnaTissueConsensus(Base): """Class definition for the protein_atlas_rna_tissue_consensus table.""" __tablename__ = "protein_atlas_rna_tissue_consensus" - id = Column(Integer, primary_key=True) + id = mapped_column(Integer, primary_key=True) - gene = Column(String(100), index=True) - gene_name = Column(String(100), index=True) - tissue = Column(String(100), index=True) - n_tpm = Column(Numeric(8, 1)) + gene = mapped_column(String(100), index=True) + gene_name = mapped_column(String(100), index=True) + tissue = mapped_column(String(100), index=True) + n_tpm = mapped_column(Numeric(8, 1)) def as_dict(self): """Convert object values to dictionary.""" @@ -96,14 +97,14 @@ class ProteinAtlasRnaBrainGtex(Base): """Class definition for the protein_atlas_rna_brain_gtex table.""" __tablename__ = "protein_atlas_rna_brain_gtex" - id = Column(Integer, primary_key=True) + id = mapped_column(Integer, primary_key=True) - gene = Column(String(100), index=True) - gene_name = Column(String(100), index=True) - brain_region = Column(String(100), index=True) - tpm = Column(Numeric(8, 1)) - p_tpm = Column(Numeric(8, 1)) - n_tpm = Column(Numeric(8, 1)) + gene = mapped_column(String(100), index=True) + gene_name = mapped_column(String(100), index=True) + brain_region = mapped_column(String(100), index=True) + tpm = mapped_column(Numeric(8, 1)) + p_tpm = mapped_column(Numeric(8, 1)) + n_tpm = mapped_column(Numeric(8, 1)) def as_dict(self): """Convert object values to dictionary.""" @@ -121,14 +122,14 @@ class ProteinAtlasRnaBrainFantom(Base): """Class definition for the protein_atlas_rna_brain_fantom table.""" __tablename__ = "protein_atlas_rna_brain_fantom" - id = Column(Integer, primary_key=True) + id = mapped_column(Integer, primary_key=True) - gene = Column(String(100)) - gene_name = Column(String(100)) - brain_region = Column(String(100)) - tags_per_million = Column(String(100)) - scaled_tags_per_million = Column(String(100)) - n_tpm = Column(String(100)) + gene = mapped_column(String(100)) + gene_name = mapped_column(String(100)) + brain_region = mapped_column(String(100)) + tags_per_million = mapped_column(String(100)) + scaled_tags_per_million = mapped_column(String(100)) + n_tpm = mapped_column(String(100)) def as_dict(self): """Convert object values to dictionary.""" @@ -146,12 +147,12 @@ class ProteinAtlasRnaMouseBrainAllen(Base): """Class definition for the protein_atlas_rna_mouse_brain_allen table.""" __tablename__ = "protein_atlas_rna_mouse_brain_allen" - id = Column(Integer, primary_key=True) + id = mapped_column(Integer, primary_key=True) - gene = Column(String(100)) - gene_name = Column(String(100)) - brain_region = Column(String(100)) - expression_energy = Column(Numeric(8, 1)) + gene = mapped_column(String(100)) + gene_name = mapped_column(String(100)) + brain_region = mapped_column(String(100)) + expression_energy = mapped_column(Numeric(8, 1)) def as_dict(self): """Convert object values to dictionary.""" diff --git a/ebel/manager/rdbms/models/reactome.py b/ebel/manager/rdbms/models/reactome.py index 0624899..e986cb1 100644 --- a/ebel/manager/rdbms/models/reactome.py +++ b/ebel/manager/rdbms/models/reactome.py @@ -1,6 +1,7 @@ """Reactome RDBMS model definition.""" from sqlalchemy import Column, Integer, String from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import mapped_column from ebel.manager.rdbms.models import object_as_dict @@ -11,12 +12,12 @@ class Reactome(Base): """Class definition for the reactome table.""" __tablename__ = "reactome" - id = Column(Integer, primary_key=True) - identifier = Column(String(50), index=True) - uniprot_accession = Column(String(50), index=True) - organism = Column(String(255)) - name = Column(String(255)) - evidence_type = Column(String(255)) + id = mapped_column(Integer, primary_key=True) + identifier = mapped_column(String(50), index=True) + uniprot_accession = mapped_column(String(50), index=True) + organism = mapped_column(String(255)) + name = mapped_column(String(255)) + evidence_type = mapped_column(String(255)) def as_dict(self): """Convert object values to dictionary.""" diff --git a/ebel/manager/rdbms/models/stringdb.py b/ebel/manager/rdbms/models/stringdb.py index 31842a5..df1fedf 100644 --- a/ebel/manager/rdbms/models/stringdb.py +++ b/ebel/manager/rdbms/models/stringdb.py @@ -2,6 +2,7 @@ from sqlalchemy import Boolean, Column, Integer, SmallInteger, String from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import mapped_column from ebel.manager.rdbms.models import object_as_dict @@ -13,26 +14,26 @@ class StringDb(Base): __tablename__ = "stringdb" - id = Column(Integer, primary_key=True) - - protein1 = Column(String(50), nullable=False) - protein2 = Column(String(50), nullable=False) - symbol1 = Column(String(50), nullable=False, index=True) - symbol2 = Column(String(50), nullable=False, index=True) - neighborhood = Column(Integer) - neighborhood_transferred = Column(SmallInteger) - fusion = Column(SmallInteger) - cooccurence = Column(SmallInteger) - homology = Column(SmallInteger) - coexpression = Column(SmallInteger) - coexpression_transferred = Column(SmallInteger) - experiments = Column(SmallInteger, index=True) - experiments_transferred = Column(SmallInteger) - database = Column(Integer) - database_transferred = Column(SmallInteger) - textmining = Column(SmallInteger) - textmining_transferred = Column(SmallInteger) - combined_score = Column(SmallInteger) + id = mapped_column(Integer, primary_key=True) + + protein1 = mapped_column(String(50), nullable=False) + protein2 = mapped_column(String(50), nullable=False) + symbol1 = mapped_column(String(50), nullable=False, index=True) + symbol2 = mapped_column(String(50), nullable=False, index=True) + neighborhood = mapped_column(Integer) + neighborhood_transferred = mapped_column(SmallInteger) + fusion = mapped_column(SmallInteger) + cooccurence = mapped_column(SmallInteger) + homology = mapped_column(SmallInteger) + coexpression = mapped_column(SmallInteger) + coexpression_transferred = mapped_column(SmallInteger) + experiments = mapped_column(SmallInteger, index=True) + experiments_transferred = mapped_column(SmallInteger) + database = mapped_column(Integer) + database_transferred = mapped_column(SmallInteger) + textmining = mapped_column(SmallInteger) + textmining_transferred = mapped_column(SmallInteger) + combined_score = mapped_column(SmallInteger) def as_dict(self): """Convert object values to dictionary.""" @@ -44,9 +45,9 @@ class StringDbProtein(Base): __tablename__ = "stringdb_protein" - id = Column(Integer, primary_key=True) - string_protein_id = Column(String(50), nullable=False, index=True) - preferred_name = Column(String(50), nullable=False, index=True) + id = mapped_column(Integer, primary_key=True) + string_protein_id = mapped_column(String(50), nullable=False, index=True) + preferred_name = mapped_column(String(50), nullable=False, index=True) def as_dict(self): """Convert object values to dictionary.""" @@ -57,16 +58,16 @@ class StringDbAction(Base): """Class definition for the stringdb_action table.""" __tablename__ = "stringdb_action" - id = Column(Integer, primary_key=True) - item_id_a = Column(String(50), nullable=False) - item_id_b = Column(String(50), nullable=False) - symbol1 = Column(String(50), nullable=False, index=True) - symbol2 = Column(String(50), nullable=False, index=True) - mode = Column(String(20), nullable=False, index=True) - action = Column(String(20)) - is_directional = Column(Boolean, nullable=False, index=True) - a_is_acting = Column(Boolean, nullable=False, index=True) - score = Column(SmallInteger) + id = mapped_column(Integer, primary_key=True) + item_id_a = mapped_column(String(50), nullable=False) + item_id_b = mapped_column(String(50), nullable=False) + symbol1 = mapped_column(String(50), nullable=False, index=True) + symbol2 = mapped_column(String(50), nullable=False, index=True) + mode = mapped_column(String(20), nullable=False, index=True) + action = mapped_column(String(20)) + is_directional = mapped_column(Boolean, nullable=False, index=True) + a_is_acting = mapped_column(Boolean, nullable=False, index=True) + score = mapped_column(SmallInteger) def as_dict(self): """Convert object values to dictionary.""" diff --git a/ebel/manager/rdbms/models/uniprot.py b/ebel/manager/rdbms/models/uniprot.py index 39a86f3..331ea82 100644 --- a/ebel/manager/rdbms/models/uniprot.py +++ b/ebel/manager/rdbms/models/uniprot.py @@ -3,7 +3,7 @@ from sqlalchemy import Column, ForeignKey, Integer, String, Table, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship +from sqlalchemy.orm import relationship, mapped_column Base = declarative_base() @@ -45,16 +45,16 @@ class Uniprot(Base): __tablename__ = "uniprot" - id = Column(Integer, primary_key=True) + id = mapped_column(Integer, primary_key=True) - accession = Column(String(20), unique=True) - name = Column(String(100), nullable=False, unique=True) - recommended_name = Column(String(255), nullable=True) + accession = mapped_column(String(20), unique=True) + name = mapped_column(String(100), nullable=False, unique=True) + recommended_name = mapped_column(String(255), nullable=True) - taxid = Column(Integer, ForeignKey("uniprot_organism.taxid"), nullable=False, index=True) + taxid = mapped_column(Integer, ForeignKey("uniprot_organism.taxid"), nullable=False, index=True) organism = relationship("Organism") - function_id = Column(Integer, ForeignKey("uniprot_function.id"), nullable=True) + function_id = mapped_column(Integer, ForeignKey("uniprot_function.id"), nullable=True) function = relationship("Function") gene_names = relationship("Gene", back_populates="uniprot") @@ -103,9 +103,9 @@ class GeneSymbol(Base): """Class definition for the uniprot_gene_symbol table.""" __tablename__ = "uniprot_gene_symbol" - id = Column(Integer, primary_key=True) - symbol = Column(String(100), nullable=False, index=True) - uniprot_id = Column(Integer, ForeignKey("uniprot.id")) + id = mapped_column(Integer, primary_key=True) + symbol = mapped_column(String(100), nullable=False, index=True) + uniprot_id = mapped_column(Integer, ForeignKey("uniprot.id")) uniprot = relationship("Uniprot", back_populates="gene_symbol") def __repr__(self): @@ -117,9 +117,9 @@ class Gene(Base): """Class definition for the uniprot_gene table.""" __tablename__ = "uniprot_gene" - id = Column(Integer, primary_key=True) - name = Column(String(100), nullable=False, index=True) - uniprot_id = Column(Integer, ForeignKey("uniprot.id")) + id = mapped_column(Integer, primary_key=True) + name = mapped_column(String(100), nullable=False, index=True) + uniprot_id = mapped_column(Integer, ForeignKey("uniprot.id")) uniprot = relationship("Uniprot", back_populates="gene_names") @@ -128,8 +128,8 @@ class Keyword(Base): __tablename__ = "uniprot_keyword" - keywordid = Column(Integer, primary_key=True) - keyword_name = Column(String(100), index=True) + keywordid = mapped_column(Integer, primary_key=True) + keyword_name = mapped_column(String(100), index=True) uniprots = relationship("Uniprot", secondary=uniprot__uniprot_keyword, back_populates="keywords") @@ -143,8 +143,8 @@ class Organism(Base): __tablename__ = "uniprot_organism" - taxid = Column(Integer, primary_key=True) - scientific_name = Column(String(255)) # TODO:Check if index=True with is possible + taxid = mapped_column(Integer, primary_key=True) + scientific_name = mapped_column(String(255)) # TODO:Check if index=True with is possible uniprots = relationship("Uniprot", secondary=uniprot__uniprot_host, back_populates="hosts") @@ -154,9 +154,9 @@ class SubcellularLocation(Base): __tablename__ = "uniprot_subcellular_location" - id = Column(Integer, primary_key=True) + id = mapped_column(Integer, primary_key=True) - name = Column(String(100), index=True) + name = mapped_column(String(100), index=True) uniprots = relationship( "Uniprot", @@ -170,10 +170,10 @@ class Xref(Base): __tablename__ = "uniprot_xref" - id = Column(Integer, primary_key=True) + id = mapped_column(Integer, primary_key=True) - db = Column(String(50), index=True) - identifier = Column(String(100), index=True) + db = mapped_column(String(50), index=True) + identifier = mapped_column(String(100), index=True) uniprots = relationship("Uniprot", secondary=uniprot__uniprot_xref, back_populates="xrefs") @@ -183,8 +183,8 @@ class Function(Base): __tablename__ = "uniprot_function" - id = Column(Integer, primary_key=True) + id = mapped_column(Integer, primary_key=True) - description = Column(Text) + description = mapped_column(Text) uniprots = relationship("Uniprot", back_populates="function") diff --git a/pyproject.toml b/pyproject.toml index 030a032..82b66a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,27 +39,27 @@ Issues = 'https://github.com/e-bel/ebel/issues' Documentation = 'https://ebel.readthedocs.io/en/latest/' [tool.poetry.dependencies] -lark-parser = "^0.11.2" +lark-parser = "^0.11.3" click = "^7.1.2" -requests = "^2.25.1" -tqdm = "^4.59.0" -pandas = "^1.2.4" -sqlalchemy = "^1.4.46" -SQLAlchemy-Utils = "^0.37.7" +requests = "^2.31.0" +tqdm = "^4.66.1" +pandas = "^1.5.3" +sqlalchemy = "^2.0.20" +SQLAlchemy-Utils = "^0.37.9" xlwt = "^1.3.0" xlrd = "^2.0.1" -xlsxwriter = "^1.3.8" +xlsxwriter = "^1.4.5" xmltodict = "^0.12.0" -GitPython = "^3.1.14" -lxml = "^4.6.5" -flask = "^2.0.1" +GitPython = "^3.1.36" +lxml = "^4.9.3" +flask = "^2.2.5" flask_cors = "^3.0.10" -connexion = {version = "^2.14.1", extras = ["swagger-ui"]} -cryptography = "^3.4.7" -openpyxl = "^3.0.10" +connexion = {version = "^2.14.2", extras = ["swagger-ui"]} +cryptography = "^3.4.8" +openpyxl = "^3.1.2" graphviz = "0.20" pyorientdb = "^1.0.0" -PyMySQL = "^1.0.2" +PyMySQL = "^1.1.0" python = "^3.9" mkdocstrings = {version = "^0.18", extras = ["python"]} diff --git a/requirements.txt b/requirements.txt index f00f628..920280b 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,21 +1,21 @@ -lark-parser==0.11.2 +lark-parser==0.11.3 click>=7.1.2 -requests>=2.25.1 -tqdm>=4.59.0 -pandas>=1.2.4 -sqlalchemy>=1.4.15 -SQLAlchemy-Utils==0.37.7 +requests>=2.31.0 +tqdm>=4.66.1 +pandas>=1.5.3 +sqlalchemy>=2.0.20 +SQLAlchemy-Utils==0.37.9 xlwt==1.3.0 xlrd==2.0.1 -xlsxwriter==1.3.8 -pymysql==1.0.2 +xlsxwriter==1.4.5 +pymysql==1.1.0 xmltodict==0.12.0 -GitPython==3.1.14 -lxml>=4.6.5 -flask==2.0.1 +GitPython==3.1.36 +lxml>=4.9.3 +flask==2.2.5 flask_cors==3.0.10 -connexion[swagger-ui]==2.14.1 -cryptography==3.4.7 -openpyxl==3.0.7 -graphviz -pyorientdb \ No newline at end of file +connexion[swagger-ui]==2.14.2 +cryptography==3.4.8 +openpyxl==3.1.2 +graphviz==0.20 +pyorientdb==1.0.0 \ No newline at end of file From 86022d86fc8afcdb444327affc58eccc8aeaba83 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Fri, 15 Sep 2023 13:04:57 +0200 Subject: [PATCH 02/58] feat: finish updating models for sqla2 and blacken code --- ebel/database.py | 8 +- ebel/manager/models.py | 3 +- ebel/manager/orientdb/biodbs/bel.py | 35 +++++-- ebel/manager/orientdb/biodbs/biogrid.py | 1 + ebel/manager/orientdb/odb_meta.py | 7 +- ebel/manager/orientdb/odb_structure.py | 3 +- ebel/manager/rdbms/models/chebi.py | 25 +++-- .../rdbms/models/clinical_trials_gov.py | 34 +++---- ebel/manager/rdbms/models/drugbank.py | 12 ++- ebel/manager/rdbms/models/hgnc.py | 3 +- ebel/manager/rdbms/models/iuphar.py | 7 +- ebel/manager/rdbms/models/nsides.py | 4 +- ebel/manager/rdbms/models/protein_atlas.py | 96 +++++++++---------- ebel/manager/rdbms/models/reactome.py | 14 +-- ebel/manager/rdbms/models/stringdb.py | 68 ++++++------- ebel/manager/rdbms/models/uniprot.py | 86 +++++++++-------- ebel/web/api/ebel/v1/bel.py | 3 +- .../web/api/ebel/v1/bel_against_expression.py | 3 +- ebel/web/api/ebel/v1/biogrid.py | 12 ++- ebel/web/api/ebel/v1/clinical_trials_gov.py | 3 +- ebel/web/api/ebel/v1/clinvar.py | 9 +- ebel/web/api/ebel/v1/disgenet.py | 8 +- ebel/web/api/ebel/v1/drugbank.py | 3 +- ebel/web/api/ebel/v1/expression_atlas.py | 11 ++- ebel/web/api/ebel/v1/intact.py | 3 +- ebel/web/api/ebel/v1/kegg.py | 3 +- ebel/web/api/ebel/v1/pathway_commons.py | 12 ++- ebel/web/api/ebel/v1/uniprot.py | 3 +- 28 files changed, 259 insertions(+), 220 deletions(-) diff --git a/ebel/database.py b/ebel/database.py index 7a006ec..a972cbe 100644 --- a/ebel/database.py +++ b/ebel/database.py @@ -6,9 +6,11 @@ import pymysql from pyorientdb import OrientDB -from pyorientdb.exceptions import (PyOrientCommandException, - PyOrientConnectionException, - PyOrientSecurityAccessException) +from pyorientdb.exceptions import ( + PyOrientCommandException, + PyOrientConnectionException, + PyOrientSecurityAccessException, +) from ebel.config import get_config_as_dict, write_to_config from ebel.constants import TerminalFormatting as TF diff --git a/ebel/manager/models.py b/ebel/manager/models.py index 1ab5587..010ef16 100755 --- a/ebel/manager/models.py +++ b/ebel/manager/models.py @@ -20,8 +20,7 @@ from sqlalchemy_utils import create_database, database_exists from tqdm import tqdm -from ebel.constants import (FILE, GRAMMAR_NS_ANNO_PATH, GRAMMAR_START_ANNO, - GRAMMAR_START_NS, URL) +from ebel.constants import FILE, GRAMMAR_NS_ANNO_PATH, GRAMMAR_START_ANNO, GRAMMAR_START_NS, URL from ebel.tools import BelRdb Base = declarative_base() diff --git a/ebel/manager/orientdb/biodbs/bel.py b/ebel/manager/orientdb/biodbs/bel.py index d5a0f20..126eab6 100644 --- a/ebel/manager/orientdb/biodbs/bel.py +++ b/ebel/manager/orientdb/biodbs/bel.py @@ -30,14 +30,29 @@ from ebel.manager.orientdb.biodbs.reactome import Reactome from ebel.manager.orientdb.biodbs.stringdb import StringDb from ebel.manager.orientdb.biodbs.uniprot import UniProt -from ebel.manager.orientdb.constants import (BIOGRID, CHEBI, CLINICAL_TRIALS, - CLINVAR, DISGENET, DRUGBANK, - ENSEMBL, EXPRESSION_ATLAS, - GWAS_CATALOG, HGNC, INTACT, - IUPHAR, KEGG, MIRTARBASE, NCBI, - NSIDES, PATHWAY_COMMONS, - PROTEIN_ATLAS, REACTOME, STRINGDB, - UNIPROT) +from ebel.manager.orientdb.constants import ( + BIOGRID, + CHEBI, + CLINICAL_TRIALS, + CLINVAR, + DISGENET, + DRUGBANK, + ENSEMBL, + EXPRESSION_ATLAS, + GWAS_CATALOG, + HGNC, + INTACT, + IUPHAR, + KEGG, + MIRTARBASE, + NCBI, + NSIDES, + PATHWAY_COMMONS, + PROTEIN_ATLAS, + REACTOME, + STRINGDB, + UNIPROT, +) from ebel.manager.orientdb.importer import _BelImporter from ebel.manager.orientdb.odb_defaults import bel_func_short from ebel.manager.orientdb.odb_meta import Graph @@ -681,6 +696,8 @@ def update_interactions(self) -> int: """Abstract method.""" pass + if __name__ == "__main__": b = Bel() - b.clinical_trials.update() \ No newline at end of file + b.clinical_trials.recreate_tables() + b.clinical_trials.update() diff --git a/ebel/manager/orientdb/biodbs/biogrid.py b/ebel/manager/orientdb/biodbs/biogrid.py index 63df5a5..277998f 100644 --- a/ebel/manager/orientdb/biodbs/biogrid.py +++ b/ebel/manager/orientdb/biodbs/biogrid.py @@ -21,6 +21,7 @@ logger = logging.getLogger(__name__) + class BioGridNode: """Custom class definition for BioGRID nodes.""" diff --git a/ebel/manager/orientdb/odb_meta.py b/ebel/manager/orientdb/odb_meta.py index 10c1972..db5cff7 100644 --- a/ebel/manager/orientdb/odb_meta.py +++ b/ebel/manager/orientdb/odb_meta.py @@ -21,9 +21,7 @@ import sqlalchemy as sqla import xmltodict from pyorientdb import OrientDB, orient -from pyorientdb.exceptions import (PyOrientCommandException, - PyOrientIndexException, - PyOrientSecurityAccessException) +from pyorientdb.exceptions import PyOrientCommandException, PyOrientIndexException, PyOrientSecurityAccessException from pyorientdb.otypes import OrientRecord from sqlalchemy import text from sqlalchemy.sql.schema import Table @@ -35,8 +33,7 @@ from ebel.config import get_config_as_dict, get_config_value, write_to_config from ebel.constants import DEFAULT_ODB, RID from ebel.manager.orientdb import urls as default_urls -from ebel.manager.orientdb.odb_structure import (Edge, Generic, Node, OClass, - OIndex, OProperty) +from ebel.manager.orientdb.odb_structure import Edge, Generic, Node, OClass, OIndex, OProperty from ebel.tools import BelRdb, chunks, get_file_path, get_standard_name type_map_inverse = {v: k for k, v in orient.type_map.items()} diff --git a/ebel/manager/orientdb/odb_structure.py b/ebel/manager/orientdb/odb_structure.py index a0fceb3..e4e830b 100755 --- a/ebel/manager/orientdb/odb_structure.py +++ b/ebel/manager/orientdb/odb_structure.py @@ -9,8 +9,7 @@ from enum import Enum from typing import Dict, List, Optional, Tuple -from ebel.manager.orientdb.odb_defaults import (ODataType, OIndexType, - normalized_pmod) +from ebel.manager.orientdb.odb_defaults import ODataType, OIndexType, normalized_pmod class OClassType(Enum): diff --git a/ebel/manager/rdbms/models/chebi.py b/ebel/manager/rdbms/models/chebi.py index 99876ff..52d3120 100644 --- a/ebel/manager/rdbms/models/chebi.py +++ b/ebel/manager/rdbms/models/chebi.py @@ -1,9 +1,8 @@ """CHEBI RDBMS model definition.""" import datetime -from typing import List +from typing import List, Optional -from sqlalchemy import (DateTime, ForeignKey, Index, Integer, String, - Text) +from sqlalchemy import DateTime, ForeignKey, Index, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import relationship, mapped_column, Mapped @@ -16,7 +15,7 @@ class ChemicalData(Base): __tablename__ = "chebi_chemical_data" id: Mapped[int] = mapped_column(primary_key=True) - chemical_data: Mapped[str] = mapped_column(Text, nullable=True) + chemical_data: Mapped[Optional[str]] = mapped_column(Text) source: Mapped[str] = mapped_column(Text, nullable=False) type: Mapped[str] = mapped_column(Text, nullable=False) @@ -68,15 +67,15 @@ class Compound(Base): __tablename__ = "chebi_compound" id: Mapped[int] = mapped_column(primary_key=True) - name: Mapped[str] = mapped_column(String(2000), nullable=True) + name: Mapped[Optional[str]] = mapped_column(String(2000)) source: Mapped[str] = mapped_column(String(32), nullable=False) - parent_id: Mapped[int] = mapped_column(nullable=True) + parent_id: Mapped[Optional[int]] = mapped_column() chebi_accession: Mapped[str] = mapped_column(String(30), nullable=False) status: Mapped[str] = mapped_column(String(1), nullable=False) - definition: Mapped[str] = mapped_column(Text, nullable=True) + definition: Mapped[Optional[str]] = mapped_column(Text) star: Mapped[int] = mapped_column(nullable=False) - modified_on: Mapped[str] = mapped_column(Text, nullable=True) - created_by: Mapped[int] = mapped_column(Text, nullable=True) + modified_on: Mapped[Optional[str]] = mapped_column(Text) + created_by: Mapped[Optional[str]] = mapped_column(Text) chemicalData: Mapped[List["ChemicalData"]] = relationship("ChemicalData", back_populates="compounds") comments: Mapped[List["Comment"]] = relationship("Comment", back_populates="compounds") @@ -136,7 +135,7 @@ class DatabaseAccession(Base): __tablename__ = "chebi_database_accession" id: Mapped[int] = mapped_column(primary_key=True) - accession_number: Mapped[str] = mapped_column(String(255), nullable=True) + accession_number: Mapped[Optional[str]] = mapped_column(String(255)) type: Mapped[str] = mapped_column(Text, nullable=False) source: Mapped[str] = mapped_column(Text, nullable=False) @@ -161,7 +160,7 @@ class Name(Base): __tablename__ = "chebi_name" id: Mapped[int] = mapped_column(primary_key=True) - name: Mapped[str] = mapped_column(Text, nullable=True) + name: Mapped[Optional[str]] = mapped_column(Text) type: Mapped[str] = mapped_column(Text, nullable=False) source: Mapped[str] = mapped_column(Text, nullable=False) adapted: Mapped[str] = mapped_column(Text, nullable=False) @@ -193,8 +192,8 @@ class Reference(Base): reference_id: Mapped[str] = mapped_column(String(60), nullable=False, index=True) reference_db_name: Mapped[str] = mapped_column(String(60), nullable=False, index=True) - location_in_ref: Mapped[str] = mapped_column(String(90), nullable=True, index=True) - reference_name: Mapped[str] = mapped_column(String(1024), nullable=True) + location_in_ref: Mapped[Optional[str]] = mapped_column(String(90), index=True) + reference_name: Mapped[Optional[str]] = mapped_column(String(1024)) compound_id: Mapped[int] = mapped_column(ForeignKey("chebi_compound.id")) compounds: Mapped[List["Compound"]] = relationship("Compound", back_populates="references") diff --git a/ebel/manager/rdbms/models/clinical_trials_gov.py b/ebel/manager/rdbms/models/clinical_trials_gov.py index a94ff4f..c38cf31 100644 --- a/ebel/manager/rdbms/models/clinical_trials_gov.py +++ b/ebel/manager/rdbms/models/clinical_trials_gov.py @@ -1,6 +1,6 @@ """ClinicalTrials.gov RDBMS model definition.""" import re -from typing import List +from typing import List, Optional from sqlalchemy import ForeignKey, Integer, String, Table, Text, Column from sqlalchemy.ext.declarative import declarative_base @@ -86,24 +86,24 @@ class ClinicalTrialGov(Base): id: Mapped[int] = mapped_column(primary_key=True) nct_id = mapped_column(String(100), index=True) - org_study_id: Mapped[str] = mapped_column(Text) - brief_title: Mapped[str] = mapped_column(Text) - official_title: Mapped[str] = mapped_column(Text) - is_fda_regulated_drug: Mapped[str] = mapped_column(Text) - brief_summary: Mapped[str] = mapped_column(Text) - detailed_description: Mapped[str] = mapped_column(Text) - overall_status: Mapped[str] = mapped_column(Text) - start_date: Mapped[str] = mapped_column(Text) - completion_date: Mapped[str] = mapped_column(Text) - phase: Mapped[str] = mapped_column(Text) - study_type: Mapped[str] = mapped_column(Text) - study_design_intervention_model: Mapped[str] = mapped_column(Text) - study_design_primary_purpose: Mapped[str] = mapped_column(Text) - study_design_masking: Mapped[str] = mapped_column(Text) + org_study_id: Mapped[Optional[str]] = mapped_column(Text) + brief_title: Mapped[Optional[str]] = mapped_column(Text) + official_title: Mapped[Optional[str]] = mapped_column(Text) + is_fda_regulated_drug: Mapped[Optional[str]] = mapped_column(Text) + brief_summary: Mapped[Optional[str]] = mapped_column(Text) + detailed_description: Mapped[Optional[str]] = mapped_column(Text) + overall_status: Mapped[Optional[str]] = mapped_column(Text) + start_date: Mapped[Optional[str]] = mapped_column(Text) + completion_date: Mapped[Optional[str]] = mapped_column(Text) + phase: Mapped[Optional[str]] = mapped_column(Text) + study_type: Mapped[Optional[str]] = mapped_column(Text) + study_design_intervention_model: Mapped[Optional[str]] = mapped_column(Text) + study_design_primary_purpose: Mapped[Optional[str]] = mapped_column(Text) + study_design_masking: Mapped[Optional[str]] = mapped_column(Text) # primary_outcomes # secondary_outcomes - patient_data_sharing_ipd: Mapped[str] = mapped_column(Text) - patient_data_ipd_description: Mapped[str] = mapped_column(Text) + patient_data_sharing_ipd: Mapped[Optional[str]] = mapped_column(Text) + patient_data_ipd_description: Mapped[Optional[str]] = mapped_column(Text) keywords: Mapped[List["Keyword"]] = relationship( "Keyword", diff --git a/ebel/manager/rdbms/models/drugbank.py b/ebel/manager/rdbms/models/drugbank.py index c0f1ba6..8877527 100644 --- a/ebel/manager/rdbms/models/drugbank.py +++ b/ebel/manager/rdbms/models/drugbank.py @@ -35,9 +35,15 @@ class Drugbank(Base): references: Mapped[List["Reference"]] = relationship("Reference", back_populates="drugbank", cascade="save-update") synonyms: Mapped[List["Synonym"]] = relationship("Synonym", back_populates="drugbank", cascade="save-update") targets: Mapped[List["Target"]] = relationship("Target", back_populates="drugbank", cascade="save-update") - external_identifiers: Mapped[List["ExternalIdentifier"]] = relationship("ExternalIdentifier", back_populates="drugbank", cascade="save-update") - product_names: Mapped[List["ProductName"]] = relationship("ProductName", back_populates="drugbank", cascade="save-update") - drug_interactions: Mapped[List["DrugInteraction"]] = relationship("DrugInteraction", back_populates="drugbank", cascade="save-update") + external_identifiers: Mapped[List["ExternalIdentifier"]] = relationship( + "ExternalIdentifier", back_populates="drugbank", cascade="save-update" + ) + product_names: Mapped[List["ProductName"]] = relationship( + "ProductName", back_populates="drugbank", cascade="save-update" + ) + drug_interactions: Mapped[List["DrugInteraction"]] = relationship( + "DrugInteraction", back_populates="drugbank", cascade="save-update" + ) statuses: Mapped[List["Status"]] = relationship("Status", back_populates="drugbank", cascade="save-update") patents: Mapped[List["Patent"]] = relationship("Patent", back_populates="drugbank", cascade="save-update") pathways: Mapped[List["Pathway"]] = relationship("Pathway", back_populates="drugbank", cascade="save-update") diff --git a/ebel/manager/rdbms/models/hgnc.py b/ebel/manager/rdbms/models/hgnc.py index 26c5a20..56c0b64 100644 --- a/ebel/manager/rdbms/models/hgnc.py +++ b/ebel/manager/rdbms/models/hgnc.py @@ -2,8 +2,7 @@ import datetime from typing import List -from sqlalchemy import (BigInteger, Column, Date, ForeignKey, Integer, String, - Text) +from sqlalchemy import BigInteger, Column, Date, ForeignKey, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import relationship, mapped_column, Mapped diff --git a/ebel/manager/rdbms/models/iuphar.py b/ebel/manager/rdbms/models/iuphar.py index cc1c1b8..11d2c4d 100644 --- a/ebel/manager/rdbms/models/iuphar.py +++ b/ebel/manager/rdbms/models/iuphar.py @@ -1,8 +1,7 @@ """IUPHAR RDBMS model definition.""" from typing import List -from sqlalchemy import (BigInteger, Boolean, Column, ForeignKey, Integer, - Numeric, String, Text) +from sqlalchemy import BigInteger, Boolean, Column, ForeignKey, Integer, Numeric, String, Text from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import relationship, mapped_column, Mapped @@ -25,7 +24,9 @@ class IupharLigand(Base): labelled: Mapped[bool] = mapped_column() radioactive: Mapped[bool] = mapped_column() pubchem_sid: Mapped[int] = mapped_column() - pubchem_cid: Mapped[str] = mapped_column(Text) # TODO: This is a integer, but for import reasons this changed to text + pubchem_cid: Mapped[str] = mapped_column( + Text + ) # TODO: This is a integer, but for import reasons this changed to text uniprot_id: Mapped[str] = mapped_column(Text) ensembl_id: Mapped[str] = mapped_column(Text) ligand_subunit_ids: Mapped[str] = mapped_column(Text) diff --git a/ebel/manager/rdbms/models/nsides.py b/ebel/manager/rdbms/models/nsides.py index 2d92762..aceb587 100644 --- a/ebel/manager/rdbms/models/nsides.py +++ b/ebel/manager/rdbms/models/nsides.py @@ -24,7 +24,9 @@ class Nsides(Base): ) id: Mapped[int] = mapped_column(primary_key=True) - drug_rxnorn_id: Mapped[str] = mapped_column(String(20), index=True) # This has to be a String because of mapping to drugbank ids + drug_rxnorn_id: Mapped[str] = mapped_column( + String(20), index=True + ) # This has to be a String because of mapping to drugbank ids drug_concept_name: Mapped[str] = mapped_column(String(255), index=True) source: Mapped[str] = mapped_column(String(10), index=True) diff --git a/ebel/manager/rdbms/models/protein_atlas.py b/ebel/manager/rdbms/models/protein_atlas.py index a1a57e7..cce2936 100644 --- a/ebel/manager/rdbms/models/protein_atlas.py +++ b/ebel/manager/rdbms/models/protein_atlas.py @@ -1,7 +1,7 @@ """Protein Atlas RDBMS model definition.""" from sqlalchemy import Column, Integer, Numeric, String, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import mapped_column +from sqlalchemy.orm import mapped_column, Mapped Base = declarative_base() @@ -10,14 +10,14 @@ class ProteinAtlasNormalTissue(Base): """Class definition for the protein_atlas_normal_tissue table.""" __tablename__ = "protein_atlas_normal_tissue" - id = mapped_column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - gene = mapped_column(String(100), index=True) - gene_name = mapped_column(String(100)) - tissue = mapped_column(String(100)) - cell_type = mapped_column(String(100)) - level = mapped_column(String(100), index=True) - reliability = mapped_column(String(100), index=True) + gene: Mapped[str] = mapped_column(String(100), index=True) + gene_name: Mapped[str] = mapped_column(String(100)) + tissue: Mapped[str] = mapped_column(String(100)) + cell_type: Mapped[str] = mapped_column(String(100)) + level: Mapped[str] = mapped_column(String(100), index=True) + reliability: Mapped[str] = mapped_column(String(100), index=True) def as_dict(self): """Convert object values to dictionary.""" @@ -35,22 +35,22 @@ class ProteinAtlasSubcellularLocation(Base): """Class definition for the protein_atlas_subcellular_location table.""" __tablename__ = "protein_atlas_subcellular_location" - id = mapped_column(Integer, primary_key=True) - - gene = mapped_column(String(100)) - gene_name = mapped_column(String(100)) - reliability = mapped_column(String(100)) - main_location = mapped_column(String(100)) - additional_location = mapped_column(String(100)) - extracellular_location = mapped_column(String(100)) - enhanced = mapped_column(String(100)) - supported = mapped_column(String(100)) - approved = mapped_column(String(100)) - uncertain = mapped_column(String(100)) - single_cell_variation_intensity = mapped_column(String(100)) - single_cell_variation_spatial = mapped_column(String(100)) - cell_cycle_dependency = mapped_column(Text) - go_id = mapped_column(Text) + id: Mapped[int] = mapped_column(primary_key=True) + + gene: Mapped[str] = mapped_column(String(100)) + gene_name: Mapped[str] = mapped_column(String(100)) + reliability: Mapped[str] = mapped_column(String(100)) + main_location: Mapped[str] = mapped_column(String(100)) + additional_location: Mapped[str] = mapped_column(String(100)) + extracellular_location: Mapped[str] = mapped_column(String(100)) + enhanced: Mapped[str] = mapped_column(String(100)) + supported: Mapped[str] = mapped_column(String(100)) + approved: Mapped[str] = mapped_column(String(100)) + uncertain: Mapped[str] = mapped_column(String(100)) + single_cell_variation_intensity: Mapped[str] = mapped_column(String(100)) + single_cell_variation_spatial: Mapped[str] = mapped_column(String(100)) + cell_cycle_dependency: Mapped[str] = mapped_column(Text) + go_id: Mapped[str] = mapped_column(Text) def as_dict(self): """Convert object values to dictionary.""" @@ -76,12 +76,12 @@ class ProteinAtlasRnaTissueConsensus(Base): """Class definition for the protein_atlas_rna_tissue_consensus table.""" __tablename__ = "protein_atlas_rna_tissue_consensus" - id = mapped_column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - gene = mapped_column(String(100), index=True) - gene_name = mapped_column(String(100), index=True) - tissue = mapped_column(String(100), index=True) - n_tpm = mapped_column(Numeric(8, 1)) + gene: Mapped[str] = mapped_column(String(100), index=True) + gene_name: Mapped[str] = mapped_column(String(100), index=True) + tissue: Mapped[str] = mapped_column(String(100), index=True) + n_tpm: Mapped[float] = mapped_column(Numeric(8, 1)) def as_dict(self): """Convert object values to dictionary.""" @@ -97,14 +97,14 @@ class ProteinAtlasRnaBrainGtex(Base): """Class definition for the protein_atlas_rna_brain_gtex table.""" __tablename__ = "protein_atlas_rna_brain_gtex" - id = mapped_column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - gene = mapped_column(String(100), index=True) - gene_name = mapped_column(String(100), index=True) - brain_region = mapped_column(String(100), index=True) - tpm = mapped_column(Numeric(8, 1)) - p_tpm = mapped_column(Numeric(8, 1)) - n_tpm = mapped_column(Numeric(8, 1)) + gene: Mapped[str] = mapped_column(String(100), index=True) + gene_name: Mapped[str] = mapped_column(String(100), index=True) + brain_region: Mapped[str] = mapped_column(String(100), index=True) + tpm: Mapped[float] = mapped_column(Numeric(8, 1)) + p_tpm: Mapped[float] = mapped_column(Numeric(8, 1)) + n_tpm: Mapped[float] = mapped_column(Numeric(8, 1)) def as_dict(self): """Convert object values to dictionary.""" @@ -122,14 +122,14 @@ class ProteinAtlasRnaBrainFantom(Base): """Class definition for the protein_atlas_rna_brain_fantom table.""" __tablename__ = "protein_atlas_rna_brain_fantom" - id = mapped_column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - gene = mapped_column(String(100)) - gene_name = mapped_column(String(100)) - brain_region = mapped_column(String(100)) - tags_per_million = mapped_column(String(100)) - scaled_tags_per_million = mapped_column(String(100)) - n_tpm = mapped_column(String(100)) + gene: Mapped[str] = mapped_column(String(100)) + gene_name: Mapped[str] = mapped_column(String(100)) + brain_region: Mapped[str] = mapped_column(String(100)) + tags_per_million: Mapped[str] = mapped_column(String(100)) + scaled_tags_per_million: Mapped[str] = mapped_column(String(100)) + n_tpm: Mapped[str] = mapped_column(String(100)) def as_dict(self): """Convert object values to dictionary.""" @@ -147,12 +147,12 @@ class ProteinAtlasRnaMouseBrainAllen(Base): """Class definition for the protein_atlas_rna_mouse_brain_allen table.""" __tablename__ = "protein_atlas_rna_mouse_brain_allen" - id = mapped_column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - gene = mapped_column(String(100)) - gene_name = mapped_column(String(100)) - brain_region = mapped_column(String(100)) - expression_energy = mapped_column(Numeric(8, 1)) + gene: Mapped[str] = mapped_column(String(100)) + gene_name: Mapped[str] = mapped_column(String(100)) + brain_region: Mapped[str] = mapped_column(String(100)) + expression_energy: Mapped[float] = mapped_column(Numeric(8, 1)) def as_dict(self): """Convert object values to dictionary.""" diff --git a/ebel/manager/rdbms/models/reactome.py b/ebel/manager/rdbms/models/reactome.py index e986cb1..3852882 100644 --- a/ebel/manager/rdbms/models/reactome.py +++ b/ebel/manager/rdbms/models/reactome.py @@ -1,7 +1,7 @@ """Reactome RDBMS model definition.""" from sqlalchemy import Column, Integer, String from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import mapped_column +from sqlalchemy.orm import mapped_column, Mapped from ebel.manager.rdbms.models import object_as_dict @@ -12,12 +12,12 @@ class Reactome(Base): """Class definition for the reactome table.""" __tablename__ = "reactome" - id = mapped_column(Integer, primary_key=True) - identifier = mapped_column(String(50), index=True) - uniprot_accession = mapped_column(String(50), index=True) - organism = mapped_column(String(255)) - name = mapped_column(String(255)) - evidence_type = mapped_column(String(255)) + id: Mapped[int] = mapped_column(primary_key=True) + identifier: Mapped[str] = mapped_column(String(50), index=True) + uniprot_accession: Mapped[str] = mapped_column(String(50), index=True) + organism: Mapped[str] = mapped_column(String(255)) + name: Mapped[str] = mapped_column(String(255)) + evidence_type: Mapped[str] = mapped_column(String(255)) def as_dict(self): """Convert object values to dictionary.""" diff --git a/ebel/manager/rdbms/models/stringdb.py b/ebel/manager/rdbms/models/stringdb.py index df1fedf..c56b9f9 100644 --- a/ebel/manager/rdbms/models/stringdb.py +++ b/ebel/manager/rdbms/models/stringdb.py @@ -2,7 +2,7 @@ from sqlalchemy import Boolean, Column, Integer, SmallInteger, String from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import mapped_column +from sqlalchemy.orm import mapped_column, Mapped from ebel.manager.rdbms.models import object_as_dict @@ -14,26 +14,26 @@ class StringDb(Base): __tablename__ = "stringdb" - id = mapped_column(Integer, primary_key=True) - - protein1 = mapped_column(String(50), nullable=False) - protein2 = mapped_column(String(50), nullable=False) - symbol1 = mapped_column(String(50), nullable=False, index=True) - symbol2 = mapped_column(String(50), nullable=False, index=True) - neighborhood = mapped_column(Integer) - neighborhood_transferred = mapped_column(SmallInteger) - fusion = mapped_column(SmallInteger) - cooccurence = mapped_column(SmallInteger) - homology = mapped_column(SmallInteger) - coexpression = mapped_column(SmallInteger) - coexpression_transferred = mapped_column(SmallInteger) - experiments = mapped_column(SmallInteger, index=True) - experiments_transferred = mapped_column(SmallInteger) - database = mapped_column(Integer) - database_transferred = mapped_column(SmallInteger) - textmining = mapped_column(SmallInteger) - textmining_transferred = mapped_column(SmallInteger) - combined_score = mapped_column(SmallInteger) + id: Mapped[int] = mapped_column(primary_key=True) + + protein1: Mapped[str] = mapped_column(String(50), nullable=False) + protein2: Mapped[str] = mapped_column(String(50), nullable=False) + symbol1: Mapped[str] = mapped_column(String(50), nullable=False, index=True) + symbol2: Mapped[str] = mapped_column(String(50), nullable=False, index=True) + neighborhood: Mapped[int] = mapped_column() + neighborhood_transferred: Mapped[int] = mapped_column(SmallInteger) + fusion: Mapped[int] = mapped_column(SmallInteger) + cooccurence: Mapped[int] = mapped_column(SmallInteger) + homology: Mapped[int] = mapped_column(SmallInteger) + coexpression: Mapped[int] = mapped_column(SmallInteger) + coexpression_transferred: Mapped[int] = mapped_column(SmallInteger) + experiments: Mapped[int] = mapped_column(SmallInteger, index=True) + experiments_transferred: Mapped[int] = mapped_column(SmallInteger) + database: Mapped[int] = mapped_column() + database_transferred: Mapped[int] = mapped_column(SmallInteger) + textmining: Mapped[int] = mapped_column(SmallInteger) + textmining_transferred: Mapped[int] = mapped_column(SmallInteger) + combined_score: Mapped[int] = mapped_column(SmallInteger) def as_dict(self): """Convert object values to dictionary.""" @@ -45,9 +45,9 @@ class StringDbProtein(Base): __tablename__ = "stringdb_protein" - id = mapped_column(Integer, primary_key=True) - string_protein_id = mapped_column(String(50), nullable=False, index=True) - preferred_name = mapped_column(String(50), nullable=False, index=True) + id: Mapped[int] = mapped_column(primary_key=True) + string_protein_id: Mapped[str] = mapped_column(String(50), nullable=False, index=True) + preferred_name: Mapped[str] = mapped_column(String(50), nullable=False, index=True) def as_dict(self): """Convert object values to dictionary.""" @@ -58,16 +58,16 @@ class StringDbAction(Base): """Class definition for the stringdb_action table.""" __tablename__ = "stringdb_action" - id = mapped_column(Integer, primary_key=True) - item_id_a = mapped_column(String(50), nullable=False) - item_id_b = mapped_column(String(50), nullable=False) - symbol1 = mapped_column(String(50), nullable=False, index=True) - symbol2 = mapped_column(String(50), nullable=False, index=True) - mode = mapped_column(String(20), nullable=False, index=True) - action = mapped_column(String(20)) - is_directional = mapped_column(Boolean, nullable=False, index=True) - a_is_acting = mapped_column(Boolean, nullable=False, index=True) - score = mapped_column(SmallInteger) + id: Mapped[int] = mapped_column(primary_key=True) + item_id_a: Mapped[str] = mapped_column(String(50), nullable=False) + item_id_b: Mapped[str] = mapped_column(String(50), nullable=False) + symbol1: Mapped[str] = mapped_column(String(50), nullable=False, index=True) + symbol2: Mapped[str] = mapped_column(String(50), nullable=False, index=True) + mode: Mapped[str] = mapped_column(String(20), nullable=False, index=True) + action: Mapped[str] = mapped_column(String(20)) + is_directional: Mapped[bool] = mapped_column(Boolean, nullable=False, index=True) + a_is_acting: Mapped[bool] = mapped_column(Boolean, nullable=False, index=True) + score: Mapped[int] = mapped_column(SmallInteger) def as_dict(self): """Convert object values to dictionary.""" diff --git a/ebel/manager/rdbms/models/uniprot.py b/ebel/manager/rdbms/models/uniprot.py index 331ea82..e4cfd9a 100644 --- a/ebel/manager/rdbms/models/uniprot.py +++ b/ebel/manager/rdbms/models/uniprot.py @@ -1,9 +1,10 @@ """UniProt RDBMS model definition.""" from collections import defaultdict +from typing import List from sqlalchemy import Column, ForeignKey, Integer, String, Table, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship, mapped_column +from sqlalchemy.orm import relationship, mapped_column, Mapped Base = declarative_base() @@ -45,29 +46,33 @@ class Uniprot(Base): __tablename__ = "uniprot" - id = mapped_column(Integer, primary_key=True) + id: Mapped[str] = mapped_column(primary_key=True) - accession = mapped_column(String(20), unique=True) - name = mapped_column(String(100), nullable=False, unique=True) - recommended_name = mapped_column(String(255), nullable=True) + accession: Mapped[str] = mapped_column(String(20), unique=True) + name: Mapped[str] = mapped_column(String(100), nullable=False, unique=True) + recommended_name: Mapped[str] = mapped_column(String(255), nullable=True) - taxid = mapped_column(Integer, ForeignKey("uniprot_organism.taxid"), nullable=False, index=True) - organism = relationship("Organism") + taxid: Mapped[int] = mapped_column(ForeignKey("uniprot_organism.taxid"), nullable=False, index=True) + organism: Mapped["Organism"] = relationship("Organism") - function_id = mapped_column(Integer, ForeignKey("uniprot_function.id"), nullable=True) - function = relationship("Function") + function_id: Mapped[int] = mapped_column(ForeignKey("uniprot_function.id"), nullable=True) + function: Mapped["Function"] = relationship("Function") - gene_names = relationship("Gene", back_populates="uniprot") + gene_names: Mapped[List["Gene"]] = relationship("Gene", back_populates="uniprot") - gene_symbol = relationship("GeneSymbol", uselist=False, back_populates="uniprot") + gene_symbol: Mapped["GeneSymbol"] = relationship("GeneSymbol", uselist=False, back_populates="uniprot") - keywords = relationship("Keyword", secondary=uniprot__uniprot_keyword, back_populates="uniprots") + keywords: Mapped[List["Keyword"]] = relationship( + "Keyword", secondary=uniprot__uniprot_keyword, back_populates="uniprots" + ) - hosts = relationship("Organism", secondary=uniprot__uniprot_host, back_populates="uniprots") + hosts: Mapped[List["Organism"]] = relationship( + "Organism", secondary=uniprot__uniprot_host, back_populates="uniprots" + ) - xrefs = relationship("Xref", secondary=uniprot__uniprot_xref, back_populates="uniprots") + xrefs: Mapped[List["Xref"]] = relationship("Xref", secondary=uniprot__uniprot_xref, back_populates="uniprots") - subcellular_locations = relationship( + subcellular_locations: Mapped[List["SubcellularLocation"]] = relationship( "SubcellularLocation", secondary=uniprot__uniprot_subcellular_location, back_populates="uniprots", @@ -103,10 +108,10 @@ class GeneSymbol(Base): """Class definition for the uniprot_gene_symbol table.""" __tablename__ = "uniprot_gene_symbol" - id = mapped_column(Integer, primary_key=True) - symbol = mapped_column(String(100), nullable=False, index=True) - uniprot_id = mapped_column(Integer, ForeignKey("uniprot.id")) - uniprot = relationship("Uniprot", back_populates="gene_symbol") + id: Mapped[int] = mapped_column(primary_key=True) + symbol: Mapped[str] = mapped_column(String(100), nullable=False, index=True) + uniprot_id: Mapped[int] = mapped_column(ForeignKey("uniprot.id")) + uniprot: Mapped[Uniprot] = relationship("Uniprot", back_populates="gene_symbol") def __repr__(self): """Define repr.""" @@ -117,10 +122,10 @@ class Gene(Base): """Class definition for the uniprot_gene table.""" __tablename__ = "uniprot_gene" - id = mapped_column(Integer, primary_key=True) - name = mapped_column(String(100), nullable=False, index=True) - uniprot_id = mapped_column(Integer, ForeignKey("uniprot.id")) - uniprot = relationship("Uniprot", back_populates="gene_names") + id: Mapped[int] = mapped_column(primary_key=True) + name: Mapped[str] = mapped_column(String(100), nullable=False, index=True) + uniprot_id: Mapped[int] = mapped_column(ForeignKey("uniprot.id")) + uniprot: Mapped[Uniprot] = relationship("Uniprot", back_populates="gene_names") class Keyword(Base): @@ -128,10 +133,12 @@ class Keyword(Base): __tablename__ = "uniprot_keyword" - keywordid = mapped_column(Integer, primary_key=True) - keyword_name = mapped_column(String(100), index=True) + keywordid: Mapped[int] = mapped_column(primary_key=True) + keyword_name: Mapped[str] = mapped_column(String(100), index=True) - uniprots = relationship("Uniprot", secondary=uniprot__uniprot_keyword, back_populates="keywords") + uniprots: Mapped[List[Uniprot]] = relationship( + "Uniprot", secondary=uniprot__uniprot_keyword, back_populates="keywords" + ) def __repr__(self): """Define repr.""" @@ -143,10 +150,10 @@ class Organism(Base): __tablename__ = "uniprot_organism" - taxid = mapped_column(Integer, primary_key=True) - scientific_name = mapped_column(String(255)) # TODO:Check if index=True with is possible + taxid: Mapped[int] = mapped_column(primary_key=True) + scientific_name: Mapped[str] = mapped_column(String(255)) # TODO:Check if index=True with is possible - uniprots = relationship("Uniprot", secondary=uniprot__uniprot_host, back_populates="hosts") + uniprots: Mapped[List[Uniprot]] = relationship("Uniprot", secondary=uniprot__uniprot_host, back_populates="hosts") class SubcellularLocation(Base): @@ -154,11 +161,10 @@ class SubcellularLocation(Base): __tablename__ = "uniprot_subcellular_location" - id = mapped_column(Integer, primary_key=True) - - name = mapped_column(String(100), index=True) + id: Mapped[int] = mapped_column(primary_key=True) + name: Mapped[str] = mapped_column(String(100), index=True) - uniprots = relationship( + uniprots: Mapped[List[Uniprot]] = relationship( "Uniprot", secondary=uniprot__uniprot_subcellular_location, back_populates="subcellular_locations", @@ -170,12 +176,12 @@ class Xref(Base): __tablename__ = "uniprot_xref" - id = mapped_column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - db = mapped_column(String(50), index=True) - identifier = mapped_column(String(100), index=True) + db: Mapped[str] = mapped_column(String(50), index=True) + identifier: Mapped[str] = mapped_column(String(100), index=True) - uniprots = relationship("Uniprot", secondary=uniprot__uniprot_xref, back_populates="xrefs") + uniprots: Mapped[List[Uniprot]] = relationship("Uniprot", secondary=uniprot__uniprot_xref, back_populates="xrefs") class Function(Base): @@ -183,8 +189,8 @@ class Function(Base): __tablename__ = "uniprot_function" - id = mapped_column(Integer, primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) - description = mapped_column(Text) + description: Mapped[str] = mapped_column(Text) - uniprots = relationship("Uniprot", back_populates="function") + uniprots: Mapped[List[Uniprot]] = relationship("Uniprot", back_populates="function") diff --git a/ebel/web/api/ebel/v1/bel.py b/ebel/web/api/ebel/v1/bel.py index 23c8d72..e14c76b 100644 --- a/ebel/web/api/ebel/v1/bel.py +++ b/ebel/web/api/ebel/v1/bel.py @@ -17,8 +17,7 @@ from graphviz import Digraph from ebel import Bel -from ebel.manager.orientdb.odb_structure import (get_columns, - get_node_view_labels) +from ebel.manager.orientdb.odb_structure import get_columns, get_node_view_labels from ebel.validate import validate_bel_file from ebel.web.api.ebel.v1 import DataType, OrientDbSqlOperator, _get_pagination diff --git a/ebel/web/api/ebel/v1/bel_against_expression.py b/ebel/web/api/ebel/v1/bel_against_expression.py index 7ee4e39..da7696f 100644 --- a/ebel/web/api/ebel/v1/bel_against_expression.py +++ b/ebel/web/api/ebel/v1/bel_against_expression.py @@ -8,8 +8,7 @@ from sqlalchemy.sql import func from ebel import Bel -from ebel.manager.rdbms.models.expression_atlas import (Experiment, FoldChange, - GroupComparison) +from ebel.manager.rdbms.models.expression_atlas import Experiment, FoldChange, GroupComparison from ebel.web.api.ebel.v1 import _get_pagination Relation = namedtuple( diff --git a/ebel/web/api/ebel/v1/biogrid.py b/ebel/web/api/ebel/v1/biogrid.py index ebc6563..25b43d3 100644 --- a/ebel/web/api/ebel/v1/biogrid.py +++ b/ebel/web/api/ebel/v1/biogrid.py @@ -9,9 +9,15 @@ from ebel import Bel from ebel.manager.orientdb.biodbs.biogrid import MODIFICATIONS -from ebel.manager.rdbms.models.biogrid import (Biogrid, ExperimentalSystem, - Interactor, Modification, - Publication, Source, Taxonomy) +from ebel.manager.rdbms.models.biogrid import ( + Biogrid, + ExperimentalSystem, + Interactor, + Modification, + Publication, + Source, + Taxonomy, +) from ebel.web.api import RDBMS from ebel.web.api.ebel.v1 import _get_data diff --git a/ebel/web/api/ebel/v1/clinical_trials_gov.py b/ebel/web/api/ebel/v1/clinical_trials_gov.py index 88d91d2..5332ab9 100644 --- a/ebel/web/api/ebel/v1/clinical_trials_gov.py +++ b/ebel/web/api/ebel/v1/clinical_trials_gov.py @@ -5,8 +5,7 @@ from ebel.manager.rdbms.models import clinical_trials_gov as ct from ebel.web.api import RDBMS -from ebel.web.api.ebel.v1 import (_get_paginated_query_result, - _get_terms_from_model_starts_with) +from ebel.web.api.ebel.v1 import _get_paginated_query_result, _get_terms_from_model_starts_with def get_ct_by_nct_id(): diff --git a/ebel/web/api/ebel/v1/clinvar.py b/ebel/web/api/ebel/v1/clinvar.py index 46feeac..5502e7c 100644 --- a/ebel/web/api/ebel/v1/clinvar.py +++ b/ebel/web/api/ebel/v1/clinvar.py @@ -7,9 +7,12 @@ from ebel import Bel from ebel.manager.rdbms.models import clinvar from ebel.web.api import RDBMS -from ebel.web.api.ebel.v1 import (_get_data, _get_paginated_query_result, - _get_pagination, - _get_terms_from_model_starts_with) +from ebel.web.api.ebel.v1 import ( + _get_data, + _get_paginated_query_result, + _get_pagination, + _get_terms_from_model_starts_with, +) def get_clinvar(): diff --git a/ebel/web/api/ebel/v1/disgenet.py b/ebel/web/api/ebel/v1/disgenet.py index 71dbab4..62ac705 100644 --- a/ebel/web/api/ebel/v1/disgenet.py +++ b/ebel/web/api/ebel/v1/disgenet.py @@ -3,9 +3,11 @@ from ebel.manager.rdbms.models import disgenet from ebel.web.api import RDBMS -from ebel.web.api.ebel.v1 import (_get_paginated_ebel_query_result, - _get_paginated_query_result, - _get_terms_from_model_starts_with) +from ebel.web.api.ebel.v1 import ( + _get_paginated_ebel_query_result, + _get_paginated_query_result, + _get_terms_from_model_starts_with, +) def get_sources(): diff --git a/ebel/web/api/ebel/v1/drugbank.py b/ebel/web/api/ebel/v1/drugbank.py index 6b55fa9..fd1205c 100644 --- a/ebel/web/api/ebel/v1/drugbank.py +++ b/ebel/web/api/ebel/v1/drugbank.py @@ -4,8 +4,7 @@ from ebel.manager.rdbms.models import drugbank from ebel.web.api import RDBMS -from ebel.web.api.ebel.v1 import (_get_data, _get_paginated_ebel_query_result, - _get_paginated_query_result) +from ebel.web.api.ebel.v1 import _get_data, _get_paginated_ebel_query_result, _get_paginated_query_result def get_by_id(): diff --git a/ebel/web/api/ebel/v1/expression_atlas.py b/ebel/web/api/ebel/v1/expression_atlas.py index 9302eaa..e1c9c23 100644 --- a/ebel/web/api/ebel/v1/expression_atlas.py +++ b/ebel/web/api/ebel/v1/expression_atlas.py @@ -8,9 +8,14 @@ from sqlalchemy import inspect from ebel import Bel -from ebel.manager.rdbms.models.expression_atlas import (Experiment, FoldChange, - GroupComparison, Gsea, - Idf, SdrfCondensed) +from ebel.manager.rdbms.models.expression_atlas import ( + Experiment, + FoldChange, + GroupComparison, + Gsea, + Idf, + SdrfCondensed, +) from ebel.web.api import RDBMS from ebel.web.api.ebel.v1 import _get_data diff --git a/ebel/web/api/ebel/v1/intact.py b/ebel/web/api/ebel/v1/intact.py index f625920..75d8d5d 100644 --- a/ebel/web/api/ebel/v1/intact.py +++ b/ebel/web/api/ebel/v1/intact.py @@ -5,8 +5,7 @@ from ebel.manager.orientdb.odb_structure import intact_edges from ebel.manager.rdbms.models.intact import Intact from ebel.web.api import RDBMS -from ebel.web.api.ebel.v1 import (_get_data, _get_paginated_ebel_query_result, - _get_paginated_query_result) +from ebel.web.api.ebel.v1 import _get_data, _get_paginated_ebel_query_result, _get_paginated_query_result def get_intact(): diff --git a/ebel/web/api/ebel/v1/kegg.py b/ebel/web/api/ebel/v1/kegg.py index e0bd768..f2be69b 100644 --- a/ebel/web/api/ebel/v1/kegg.py +++ b/ebel/web/api/ebel/v1/kegg.py @@ -5,8 +5,7 @@ from ebel.manager.rdbms.models.kegg import Kegg from ebel.web.api import RDBMS -from ebel.web.api.ebel.v1 import (_get_data, _get_paginated_ebel_query_result, - _get_paginated_query_result) +from ebel.web.api.ebel.v1 import _get_data, _get_paginated_ebel_query_result, _get_paginated_query_result def get_kegg(): diff --git a/ebel/web/api/ebel/v1/pathway_commons.py b/ebel/web/api/ebel/v1/pathway_commons.py index 3f5d1f4..6c68070 100644 --- a/ebel/web/api/ebel/v1/pathway_commons.py +++ b/ebel/web/api/ebel/v1/pathway_commons.py @@ -3,12 +3,14 @@ from flask import request from sqlalchemy import or_ -from ebel.manager.rdbms.models.pathway_commons import ( - PathwayCommons, PathwayName, Pmid, pathway_commons__pathway_name) +from ebel.manager.rdbms.models.pathway_commons import PathwayCommons, PathwayName, Pmid, pathway_commons__pathway_name from ebel.web.api import RDBMS -from ebel.web.api.ebel.v1 import (_get_data, _get_paginated_ebel_query_result, - _get_paginated_query_result, - _get_terms_from_model_starts_with) +from ebel.web.api.ebel.v1 import ( + _get_data, + _get_paginated_ebel_query_result, + _get_paginated_query_result, + _get_terms_from_model_starts_with, +) def get_pathway_commons(): diff --git a/ebel/web/api/ebel/v1/uniprot.py b/ebel/web/api/ebel/v1/uniprot.py index 0f231e5..51858c3 100644 --- a/ebel/web/api/ebel/v1/uniprot.py +++ b/ebel/web/api/ebel/v1/uniprot.py @@ -6,8 +6,7 @@ from ebel import Bel from ebel.manager.rdbms.models import uniprot from ebel.web.api import RDBMS -from ebel.web.api.ebel.v1 import (_get_paginated_query_result, - _get_terms_from_model_starts_with) +from ebel.web.api.ebel.v1 import _get_paginated_query_result, _get_terms_from_model_starts_with from . import add_query_filters From b61e876d4aad5e93290bfe9775e8db5b3edc395b Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Fri, 15 Sep 2023 13:56:27 +0200 Subject: [PATCH 03/58] fix: add optional to clinvar props --- ebel/manager/orientdb/biodbs/bel.py | 6 ------ ebel/manager/orientdb/biodbs/clinvar.py | 1 + ebel/manager/rdbms/models/clinvar.py | 4 ++-- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/ebel/manager/orientdb/biodbs/bel.py b/ebel/manager/orientdb/biodbs/bel.py index 126eab6..ed7241c 100644 --- a/ebel/manager/orientdb/biodbs/bel.py +++ b/ebel/manager/orientdb/biodbs/bel.py @@ -695,9 +695,3 @@ def insert_data(self) -> Dict[str, int]: def update_interactions(self) -> int: """Abstract method.""" pass - - -if __name__ == "__main__": - b = Bel() - b.clinical_trials.recreate_tables() - b.clinical_trials.update() diff --git a/ebel/manager/orientdb/biodbs/clinvar.py b/ebel/manager/orientdb/biodbs/clinvar.py index ef8f237..613ce5e 100644 --- a/ebel/manager/orientdb/biodbs/clinvar.py +++ b/ebel/manager/orientdb/biodbs/clinvar.py @@ -57,6 +57,7 @@ def __contains__(self, item): def insert_data(self) -> Dict[str, int]: """Insert data.""" inserted = {} + logger.info("Insert data for ClinVar") self.recreate_tables() df = pd.read_csv(self.file_path, sep="\t", low_memory=False) self._standardize_dataframe(df) diff --git a/ebel/manager/rdbms/models/clinvar.py b/ebel/manager/rdbms/models/clinvar.py index 3d91d16..e70a738 100644 --- a/ebel/manager/rdbms/models/clinvar.py +++ b/ebel/manager/rdbms/models/clinvar.py @@ -1,5 +1,5 @@ """ClinVar RDBMS model definition.""" -from typing import List +from typing import List, Optional from sqlalchemy import ForeignKey, Index, Integer, String, Table, Text, Column from sqlalchemy.ext.declarative import declarative_base @@ -58,7 +58,7 @@ class Clinvar(Base): hgnc_id: Mapped[str] = mapped_column(String(100)) clinical_significance: Mapped[str] = mapped_column(String(100)) clin_sig_simple: Mapped[int] = mapped_column() - last_evaluated: Mapped[str] = mapped_column(String(100)) + last_evaluated: Mapped[Optional[str]] = mapped_column(String(100)) rs_db_snp: Mapped[int] = mapped_column(index=True) nsv_esv_db_var: Mapped[str] = mapped_column(String(100)) rcvaccession: Mapped[str] = mapped_column(String(1000)) From f87311af3911f54df41c12f1c6181a1f4d4bc2eb Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Mon, 18 Sep 2023 10:13:54 +0200 Subject: [PATCH 04/58] build: upgrade click version to work with flask deps --- pyproject.toml | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 82b66a4..a0b9f80 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ Documentation = 'https://ebel.readthedocs.io/en/latest/' [tool.poetry.dependencies] lark-parser = "^0.11.3" -click = "^7.1.2" +click = "^8.1.7" requests = "^2.31.0" tqdm = "^4.66.1" pandas = "^1.5.3" diff --git a/requirements.txt b/requirements.txt index 920280b..d128e4b 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ lark-parser==0.11.3 -click>=7.1.2 +click>=8.1.7 requests>=2.31.0 tqdm>=4.66.1 pandas>=1.5.3 From 05a42148e021f6b9dcfcfcd16d8f22c09121a64a Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Mon, 18 Sep 2023 10:19:59 +0200 Subject: [PATCH 05/58] fix: disgenet URLs and merge method --- ebel/manager/orientdb/biodbs/disgenet.py | 25 ++++++++++++++++++------ ebel/manager/orientdb/urls.py | 4 ++-- ebel/manager/rdbms/models/disgenet.py | 6 +++--- 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/ebel/manager/orientdb/biodbs/disgenet.py b/ebel/manager/orientdb/biodbs/disgenet.py index d884350..149242c 100644 --- a/ebel/manager/orientdb/biodbs/disgenet.py +++ b/ebel/manager/orientdb/biodbs/disgenet.py @@ -4,10 +4,11 @@ import pandas as pd from pyorientdb import OrientDB -from sqlalchemy import text +from sqlalchemy import text, select from tqdm import tqdm from ebel.manager.orientdb import odb_meta, odb_structure, urls +from ebel.manager.orientdb.biodbs.ensembl import Ensembl from ebel.manager.orientdb.constants import DISGENET from ebel.manager.rdbms.models import disgenet from ebel.tools import get_disease_trait_keywords_from_config, get_file_path @@ -51,6 +52,12 @@ def __repr__(self) -> str: def insert_data(self) -> Dict[str, int]: """Insert data into database.""" logger.info(f"Import {self.biodb_name.upper()}") + + # Update EnSembl first since DisGeNet is dependent on it + ens = Ensembl() + ens.update() + + # Insert data inserted = dict() inserted["sources"] = self._insert_sources() inserted["gene_symbols"] = self._insert_gene_symbols() @@ -74,8 +81,8 @@ def file_path_variant(self): return self.__get_file_for_model(disgenet.DisgenetVariant) def _insert_sources(self): - df_g = pd.read_csv(self.file_path_gene, sep="\t", usecols=["source"]).drop_duplicates() - df_v = pd.read_csv(self.file_path_variant, sep="\t", usecols=["source"]).drop_duplicates() + df_g = pd.read_csv(self.file_path_gene, sep="\t", usecols=["source"]) + df_v = pd.read_csv(self.file_path_variant, sep="\t", usecols=["source"]) df = pd.concat([df_g, df_v]).drop_duplicates() df.reset_index(inplace=True, drop=True) df.index += 1 @@ -116,9 +123,10 @@ def _insert_gene_symbols(self) -> int: return df.shape[0] def _merge_with_source(self, df): - df_sources = pd.read_sql_table(disgenet.DisgenetSource.__tablename__, self.engine).rename( - columns={"id": "source_id"} - ) + with self.engine.connect() as conn: + stmt = select(disgenet.DisgenetSource) + df_sources = pd.read_sql(stmt, conn).rename(columns={"id": "source_id"}) + return pd.merge(df, df_sources, on="source").drop(columns=["source"]) def _insert_gene_disease_pmid_associations(self) -> int: @@ -245,3 +253,8 @@ def update_snps(self) -> int: inserted += 1 return inserted + + +if __name__ == "__main__": + dis = DisGeNet() + dis.update() diff --git a/ebel/manager/orientdb/urls.py b/ebel/manager/orientdb/urls.py index a6c31d2..4e1a06f 100755 --- a/ebel/manager/orientdb/urls.py +++ b/ebel/manager/orientdb/urls.py @@ -52,8 +52,8 @@ WIKIPATHWAYS = "http://data.wikipathways.org/20180710/gpml/wikipathways-20180710-gpml-Homo_sapiens.zip" # Ensembl # -ENSEMBL_FASTA_PEP = "ftp://ftp.ensembl.org/pub/release-94/fasta/homo_sapiens/pep/Homo_sapiens.GRCh38.pep.all.fa.gz" -ENSEMBL_CDS = "ftp://ftp.ensembl.org/pub/release-96/fasta/homo_sapiens/cds/Homo_sapiens.GRCh38.cds.all.fa.gz" +ENSEMBL_FASTA_PEP = "https://ftp.ensembl.org/pub/release-94/fasta/homo_sapiens/pep/Homo_sapiens.GRCh38.pep.all.fa.gz" +ENSEMBL_CDS = "https://ftp.ensembl.org/pub/release-96/fasta/homo_sapiens/cds/Homo_sapiens.GRCh38.cds.all.fa.gz" # SIDER # SIDER_ATC = "http://sideeffects.embl.de/media/download/drug_atc.tsv" diff --git a/ebel/manager/rdbms/models/disgenet.py b/ebel/manager/rdbms/models/disgenet.py index 3127fd4..9ff23fe 100644 --- a/ebel/manager/rdbms/models/disgenet.py +++ b/ebel/manager/rdbms/models/disgenet.py @@ -1,5 +1,5 @@ """DisGeNet RDBMS model definition.""" -from typing import List +from typing import List, Optional from sqlalchemy import BigInteger, Float, ForeignKey, Integer, String from sqlalchemy.ext.declarative import declarative_base @@ -23,7 +23,7 @@ class DisgenetGene(Base): disease_id: Mapped[str] = mapped_column(String(100), ForeignKey("disgenet_disease.disease_id")) disease: Mapped["DisgenetDisease"] = relationship("DisgenetDisease", foreign_keys=[disease_id]) score: Mapped[float] = mapped_column() - pmid: Mapped[int] = mapped_column() + pmid: Mapped[Optional[int]] = mapped_column() source_id: Mapped[int] = mapped_column(ForeignKey("disgenet_source.id")) source: Mapped["DisgenetSource"] = relationship("DisgenetSource", foreign_keys=[source_id]) @@ -68,7 +68,7 @@ class DisgenetVariant(Base): disease_id: Mapped[str] = mapped_column(String(100), ForeignKey("disgenet_disease.disease_id")) disease: Mapped["DisgenetDisease"] = relationship("DisgenetDisease", foreign_keys=[disease_id]) score: Mapped[float] = mapped_column() - pmid: Mapped[int] = mapped_column(index=True) + pmid: Mapped[Optional[int]] = mapped_column(index=True) source_id: Mapped[int] = mapped_column(ForeignKey("disgenet_source.id")) source: Mapped["DisgenetSource"] = relationship("DisgenetSource", foreign_keys=[source_id]) From 84da7eb26902e5e4211535bd3a7d989d704a5f3f Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Mon, 18 Sep 2023 10:47:47 +0200 Subject: [PATCH 06/58] fix: disgenet nullable fields, now working --- ebel/manager/orientdb/biodbs/disgenet.py | 5 ----- ebel/manager/orientdb/odb_meta.py | 2 +- ebel/manager/rdbms/models/disgenet.py | 4 ++-- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/ebel/manager/orientdb/biodbs/disgenet.py b/ebel/manager/orientdb/biodbs/disgenet.py index 149242c..bb3259d 100644 --- a/ebel/manager/orientdb/biodbs/disgenet.py +++ b/ebel/manager/orientdb/biodbs/disgenet.py @@ -253,8 +253,3 @@ def update_snps(self) -> int: inserted += 1 return inserted - - -if __name__ == "__main__": - dis = DisGeNet() - dis.update() diff --git a/ebel/manager/orientdb/odb_meta.py b/ebel/manager/orientdb/odb_meta.py index db5cff7..8773bad 100644 --- a/ebel/manager/orientdb/odb_meta.py +++ b/ebel/manager/orientdb/odb_meta.py @@ -1346,7 +1346,7 @@ def get_set_gene_rids_by_position( for gene_type, sql in sqls.items(): if gene_type in gene_types: - results = self.session.execute(sql) + results = self.session.execute(text(sql)) for (symbol,) in results.fetchall(): bel = f'g(HGNC:"{symbol}")' data = { diff --git a/ebel/manager/rdbms/models/disgenet.py b/ebel/manager/rdbms/models/disgenet.py index 9ff23fe..3ee071c 100644 --- a/ebel/manager/rdbms/models/disgenet.py +++ b/ebel/manager/rdbms/models/disgenet.py @@ -24,7 +24,7 @@ class DisgenetGene(Base): disease: Mapped["DisgenetDisease"] = relationship("DisgenetDisease", foreign_keys=[disease_id]) score: Mapped[float] = mapped_column() pmid: Mapped[Optional[int]] = mapped_column() - source_id: Mapped[int] = mapped_column(ForeignKey("disgenet_source.id")) + source_id: Mapped[Optional[int]] = mapped_column(ForeignKey("disgenet_source.id")) source: Mapped["DisgenetSource"] = relationship("DisgenetSource", foreign_keys=[source_id]) def as_dict(self): @@ -69,7 +69,7 @@ class DisgenetVariant(Base): disease: Mapped["DisgenetDisease"] = relationship("DisgenetDisease", foreign_keys=[disease_id]) score: Mapped[float] = mapped_column() pmid: Mapped[Optional[int]] = mapped_column(index=True) - source_id: Mapped[int] = mapped_column(ForeignKey("disgenet_source.id")) + source_id: Mapped[Optional[int]] = mapped_column(ForeignKey("disgenet_source.id")) source: Mapped["DisgenetSource"] = relationship("DisgenetSource", foreign_keys=[source_id]) def as_dict(self): From 02142f9248b687c810960d7929cf9db2ff07973a Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Mon, 18 Sep 2023 11:22:52 +0200 Subject: [PATCH 07/58] fix: hgnc sqla2 import --- ebel/manager/orientdb/biodbs/hgnc.py | 105 +++++++++++++++++---------- ebel/manager/orientdb/urls.py | 6 +- ebel/manager/rdbms/models/hgnc.py | 51 ++++++------- 3 files changed, 96 insertions(+), 66 deletions(-) diff --git a/ebel/manager/orientdb/biodbs/hgnc.py b/ebel/manager/orientdb/biodbs/hgnc.py index 8140544..13c7598 100644 --- a/ebel/manager/orientdb/biodbs/hgnc.py +++ b/ebel/manager/orientdb/biodbs/hgnc.py @@ -9,11 +9,32 @@ import numpy as np import pandas as pd from pyorientdb import OrientDB +from sqlalchemy import select from tqdm import tqdm from ebel.manager.orientdb import odb_meta, odb_structure, urls from ebel.manager.orientdb.constants import HGNC from ebel.manager.rdbms.models import hgnc +from ebel.manager.rdbms.models.hgnc import ( + Hgnc as HgncDb, + PrevSymbol, + AliasSymbol, + AliasName, + Ccds, + Ena, + Enzyme, + GeneGroupName, + GeneGroupId, + UniProt, + RnaCentral, + Rgd, + RefSeq, + PubMed, + PrevName, + Omim, + Mgd, + Lsdb, +) from ebel.tools import get_file_path logger = logging.getLogger(__name__) @@ -117,28 +138,28 @@ def import_hgnc_into_rdbms(self) -> int: df["id"] = pd.to_numeric(df.hgnc_id.str.split(":").str[1]) df.set_index("id", inplace=True) - df[columns].to_sql(hgnc.Hgnc.__tablename__, self.engine, if_exists="append") + df[columns].to_sql(HgncDb.__tablename__, self.engine, if_exists="append") df.hgnc_id = pd.to_numeric(df.hgnc_id.str.split(":").str[1]) for df_col, model, m_col in ( - ("prev_symbol", hgnc.PrevSymbol, None), - ("alias_symbol", hgnc.AliasSymbol, None), - ("alias_name", hgnc.AliasName, None), - ("ccds_id", hgnc.Ccds, "identifier"), - ("ena", hgnc.Ena, "identifier"), - ("enzyme_id", hgnc.Enzyme, "ec_number"), - ("gene_group", hgnc.GeneGroupName, "name"), - ("gene_group_id", hgnc.GeneGroupId, "identifier"), - ("uniprot_ids", hgnc.UniProt, "accession"), - ("rna_central_id", hgnc.RnaCentral, "identifier"), - ("rgd_id", hgnc.Rgd, "identifier"), - ("refseq_accession", hgnc.RefSeq, "accession"), - ("pubmed_id", hgnc.PubMed, "pmid"), - ("prev_name", hgnc.PrevName, None), - ("omim_id", hgnc.Omim, "identifier"), - ("mgd_id", hgnc.Mgd, "identifier"), - ("lsdb", hgnc.Lsdb, "identifier"), + ("prev_symbol", PrevSymbol, None), + ("alias_symbol", AliasSymbol, None), + ("alias_name", AliasName, None), + ("ccds_id", Ccds, "identifier"), + ("ena", Ena, "identifier"), + ("enzyme_id", Enzyme, "ec_number"), + ("gene_group", GeneGroupName, "name"), + ("gene_group_id", GeneGroupId, "identifier"), + ("uniprot_ids", UniProt, "accession"), + ("rna_central_id", RnaCentral, "identifier"), + ("rgd_id", Rgd, "identifier"), + ("refseq_accession", RefSeq, "accession"), + ("pubmed_id", PubMed, "pmid"), + ("prev_name", PrevName, None), + ("omim_id", Omim, "identifier"), + ("mgd_id", Mgd, "identifier"), + ("lsdb", Lsdb, "identifier"), ): df_1n_table = df[[df_col, "hgnc_id"]].explode(df_col).dropna() if m_col: @@ -264,11 +285,10 @@ def get_bel_symbols_all(self): def get_correct_symbol(self, symbol: str): """Checks if symbol is valid otherwise checks previsous symbols.""" - result_in_symbol = self.session.query(hgnc.Hgnc).filter(hgnc.Hgnc.symbol == symbol).first() + symbol_query = select(HgncDb).where(HgncDb.symbol == symbol) + result_in_symbol = self.session.execute(symbol_query).first() if not result_in_symbol: - result_in_prev_symbol = ( - self.session.query(hgnc.PrevSymbol).filter(hgnc.PrevSymbol.prev_symbol == symbol).first() - ) + result_in_prev_symbol = self.session.query(PrevSymbol).filter(PrevSymbol.prev_symbol == symbol).first() if result_in_prev_symbol: symbol = result_in_prev_symbol.hgnc.symbol else: @@ -277,7 +297,7 @@ def get_correct_symbol(self, symbol: str): def correct_wrong_symbol(self, symbol, bel_symbols_all: set): """Corrects the symbol of the node and relinks all edges to existing node if needed.""" - result = self.session.query(hgnc.PrevSymbol).filter_by(prev_symbol=symbol).first() + result = self.session.query(PrevSymbol).filter_by(prev_symbol=symbol).first() if result: correct_symbol = result.hgnc.symbol if correct_symbol not in bel_symbols_all: @@ -300,7 +320,10 @@ def update_bel(self) -> int: bel_symbols_all = self.get_bel_symbols_all() symbols_without_hgnc = self.get_bel_symbols_without_hgnc_link() - hgnc_symbols = {x[0] for x in self.session.query(hgnc.Hgnc.symbol).all()} + + symbol_query = select(HgncDb.symbol) + symbol_results = self.session.execute(symbol_query).all() + hgnc_symbols = {x[0] for x in symbol_results} for wrong_symbol in symbols_without_hgnc - hgnc_symbols: self.correct_wrong_symbol(wrong_symbol, bel_symbols_all) @@ -377,28 +400,28 @@ def update_protein(self, hgnc_rid: str, label: str, hgnc_symbol: str, suggested_ def update_nodes_by_symbol(self, symbol) -> dict: """Update all nodes by HGNC symbol.""" - hgnc = self.get_basic_entry_by_symbol(symbol) + hgnc_results = self.get_basic_entry_by_symbol(symbol) - if hgnc: - suggest = json.dumps(hgnc.suggested_corrections) if hgnc.suggested_corrections else None + if hgnc_results: + suggest = json.dumps(hgnc_results.suggested_corrections) if hgnc_results.suggested_corrections else None num_update_genes = self.update_gene( - hgnc_symbol=hgnc.symbol, - hgnc_rid=hgnc.hgnc_rid, - label=hgnc.label, - location=hgnc.location, + hgnc_symbol=hgnc_results.symbol, + hgnc_rid=hgnc_results.hgnc_rid, + label=hgnc_results.label, + location=hgnc_results.location, suggested_corrections=suggest, ) num_update_rnas = self.update_rna( - hgnc_symbol=hgnc.symbol, - hgnc_rid=hgnc.hgnc_rid, - label=hgnc.label, + hgnc_symbol=hgnc_results.symbol, + hgnc_rid=hgnc_results.hgnc_rid, + label=hgnc_results.label, suggested_corrections=suggest, ) num_update_proteins = self.update_protein( - hgnc_symbol=hgnc.symbol, - hgnc_rid=hgnc.hgnc_rid, - label=hgnc.label, + hgnc_symbol=hgnc_results.symbol, + hgnc_rid=hgnc_results.hgnc_rid, + label=hgnc_results.label, suggested_corrections=suggest, ) return { @@ -409,9 +432,15 @@ def update_nodes_by_symbol(self, symbol) -> dict: def get_symbol_entrez_dict(self) -> Dict[str, int]: """Return dictionary with gene symbols as keys and entrez IDs as values.""" - query = self.session.query(hgnc.Hgnc.symbol, hgnc.Hgnc.entrez_id).filter(hgnc.Hgnc.entrez_id.isnot(None)) + query = self.session.query(HgncDb.symbol, HgncDb.entrez_id).filter(HgncDb.entrez_id.isnot(None)) return {r.symbol: r.entrez_id for r in query.all()} def update_interactions(self) -> int: """Abstract method.""" pass + + +if __name__ == "__main__": + hgncdb = Hgnc() + hgncdb.recreate_tables() + hgncdb.update() diff --git a/ebel/manager/orientdb/urls.py b/ebel/manager/orientdb/urls.py index 4e1a06f..0e42a79 100755 --- a/ebel/manager/orientdb/urls.py +++ b/ebel/manager/orientdb/urls.py @@ -1,9 +1,9 @@ """URLs to download files.""" # HGNC # -HGNC_JSON = "ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/json/hgnc_complete_set.json" -HGNC_TSV = "ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt" -HCOP_GZIP = "ftp://ftp.ebi.ac.uk/pub/databases/genenames/hcop/human_all_hcop_sixteen_column.txt.gz" +HGNC_JSON = "https://ftp.ebi.ac.uk/pub/databases/genenames/new/json/hgnc_complete_set.json" +HGNC_TSV = "https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt" +HCOP_GZIP = "https://ftp.ebi.ac.uk/pub/databases/genenames/hcop/human_all_hcop_sixteen_column.txt.gz" # UniProt # UNIPROT_SPROT = ( diff --git a/ebel/manager/rdbms/models/hgnc.py b/ebel/manager/rdbms/models/hgnc.py index 56c0b64..b356682 100644 --- a/ebel/manager/rdbms/models/hgnc.py +++ b/ebel/manager/rdbms/models/hgnc.py @@ -1,6 +1,6 @@ """HGNC RDBMS model definition.""" import datetime -from typing import List +from typing import List, Optional from sqlalchemy import BigInteger, Column, Date, ForeignKey, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base @@ -15,41 +15,42 @@ class Hgnc(Base): """Class definition for the hgnc table.""" __tablename__ = "hgnc" + id: Mapped[int] = mapped_column(primary_key=True) hgnc_id: Mapped[str] = mapped_column(String(20)) version: Mapped[int] = mapped_column() - bioparadigms_slc: Mapped[str] = mapped_column(String(20)) - cd: Mapped[str] = mapped_column(String(20)) - cosmic: Mapped[str] = mapped_column(String(50)) + bioparadigms_slc: Mapped[Optional[str]] = mapped_column(String(20)) + cd: Mapped[Optional[str]] = mapped_column(String(20)) + cosmic: Mapped[Optional[str]] = mapped_column(String(50)) date_approved_reserved: Mapped[datetime.date] = mapped_column(Date) - date_modified: Mapped[datetime.date] = mapped_column(Date) - date_name_changed: Mapped[datetime.date] = mapped_column(Date) - date_symbol_changed: Mapped[datetime.date] = mapped_column(Date) - ensembl_gene_id: Mapped[str] = mapped_column(String(20)) - entrez_id: Mapped[int] = mapped_column() - homeodb: Mapped[int] = mapped_column() - horde_id: Mapped[str] = mapped_column(String(50)) - imgt: Mapped[str] = mapped_column(String(50)) - iuphar: Mapped[str] = mapped_column(String(50)) + date_modified: Mapped[Optional[datetime.date]] = mapped_column(Date) + date_name_changed: Mapped[Optional[datetime.date]] = mapped_column(Date) + date_symbol_changed: Mapped[Optional[datetime.date]] = mapped_column(Date) + ensembl_gene_id: Mapped[Optional[str]] = mapped_column(String(20)) + entrez_id: Mapped[Optional[int]] = mapped_column() + homeodb: Mapped[Optional[int]] = mapped_column() + horde_id: Mapped[Optional[str]] = mapped_column(String(50)) + imgt: Mapped[Optional[str]] = mapped_column(String(50)) + iuphar: Mapped[Optional[str]] = mapped_column(String(50)) kznf_gene_catalog: Mapped[int] = mapped_column() - lncipedia: Mapped[str] = mapped_column(String(50)) - lncrnadb: Mapped[str] = mapped_column(String(50)) - location: Mapped[str] = mapped_column(String(100)) - location_sortable: Mapped[str] = mapped_column(String(100)) + lncipedia: Mapped[Optional[str]] = mapped_column(String(50)) + lncrnadb: Mapped[Optional[str]] = mapped_column(String(50)) + location: Mapped[Optional[str]] = mapped_column(String(100)) + location_sortable: Mapped[Optional[str]] = mapped_column(String(100)) locus_group: Mapped[str] = mapped_column(String(50)) locus_type: Mapped[str] = mapped_column(String(50)) - merops: Mapped[str] = mapped_column(String(20)) - mirbase: Mapped[str] = mapped_column(String(20)) + merops: Mapped[Optional[str]] = mapped_column(String(20)) + mirbase: Mapped[Optional[str]] = mapped_column(String(20)) name: Mapped[str] = mapped_column(String(255)) - orphanet: Mapped[int] = mapped_column() - snornabase: Mapped[str] = mapped_column(String(20)) + orphanet: Mapped[Optional[int]] = mapped_column() + snornabase: Mapped[Optional[str]] = mapped_column(String(20)) status: Mapped[str] = mapped_column(String(50)) symbol: Mapped[str] = mapped_column(String(100), index=True) - ucsc_id: Mapped[str] = mapped_column(String(50)) + ucsc_id: Mapped[Optional[str]] = mapped_column(String(50)) uuid: Mapped[str] = mapped_column(String(50)) - vega_id: Mapped[str] = mapped_column(String(50)) - agr: Mapped[str] = mapped_column(String(50)) - kznf_gene_catalog: Mapped[str] = mapped_column(Text) + vega_id: Mapped[Optional[str]] = mapped_column(String(50)) + agr: Mapped[Optional[str]] = mapped_column(String(50)) + kznf_gene_catalog: Mapped[Optional[str]] = mapped_column(Text) pre_symbols: Mapped[List["PrevSymbol"]] = relationship("PrevSymbol", back_populates="hgnc") alias_names: Mapped[List["AliasName"]] = relationship("AliasName", back_populates="hgnc") From fce52806351870471b46c99d81c96cda6470f813 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Mon, 18 Sep 2023 11:39:41 +0200 Subject: [PATCH 08/58] feat: drugbank sqla2 import --- ebel/manager/orientdb/biodbs/hgnc.py | 6 ---- ebel/manager/rdbms/models/drugbank.py | 44 ++++++++++++++++----------- 2 files changed, 27 insertions(+), 23 deletions(-) diff --git a/ebel/manager/orientdb/biodbs/hgnc.py b/ebel/manager/orientdb/biodbs/hgnc.py index 13c7598..91d1a65 100644 --- a/ebel/manager/orientdb/biodbs/hgnc.py +++ b/ebel/manager/orientdb/biodbs/hgnc.py @@ -438,9 +438,3 @@ def get_symbol_entrez_dict(self) -> Dict[str, int]: def update_interactions(self) -> int: """Abstract method.""" pass - - -if __name__ == "__main__": - hgncdb = Hgnc() - hgncdb.recreate_tables() - hgncdb.update() diff --git a/ebel/manager/rdbms/models/drugbank.py b/ebel/manager/rdbms/models/drugbank.py index 8877527..6cd9b52 100644 --- a/ebel/manager/rdbms/models/drugbank.py +++ b/ebel/manager/rdbms/models/drugbank.py @@ -1,6 +1,6 @@ """DrugBank RDBMS model definition.""" import datetime -from typing import List +from typing import List, Optional from sqlalchemy import Column, Date, ForeignKey, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base @@ -13,24 +13,25 @@ class Drugbank(Base): """Class definition for the drugbank table.""" __tablename__ = "drugbank" + id: Mapped[int] = mapped_column(primary_key=True) drugbank_id: Mapped[str] = mapped_column(String(10), index=True) name: Mapped[str] = mapped_column(String(255)) - description: Mapped[str] = mapped_column(Text) - cas_number: Mapped[str] = mapped_column(String(20)) - unii: Mapped[str] = mapped_column(String(20)) - state: Mapped[str] = mapped_column(String(20)) - indication: Mapped[str] = mapped_column(Text) - pharmacodynamics: Mapped[str] = mapped_column(Text) - toxicity: Mapped[str] = mapped_column(Text) - metabolism: Mapped[str] = mapped_column(Text) - absorption: Mapped[str] = mapped_column(Text) - half_life: Mapped[str] = mapped_column(Text) - route_of_elimination: Mapped[str] = mapped_column(Text) - volume_of_distribution: Mapped[str] = mapped_column(Text) - clearance: Mapped[str] = mapped_column(Text) - mechanism_of_action: Mapped[str] = mapped_column(Text) - fda_label: Mapped[str] = mapped_column(Text) + description: Mapped[Optional[str]] = mapped_column(Text) + cas_number: Mapped[Optional[str]] = mapped_column(String(20)) + unii: Mapped[Optional[str]] = mapped_column(String(20)) + state: Mapped[Optional[str]] = mapped_column(String(20)) + indication: Mapped[Optional[str]] = mapped_column(Text) + pharmacodynamics: Mapped[Optional[str]] = mapped_column(Text) + toxicity: Mapped[Optional[str]] = mapped_column(Text) + metabolism: Mapped[Optional[str]] = mapped_column(Text) + absorption: Mapped[Optional[str]] = mapped_column(Text) + half_life: Mapped[Optional[str]] = mapped_column(Text) + route_of_elimination: Mapped[Optional[str]] = mapped_column(Text) + volume_of_distribution: Mapped[Optional[str]] = mapped_column(Text) + clearance: Mapped[Optional[str]] = mapped_column(Text) + mechanism_of_action: Mapped[Optional[str]] = mapped_column(Text) + fda_label: Mapped[Optional[str]] = mapped_column(Text) references: Mapped[List["Reference"]] = relationship("Reference", back_populates="drugbank", cascade="save-update") synonyms: Mapped[List["Synonym"]] = relationship("Synonym", back_populates="drugbank", cascade="save-update") @@ -85,6 +86,7 @@ class Pathway(Base): """Class definition for the drugbank_pathway table.""" __tablename__ = "drugbank_pathway" + id: Mapped[int] = mapped_column(primary_key=True) smpdb_id: Mapped[str] = mapped_column(String(255)) @@ -103,6 +105,7 @@ class Patent(Base): """Class definition for the drugbank_patent table.""" __tablename__ = "drugbank_patent" + id: Mapped[int] = mapped_column(primary_key=True) number: Mapped[str] = mapped_column(String(255)) country: Mapped[str] = mapped_column(String(255)) @@ -132,6 +135,7 @@ class Status(Base): """Class definition for the drugbank_status table.""" __tablename__ = "drugbank_status" + id: Mapped[int] = mapped_column(primary_key=True) status: Mapped[str] = mapped_column(String(20), index=True) @@ -150,6 +154,7 @@ class ExternalIdentifier(Base): """Class definition for the drugbank_external_identifier table.""" __tablename__ = "drugbank_external_identifier" + id: Mapped[int] = mapped_column(primary_key=True) resource: Mapped[str] = mapped_column(String(255), index=True) identifier: Mapped[str] = mapped_column(String(255), index=True) @@ -173,6 +178,7 @@ class Reference(Base): """Class definition for the drugbank_reference table.""" __tablename__ = "drugbank_reference" + id: Mapped[int] = mapped_column(primary_key=True) pmid: Mapped[int] = mapped_column() @@ -191,9 +197,10 @@ class Target(Base): """Class definition for the drugbank_target table.""" __tablename__ = "drugbank_target" + id: Mapped[int] = mapped_column(primary_key=True) uniprot: Mapped[str] = mapped_column(String(20), index=True) - action: Mapped[str] = mapped_column(String(50), index=True) + action: Mapped[Optional[str]] = mapped_column(String(50), index=True) known_action: Mapped[str] = mapped_column(String(20), index=True) drugbank_id: Mapped[int] = mapped_column(ForeignKey("drugbank.id")) @@ -216,6 +223,7 @@ class DrugInteraction(Base): """Class definition for the drugbank_drug_interaction table.""" __tablename__ = "drugbank_drug_interaction" + id: Mapped[int] = mapped_column(primary_key=True) drugbank_id: Mapped[str] = mapped_column(String(10), index=True) name: Mapped[str] = mapped_column(Text) @@ -241,6 +249,7 @@ class ProductName(Base): """Class definition for the drugbank_product_name table.""" __tablename__ = "drugbank_product_name" + id: Mapped[int] = mapped_column(primary_key=True) name: Mapped[str] = mapped_column(Text) @@ -259,6 +268,7 @@ class Synonym(Base): """Class definition for the drugbank_synonym table.""" __tablename__ = "drugbank_synonym" + id: Mapped[int] = mapped_column(primary_key=True) synonym: Mapped[str] = mapped_column(Text) From dfe6bcd1097dc5defc961729fbcb9736298b7fb5 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Mon, 18 Sep 2023 12:00:32 +0200 Subject: [PATCH 09/58] feat: gwas catalog sqla2 import --- ebel/manager/rdbms/models/gwas_catalog.py | 40 +++++++++++------------ 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/ebel/manager/rdbms/models/gwas_catalog.py b/ebel/manager/rdbms/models/gwas_catalog.py index 6c2c9a9..c846ead 100644 --- a/ebel/manager/rdbms/models/gwas_catalog.py +++ b/ebel/manager/rdbms/models/gwas_catalog.py @@ -1,5 +1,5 @@ """GWAS Catalog RDBMS model definition.""" -from typing import List +from typing import List, Optional from sqlalchemy import Column, Float, ForeignKey, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base @@ -23,29 +23,29 @@ class GwasCatalog(Base): link: Mapped[str] = mapped_column(String(255)) study: Mapped[str] = mapped_column(Text) disease_trait: Mapped[str] = mapped_column(String(255)) - initial_sample_size: Mapped[str] = mapped_column(Text) - replication_sample_size: Mapped[str] = mapped_column(Text) - region: Mapped[str] = mapped_column(String(50)) - chr_id: Mapped[str] = mapped_column(Text) - chr_pos: Mapped[str] = mapped_column(Text) - reported_gene_s: Mapped[str] = mapped_column(Text) - mapped_gene: Mapped[str] = mapped_column(Text) - upstream_gene_id: Mapped[str] = mapped_column(String(50)) - downstream_gene_id: Mapped[str] = mapped_column(String(50)) - upstream_gene_distance: Mapped[int] = mapped_column() - downstream_gene_distance: Mapped[int] = mapped_column() + initial_sample_size: Mapped[Optional[str]] = mapped_column(Text) + replication_sample_size: Mapped[Optional[str]] = mapped_column(Text) + region: Mapped[Optional[str]] = mapped_column(String(50)) + chr_id: Mapped[Optional[str]] = mapped_column(Text) + chr_pos: Mapped[Optional[str]] = mapped_column(Text) + reported_gene_s: Mapped[Optional[str]] = mapped_column(Text) + mapped_gene: Mapped[Optional[str]] = mapped_column(Text) + upstream_gene_id: Mapped[Optional[str]] = mapped_column(String(50)) + downstream_gene_id: Mapped[Optional[str]] = mapped_column(String(50)) + upstream_gene_distance: Mapped[Optional[int]] = mapped_column() + downstream_gene_distance: Mapped[Optional[int]] = mapped_column() strongest_snp_risk_allele: Mapped[str] = mapped_column(Text) snp: Mapped[str] = mapped_column(Text) - merged: Mapped[int] = mapped_column() - snp_id_current: Mapped[str] = mapped_column(Text) - context: Mapped[str] = mapped_column(Text) - intergenic: Mapped[int] = mapped_column() - risk_allele_frequency: Mapped[str] = mapped_column(Text) + merged: Mapped[Optional[int]] = mapped_column() + snp_id_current: Mapped[Optional[str]] = mapped_column(Text) + context: Mapped[Optional[str]] = mapped_column(Text) + intergenic: Mapped[Optional[int]] = mapped_column() + risk_allele_frequency: Mapped[Optional[str]] = mapped_column(Text) p_value: Mapped[float] = mapped_column() pvalue_mlog: Mapped[float] = mapped_column() - p_value_text: Mapped[str] = mapped_column(Text) - or_or_beta: Mapped[float] = mapped_column() - _95_ci_text: Mapped[str] = mapped_column(Text) + p_value_text: Mapped[Optional[str]] = mapped_column(Text) + or_or_beta: Mapped[Optional[float]] = mapped_column() + _95_ci_text: Mapped[Optional[str]] = mapped_column(Text) platform_snps_passing_qc: Mapped[str] = mapped_column(Text) cnv: Mapped[str] = mapped_column(Text) From c0baf7171110e0da911985af2b64bfd2c99685f3 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Mon, 18 Sep 2023 12:19:34 +0200 Subject: [PATCH 10/58] feat: clinvar sqla2 import --- ebel/manager/rdbms/models/clinvar.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/ebel/manager/rdbms/models/clinvar.py b/ebel/manager/rdbms/models/clinvar.py index e70a738..8f84af7 100644 --- a/ebel/manager/rdbms/models/clinvar.py +++ b/ebel/manager/rdbms/models/clinvar.py @@ -54,14 +54,14 @@ class Clinvar(Base): type: Mapped[str] = mapped_column(String(100)) name: Mapped[str] = mapped_column(String(1000)) gene_id: Mapped[int] = mapped_column(index=True) - gene_symbol: Mapped[str] = mapped_column(String(1000)) - hgnc_id: Mapped[str] = mapped_column(String(100)) + gene_symbol: Mapped[Optional[str]] = mapped_column(String(1000)) + hgnc_id: Mapped[Optional[str]] = mapped_column(String(100)) clinical_significance: Mapped[str] = mapped_column(String(100)) clin_sig_simple: Mapped[int] = mapped_column() last_evaluated: Mapped[Optional[str]] = mapped_column(String(100)) rs_db_snp: Mapped[int] = mapped_column(index=True) - nsv_esv_db_var: Mapped[str] = mapped_column(String(100)) - rcvaccession: Mapped[str] = mapped_column(String(1000)) + nsv_esv_db_var: Mapped[Optional[str]] = mapped_column(String(100)) + rcvaccession: Mapped[Optional[str]] = mapped_column(String(1000)) origin: Mapped[str] = mapped_column(Text) origin_simple: Mapped[str] = mapped_column(Text) assembly: Mapped[str] = mapped_column(String(100), index=True) @@ -69,18 +69,18 @@ class Clinvar(Base): chromosome: Mapped[str] = mapped_column(Text) start: Mapped[int] = mapped_column() stop: Mapped[int] = mapped_column() - reference_allele: Mapped[str] = mapped_column(Text) - alternate_allele: Mapped[str] = mapped_column(Text) - cytogenetic: Mapped[str] = mapped_column(Text) + reference_allele: Mapped[Optional[str]] = mapped_column(Text) + alternate_allele: Mapped[Optional[str]] = mapped_column(Text) + cytogenetic: Mapped[Optional[str]] = mapped_column(Text) review_status: Mapped[str] = mapped_column(Text) number_submitters: Mapped[int] = mapped_column() - guidelines: Mapped[str] = mapped_column(Text) + guidelines: Mapped[Optional[str]] = mapped_column(Text) tested_in_gtr: Mapped[str] = mapped_column(Text) submitter_categories: Mapped[int] = mapped_column() variation_id: Mapped[int] = mapped_column() position_vcf: Mapped[int] = mapped_column() - reference_allele_vcf: Mapped[str] = mapped_column(Text(100000)) - alternate_allele_vcf: Mapped[str] = mapped_column(Text(100000)) + reference_allele_vcf: Mapped[Optional[str]] = mapped_column(Text(100000)) + alternate_allele_vcf: Mapped[Optional[str]] = mapped_column(Text(100000)) phenotypeMedgens: Mapped[List["ClinvarPhenotypeMedgen"]] = relationship( "ClinvarPhenotypeMedgen", foreign_keys=[ClinvarPhenotypeMedgen.clinvar_id] From f8502aedb420b42946e503a724621779bb79cb54 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Mon, 18 Sep 2023 14:16:13 +0200 Subject: [PATCH 11/58] feat: mirtarbase sqla2 import --- ebel/manager/rdbms/models/mirtarbase.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ebel/manager/rdbms/models/mirtarbase.py b/ebel/manager/rdbms/models/mirtarbase.py index 700543f..15df19a 100644 --- a/ebel/manager/rdbms/models/mirtarbase.py +++ b/ebel/manager/rdbms/models/mirtarbase.py @@ -1,4 +1,6 @@ """KEGG RDBMS model definition.""" +from typing import Optional + from sqlalchemy import Column, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import mapped_column, Mapped @@ -21,7 +23,7 @@ class Mirtarbase(Base): target_gene_entrez_id: Mapped[int] = mapped_column() species_target_gene: Mapped[str] = mapped_column(String(50), index=True) experiments: Mapped[str] = mapped_column(Text) - support_type: Mapped[str] = mapped_column(String(50), index=True) + support_type: Mapped[Optional[str]] = mapped_column(String(50), index=True) references_pmid: Mapped[int] = mapped_column() def as_dict(self): From 9bf20461b67cb31ea5810ecda98935ed7e872223 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Mon, 18 Sep 2023 15:17:31 +0200 Subject: [PATCH 12/58] feat: intact sqla2 import --- ebel/manager/orientdb/urls.py | 2 +- ebel/manager/rdbms/models/intact.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/ebel/manager/orientdb/urls.py b/ebel/manager/orientdb/urls.py index 0e42a79..b5f3036 100755 --- a/ebel/manager/orientdb/urls.py +++ b/ebel/manager/orientdb/urls.py @@ -32,7 +32,7 @@ BIOGRID = ( "https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-4.4.215/BIOGRID-ALL-4.4.215.tab3.zip" ) -INTACT = "ftp://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.zip" +INTACT = "https://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.zip" STITCH = "http://stitch.embl.de/download/protein_chemical.links.transfer.v5.0.tsv.gz" # String # diff --git a/ebel/manager/rdbms/models/intact.py b/ebel/manager/rdbms/models/intact.py index 7067f37..fbbb3d5 100644 --- a/ebel/manager/rdbms/models/intact.py +++ b/ebel/manager/rdbms/models/intact.py @@ -1,4 +1,6 @@ """IntAct RDBMS model definition.""" +from typing import Optional + from sqlalchemy import Column, Float, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import mapped_column, Mapped @@ -21,7 +23,7 @@ class Intact(Base): interaction_ids: Mapped[str] = mapped_column(Text) interaction_type: Mapped[str] = mapped_column(String(100), index=True) interaction_type_psimi_id: Mapped[int] = mapped_column() - pmid: Mapped[int] = mapped_column() + pmid: Mapped[Optional[int]] = mapped_column() def as_dict(self): """Convert object values to dictionary.""" From 6b060d3916656be17445caf94508f62273256721 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Mon, 18 Sep 2023 16:20:26 +0200 Subject: [PATCH 13/58] feat: pc sqla2 import --- .../orientdb/biodbs/pathway_commons.py | 30 ++++++++++++------- ebel/manager/rdbms/models/pathway_commons.py | 2 +- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/ebel/manager/orientdb/biodbs/pathway_commons.py b/ebel/manager/orientdb/biodbs/pathway_commons.py index 1fcc12b..34e36a0 100644 --- a/ebel/manager/orientdb/biodbs/pathway_commons.py +++ b/ebel/manager/orientdb/biodbs/pathway_commons.py @@ -5,6 +5,7 @@ import pandas as pd from pyorientdb import OrientDB +from sqlalchemy import select from tqdm import tqdm from ebel.constants import RID @@ -104,13 +105,16 @@ def create_pmids_table(self, df): inplace=True, ) df_pmids.pmid = pd.to_numeric(df_pmids.pmid, errors="coerce") - df_pmids.to_sql( - pc.Pmid.__tablename__, - con=self.engine, - index=False, - if_exists="append", - chunksize=10000, - ) + df_pmids = df_pmids[df_pmids.pmid.notna()] + + with self.engine.connect() as conn: + df_pmids.to_sql( + pc.Pmid.__tablename__, + con=conn, + index=False, + if_exists="append", + chunksize=10000, + ) del df_pmids def create_joining_table_names(self, df, df_pc_names): @@ -232,9 +236,13 @@ def update_interactions(self) -> Dict[str, int]: for edge_type in edge_types: inserted[edge_type] = 0 - sql = f"""Select id, participant_a, participant_b from - pathway_commons where interaction_type='{edge_type}'""" - df_ppi_of = pd.read_sql(sql, self.engine) + sql = select(pc.PathwayCommons.id, pc.PathwayCommons.participant_a, pc.PathwayCommons.participant_b).where( + pc.PathwayCommons.interaction_type == edge_type + ) + + with self.engine.connect() as conn: + df_ppi_of = pd.read_sql(sql, conn) + df_join = ( df_ppi_of.set_index("participant_a") .join(df_all.set_index("symbol")) @@ -289,7 +297,7 @@ def update_interactions(self) -> Dict[str, int]: def get_pathway_pmids_sources(self, pc_id, pc_pathway_name_rid_dict) -> tuple: """Return all pathway, PMIDs, and their sources.""" - pc_obj = self.session.query(pc.PathwayCommons).get(pc_id) + pc_obj = self.session.get(pc.PathwayCommons, pc_id) sources = [x.source for x in pc_obj.sources] pmids = [x.pmid for x in pc_obj.pmids] pathways = [pc_pathway_name_rid_dict[x.name] for x in pc_obj.pathway_names] diff --git a/ebel/manager/rdbms/models/pathway_commons.py b/ebel/manager/rdbms/models/pathway_commons.py index fde22a0..a27b49f 100644 --- a/ebel/manager/rdbms/models/pathway_commons.py +++ b/ebel/manager/rdbms/models/pathway_commons.py @@ -1,5 +1,5 @@ """Pathway Commons RDBMS model definition.""" -from typing import List +from typing import List, Optional from sqlalchemy import BigInteger, Column, ForeignKey, Integer, String, Table from sqlalchemy.ext.declarative import declarative_base From 6ab734f9b3e294711629b5ab9c82e334530d325b Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Mon, 18 Sep 2023 16:37:31 +0200 Subject: [PATCH 14/58] feat: stringdb sqla2 import --- ebel/manager/orientdb/biodbs/stringdb.py | 19 +++++++++++-------- ebel/manager/rdbms/models/stringdb.py | 3 ++- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/ebel/manager/orientdb/biodbs/stringdb.py b/ebel/manager/orientdb/biodbs/stringdb.py index 27272e9..b5c6774 100644 --- a/ebel/manager/orientdb/biodbs/stringdb.py +++ b/ebel/manager/orientdb/biodbs/stringdb.py @@ -6,7 +6,7 @@ import numpy as np import pandas as pd from pyorientdb import OrientDB -from sqlalchemy import text +from sqlalchemy import text, select, or_ from tqdm import tqdm from ebel.manager.orientdb import odb_meta, odb_structure, urls @@ -280,13 +280,10 @@ def update_action_interactions(self, hgnc: Hgnc) -> int: ("inhibition", "inhibition"): "inhibits_st", } + sdbaction = stringdb.StringDbAction Action = namedtuple("Action", ("symbol1", "symbol2", "mode", "action", "score")) - columns = ", ".join(Action._fields) - sql_temp = f"""Select {columns} from {self.table_action} - where mode in ('activation', 'inhibition', 'ptmod', 'expression') - and (symbol1='{{symbol}}' or symbol2='{{symbol}}') - and is_directional=1 and a_is_acting=1""" + modes = ("activation", "inhibition", "ptmod", "expression") symbols_rid_dict = self.get_pure_symbol_rids_dict_in_bel_context(namespace="HGNC") symbols = tuple(symbols_rid_dict.keys()) @@ -295,8 +292,14 @@ def update_action_interactions(self, hgnc: Hgnc) -> int: updated = 0 for symbol in tqdm(symbols, desc="Update has_action_st edges"): - sql = sql_temp.format(symbol=symbol) - rows = self.engine.execute(text(sql)) + sql = ( + select(sdbaction.symbol1, sdbaction.symbol2, sdbaction.mode, sdbaction.action, sdbaction.score) + .where(sdbaction.mode.in_(modes)) + .where(or_(sdbaction.symbol1 == symbol, sdbaction.symbol2 == symbol)) + .where(sdbaction.is_directional == 1) + .where(sdbaction.a_is_acting == 1) + ) + rows = self.session.execute(sql) for row in rows.fetchall(): action = Action(*row) diff --git a/ebel/manager/rdbms/models/stringdb.py b/ebel/manager/rdbms/models/stringdb.py index c56b9f9..47eccac 100644 --- a/ebel/manager/rdbms/models/stringdb.py +++ b/ebel/manager/rdbms/models/stringdb.py @@ -1,4 +1,5 @@ """StringDB RDBMS model definition.""" +from typing import Optional from sqlalchemy import Boolean, Column, Integer, SmallInteger, String from sqlalchemy.ext.declarative import declarative_base @@ -64,7 +65,7 @@ class StringDbAction(Base): symbol1: Mapped[str] = mapped_column(String(50), nullable=False, index=True) symbol2: Mapped[str] = mapped_column(String(50), nullable=False, index=True) mode: Mapped[str] = mapped_column(String(20), nullable=False, index=True) - action: Mapped[str] = mapped_column(String(20)) + action: Mapped[Optional[str]] = mapped_column(String(20)) is_directional: Mapped[bool] = mapped_column(Boolean, nullable=False, index=True) a_is_acting: Mapped[bool] = mapped_column(Boolean, nullable=False, index=True) score: Mapped[int] = mapped_column(SmallInteger) From 0dc24ef1d7f7674075342eeebc5436863c252507 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Mon, 18 Sep 2023 16:43:57 +0200 Subject: [PATCH 15/58] feat: protein atlas sqla2 import --- ebel/manager/orientdb/biodbs/protein_atlas.py | 4 ++- ebel/manager/rdbms/models/protein_atlas.py | 28 ++++++++++--------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/ebel/manager/orientdb/biodbs/protein_atlas.py b/ebel/manager/orientdb/biodbs/protein_atlas.py index 53481b5..efd77eb 100644 --- a/ebel/manager/orientdb/biodbs/protein_atlas.py +++ b/ebel/manager/orientdb/biodbs/protein_atlas.py @@ -170,7 +170,9 @@ def update_interactions(self) -> int: location_rid_cache = {x["bel"]: x["rid"] for x in self.query_class("location", columns=["bel"])} - for ensembl_gene_id, data in tqdm(rid_ensembl_gene_ids.items()): + for ensembl_gene_id, data in tqdm( + rid_ensembl_gene_ids.items(), desc=f"Update {self.biodb_name.upper()} interactions" + ): ns_location = "PROTEIN_ATLAS" pure_protein = data.oRecordData ns = pure_protein["namespace"] diff --git a/ebel/manager/rdbms/models/protein_atlas.py b/ebel/manager/rdbms/models/protein_atlas.py index cce2936..07ef8fc 100644 --- a/ebel/manager/rdbms/models/protein_atlas.py +++ b/ebel/manager/rdbms/models/protein_atlas.py @@ -1,4 +1,6 @@ """Protein Atlas RDBMS model definition.""" +from typing import Optional + from sqlalchemy import Column, Integer, Numeric, String, Text from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import mapped_column, Mapped @@ -14,9 +16,9 @@ class ProteinAtlasNormalTissue(Base): gene: Mapped[str] = mapped_column(String(100), index=True) gene_name: Mapped[str] = mapped_column(String(100)) - tissue: Mapped[str] = mapped_column(String(100)) - cell_type: Mapped[str] = mapped_column(String(100)) - level: Mapped[str] = mapped_column(String(100), index=True) + tissue: Mapped[Optional[str]] = mapped_column(String(100)) + cell_type: Mapped[Optional[str]] = mapped_column(String(100)) + level: Mapped[Optional[str]] = mapped_column(String(100), index=True) reliability: Mapped[str] = mapped_column(String(100), index=True) def as_dict(self): @@ -40,16 +42,16 @@ class ProteinAtlasSubcellularLocation(Base): gene: Mapped[str] = mapped_column(String(100)) gene_name: Mapped[str] = mapped_column(String(100)) reliability: Mapped[str] = mapped_column(String(100)) - main_location: Mapped[str] = mapped_column(String(100)) - additional_location: Mapped[str] = mapped_column(String(100)) - extracellular_location: Mapped[str] = mapped_column(String(100)) - enhanced: Mapped[str] = mapped_column(String(100)) - supported: Mapped[str] = mapped_column(String(100)) - approved: Mapped[str] = mapped_column(String(100)) - uncertain: Mapped[str] = mapped_column(String(100)) - single_cell_variation_intensity: Mapped[str] = mapped_column(String(100)) - single_cell_variation_spatial: Mapped[str] = mapped_column(String(100)) - cell_cycle_dependency: Mapped[str] = mapped_column(Text) + main_location: Mapped[Optional[str]] = mapped_column(String(100)) + additional_location: Mapped[Optional[str]] = mapped_column(String(100)) + extracellular_location: Mapped[Optional[str]] = mapped_column(String(100)) + enhanced: Mapped[Optional[str]] = mapped_column(String(100)) + supported: Mapped[Optional[str]] = mapped_column(String(100)) + approved: Mapped[Optional[str]] = mapped_column(String(100)) + uncertain: Mapped[Optional[str]] = mapped_column(String(100)) + single_cell_variation_intensity: Mapped[Optional[str]] = mapped_column(String(100)) + single_cell_variation_spatial: Mapped[Optional[str]] = mapped_column(String(100)) + cell_cycle_dependency: Mapped[Optional[str]] = mapped_column(Text) go_id: Mapped[str] = mapped_column(Text) def as_dict(self): From fb2c89e46d5573859725fd68e90e02668d2346d9 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Wed, 20 Sep 2023 15:34:55 +0200 Subject: [PATCH 16/58] feat: get ncbi and pc working --- ebel/constants.py | 30 +++++++++---------- ebel/defaults.py | 12 ++++---- .../orientdb/biodbs/pathway_commons.py | 7 +++-- ebel/manager/orientdb/biodbs/uniprot.py | 18 ++++++----- ebel/manager/rdbms/models/hgnc.py | 4 +-- ebel/manager/rdbms/models/ncbi.py | 10 +++---- ebel/manager/rdbms/models/uniprot.py | 2 +- 7 files changed, 44 insertions(+), 39 deletions(-) diff --git a/ebel/constants.py b/ebel/constants.py index 4d8c28b..86be228 100755 --- a/ebel/constants.py +++ b/ebel/constants.py @@ -2,38 +2,36 @@ # -*- coding: utf-8 -*- import os +from pathlib import Path -THIS_DIR = os.path.dirname(__file__) +THIS_DIR = Path(__file__) PROJECT_NAME = "ebel" -HOME = os.path.expanduser("~") +HOME = Path.home() LIBRARY_NAME = PROJECT_NAME # Path to folder -PROJECT_DIR = os.path.join(HOME, f".{PROJECT_NAME}") -if not os.path.exists(PROJECT_DIR): - os.mkdir(PROJECT_DIR) +PROJECT_DIR = Path(HOME, f".{PROJECT_NAME}") +PROJECT_DIR.mkdir(parents=True, exist_ok=True) # Path to data folder -DATA_DIR = os.path.join(PROJECT_DIR, "data") -if not os.path.exists(DATA_DIR): - os.mkdir(DATA_DIR) +DATA_DIR = Path(PROJECT_DIR, "data") +DATA_DIR.mkdir(parents=True, exist_ok=True) # Path to logs folder -LOG_DIR = os.path.join(PROJECT_DIR, "logs") -if not os.path.exists(LOG_DIR): - os.mkdir(LOG_DIR) +LOG_DIR = Path(PROJECT_DIR, "logs") +LOG_DIR.mkdir(parents=True, exist_ok=True) # Default database name and location -DB_NAME = "{}.db".format(PROJECT_NAME) -DB_PATH = os.path.join(DATA_DIR, DB_NAME) +DB_NAME = f"{PROJECT_NAME}.db" +DB_PATH = Path(DATA_DIR, DB_NAME) GRAMMAR_BEL_PATH = { - "2": os.path.join(THIS_DIR, "grammar", "grammar_bel_2.bnf"), - "2_1": os.path.join(THIS_DIR, "grammar", "grammar_bel_2_1.bnf"), + "2": THIS_DIR.joinpath("grammar", "grammar_bel_2.bnf"), + "2_1": THIS_DIR.joinpath("grammar", "grammar_bel_2_1.bnf"), } -GRAMMAR_NS_ANNO_PATH = os.path.join(THIS_DIR, "grammar", "grammar_belns_belanno_1__2.bnf") +GRAMMAR_NS_ANNO_PATH = THIS_DIR.joinpath("grammar", "grammar_belns_belanno_1__2.bnf") GRAMMAR_START_NS = "belns" GRAMMAR_START_ANNO = "belanno" GRAMMAR_START_LINE = "script_line_by_line" diff --git a/ebel/defaults.py b/ebel/defaults.py index 566da81..3ff9bb7 100755 --- a/ebel/defaults.py +++ b/ebel/defaults.py @@ -24,16 +24,16 @@ SQLITE_DATABASE_NAME = "ebel.db" SQLITE_TEST_DATABASE_NAME = "ebel_test.db" -DATABASE_LOCATION = os.path.join(DATA_DIR, SQLITE_DATABASE_NAME) -DEFAULT_TEST_DATABASE_LOCATION = os.path.join(DATA_DIR, SQLITE_TEST_DATABASE_NAME) +DATABASE_LOCATION = DATA_DIR.joinpath(SQLITE_DATABASE_NAME) +DEFAULT_TEST_DATABASE_LOCATION = DATA_DIR.joinpath(SQLITE_TEST_DATABASE_NAME) ############################################################################### # SQLAlchemy connection strings # ============================= # SQLite # ------ -CONN_STR_DEFAULT = "sqlite:///" + DATABASE_LOCATION -CONN_STR_TESTS = "sqlite:///" + SQLITE_TEST_DATABASE_NAME +CONN_STR_DEFAULT = "sqlite:///" + DATABASE_LOCATION.name +CONN_STR_TESTS = "sqlite:///" + DEFAULT_TEST_DATABASE_LOCATION.name # MySQL # ----- CONN_STR_MYSQL_PREFIX = "mysql+pymysql://ebel:ebel@localhost/" @@ -42,12 +42,12 @@ ############################################################################### # Config -config_file_path = os.path.join(PROJECT_DIR, "config.ini") +config_file_path = PROJECT_DIR.joinpath("config.ini") ############################################################################### # Log Handling logHandler = handlers.RotatingFileHandler( - filename=os.path.join(LOG_DIR, "ebel.log"), + filename=LOG_DIR.joinpath("ebel.log"), mode="a", maxBytes=4098 * 10, # 4MB file max backupCount=0, diff --git a/ebel/manager/orientdb/biodbs/pathway_commons.py b/ebel/manager/orientdb/biodbs/pathway_commons.py index 34e36a0..88a23fe 100644 --- a/ebel/manager/orientdb/biodbs/pathway_commons.py +++ b/ebel/manager/orientdb/biodbs/pathway_commons.py @@ -63,12 +63,12 @@ def insert_data(self) -> Dict[str, int]: "INTERACTION_PUBMED_ID", "PATHWAY_NAMES", ] - df = pd.read_csv(self.file_path, sep="\t", low_memory=True, usecols=usecols) # Because 2 tables are in file, we have to identify where second table starts and slice the dataframe df = df.iloc[: df[df["PARTICIPANT_A"] == "PARTICIPANT"].index[0]] df.columns = self._standardize_column_names(df.columns) + df.pathway_names = df.pathway_names.str.split(";") df.interaction_data_source = df.interaction_data_source.str.split(";") df.interaction_pubmed_id = df.interaction_pubmed_id.str.split(";") @@ -104,7 +104,7 @@ def create_pmids_table(self, df): columns={"id": "pathway_commons_id", "interaction_pubmed_id": "pmid"}, inplace=True, ) - df_pmids.pmid = pd.to_numeric(df_pmids.pmid, errors="coerce") + df_pmids.pmid = pd.to_numeric(df_pmids.pmid, errors="coerce", downcast="integer") df_pmids = df_pmids[df_pmids.pmid.notna()] with self.engine.connect() as conn: @@ -216,6 +216,9 @@ def update_interactions(self) -> Dict[str, int]: inserted = {} pc_pathway_name_rid_dict = self.get_pathway_name_rid_dict() + + # Update HGNC in case not in DB + self.hgnc.update() valid_hgnc_symbols = {x[0] for x in self.session.query(hgnc.Hgnc).with_entities(hgnc.Hgnc.symbol).all()} cols = ["symbol", "rid"] diff --git a/ebel/manager/orientdb/biodbs/uniprot.py b/ebel/manager/orientdb/biodbs/uniprot.py index 4e4c131..1bdb0be 100644 --- a/ebel/manager/orientdb/biodbs/uniprot.py +++ b/ebel/manager/orientdb/biodbs/uniprot.py @@ -4,6 +4,7 @@ import os import re from collections import namedtuple +from pathlib import Path from typing import Dict, List, Tuple, Union import pandas as pd @@ -479,10 +480,13 @@ def insert_uniprot(self) -> int: logger.info("Drop and create Uniprot table in RDBMS") logger.info("Insert data linked to Uniprot entry into RDBMS") - # avoid to use old gunzipped file - if os.path.exists(self.file_path_gunzipped): - os.remove(self.file_path_gunzipped) - if not os.path.exists(self.file_path_gunzipped): + + gunzipped_file = Path(self.file_path_gunzipped) + # Remove previous gunzipped file if present + if gunzipped_file.is_file(): + gunzipped_file.unlink() + + if not gunzipped_file.is_file(): # Gunzip compressed uniprot file gunzip(self.file_path, self.file_path_gunzipped) ( @@ -496,9 +500,9 @@ def insert_uniprot(self) -> int: self.__insert_linked_data(keywords, hosts, xrefs, functions, sclocations) inserted = self.__insert_uniprot_data(xrefs, functions, sclocations, number_of_entries) - # save storage space - if os.path.exists(self.file_path_gunzipped): - os.remove(self.file_path_gunzipped) + # save storage space by deleting uncompressed XML file + if gunzipped_file.is_file(): + gunzipped_file.unlink() # return number_of_entries return inserted diff --git a/ebel/manager/rdbms/models/hgnc.py b/ebel/manager/rdbms/models/hgnc.py index b356682..6f6f80a 100644 --- a/ebel/manager/rdbms/models/hgnc.py +++ b/ebel/manager/rdbms/models/hgnc.py @@ -2,7 +2,7 @@ import datetime from typing import List, Optional -from sqlalchemy import BigInteger, Column, Date, ForeignKey, Integer, String, Text +from sqlalchemy import BigInteger, Date, ForeignKey, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import relationship, mapped_column, Mapped @@ -18,7 +18,7 @@ class Hgnc(Base): id: Mapped[int] = mapped_column(primary_key=True) hgnc_id: Mapped[str] = mapped_column(String(20)) - version: Mapped[int] = mapped_column() + version: Mapped[int] = mapped_column(BigInteger) bioparadigms_slc: Mapped[Optional[str]] = mapped_column(String(20)) cd: Mapped[Optional[str]] = mapped_column(String(20)) cosmic: Mapped[Optional[str]] = mapped_column(String(50)) diff --git a/ebel/manager/rdbms/models/ncbi.py b/ebel/manager/rdbms/models/ncbi.py index ade3a7b..3c83f6b 100644 --- a/ebel/manager/rdbms/models/ncbi.py +++ b/ebel/manager/rdbms/models/ncbi.py @@ -1,5 +1,5 @@ """NCBI RDBMS model definition.""" -from typing import List +from typing import List, Optional from sqlalchemy import Column, ForeignKey, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base @@ -19,10 +19,10 @@ class NcbiGeneInfo(Base): tax_id: Mapped[int] = mapped_column(index=True) symbol: Mapped[str] = mapped_column(String(100), index=True) type_of_gene: Mapped[str] = mapped_column(String(100), index=True) - locus_tag: Mapped[str] = mapped_column(String(100)) - chromosome: Mapped[str] = mapped_column(String(100)) - map_location: Mapped[str] = mapped_column(String(100)) - description_id: Mapped[int] = mapped_column(ForeignKey("ncbi_gene_info_description.id")) + locus_tag: Mapped[Optional[str]] = mapped_column(String(100)) + chromosome: Mapped[Optional[str]] = mapped_column(String(100)) + map_location: Mapped[Optional[str]] = mapped_column(String(100)) + description_id: Mapped[Optional[int]] = mapped_column(ForeignKey("ncbi_gene_info_description.id")) description: Mapped["NcbiGeneInfoDescription"] = relationship( "NcbiGeneInfoDescription", foreign_keys=[description_id] ) diff --git a/ebel/manager/rdbms/models/uniprot.py b/ebel/manager/rdbms/models/uniprot.py index e4cfd9a..a63484a 100644 --- a/ebel/manager/rdbms/models/uniprot.py +++ b/ebel/manager/rdbms/models/uniprot.py @@ -46,7 +46,7 @@ class Uniprot(Base): __tablename__ = "uniprot" - id: Mapped[str] = mapped_column(primary_key=True) + id: Mapped[int] = mapped_column(primary_key=True) accession: Mapped[str] = mapped_column(String(20), unique=True) name: Mapped[str] = mapped_column(String(100), nullable=False, unique=True) From 608c4b641880cce463f3e3a8b852dafc3fc25b1d Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Wed, 20 Sep 2023 15:36:07 +0200 Subject: [PATCH 17/58] fix: ncbi constraint --- ebel/manager/rdbms/models/ncbi.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ebel/manager/rdbms/models/ncbi.py b/ebel/manager/rdbms/models/ncbi.py index 3c83f6b..a26ac3a 100644 --- a/ebel/manager/rdbms/models/ncbi.py +++ b/ebel/manager/rdbms/models/ncbi.py @@ -17,8 +17,8 @@ class NcbiGeneInfo(Base): gene_id: Mapped[int] = mapped_column(primary_key=True) tax_id: Mapped[int] = mapped_column(index=True) - symbol: Mapped[str] = mapped_column(String(100), index=True) - type_of_gene: Mapped[str] = mapped_column(String(100), index=True) + symbol: Mapped[Optional[str]] = mapped_column(String(100), index=True) + type_of_gene: Mapped[Optional[str]] = mapped_column(String(100), index=True) locus_tag: Mapped[Optional[str]] = mapped_column(String(100)) chromosome: Mapped[Optional[str]] = mapped_column(String(100)) map_location: Mapped[Optional[str]] = mapped_column(String(100)) From d4f66702079e301fdfe5cf4dabce43dc65989c63 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Thu, 21 Sep 2023 09:01:21 +0200 Subject: [PATCH 18/58] feat: update iuphar to sqla 2 --- ebel/manager/orientdb/biodbs/iuphar.py | 44 ++++++++-- ebel/manager/orientdb/urls.py | 2 +- ebel/manager/rdbms/models/iuphar.py | 117 +++++++++++++------------ 3 files changed, 97 insertions(+), 66 deletions(-) diff --git a/ebel/manager/orientdb/biodbs/iuphar.py b/ebel/manager/orientdb/biodbs/iuphar.py index 5f03d50..71e1476 100644 --- a/ebel/manager/orientdb/biodbs/iuphar.py +++ b/ebel/manager/orientdb/biodbs/iuphar.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd from pyorientdb import OrientDB +from sqlalchemy import select from tqdm import tqdm from ebel.manager.orientdb import odb_meta, odb_structure, urls @@ -132,13 +133,30 @@ def update_interactions(self) -> int: "Gating inhibitor": "inhibits_gating__iu", } - sql = """select i.pubmed_id, i.assay_description, i.affinity_units, i.affinity_low, i.affinity_median, - i.affinity_high, i.type, - i.action,i.target_uniprot, l.name as ligand_name, l.pubchem_sid, i.ligand_gene_symbol, i.ligand_species - from iuphar_interaction as i inner join iuphar_ligand as l - on (i.ligand_id=l.id) where i.target_uniprot IS NOT NULL and pubchem_sid IS NOT NULL""" + i_int = iuphar.IupharInteraction + lig = iuphar.IupharLigand + sql = ( + select( + i_int.pubmed_id, + i_int.assay_description, + i_int.affinity_units, + i_int.affinity_low, + i_int.affinity_median, + i_int.affinity_high, + i_int.type, + i_int.action, + i_int.target_uniprot, + lig.name.label("ligand_name"), + lig.pubchem_sid, + ) + .join(lig) + .where(i_int.target_uniprot.isnot(None)) + .where(lig.pubchem_sid.isnot(None)) + ) + + with self.engine.connect() as conn: + df_iuphar = pd.read_sql(sql, conn).replace({np.nan: None}) - df_iuphar = pd.read_sql(sql, self.engine).replace({np.nan: None}) df_iuphar.set_index("target_uniprot", inplace=True) df_graph = pd.DataFrame( uniprot.get_pure_uniprot_rid_dict_in_bel_context().items(), @@ -152,7 +170,11 @@ def update_interactions(self) -> int: total=df_join.shape[0], desc=f"Update {self.biodb_name.upper()} interactions", ): - if data.ligand_gene_symbol and data.ligand_species and "Human" in data.ligand_species: + if ( + "ligand_gene_symbol" in data.index + and "ligand_species" in data.index + and "Human" in data.ligand_species + ): symbol = data.ligand_gene_symbol.split("|")[0] # human seems to always the first a_value_dict = { "pure": True, @@ -161,6 +183,7 @@ def update_interactions(self) -> int: "name": symbol, } a_class = "protein" + else: a_value_dict = { "pure": True, @@ -170,6 +193,7 @@ def update_interactions(self) -> int: "label": data.ligand_name, } a_class = "abundance" + a_rid = self.get_create_rid(a_class, value_dict=a_value_dict, check_for="bel") i_value_dict = { @@ -189,3 +213,9 @@ def update_interactions(self) -> int: # Hgnc(self.client).update_bel() return df_join.shape[0] + + +if __name__ == "__main__": + hgncdb = Iuphar() + # hgncdb.recreate_tables() + hgncdb.update() diff --git a/ebel/manager/orientdb/urls.py b/ebel/manager/orientdb/urls.py index b5f3036..a83b81b 100755 --- a/ebel/manager/orientdb/urls.py +++ b/ebel/manager/orientdb/urls.py @@ -60,7 +60,7 @@ SIDER_SE = "http://sideeffects.embl.de/media/download/meddra_all_se.tsv.gz" # Expression Atlas # -EXPRESSION_ATLAS_BASE = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/" +EXPRESSION_ATLAS_BASE = "https://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/" EXPRESSION_ATLAS_EXPERIMENTS = EXPRESSION_ATLAS_BASE + "atlas-latest-data.tar.gz" # DisGeNet # diff --git a/ebel/manager/rdbms/models/iuphar.py b/ebel/manager/rdbms/models/iuphar.py index 11d2c4d..d28002f 100644 --- a/ebel/manager/rdbms/models/iuphar.py +++ b/ebel/manager/rdbms/models/iuphar.py @@ -1,9 +1,10 @@ """IUPHAR RDBMS model definition.""" -from typing import List +from typing import List, Optional -from sqlalchemy import BigInteger, Boolean, Column, ForeignKey, Integer, Numeric, String, Text +from sqlalchemy import (BigInteger, Boolean, Column, ForeignKey, Integer, + Numeric, String, Text) from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship, mapped_column, Mapped +from sqlalchemy.orm import Mapped, mapped_column, relationship from ebel.manager.rdbms.models import object_as_dict @@ -17,31 +18,31 @@ class IupharLigand(Base): id: Mapped[int] = mapped_column(primary_key=True) name: Mapped[str] = mapped_column(Text) - species: Mapped[str] = mapped_column(Text) + species: Mapped[Optional[str]] = mapped_column(Text) type: Mapped[str] = mapped_column(Text) - approved: Mapped[bool] = mapped_column() - withdrawn: Mapped[bool] = mapped_column() - labelled: Mapped[bool] = mapped_column() - radioactive: Mapped[bool] = mapped_column() - pubchem_sid: Mapped[int] = mapped_column() - pubchem_cid: Mapped[str] = mapped_column( + approved: Mapped[Optional[bool]] = mapped_column() + withdrawn: Mapped[Optional[bool]] = mapped_column() + labelled: Mapped[Optional[bool]] = mapped_column() + radioactive: Mapped[Optional[bool]] = mapped_column() + pubchem_sid: Mapped[Optional[int]] = mapped_column() + pubchem_cid: Mapped[Optional[int]] = mapped_column( Text ) # TODO: This is a integer, but for import reasons this changed to text - uniprot_id: Mapped[str] = mapped_column(Text) - ensembl_id: Mapped[str] = mapped_column(Text) - ligand_subunit_ids: Mapped[str] = mapped_column(Text) - ligand_subunit_name: Mapped[str] = mapped_column(Text) - ligand_subunit_uni_prot_ids: Mapped[str] = mapped_column(Text) - ligand_subunit_ensembl_ids: Mapped[str] = mapped_column(Text) - iupac_name: Mapped[str] = mapped_column(Text) - inn: Mapped[str] = mapped_column(Text) - synonyms: Mapped[str] = mapped_column(Text) - smiles: Mapped[str] = mapped_column(Text) - inchi_key: Mapped[str] = mapped_column(Text) - inchi: Mapped[str] = mapped_column(Text) - gto_immu_pdb: Mapped[bool] = mapped_column() - gto_mpdb: Mapped[bool] = mapped_column() - antibacterial: Mapped[bool] = mapped_column() + uniprot_id: Mapped[Optional[str]] = mapped_column(Text) + ensembl_id: Mapped[Optional[str]] = mapped_column(Text) + ligand_subunit_ids: Mapped[Optional[str]] = mapped_column(Text) + ligand_subunit_name: Mapped[Optional[str]] = mapped_column(Text) + ligand_subunit_uni_prot_ids: Mapped[Optional[str]] = mapped_column(Text) + ligand_subunit_ensembl_ids: Mapped[Optional[str]] = mapped_column(Text) + iupac_name: Mapped[Optional[str]] = mapped_column(Text) + inn: Mapped[Optional[str]] = mapped_column(Text) + synonyms: Mapped[Optional[str]] = mapped_column(Text) + smiles: Mapped[Optional[str]] = mapped_column(Text) + inchi_key: Mapped[Optional[str]] = mapped_column(Text) + inchi: Mapped[Optional[str]] = mapped_column(Text) + gto_immu_pdb: Mapped[Optional[bool]] = mapped_column() + gto_mpdb: Mapped[Optional[bool]] = mapped_column() + antibacterial: Mapped[Optional[bool]] = mapped_column() interactions: Mapped[List["IupharInteraction"]] = relationship("IupharInteraction") @@ -56,48 +57,48 @@ class IupharInteraction(Base): __tablename__ = "iuphar_interaction" id = mapped_column(Integer, primary_key=True) - target: Mapped[str] = mapped_column(String(255)) - target_id: Mapped[int] = mapped_column() - target_subunit_ids: Mapped[str] = mapped_column(Text) - target_gene_symbol: Mapped[str] = mapped_column(String(100)) - target_uniprot: Mapped[str] = mapped_column(String(100)) - target_ensembl_gene_id: Mapped[str] = mapped_column(String(200)) - target_ligand: Mapped[str] = mapped_column(String(100)) - target_ligand_id: Mapped[int] = mapped_column() - target_ligand_subunit_ids: Mapped[str] = mapped_column(Text) - target_ligand_gene_symbol: Mapped[str] = mapped_column(String(50)) - target_ligand_uniprot_id: Mapped[str] = mapped_column(String(200)) - target_ligand_ensembl_gene_id: Mapped[str] = mapped_column(String(50)) - target_ligand_pubchem_sid: Mapped[int] = mapped_column() - target_species: Mapped[str] = mapped_column(String(100)) + target: Mapped[Optional[str]] = mapped_column(String(255)) + target_id: Mapped[Optional[int]] = mapped_column() + target_subunit_ids: Mapped[Optional[str]] = mapped_column(Text) + target_gene_symbol: Mapped[Optional[str]] = mapped_column(String(100)) + target_uniprot: Mapped[Optional[str]] = mapped_column(String(100)) + target_ensembl_gene_id: Mapped[Optional[str]] = mapped_column(String(200)) + target_ligand: Mapped[Optional[str]] = mapped_column(String(100)) + target_ligand_id: Mapped[Optional[str]] = mapped_column() + target_ligand_subunit_ids: Mapped[Optional[str]] = mapped_column(Text) + target_ligand_gene_symbol: Mapped[Optional[str]] = mapped_column(String(50)) + target_ligand_uniprot_id: Mapped[Optional[str]] = mapped_column(String(200)) + target_ligand_ensembl_gene_id: Mapped[Optional[str]] = mapped_column(String(50)) + target_ligand_pubchem_sid: Mapped[Optional[str]] = mapped_column() + target_species: Mapped[Optional[str]] = mapped_column(String(100)) ligand: Mapped[str] = mapped_column(String(255)) ligand_id: Mapped[int] = mapped_column(ForeignKey("iuphar_ligand.id"), index=True) - ligand_subunit_ids: Mapped[str] = mapped_column(Text) - ligand_gene_symbol: Mapped[str] = mapped_column(String(50)) - ligand_species: Mapped[str] = mapped_column(String(50)) - ligand_pubchem_sid: Mapped[int] = mapped_column() + ligand_subunit_ids: Mapped[Optional[str]] = mapped_column(Text) + ligand_gene_symbol: Mapped[Optional[str]] = mapped_column(String(50)) + ligand_species: Mapped[Optional[str]] = mapped_column(String(50)) + ligand_pubchem_sid: Mapped[Optional[int]] = mapped_column() ligand_type: Mapped[str] = mapped_column(Text) approved: Mapped[bool] = mapped_column() type: Mapped[str] = mapped_column(String(100)) action: Mapped[str] = mapped_column(String(100)) - action_comment: Mapped[str] = mapped_column(String(255)) - selectivity: Mapped[str] = mapped_column(String(50)) + action_comment: Mapped[Optional[str]] = mapped_column(String(255)) + selectivity: Mapped[Optional[str]] = mapped_column(String(50)) endogenous: Mapped[bool] = mapped_column() primary_target: Mapped[bool] = mapped_column() - concentration_range: Mapped[str] = mapped_column(String(50)) + concentration_range: Mapped[Optional[str]] = mapped_column(String(50)) affinity_units: Mapped[str] = mapped_column(String(10)) - affinity_high: Mapped[float] = mapped_column(Numeric(6, 2)) - affinity_median: Mapped[float] = mapped_column(Numeric(6, 2)) - affinity_low: Mapped[float] = mapped_column(Numeric(6, 2)) - original_affinity_units: Mapped[str] = mapped_column(String(10)) - original_affinity_low_nm: Mapped[float] = mapped_column(Numeric(12, 3)) - original_affinity_median_nm: Mapped[float] = mapped_column(Numeric(12, 3)) - original_affinity_high_nm: Mapped[float] = mapped_column(Numeric(12, 3)) - original_affinity_relation: Mapped[str] = mapped_column(String(1)) - assay_description: Mapped[str] = mapped_column(Text) - receptor_site: Mapped[str] = mapped_column(String(100)) - ligand_context: Mapped[str] = mapped_column(String(50)) - pubmed_id: Mapped[str] = mapped_column(Text) + affinity_high: Mapped[Optional[float]] = mapped_column(Numeric(6, 2)) + affinity_median: Mapped[Optional[float]] = mapped_column(Numeric(6, 2)) + affinity_low: Mapped[Optional[float]] = mapped_column(Numeric(6, 2)) + original_affinity_units: Mapped[Optional[str]] = mapped_column(String(10)) + original_affinity_low_nm: Mapped[Optional[float]] = mapped_column(Numeric(12, 3)) + original_affinity_median_nm: Mapped[Optional[float]] = mapped_column(Numeric(12, 3)) + original_affinity_high_nm: Mapped[Optional[float]] = mapped_column(Numeric(12, 3)) + original_affinity_relation: Mapped[Optional[str]] = mapped_column(String(1)) + assay_description: Mapped[Optional[str]] = mapped_column(Text) + receptor_site: Mapped[Optional[str]] = mapped_column(String(100)) + ligand_context: Mapped[Optional[str]] = mapped_column(String(50)) + pubmed_id: Mapped[Optional[str]] = mapped_column(Text) def as_dict(self): """Convert object values to dictionary.""" From c1856ca410078c9f9dc21fd7fa3bc8675fe256e0 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Thu, 21 Sep 2023 09:01:40 +0200 Subject: [PATCH 19/58] style: black and isort --- ebel/manager/models.py | 2 +- ebel/manager/orientdb/biodbs/disgenet.py | 2 +- ebel/manager/orientdb/biodbs/hgnc.py | 26 +++++++------------ ebel/manager/orientdb/biodbs/stringdb.py | 2 +- ebel/manager/rdbms/models/biogrid.py | 2 +- ebel/manager/rdbms/models/chebi.py | 2 +- .../rdbms/models/clinical_trials_gov.py | 4 +-- ebel/manager/rdbms/models/clinvar.py | 4 +-- ebel/manager/rdbms/models/disgenet.py | 2 +- ebel/manager/rdbms/models/drugbank.py | 2 +- ebel/manager/rdbms/models/ensembl.py | 2 +- ebel/manager/rdbms/models/expression_atlas.py | 2 +- ebel/manager/rdbms/models/gwas_catalog.py | 2 +- ebel/manager/rdbms/models/hgnc.py | 2 +- ebel/manager/rdbms/models/human_ortholog.py | 2 +- ebel/manager/rdbms/models/intact.py | 2 +- ebel/manager/rdbms/models/iuphar.py | 3 +-- ebel/manager/rdbms/models/kegg.py | 2 +- ebel/manager/rdbms/models/mirtarbase.py | 2 +- ebel/manager/rdbms/models/ncbi.py | 2 +- ebel/manager/rdbms/models/nsides.py | 2 +- ebel/manager/rdbms/models/pathway_commons.py | 2 +- ebel/manager/rdbms/models/protein_atlas.py | 2 +- ebel/manager/rdbms/models/reactome.py | 2 +- ebel/manager/rdbms/models/stringdb.py | 2 +- ebel/manager/rdbms/models/uniprot.py | 2 +- 26 files changed, 37 insertions(+), 44 deletions(-) diff --git a/ebel/manager/models.py b/ebel/manager/models.py index 010ef16..32c0307 100755 --- a/ebel/manager/models.py +++ b/ebel/manager/models.py @@ -15,7 +15,7 @@ from lark import Lark, Token, Tree from sqlalchemy import Boolean, ForeignKey, Index, Integer, String from sqlalchemy.ext.declarative import declarative_base, declared_attr -from sqlalchemy.orm import relationship, mapped_column +from sqlalchemy.orm import mapped_column, relationship from sqlalchemy.sql.expression import func from sqlalchemy_utils import create_database, database_exists from tqdm import tqdm diff --git a/ebel/manager/orientdb/biodbs/disgenet.py b/ebel/manager/orientdb/biodbs/disgenet.py index bb3259d..c052cc7 100644 --- a/ebel/manager/orientdb/biodbs/disgenet.py +++ b/ebel/manager/orientdb/biodbs/disgenet.py @@ -4,7 +4,7 @@ import pandas as pd from pyorientdb import OrientDB -from sqlalchemy import text, select +from sqlalchemy import select, text from tqdm import tqdm from ebel.manager.orientdb import odb_meta, odb_structure, urls diff --git a/ebel/manager/orientdb/biodbs/hgnc.py b/ebel/manager/orientdb/biodbs/hgnc.py index 91d1a65..f16e0b7 100644 --- a/ebel/manager/orientdb/biodbs/hgnc.py +++ b/ebel/manager/orientdb/biodbs/hgnc.py @@ -15,25 +15,19 @@ from ebel.manager.orientdb import odb_meta, odb_structure, urls from ebel.manager.orientdb.constants import HGNC from ebel.manager.rdbms.models import hgnc +from ebel.manager.rdbms.models.hgnc import AliasName, AliasSymbol, Ccds, Ena, Enzyme, GeneGroupId, GeneGroupName +from ebel.manager.rdbms.models.hgnc import Hgnc as HgncDb from ebel.manager.rdbms.models.hgnc import ( - Hgnc as HgncDb, + Lsdb, + Mgd, + Omim, + PrevName, PrevSymbol, - AliasSymbol, - AliasName, - Ccds, - Ena, - Enzyme, - GeneGroupName, - GeneGroupId, - UniProt, - RnaCentral, - Rgd, - RefSeq, PubMed, - PrevName, - Omim, - Mgd, - Lsdb, + RefSeq, + Rgd, + RnaCentral, + UniProt, ) from ebel.tools import get_file_path diff --git a/ebel/manager/orientdb/biodbs/stringdb.py b/ebel/manager/orientdb/biodbs/stringdb.py index b5c6774..19aceb3 100644 --- a/ebel/manager/orientdb/biodbs/stringdb.py +++ b/ebel/manager/orientdb/biodbs/stringdb.py @@ -6,7 +6,7 @@ import numpy as np import pandas as pd from pyorientdb import OrientDB -from sqlalchemy import text, select, or_ +from sqlalchemy import or_, select, text from tqdm import tqdm from ebel.manager.orientdb import odb_meta, odb_structure, urls diff --git a/ebel/manager/rdbms/models/biogrid.py b/ebel/manager/rdbms/models/biogrid.py index d552d56..b8dea17 100644 --- a/ebel/manager/rdbms/models/biogrid.py +++ b/ebel/manager/rdbms/models/biogrid.py @@ -1,7 +1,7 @@ """BioGRID RDBMS model definition.""" from sqlalchemy import Float, ForeignKey, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship, mapped_column, Mapped +from sqlalchemy.orm import Mapped, mapped_column, relationship from ebel.manager.rdbms.models import object_as_dict diff --git a/ebel/manager/rdbms/models/chebi.py b/ebel/manager/rdbms/models/chebi.py index 52d3120..365968e 100644 --- a/ebel/manager/rdbms/models/chebi.py +++ b/ebel/manager/rdbms/models/chebi.py @@ -4,7 +4,7 @@ from sqlalchemy import DateTime, ForeignKey, Index, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship, mapped_column, Mapped +from sqlalchemy.orm import Mapped, mapped_column, relationship Base = declarative_base() diff --git a/ebel/manager/rdbms/models/clinical_trials_gov.py b/ebel/manager/rdbms/models/clinical_trials_gov.py index c38cf31..c56daef 100644 --- a/ebel/manager/rdbms/models/clinical_trials_gov.py +++ b/ebel/manager/rdbms/models/clinical_trials_gov.py @@ -2,9 +2,9 @@ import re from typing import List, Optional -from sqlalchemy import ForeignKey, Integer, String, Table, Text, Column +from sqlalchemy import Column, ForeignKey, Integer, String, Table, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship, mapped_column, Mapped +from sqlalchemy.orm import Mapped, mapped_column, relationship from ebel.manager.rdbms.models import object_as_dict diff --git a/ebel/manager/rdbms/models/clinvar.py b/ebel/manager/rdbms/models/clinvar.py index 8f84af7..23ffd65 100644 --- a/ebel/manager/rdbms/models/clinvar.py +++ b/ebel/manager/rdbms/models/clinvar.py @@ -1,9 +1,9 @@ """ClinVar RDBMS model definition.""" from typing import List, Optional -from sqlalchemy import ForeignKey, Index, Integer, String, Table, Text, Column +from sqlalchemy import Column, ForeignKey, Index, Integer, String, Table, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship, mapped_column, Mapped +from sqlalchemy.orm import Mapped, mapped_column, relationship from ebel.manager.rdbms.models import object_as_dict diff --git a/ebel/manager/rdbms/models/disgenet.py b/ebel/manager/rdbms/models/disgenet.py index 3ee071c..7959b9a 100644 --- a/ebel/manager/rdbms/models/disgenet.py +++ b/ebel/manager/rdbms/models/disgenet.py @@ -3,7 +3,7 @@ from sqlalchemy import BigInteger, Float, ForeignKey, Integer, String from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship, mapped_column, Mapped +from sqlalchemy.orm import Mapped, mapped_column, relationship from ebel.manager.rdbms.models import object_as_dict diff --git a/ebel/manager/rdbms/models/drugbank.py b/ebel/manager/rdbms/models/drugbank.py index 6cd9b52..3f3800f 100644 --- a/ebel/manager/rdbms/models/drugbank.py +++ b/ebel/manager/rdbms/models/drugbank.py @@ -4,7 +4,7 @@ from sqlalchemy import Column, Date, ForeignKey, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship, mapped_column, Mapped +from sqlalchemy.orm import Mapped, mapped_column, relationship Base = declarative_base() diff --git a/ebel/manager/rdbms/models/ensembl.py b/ebel/manager/rdbms/models/ensembl.py index 6d88a66..5d386ec 100644 --- a/ebel/manager/rdbms/models/ensembl.py +++ b/ebel/manager/rdbms/models/ensembl.py @@ -2,7 +2,7 @@ from sqlalchemy import Column, Integer, String from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import mapped_column, Mapped +from sqlalchemy.orm import Mapped, mapped_column from ebel.manager.rdbms.models import object_as_dict diff --git a/ebel/manager/rdbms/models/expression_atlas.py b/ebel/manager/rdbms/models/expression_atlas.py index cf7afaf..8e38c5e 100644 --- a/ebel/manager/rdbms/models/expression_atlas.py +++ b/ebel/manager/rdbms/models/expression_atlas.py @@ -3,7 +3,7 @@ from sqlalchemy import Column, Float, ForeignKey, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship, mapped_column, Mapped +from sqlalchemy.orm import Mapped, mapped_column, relationship from ebel.manager.rdbms.models import object_as_dict diff --git a/ebel/manager/rdbms/models/gwas_catalog.py b/ebel/manager/rdbms/models/gwas_catalog.py index c846ead..550e128 100644 --- a/ebel/manager/rdbms/models/gwas_catalog.py +++ b/ebel/manager/rdbms/models/gwas_catalog.py @@ -3,7 +3,7 @@ from sqlalchemy import Column, Float, ForeignKey, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship, mapped_column, Mapped +from sqlalchemy.orm import Mapped, mapped_column, relationship from ebel.manager.rdbms.models import object_as_dict diff --git a/ebel/manager/rdbms/models/hgnc.py b/ebel/manager/rdbms/models/hgnc.py index 6f6f80a..e6f5d1e 100644 --- a/ebel/manager/rdbms/models/hgnc.py +++ b/ebel/manager/rdbms/models/hgnc.py @@ -4,7 +4,7 @@ from sqlalchemy import BigInteger, Date, ForeignKey, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship, mapped_column, Mapped +from sqlalchemy.orm import Mapped, mapped_column, relationship from ebel.manager.rdbms.models import object_as_dict diff --git a/ebel/manager/rdbms/models/human_ortholog.py b/ebel/manager/rdbms/models/human_ortholog.py index 565d88c..1b1ab01 100644 --- a/ebel/manager/rdbms/models/human_ortholog.py +++ b/ebel/manager/rdbms/models/human_ortholog.py @@ -1,7 +1,7 @@ """HGNC Human Ortholog RDBMS model definition.""" from sqlalchemy import Column, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import mapped_column, Mapped +from sqlalchemy.orm import Mapped, mapped_column from ebel.manager.rdbms.models import object_as_dict diff --git a/ebel/manager/rdbms/models/intact.py b/ebel/manager/rdbms/models/intact.py index fbbb3d5..62c6062 100644 --- a/ebel/manager/rdbms/models/intact.py +++ b/ebel/manager/rdbms/models/intact.py @@ -3,7 +3,7 @@ from sqlalchemy import Column, Float, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import mapped_column, Mapped +from sqlalchemy.orm import Mapped, mapped_column from ebel.manager.rdbms.models import object_as_dict diff --git a/ebel/manager/rdbms/models/iuphar.py b/ebel/manager/rdbms/models/iuphar.py index d28002f..83adc97 100644 --- a/ebel/manager/rdbms/models/iuphar.py +++ b/ebel/manager/rdbms/models/iuphar.py @@ -1,8 +1,7 @@ """IUPHAR RDBMS model definition.""" from typing import List, Optional -from sqlalchemy import (BigInteger, Boolean, Column, ForeignKey, Integer, - Numeric, String, Text) +from sqlalchemy import BigInteger, Boolean, Column, ForeignKey, Integer, Numeric, String, Text from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import Mapped, mapped_column, relationship diff --git a/ebel/manager/rdbms/models/kegg.py b/ebel/manager/rdbms/models/kegg.py index d26d78d..d0975e0 100644 --- a/ebel/manager/rdbms/models/kegg.py +++ b/ebel/manager/rdbms/models/kegg.py @@ -1,7 +1,7 @@ """KEGG RDBMS model definition.""" from sqlalchemy import Column, Integer, String from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import mapped_column, Mapped +from sqlalchemy.orm import Mapped, mapped_column from ebel.manager.rdbms.models import object_as_dict diff --git a/ebel/manager/rdbms/models/mirtarbase.py b/ebel/manager/rdbms/models/mirtarbase.py index 15df19a..f44aaaf 100644 --- a/ebel/manager/rdbms/models/mirtarbase.py +++ b/ebel/manager/rdbms/models/mirtarbase.py @@ -3,7 +3,7 @@ from sqlalchemy import Column, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import mapped_column, Mapped +from sqlalchemy.orm import Mapped, mapped_column from ebel.manager.rdbms.models import object_as_dict diff --git a/ebel/manager/rdbms/models/ncbi.py b/ebel/manager/rdbms/models/ncbi.py index a26ac3a..caa04f7 100644 --- a/ebel/manager/rdbms/models/ncbi.py +++ b/ebel/manager/rdbms/models/ncbi.py @@ -3,7 +3,7 @@ from sqlalchemy import Column, ForeignKey, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship, mapped_column, Mapped +from sqlalchemy.orm import Mapped, mapped_column, relationship from . import object_as_dict diff --git a/ebel/manager/rdbms/models/nsides.py b/ebel/manager/rdbms/models/nsides.py index aceb587..577c09f 100644 --- a/ebel/manager/rdbms/models/nsides.py +++ b/ebel/manager/rdbms/models/nsides.py @@ -2,7 +2,7 @@ from sqlalchemy import Column, Float, Index, Integer, String from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import mapped_column, Mapped +from sqlalchemy.orm import Mapped, mapped_column from ebel.manager.rdbms.models import object_as_dict diff --git a/ebel/manager/rdbms/models/pathway_commons.py b/ebel/manager/rdbms/models/pathway_commons.py index a27b49f..172b159 100644 --- a/ebel/manager/rdbms/models/pathway_commons.py +++ b/ebel/manager/rdbms/models/pathway_commons.py @@ -3,7 +3,7 @@ from sqlalchemy import BigInteger, Column, ForeignKey, Integer, String, Table from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship, mapped_column, Mapped +from sqlalchemy.orm import Mapped, mapped_column, relationship from ebel.manager.rdbms.models import object_as_dict diff --git a/ebel/manager/rdbms/models/protein_atlas.py b/ebel/manager/rdbms/models/protein_atlas.py index 07ef8fc..0857120 100644 --- a/ebel/manager/rdbms/models/protein_atlas.py +++ b/ebel/manager/rdbms/models/protein_atlas.py @@ -3,7 +3,7 @@ from sqlalchemy import Column, Integer, Numeric, String, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import mapped_column, Mapped +from sqlalchemy.orm import Mapped, mapped_column Base = declarative_base() diff --git a/ebel/manager/rdbms/models/reactome.py b/ebel/manager/rdbms/models/reactome.py index 3852882..42f5b68 100644 --- a/ebel/manager/rdbms/models/reactome.py +++ b/ebel/manager/rdbms/models/reactome.py @@ -1,7 +1,7 @@ """Reactome RDBMS model definition.""" from sqlalchemy import Column, Integer, String from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import mapped_column, Mapped +from sqlalchemy.orm import Mapped, mapped_column from ebel.manager.rdbms.models import object_as_dict diff --git a/ebel/manager/rdbms/models/stringdb.py b/ebel/manager/rdbms/models/stringdb.py index 47eccac..1d1992f 100644 --- a/ebel/manager/rdbms/models/stringdb.py +++ b/ebel/manager/rdbms/models/stringdb.py @@ -3,7 +3,7 @@ from sqlalchemy import Boolean, Column, Integer, SmallInteger, String from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import mapped_column, Mapped +from sqlalchemy.orm import Mapped, mapped_column from ebel.manager.rdbms.models import object_as_dict diff --git a/ebel/manager/rdbms/models/uniprot.py b/ebel/manager/rdbms/models/uniprot.py index a63484a..50e3170 100644 --- a/ebel/manager/rdbms/models/uniprot.py +++ b/ebel/manager/rdbms/models/uniprot.py @@ -4,7 +4,7 @@ from sqlalchemy import Column, ForeignKey, Integer, String, Table, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship, mapped_column, Mapped +from sqlalchemy.orm import Mapped, mapped_column, relationship Base = declarative_base() From 201ef8a19143ece2c10a489923a515384ca8a6e5 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Thu, 21 Sep 2023 10:52:07 +0200 Subject: [PATCH 20/58] feat: update intact and iuphar to select stmts --- ebel/manager/orientdb/biodbs/intact.py | 101 +++++++++++++++------ ebel/manager/orientdb/biodbs/iuphar.py | 9 -- ebel/manager/orientdb/biodbs/mirtarbase.py | 51 ++++++++--- 3 files changed, 107 insertions(+), 54 deletions(-) diff --git a/ebel/manager/orientdb/biodbs/intact.py b/ebel/manager/orientdb/biodbs/intact.py index 0a4d57d..a450fb8 100644 --- a/ebel/manager/orientdb/biodbs/intact.py +++ b/ebel/manager/orientdb/biodbs/intact.py @@ -5,7 +5,7 @@ import pandas as pd from pyorientdb import OrientDB -from sqlalchemy import text +from sqlalchemy import select, or_ from tqdm import tqdm from ebel.manager.orientdb import odb_meta, odb_structure, urls @@ -77,18 +77,22 @@ def insert_data(self) -> Dict[str, int]: df = pd.read_csv(zf.open("intact.txt"), sep="\t", usecols=usecols.keys()) df.rename(columns=usecols, inplace=True) + regex_accession = r"uniprotkb:([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})" df.int_a_uniprot_id = df.int_a_uniprot_id.str.extract(regex_accession)[0] df.int_b_uniprot_id = df.int_b_uniprot_id.str.extract(regex_accession)[0] df = df[(pd.notnull(df.int_a_uniprot_id) & pd.notnull(df.int_b_uniprot_id))] + regex_detection_method = r"psi-mi:\"MI:0*(?P\d+)\"\((?P[^)]+)\)" df = df.join(df.dm.str.extract(regex_detection_method), how="left") df.drop(columns=["dm"], inplace=True) df.pmid = df.pmid.str.extract(r"pubmed:(\d+)") + regex_interaction_type = r"psi-mi:\"MI:0*(?P\d+)\"\((?P[^)]+)\)" df = df.join(df.it.str.extract(regex_interaction_type), how="left") df.drop(columns=["it"], inplace=True) df.confidence_value = df.confidence_value.str.extract(r"intact-miscore:(\d+(\.\d+)?)")[0] + df.index += 1 df.index.rename("id", inplace=True) @@ -141,9 +145,14 @@ def get_namespace_name_by_uniprot(self, uniprot_accession: str) -> tuple: namespace, value """ return_value = () - sql = f"""Select s.symbol, u.taxid from uniprot u inner join uniprot_gene_symbol s - on (u.id=s.uniprot_id) where u.accession='{uniprot_accession}' limit 1""" - result = self.session.execute(text(sql)).fetchone() + + sql = ( + select(uniprot.GeneSymbol.symbol, uniprot.Uniprot.taxid) + .join(uniprot.Uniprot) + .where(uniprot.Uniprot.accession == uniprot_accession) + ) + + result = self.session.execute(sql).fetchone() taxid_to_namespace = {9606: "HGNC", 10090: "MGI", 10116: "RGD"} if result: name, taxid = result @@ -161,38 +170,64 @@ def update_interactions(self) -> int: uniprot_rid_dict = uniprot.get_pure_uniprot_rid_dict_in_bel_context() - sql_temp = """SELECT - int_a_uniprot_id, - int_b_uniprot_id, - pmid, - interaction_ids, - interaction_type, - interaction_type_psimi_id, - detection_method, - detection_method_psimi_id, - confidence_value - FROM - intact - WHERE - int_a_uniprot_id = '{uniprot_accession}' or int_b_uniprot_id = '{uniprot_accession}' - GROUP BY - int_a_uniprot_id, - int_b_uniprot_id, - pmid, - interaction_ids, - interaction_type, - interaction_type_psimi_id, - detection_method, - detection_method_psimi_id, - confidence_value""" + # sql_temp = """SELECT + # int_a_uniprot_id, + # int_b_uniprot_id, + # pmid, + # interaction_ids, + # interaction_type, + # interaction_type_psimi_id, + # detection_method, + # detection_method_psimi_id, + # confidence_value + # FROM + # intact + # WHERE + # int_a_uniprot_id = '{uniprot_accession}' or int_b_uniprot_id = '{uniprot_accession}' + # GROUP BY + # int_a_uniprot_id, + # int_b_uniprot_id, + # pmid, + # interaction_ids, + # interaction_type, + # interaction_type_psimi_id, + # detection_method, + # detection_method_psimi_id, + # confidence_value""" updated = 0 uniprot_accessions = tuple(uniprot_rid_dict.keys()) + it = intact.Intact for uniprot_accession in tqdm(uniprot_accessions, desc="Update IntAct interactions"): - sql = sql_temp.format(uniprot_accession=uniprot_accession) - result = self.session.execute(text(sql)) + # sql = sql_temp.format(uniprot_accession=uniprot_accession) + sql = ( + select( + it.int_a_uniprot_id, + it.int_b_uniprot_id, + it.pmid, + it.interaction_ids, + it.interaction_type, + it.interaction_type_psimi_id, + it.detection_method, + it.detection_method_psimi_id, + it.confidence_value, + ) + .where(or_(it.int_a_uniprot_id == uniprot_accession, it.int_b_uniprot_id == uniprot_accession)) + .group_by( + it.int_a_uniprot_id, + it.int_b_uniprot_id, + it.pmid, + it.interaction_ids, + it.interaction_type, + it.interaction_type_psimi_id, + it.detection_method, + it.detection_method_psimi_id, + it.confidence_value, + ) + ) + result = self.session.execute(sql) for ( up_a, @@ -229,3 +264,9 @@ def update_interactions(self) -> int: updated += 1 return updated + + +if __name__ == "__main__": + hgncdb = IntAct() + hgncdb.recreate_tables() + hgncdb.update() diff --git a/ebel/manager/orientdb/biodbs/iuphar.py b/ebel/manager/orientdb/biodbs/iuphar.py index 71e1476..3691a41 100644 --- a/ebel/manager/orientdb/biodbs/iuphar.py +++ b/ebel/manager/orientdb/biodbs/iuphar.py @@ -209,13 +209,4 @@ def update_interactions(self) -> int: edge_class = iuphar_edge_type_mapper.get(data.type, "iuphar_interaction") self.create_edge(edge_class, from_rid=a_rid, to_rid=data.rid, value_dict=i_value_dict) - # not sure if this is really needed - # Hgnc(self.client).update_bel() - return df_join.shape[0] - - -if __name__ == "__main__": - hgncdb = Iuphar() - # hgncdb.recreate_tables() - hgncdb.update() diff --git a/ebel/manager/orientdb/biodbs/mirtarbase.py b/ebel/manager/orientdb/biodbs/mirtarbase.py index 4085b2a..c66e14e 100644 --- a/ebel/manager/orientdb/biodbs/mirtarbase.py +++ b/ebel/manager/orientdb/biodbs/mirtarbase.py @@ -3,7 +3,7 @@ import pandas as pd from pyorientdb import OrientDB -from sqlalchemy import text +from sqlalchemy import text, select from tqdm import tqdm from ebel.manager.orientdb import odb_meta, odb_structure, urls @@ -37,7 +37,7 @@ def __contains__(self, item) -> bool: def insert_data(self) -> Dict[str, int]: """Insert mirtarbase data into database.""" - # TODO Fix download error - + # TODO: Fix download error - # ssl.SSLError: [SSL: SSLV3_ALERT_HANDSHAKE_FAILURE] sslv3 alert handshake failure (_ssl.c:997) df = pd.read_excel(self.file_path) df.columns = self._standardize_column_names(df.columns) @@ -58,20 +58,35 @@ def update_interactions(self) -> int: self.clear_edges() df_symbol_rid = self.get_pure_symbol_rid_df_in_bel_context(class_name="rna", namespace="HGNC") - sql = f"""Select - mi_rna, - target_gene as symbol, - support_type, - references_pmid as pmid, - experiments - from - {mirtarbase.Mirtarbase.__tablename__} - where - species_mi_rna='Homo sapiens' and - species_target_gene='Homo sapiens' and - support_type in ('Functional MTI', 'Non-Functional MTI')""" + # sql = f"""Select + # mi_rna, + # target_gene as symbol, + # support_type, + # references_pmid as pmid, + # experiments + # from + # {mirtarbase.Mirtarbase.__tablename__} + # where + # species_mi_rna='Homo sapiens' and + # species_target_gene='Homo sapiens' and + # support_type in ()""" + + mtb = mirtarbase.Mirtarbase + sql = ( + select( + mtb.mi_rna, + mtb.target_gene.label("symbol"), + mtb.support_type, + mtb.references_pmid.label("pmid"), + mtb.experiments, + ) + .where(mtb.species_mi_rna == "Homo sapiens") + .where(mtb.species_target_gene == "Homo sapiens") + .where(mtb.support_type.in_(["Functional MTI", "Non-Functional MTI"])) + ) + cols = ["mi_rna", "symbol", "support_type", "pmid", "experiments"] - df_mirtarbase = pd.DataFrame(self.session.execute(text(sql)).fetchall(), columns=cols) + df_mirtarbase = pd.DataFrame(self.session.execute(sql).fetchall(), columns=cols) df_mirtarbase.experiments = df_mirtarbase.experiments.str.split("//") df_join = df_mirtarbase.set_index("symbol").join(df_symbol_rid.set_index("symbol"), how="inner") @@ -95,3 +110,9 @@ def update_interactions(self) -> int: updated += 1 return updated + + +if __name__ == "__main__": + hgncdb = MirTarBase() + # hgncdb.recreate_tables() + hgncdb.update() From ad2b9a1b84cb2a181c9bbf1de2c5c45e4b9b1e06 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Thu, 21 Sep 2023 11:01:07 +0200 Subject: [PATCH 21/58] fix: chebi inster replace table instead of append --- ebel/manager/orientdb/biodbs/chebi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ebel/manager/orientdb/biodbs/chebi.py b/ebel/manager/orientdb/biodbs/chebi.py index 3ace1cb..767f4c6 100644 --- a/ebel/manager/orientdb/biodbs/chebi.py +++ b/ebel/manager/orientdb/biodbs/chebi.py @@ -104,7 +104,7 @@ def insert_data(self) -> Dict[str, int]: .reset_index() ) - df.to_sql(table_name, self.engine, index=False, if_exists="append") + df.to_sql(table_name, self.engine, index=False, if_exists="replace") inserted[table_name] += df.shape[0] self.session.commit() From 3499b8d78548b6018c53295d839be0632dbeb3cc Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Thu, 21 Sep 2023 12:29:27 +0200 Subject: [PATCH 22/58] chore: remove test code --- ebel/manager/models.py | 1 + ebel/manager/orientdb/biodbs/intact.py | 6 ------ ebel/manager/orientdb/biodbs/mirtarbase.py | 6 ------ 3 files changed, 1 insertion(+), 12 deletions(-) diff --git a/ebel/manager/models.py b/ebel/manager/models.py index 32c0307..98fe552 100755 --- a/ebel/manager/models.py +++ b/ebel/manager/models.py @@ -42,6 +42,7 @@ def reset_tables(engine: sqlalchemy.engine.Engine, force_new_db: bool) -> None: if force_new_db: Base.metadata.drop_all(bind=engine) + Base.metadata.create_all(bind=engine, checkfirst=True) diff --git a/ebel/manager/orientdb/biodbs/intact.py b/ebel/manager/orientdb/biodbs/intact.py index a450fb8..c3c4959 100644 --- a/ebel/manager/orientdb/biodbs/intact.py +++ b/ebel/manager/orientdb/biodbs/intact.py @@ -264,9 +264,3 @@ def update_interactions(self) -> int: updated += 1 return updated - - -if __name__ == "__main__": - hgncdb = IntAct() - hgncdb.recreate_tables() - hgncdb.update() diff --git a/ebel/manager/orientdb/biodbs/mirtarbase.py b/ebel/manager/orientdb/biodbs/mirtarbase.py index c66e14e..68ace6d 100644 --- a/ebel/manager/orientdb/biodbs/mirtarbase.py +++ b/ebel/manager/orientdb/biodbs/mirtarbase.py @@ -110,9 +110,3 @@ def update_interactions(self) -> int: updated += 1 return updated - - -if __name__ == "__main__": - hgncdb = MirTarBase() - # hgncdb.recreate_tables() - hgncdb.update() From f87ebfcb640e9676116f604d3405d205764274a9 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Thu, 21 Sep 2023 14:00:22 +0200 Subject: [PATCH 23/58] build: update pandas version in reqs --- pyproject.toml | 4 ++-- requirements.txt | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a0b9f80..f6e5a1b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,7 @@ lark-parser = "^0.11.3" click = "^8.1.7" requests = "^2.31.0" tqdm = "^4.66.1" -pandas = "^1.5.3" +pandas = "^2.1.1" sqlalchemy = "^2.0.20" SQLAlchemy-Utils = "^0.37.9" xlwt = "^1.3.0" @@ -117,4 +117,4 @@ source = [ ] [tool.coverage.html] -directory = "coverage_html_report" \ No newline at end of file +directory = "coverage_html_report" diff --git a/requirements.txt b/requirements.txt index d128e4b..dcc3963 100755 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ lark-parser==0.11.3 click>=8.1.7 requests>=2.31.0 tqdm>=4.66.1 -pandas>=1.5.3 +pandas>=2.2.1 sqlalchemy>=2.0.20 SQLAlchemy-Utils==0.37.9 xlwt==1.3.0 @@ -18,4 +18,4 @@ connexion[swagger-ui]==2.14.2 cryptography==3.4.8 openpyxl==3.1.2 graphviz==0.20 -pyorientdb==1.0.0 \ No newline at end of file +pyorientdb==1.0.0 From f778baf93462b363469b78bedeb63c40e6576c52 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Fri, 22 Sep 2023 09:40:56 +0200 Subject: [PATCH 24/58] fix: pandas version in requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index dcc3963..cbde27b 100755 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ lark-parser==0.11.3 click>=8.1.7 requests>=2.31.0 tqdm>=4.66.1 -pandas>=2.2.1 +pandas>=2.1.1 sqlalchemy>=2.0.20 SQLAlchemy-Utils==0.37.9 xlwt==1.3.0 From 48418e56c46df7231b38923ee996778a10b3bbb7 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Fri, 22 Sep 2023 10:29:42 +0200 Subject: [PATCH 25/58] fix: gwas catalog nullable props --- ebel/manager/orientdb/biodbs/clinvar.py | 6 +++--- ebel/manager/orientdb/biodbs/gwas_catalog.py | 4 ++-- ebel/manager/rdbms/models/gwas_catalog.py | 12 ++++++------ 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/ebel/manager/orientdb/biodbs/clinvar.py b/ebel/manager/orientdb/biodbs/clinvar.py index 613ce5e..f8b77b6 100644 --- a/ebel/manager/orientdb/biodbs/clinvar.py +++ b/ebel/manager/orientdb/biodbs/clinvar.py @@ -63,9 +63,9 @@ def insert_data(self) -> Dict[str, int]: self._standardize_dataframe(df) df.index += 1 df.index.rename("id", inplace=True) - df.drop(columns=["phenotype_ids", "phenotype_list", "other_ids"]).to_sql( - self.biodb_name, self.engine, if_exists="append", chunksize=10000 - ) + + df_base = df.drop(columns=["phenotype_ids", "phenotype_list", "other_ids"]) + df_base.to_sql(clinvar.Clinvar.__tablename__, con=self.engine, if_exists="append", chunksize=10000) df_clinvar__phenotype = ( df["phenotype_list"] diff --git a/ebel/manager/orientdb/biodbs/gwas_catalog.py b/ebel/manager/orientdb/biodbs/gwas_catalog.py index 414ee8d..ac44b79 100644 --- a/ebel/manager/orientdb/biodbs/gwas_catalog.py +++ b/ebel/manager/orientdb/biodbs/gwas_catalog.py @@ -79,7 +79,7 @@ def insert_data(self) -> Dict[str, int]: table_name = gwas_catalog.GwasCatalog.__tablename__ - df[columns_main_table].to_sql(table_name, self.engine, if_exists="append") + df[columns_main_table].to_sql(table_name, self.engine, if_exists="replace") df.snp_gene_ids = df.snp_gene_ids.str.strip().str.split(", ") df[table_name + "_id"] = df.index @@ -89,7 +89,7 @@ def insert_data(self) -> Dict[str, int]: df_snp_gene_ids.index = range(1, df_snp_gene_ids.shape[0] + 1) df_snp_gene_ids.rename(columns={"snp_gene_ids": "ensembl_identifier"}, inplace=True) df_snp_gene_ids.index.rename("id", inplace=True) - df_snp_gene_ids.to_sql(gwas_catalog.SnpGene.__tablename__, self.engine, if_exists="append") + df_snp_gene_ids.to_sql(gwas_catalog.SnpGene.__tablename__, self.engine, if_exists="replace") self.session.commit() diff --git a/ebel/manager/rdbms/models/gwas_catalog.py b/ebel/manager/rdbms/models/gwas_catalog.py index 550e128..bcd336e 100644 --- a/ebel/manager/rdbms/models/gwas_catalog.py +++ b/ebel/manager/rdbms/models/gwas_catalog.py @@ -34,20 +34,20 @@ class GwasCatalog(Base): downstream_gene_id: Mapped[Optional[str]] = mapped_column(String(50)) upstream_gene_distance: Mapped[Optional[int]] = mapped_column() downstream_gene_distance: Mapped[Optional[int]] = mapped_column() - strongest_snp_risk_allele: Mapped[str] = mapped_column(Text) - snp: Mapped[str] = mapped_column(Text) + strongest_snp_risk_allele: Mapped[Optional[int]] = mapped_column(Text) + snp: Mapped[Optional[int]] = mapped_column(Text) merged: Mapped[Optional[int]] = mapped_column() snp_id_current: Mapped[Optional[str]] = mapped_column(Text) context: Mapped[Optional[str]] = mapped_column(Text) intergenic: Mapped[Optional[int]] = mapped_column() risk_allele_frequency: Mapped[Optional[str]] = mapped_column(Text) - p_value: Mapped[float] = mapped_column() - pvalue_mlog: Mapped[float] = mapped_column() + p_value: Mapped[Optional[float]] = mapped_column() + pvalue_mlog: Mapped[Optional[float]] = mapped_column() p_value_text: Mapped[Optional[str]] = mapped_column(Text) or_or_beta: Mapped[Optional[float]] = mapped_column() _95_ci_text: Mapped[Optional[str]] = mapped_column(Text) - platform_snps_passing_qc: Mapped[str] = mapped_column(Text) - cnv: Mapped[str] = mapped_column(Text) + platform_snps_passing_qc: Mapped[Optional[str]] = mapped_column(Text) + cnv: Mapped[Optional[str]] = mapped_column(Text) snp_genes: Mapped[List["SnpGene"]] = relationship("SnpGene", back_populates="gwascatalog") From d645b5dcdbb16fcffc86a5a3d4b9a66e60eba7fb Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Fri, 22 Sep 2023 13:33:50 +0200 Subject: [PATCH 26/58] fix: add Ensembl update to ClinVar --- ebel/manager/orientdb/biodbs/clinvar.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ebel/manager/orientdb/biodbs/clinvar.py b/ebel/manager/orientdb/biodbs/clinvar.py index f8b77b6..56cca96 100644 --- a/ebel/manager/orientdb/biodbs/clinvar.py +++ b/ebel/manager/orientdb/biodbs/clinvar.py @@ -11,6 +11,7 @@ from ebel.manager.orientdb import odb_meta, odb_structure, urls from ebel.manager.orientdb.constants import CLINVAR from ebel.manager.rdbms.models import clinvar +from ebel.manager.orientdb.biodbs.ensembl import Ensembl from ebel.tools import get_disease_trait_keywords_from_config, get_file_path logger = logging.getLogger(__name__) @@ -58,7 +59,12 @@ def insert_data(self) -> Dict[str, int]: """Insert data.""" inserted = {} logger.info("Insert data for ClinVar") + + # Depends on Ensembl + Ensembl().update() + self.recreate_tables() + df = pd.read_csv(self.file_path, sep="\t", low_memory=False) self._standardize_dataframe(df) df.index += 1 From 191ca9128476322563a336fe216bef4238ffe025 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Sat, 23 Sep 2023 09:40:28 +0200 Subject: [PATCH 27/58] fix: add hgnc update to stringdb --- ebel/manager/orientdb/biodbs/stringdb.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ebel/manager/orientdb/biodbs/stringdb.py b/ebel/manager/orientdb/biodbs/stringdb.py index 19aceb3..a10c904 100644 --- a/ebel/manager/orientdb/biodbs/stringdb.py +++ b/ebel/manager/orientdb/biodbs/stringdb.py @@ -6,7 +6,7 @@ import numpy as np import pandas as pd from pyorientdb import OrientDB -from sqlalchemy import or_, select, text +from sqlalchemy import or_, select, text, and_ from tqdm import tqdm from ebel.manager.orientdb import odb_meta, odb_structure, urls @@ -167,6 +167,7 @@ def get_stringdb_action_hgnc_set(self): def update_interactions(self) -> Dict[str, int]: """Update the edges with StringDB metadata.""" hgnc = Hgnc(self.client) + hgnc.update() # If users haven't run Hgnc yet updated = dict() updated["interactions"] = self.update_stringdb_interactions(hgnc) updated["actions"] = self.update_action_interactions(hgnc) @@ -322,3 +323,9 @@ def update_action_interactions(self, hgnc: Hgnc) -> int: updated += 1 return updated + + +if __name__ == "__main__": + a = StringDb() + # a.recreate_tables() + a.update() From c39de4faf866d14252c4a75d27fcba1dda9d81da Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Sat, 23 Sep 2023 09:40:49 +0200 Subject: [PATCH 28/58] chore: remove test code from stringdb --- ebel/manager/orientdb/biodbs/stringdb.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/ebel/manager/orientdb/biodbs/stringdb.py b/ebel/manager/orientdb/biodbs/stringdb.py index a10c904..83380a5 100644 --- a/ebel/manager/orientdb/biodbs/stringdb.py +++ b/ebel/manager/orientdb/biodbs/stringdb.py @@ -323,9 +323,3 @@ def update_action_interactions(self, hgnc: Hgnc) -> int: updated += 1 return updated - - -if __name__ == "__main__": - a = StringDb() - # a.recreate_tables() - a.update() From 6c03eb20da7daaf5b8c1425425fbda0464481697 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Mon, 25 Sep 2023 10:32:19 +0200 Subject: [PATCH 29/58] build: update sqlalchemy-utils version in deps --- pyproject.toml | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f6e5a1b..187016c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,7 @@ requests = "^2.31.0" tqdm = "^4.66.1" pandas = "^2.1.1" sqlalchemy = "^2.0.20" -SQLAlchemy-Utils = "^0.37.9" +SQLAlchemy-Utils = "^0.41.1" xlwt = "^1.3.0" xlrd = "^2.0.1" xlsxwriter = "^1.4.5" diff --git a/requirements.txt b/requirements.txt index cbde27b..206a5d6 100755 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ requests>=2.31.0 tqdm>=4.66.1 pandas>=2.1.1 sqlalchemy>=2.0.20 -SQLAlchemy-Utils==0.37.9 +SQLAlchemy-Utils==0.41.1 xlwt==1.3.0 xlrd==2.0.1 xlsxwriter==1.4.5 From 107676f91bdfb9efdd2bccef04333465818ecd13 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Mon, 25 Sep 2023 10:35:57 +0200 Subject: [PATCH 30/58] fix: add check/create database step to abstract graph init --- ebel/manager/orientdb/odb_meta.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ebel/manager/orientdb/odb_meta.py b/ebel/manager/orientdb/odb_meta.py index 8773bad..88e510f 100644 --- a/ebel/manager/orientdb/odb_meta.py +++ b/ebel/manager/orientdb/odb_meta.py @@ -94,13 +94,18 @@ def __init__( self.engine = rdb.engine self.session = rdb.session - if not (get_config_value("DATABASE", "sqlalchemy_connection_string") or database_exists(self.engine.url)): + conn = get_config_value("DATABASE", "sqlalchemy_connection_string") + + if not (conn or database_exists(self.engine.url)): if str(self.engine.url).startswith("mysql"): set_mysql_interactive() else: create_database(self.engine.url) + if not database_exists(self.engine.url): + create_database(self.engine.url) + def __config_params_check(self, overwrite_config: bool = False): """Go through passed/available configuration params.""" # Set the client From 3b9e9b616283de26cf40c2d49a175f05274ea189 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Mon, 25 Sep 2023 11:07:04 +0200 Subject: [PATCH 31/58] fix: dafaults dir path --- ebel/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ebel/constants.py b/ebel/constants.py index 86be228..bb8a088 100755 --- a/ebel/constants.py +++ b/ebel/constants.py @@ -4,7 +4,7 @@ import os from pathlib import Path -THIS_DIR = Path(__file__) +THIS_DIR = Path(__file__).parent PROJECT_NAME = "ebel" HOME = Path.home() From 0755a47f4207e90db7b45a2666e4482b14379bb6 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Mon, 25 Sep 2023 11:10:28 +0200 Subject: [PATCH 32/58] feat: begin updating biogrid to sql2 --- ebel/manager/orientdb/biodbs/biogrid.py | 47 ++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/ebel/manager/orientdb/biodbs/biogrid.py b/ebel/manager/orientdb/biodbs/biogrid.py index 277998f..072342c 100644 --- a/ebel/manager/orientdb/biodbs/biogrid.py +++ b/ebel/manager/orientdb/biodbs/biogrid.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd from pyorientdb import OrientDB -from sqlalchemy import text +from sqlalchemy import text, select, func, cast, Integer from tqdm import tqdm from ebel import tools @@ -516,8 +516,9 @@ def update_interactions(self) -> int: ib.uniprot as object_uniprot, ib.taxonomy_id as object_taxonomy_id, es.experimental_system, - group_concat( distinct b.biogrid_id) as biogrid_ids, - group_concat( distinct if(p.source='PUBMED',CAST(p.source_identifier AS UNSIGNED),NULL)) as pmids, + group_concat( distinct b.biogrid_id) as biogrid_ids, group_concat( + distinct if(p.source='PUBMED',CAST(p.source_identifier AS UNSIGNED),NULL) + ) as pmids, count(distinct p.source_identifier) as num_pubs, group_concat( distinct if(p.source='DOI',CAST(p.source_identifier AS UNSIGNED),NULL)) as dois from @@ -540,6 +541,13 @@ def update_interactions(self) -> int: ib.taxonomy_id, es.experimental_system""" + b = biogrid.Biogrid + ia = biogrid.Interactor + ib = biogrid.Interactor + m = biogrid.Modification + p = biogrid.Publication + es = biogrid.ExperimentalSystem + uniprots_in_bel_set = self.get_pure_uniprots_in_bel_context() uniprot_modification_pairs = self.get_uniprot_modification_pairs() @@ -568,7 +576,38 @@ def update_interactions(self) -> int: object_uniprot=e["object_uniprot"], ) - for row in self.session.execute(text(sql)).fetchall(): + sql = ( + select( + ia.symbol.label("subject_symbol"), + ia.uniprot.label("subject_uniprot"), + ia.taxonomy_id.label("subject_taxonomy_id"), + m.modification, + ib.symbol.label("object_symbol"), + ib.uniprot.label("object_uniprot"), + ib.taxonomy_id.label("object_taxonomy_id"), + es.experimental_system, + func.group_concat( + b.biogrid_id.distinct().label("biogrid_ids"), + func.group_concat( + func.IF(p.source == "PUBMED", cast(p.source_identifier, Integer), None).distinct() + ).label("pmids"), + ), + p.source_identifier.count().label("num_pubs"), + func.group_concat(func.IF(p.source == "DOI", cast(p.source_identifier, Integer), None)).label( + "dois" + ), + ) + .join(ia) + .join(ib) + .join(m) + .join(p) + .join(es) + .where(ia.uniprot == e["subject_uniprot"]) + .where(ib.uniprot == e["object_uniprot"]) + .where(m.modification != "No Modification") + ) + + for row in self.session.execute(sql).fetchall(): row_dict = row._asdict() be = BioGridEdge(subject_rid=subj_pure_rid, object_rid=obj_pure_rid, **row_dict) edge_value_dict = be.get_edge_value_dict() From ded1170aeb2beb1797db2d368f58f3cb72bfda11 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Mon, 25 Sep 2023 11:13:55 +0200 Subject: [PATCH 33/58] fix: add defaults import to odb_meta to ensure logging init --- ebel/manager/orientdb/odb_meta.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ebel/manager/orientdb/odb_meta.py b/ebel/manager/orientdb/odb_meta.py index 88e510f..6240440 100644 --- a/ebel/manager/orientdb/odb_meta.py +++ b/ebel/manager/orientdb/odb_meta.py @@ -29,6 +29,7 @@ from tqdm import tqdm import ebel.database +import ebel.defaults from ebel.cache import set_mysql_interactive from ebel.config import get_config_as_dict, get_config_value, write_to_config from ebel.constants import DEFAULT_ODB, RID From d447e934e506618a4ef85ad2560ea9cdce5d00a0 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Mon, 25 Sep 2023 13:33:32 +0200 Subject: [PATCH 34/58] fix: convert biogrid update sql text to sqla2 stmt --- ebel/defaults.py | 6 ++ ebel/manager/orientdb/biodbs/biogrid.py | 83 +++++++------------------ ebel/manager/rdbms/models/biogrid.py | 6 +- 3 files changed, 33 insertions(+), 62 deletions(-) diff --git a/ebel/defaults.py b/ebel/defaults.py index 3ff9bb7..8466190 100755 --- a/ebel/defaults.py +++ b/ebel/defaults.py @@ -56,8 +56,14 @@ logHandler.setFormatter(logh_format) logHandler.setLevel(logging.DEBUG) + # Console Handler ch = logging.StreamHandler() ch_format = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") ch.setFormatter(ch_format) ch.setLevel(logging.WARNING) + +logging.basicConfig( + handlers=[logHandler, ch], + encoding="utf-8", +) diff --git a/ebel/manager/orientdb/biodbs/biogrid.py b/ebel/manager/orientdb/biodbs/biogrid.py index 072342c..b47cc88 100644 --- a/ebel/manager/orientdb/biodbs/biogrid.py +++ b/ebel/manager/orientdb/biodbs/biogrid.py @@ -8,6 +8,7 @@ import pandas as pd from pyorientdb import OrientDB from sqlalchemy import text, select, func, cast, Integer +from sqlalchemy.orm import aliased from tqdm import tqdm from ebel import tools @@ -286,7 +287,7 @@ def insert_data(self) -> Dict[str, int]: } # main table - df = pd.read_csv(self.file_path, usecols=use_columns.keys(), sep="\t", low_memory=False) + df = pd.read_csv(self.file_path, usecols=list(use_columns.keys()), sep="\t", low_memory=False) df.rename(columns=use_columns, inplace=True) df.replace("-", np.nan, inplace=True) @@ -505,45 +506,9 @@ def get_create_pure_protein_rid_by_uniprot(self, taxonomy_id, symbol, uniprot): def update_interactions(self) -> int: """Updates all BioGrid interactions.""" - # TODO: sql_temp as sqlalchemy query - sql_temp = """ - Select - ia.symbol as subject_symbol, - ia.uniprot as subject_uniprot, - ia.taxonomy_id as subject_taxonomy_id, - m.modification, - ib.symbol as object_symbol, - ib.uniprot as object_uniprot, - ib.taxonomy_id as object_taxonomy_id, - es.experimental_system, - group_concat( distinct b.biogrid_id) as biogrid_ids, group_concat( - distinct if(p.source='PUBMED',CAST(p.source_identifier AS UNSIGNED),NULL) - ) as pmids, - count(distinct p.source_identifier) as num_pubs, - group_concat( distinct if(p.source='DOI',CAST(p.source_identifier AS UNSIGNED),NULL)) as dois - from - biogrid b - inner join biogrid_interactor ia on (b.biogrid_a_id=ia.biogrid_id) - inner join biogrid_interactor ib on (b.biogrid_b_id=ib.biogrid_id) - inner join biogrid_modification m on (m.id=b.modification_id) - inner join biogrid_publication p on (b.publication_id=p.id) - inner join biogrid_experimental_system es on (b.experimental_system_id=es.id) - where - (ia.uniprot = '{subject_uniprot}' and ib.uniprot = '{object_uniprot}') and - m.modification != 'No Modification' - group by - ia.symbol, - ia.uniprot, - ia.taxonomy_id, - m.modification, - ib.symbol, - ib.uniprot, - ib.taxonomy_id, - es.experimental_system""" - b = biogrid.Biogrid - ia = biogrid.Interactor - ib = biogrid.Interactor + ia = aliased(biogrid.Interactor) + ib = aliased(biogrid.Interactor) m = biogrid.Modification p = biogrid.Publication es = biogrid.ExperimentalSystem @@ -554,6 +519,10 @@ def update_interactions(self) -> int: counter = 0 self.clear_edges() + if_func = func.iif if self.engine.dialect.name == "sqlite" else func.IF + + logging.info("Update BioGRID") + for e in tqdm( uniprot_modification_pairs, desc=f"Update {self.biodb_name.upper()} interactions", @@ -571,10 +540,8 @@ def update_interactions(self) -> int: uniprot=e["object_uniprot"], ) - sql = sql_temp.format( - subject_uniprot=e["subject_uniprot"], - object_uniprot=e["object_uniprot"], - ) + subject_uniprot = e["subject_uniprot"] + object_uniprot = e["object_uniprot"] sql = ( select( @@ -586,24 +553,22 @@ def update_interactions(self) -> int: ib.uniprot.label("object_uniprot"), ib.taxonomy_id.label("object_taxonomy_id"), es.experimental_system, + func.group_concat(b.biogrid_id.distinct()).label("biogrid_ids"), + func.group_concat( + if_func(p.source == "PUBMED", cast(p.source_identifier, Integer), None).distinct() + ).label("pmids"), + func.count(p.source_identifier).label("num_pubs"), func.group_concat( - b.biogrid_id.distinct().label("biogrid_ids"), - func.group_concat( - func.IF(p.source == "PUBMED", cast(p.source_identifier, Integer), None).distinct() - ).label("pmids"), - ), - p.source_identifier.count().label("num_pubs"), - func.group_concat(func.IF(p.source == "DOI", cast(p.source_identifier, Integer), None)).label( - "dois" - ), + if_func(p.source == "DOI", cast(p.source_identifier, Integer), None).distinct() + ).label("dois"), ) - .join(ia) - .join(ib) - .join(m) - .join(p) - .join(es) - .where(ia.uniprot == e["subject_uniprot"]) - .where(ib.uniprot == e["object_uniprot"]) + .join(ia, b.biogrid_a_id == ia.biogrid_id) + .join(ib, b.biogrid_b_id == ib.biogrid_id) + .join(m, m.id == b.modification_id) + .join(p, b.publication_id == p.id) + .join(es, b.experimental_system_id == es.id) + .where(ia.uniprot == subject_uniprot) + .where(ib.uniprot == object_uniprot) .where(m.modification != "No Modification") ) diff --git a/ebel/manager/rdbms/models/biogrid.py b/ebel/manager/rdbms/models/biogrid.py index b8dea17..f5cec5e 100644 --- a/ebel/manager/rdbms/models/biogrid.py +++ b/ebel/manager/rdbms/models/biogrid.py @@ -72,7 +72,7 @@ class Throughput(Base): __tablename__ = "biogrid_throughput" id: Mapped[int] = mapped_column(primary_key=True) throughput: Mapped[str] = mapped_column(String(255)) - frequency: Mapped[int] = mapped_column() + count: Mapped[int] = mapped_column() def as_dict(self): """Convert object values to dictionary.""" @@ -98,7 +98,7 @@ class ExperimentalSystem(Base): id: Mapped[int] = mapped_column(primary_key=True) experimental_system: Mapped[str] = mapped_column(String(255), nullable=True) experimental_system_type: Mapped[str] = mapped_column(String(255), nullable=True) - frequency: Mapped[int] = mapped_column() + count: Mapped[int] = mapped_column() def as_dict(self): """Convert object values to dictionary.""" @@ -149,7 +149,7 @@ class Modification(Base): __tablename__ = "biogrid_modification" id: Mapped[int] = mapped_column(primary_key=True) modification: Mapped[str] = mapped_column(String(255), nullable=True) - frequency: Mapped[int] = mapped_column() + count: Mapped[int] = mapped_column() def as_dict(self): """Convert object values to dictionary.""" From ae93f1a192369e9e20d600d1555aa493fdb9a609 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Mon, 25 Sep 2023 13:48:59 +0200 Subject: [PATCH 35/58] fix: pathway commons super table join in update --- ebel/manager/orientdb/biodbs/pathway_commons.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/ebel/manager/orientdb/biodbs/pathway_commons.py b/ebel/manager/orientdb/biodbs/pathway_commons.py index 88a23fe..c0ca6e9 100644 --- a/ebel/manager/orientdb/biodbs/pathway_commons.py +++ b/ebel/manager/orientdb/biodbs/pathway_commons.py @@ -243,8 +243,7 @@ def update_interactions(self) -> Dict[str, int]: pc.PathwayCommons.interaction_type == edge_type ) - with self.engine.connect() as conn: - df_ppi_of = pd.read_sql(sql, conn) + df_ppi_of = pd.read_sql(sql, self.engine) df_join = ( df_ppi_of.set_index("participant_a") @@ -252,13 +251,13 @@ def update_interactions(self) -> Dict[str, int]: .rename(columns={"rid": "rid_a_all"}) .join(df_bel.set_index("symbol")) .reset_index() - .rename(columns={"rid": "rid_a_bel", "index": "a"}) + .rename(columns={"rid": "rid_a_bel", "participant_a": "a"}) .set_index("participant_b") .join(df_all.set_index("symbol")) .rename(columns={"rid": "rid_b_all"}) .join(df_bel.set_index("symbol")) .reset_index() - .rename(columns={"rid": "rid_b_bel", "index": "b"}) + .rename(columns={"rid": "rid_b_bel", "participant_b": "b"}) .set_index("id") ) @@ -305,3 +304,8 @@ def get_pathway_pmids_sources(self, pc_id, pc_pathway_name_rid_dict) -> tuple: pmids = [x.pmid for x in pc_obj.pmids] pathways = [pc_pathway_name_rid_dict[x.name] for x in pc_obj.pathway_names] return pathways, pmids, sources + + +if __name__ == "__main__": + foo = PathwayCommons() + foo.update_interactions() From ca26a1798fdea459287c8ebf38bb7b2d1e9c97f5 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Mon, 25 Sep 2023 13:55:04 +0200 Subject: [PATCH 36/58] fix: remove database init --- ebel/manager/orientdb/odb_meta.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/ebel/manager/orientdb/odb_meta.py b/ebel/manager/orientdb/odb_meta.py index 6240440..0501be6 100644 --- a/ebel/manager/orientdb/odb_meta.py +++ b/ebel/manager/orientdb/odb_meta.py @@ -104,9 +104,6 @@ def __init__( else: create_database(self.engine.url) - if not database_exists(self.engine.url): - create_database(self.engine.url) - def __config_params_check(self, overwrite_config: bool = False): """Go through passed/available configuration params.""" # Set the client From 0122f00017040468d6a5a280822785133f61dd82 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Mon, 25 Sep 2023 14:52:16 +0200 Subject: [PATCH 37/58] fix: pc pmid model upgraded to big integer --- ebel/manager/rdbms/models/pathway_commons.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ebel/manager/rdbms/models/pathway_commons.py b/ebel/manager/rdbms/models/pathway_commons.py index 172b159..ef5207c 100644 --- a/ebel/manager/rdbms/models/pathway_commons.py +++ b/ebel/manager/rdbms/models/pathway_commons.py @@ -93,7 +93,7 @@ class Pmid(Base): __tablename__ = "pathway_commons_pmid" id: Mapped[int] = mapped_column(primary_key=True) - pmid: Mapped[int] = mapped_column(index=True) + pmid: Mapped[int] = mapped_column(BigInteger, index=True) pathway_commons_id: Mapped[int] = mapped_column(ForeignKey("pathway_commons.id"), index=True) pathway_commons: Mapped[List[PathwayCommons]] = relationship("PathwayCommons", back_populates="pmids") From 8ddd4927a6659cdede0db05d056eacd61d764786 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Mon, 25 Sep 2023 14:58:49 +0200 Subject: [PATCH 38/58] feat: update biogrid to use sqla2 stmts --- ebel/manager/orientdb/biodbs/biogrid.py | 81 ++++++++----------------- ebel/manager/rdbms/models/biogrid.py | 51 +++++++++++++++- 2 files changed, 75 insertions(+), 57 deletions(-) diff --git a/ebel/manager/orientdb/biodbs/biogrid.py b/ebel/manager/orientdb/biodbs/biogrid.py index b47cc88..789ed32 100644 --- a/ebel/manager/orientdb/biodbs/biogrid.py +++ b/ebel/manager/orientdb/biodbs/biogrid.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd from pyorientdb import OrientDB -from sqlalchemy import text, select, func, cast, Integer +from sqlalchemy import select, func, cast, Integer from sqlalchemy.orm import aliased from tqdm import tqdm @@ -460,29 +460,31 @@ def _create_source_table(self, df: pd.DataFrame) -> pd.DataFrame: def get_uniprot_modification_pairs(self): """Return all UniProt modification pairs.""" - # TODO: sql as sqlalchemy query - sql = """Select - ia.symbol as subject_symbol, - ia.uniprot as subject_uniprot, - ia.taxonomy_id as subject_taxonomy_id, - ib.symbol as object_symbol, - ib.uniprot as object_uniprot, - ib.taxonomy_id as object_taxonomy_id - from - biogrid b - inner join biogrid_interactor ia on (b.biogrid_a_id=ia.biogrid_id) - inner join biogrid_interactor ib on (b.biogrid_b_id=ib.biogrid_id) - inner join biogrid_modification m on (m.id=b.modification_id) - where - m.modification != 'No Modification' and ia.uniprot IS NOT NULL and ib.uniprot IS NOT NULL - group by - subject_symbol, - subject_uniprot, - subject_taxonomy_id, - object_symbol, - object_uniprot, - object_taxonomy_id""" - results = self.session.execute(text(sql)).fetchall() + b = biogrid.Biogrid + ia = aliased(biogrid.Interactor) + ib = aliased(biogrid.Interactor) + m = biogrid.Modification + + sql = ( + ( + select( + ia.symbol.label("subject_symbol"), + ia.uniprot.label("subject_uniprot"), + ia.taxonomy_id.label("subject_taxonomy_id"), + ib.symbol.label("object_symbol"), + ib.uniprot.label("object_uniprot"), + ib.taxonomy_id.label("object_taxonomy_id"), + ) + .join(ia, b.biogrid_a_id == ia.biogrid_id) + .join(ib, b.biogrid_b_id == ib.biogrid_id) + .join(m, b.modification_id == m.id) + ) + .where(m.modification == "No Modification") + .where(ia.uniprot.isnot(None)) + .where(ib.uniprot.isnot(None)) + .group_by(ia.symbol, ia.uniprot, ia.taxonomy_id, ib.symbol, ib.uniprot, ib.taxonomy_id) + ) + results = self.session.execute(sql).fetchall() return [x._asdict() for x in results] def get_create_pure_protein_rid_by_uniprot(self, taxonomy_id, symbol, uniprot): @@ -601,34 +603,3 @@ def update_interactions(self) -> int: ) counter += 1 return counter - - def create_view(self): - """Create SQL view of BioGRID data.""" - sql = """create view if not exists biogrid_view as - select - b.biogrid_id, - ia.symbol as symbol_a, - ia.uniprot as uniprot_a, - ta.taxonomy_id as tax_id_a, - ta.organism_name as organism_a, - ib.symbol as symbol_b, - ib.uniprot as uniprot_b, - tb.taxonomy_id as tax_id_b, - tb.organism_name as organism_b, - es.experimental_system, - m.modification, - s.source, - b.qualification, - p.source as publication_source, - p.source_identifier as publication_identifier - from - biogrid b inner join - biogrid_interactor ia on (ia.biogrid_id=b.biogrid_a_id) inner join - biogrid_interactor ib on (ib.biogrid_id=b.biogrid_b_id) inner join - biogrid_taxonomy ta on (ia.taxonomy_id=ta.taxonomy_id) inner join - biogrid_taxonomy tb on (ib.taxonomy_id=tb.taxonomy_id) left join - biogrid_experimental_system es on (b.experimental_system_id=es.id) left join - biogrid_modification m on (m.id=b.modification_id) left join - biogrid_source s on (s.id=b.source_id) left join - biogrid_publication p on (p.id=b.publication_id)""" - self.session.execute(text(sql)) diff --git a/ebel/manager/rdbms/models/biogrid.py b/ebel/manager/rdbms/models/biogrid.py index f5cec5e..58def36 100644 --- a/ebel/manager/rdbms/models/biogrid.py +++ b/ebel/manager/rdbms/models/biogrid.py @@ -1,7 +1,8 @@ """BioGRID RDBMS model definition.""" -from sqlalchemy import Float, ForeignKey, Integer, String, Text +from sqlalchemy import Float, ForeignKey, Integer, String, Text, select from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import Mapped, mapped_column, relationship +from sqlalchemy.orm import Mapped, mapped_column, relationship, aliased +from sqlalchemy_utils import create_view from ebel.manager.rdbms.models import object_as_dict @@ -154,3 +155,49 @@ class Modification(Base): def as_dict(self): """Convert object values to dictionary.""" return object_as_dict(self, exclude=["id"]) + + +class BiogridView(Base): + """SQL view for Biogrid.""" + + b = Biogrid + ia = aliased(Interactor) + ib = aliased(Interactor) + m = Modification + p = Publication + es = ExperimentalSystem + ta = aliased(Taxonomy) + tb = aliased(Taxonomy) + s = Source + + stmt = ( + select( + b.biogrid_id, + ia.symbol.label("symbol_a"), + ia.uniprot.label("uniprot_a"), + ta.taxonomy_id.label("tax_id_a"), + ta.organism_name.label("organism_a"), + ib.symbol.label("symbol_b"), + ib.uniprot.label("uniprot_b"), + tb.taxonomy_id.label("tax_id_b"), + tb.organism_name.label("organism_b"), + es.experimental_system, + m.modification, + s.source, + b.qualification, + p.source.label("publication_source"), + p.source_identifier.label("publication_identifier"), + ) + .join(ia, b.biogrid_a_id == ia.biogrid_id) + .join(ib, b.biogrid_b_id == ib.biogrid_id) + .join(ta, ia.taxonomy_id == ta.taxonomy_id) + .join(tb, ib.taxonomy_id == tb.taxonomy_id) + .join(es, b.experimental_system_id == es.id, isouter=True) + .join(m, m.id == b.modification_id, isouter=True) + .join(s, b.source_id == s.id, isouter=True) + .join(p, b.publication_id == p.id, isouter=True) + ) + + view = create_view(name="biogrid_view", selectable=stmt, metadata=Base.metadata) + + __table__ = view From 3b14d3a7c51b1bc2da00cf10d14e0e0bf6847b19 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Tue, 26 Sep 2023 09:47:47 +0200 Subject: [PATCH 39/58] feat: update remaining biodbs to sqla2 select stmts --- ebel/manager/orientdb/biodbs/chebi.py | 2 +- .../orientdb/biodbs/clinical_trials.py | 2 +- ebel/manager/orientdb/biodbs/clinvar.py | 64 ++++++++++++----- ebel/manager/orientdb/biodbs/disgenet.py | 64 ++++++++++------- .../orientdb/biodbs/expression_atlas.py | 2 +- ebel/manager/orientdb/biodbs/gwas_catalog.py | 4 +- ebel/manager/orientdb/biodbs/hgnc.py | 17 +++-- ebel/manager/orientdb/biodbs/intact.py | 37 ++-------- ebel/manager/orientdb/biodbs/kegg.py | 69 +++++++++++++------ ebel/manager/orientdb/biodbs/mirtarbase.py | 13 ---- ebel/manager/orientdb/biodbs/ncbi.py | 1 + ebel/manager/orientdb/biodbs/nsides.py | 64 ++++++++++------- ebel/manager/orientdb/biodbs/uniprot.py | 27 +++++--- ebel/tools.py | 3 +- 14 files changed, 215 insertions(+), 154 deletions(-) diff --git a/ebel/manager/orientdb/biodbs/chebi.py b/ebel/manager/orientdb/biodbs/chebi.py index 767f4c6..3ace1cb 100644 --- a/ebel/manager/orientdb/biodbs/chebi.py +++ b/ebel/manager/orientdb/biodbs/chebi.py @@ -104,7 +104,7 @@ def insert_data(self) -> Dict[str, int]: .reset_index() ) - df.to_sql(table_name, self.engine, index=False, if_exists="replace") + df.to_sql(table_name, self.engine, index=False, if_exists="append") inserted[table_name] += df.shape[0] self.session.commit() diff --git a/ebel/manager/orientdb/biodbs/clinical_trials.py b/ebel/manager/orientdb/biodbs/clinical_trials.py index 008c567..148eba5 100644 --- a/ebel/manager/orientdb/biodbs/clinical_trials.py +++ b/ebel/manager/orientdb/biodbs/clinical_trials.py @@ -47,7 +47,7 @@ def __contains__(self, item): def add_link_to_drugbank(self, data_dict: dict, trial_rid: str): """Create LINKSET in drugbank table for associated clinical trials.""" - # Can't check synonyms untils OrientDB 3.0, need to be able to index on collections + # Can't check synonyms until OrientDB 3.0, need to be able to index on collections # update_sql = 'UPDATE drugbank ADD clinical_trials = {} WHERE name = "{}" OR "{}" in synonyms' # TODO index drugbank.synonyms diff --git a/ebel/manager/orientdb/biodbs/clinvar.py b/ebel/manager/orientdb/biodbs/clinvar.py index 56cca96..f19b510 100644 --- a/ebel/manager/orientdb/biodbs/clinvar.py +++ b/ebel/manager/orientdb/biodbs/clinvar.py @@ -5,7 +5,7 @@ import pandas as pd from pyorientdb import OrientDB -from sqlalchemy import text +from sqlalchemy import text, select from tqdm import tqdm from ebel.manager.orientdb import odb_meta, odb_structure, urls @@ -166,25 +166,26 @@ def get_disease_snps_dict(self) -> Dict[str, List[Snp]]: """Get a dictionary {'disease':[snp,snp,... ]} by disease names.""" disease_keywords = get_disease_trait_keywords_from_config() - sql_temp = """Select - '{keyword}', - phenotype, - rs_db_snp as rs_number, - hgnc_id, - chromosome, - start as position, - clinical_significance - from clinvar c inner join - clinvar__phenotype cp on (c.id=cp.clinvar_id) inner JOIN - clinvar_phenotype p on (cp.clinvar_phenotype_id=p.id) - where - p.phenotype like '%%{keyword}%%' - and rs_db_snp != -1""" + cv = clinvar.Clinvar + cp = clinvar.ClinvarPhenotype results = dict() for kwd in disease_keywords: - sql = sql_temp.format(keyword=kwd) - rows = self.session.execute(text(sql)) + sql = ( + select( + cp.phenotype, + cv.rs_db_snp.label("rs_number"), + cv.hgnc_id, + cv.chromosome, + cv.start.label("position"), + cv.clinical_significance, + ) + .join(cv, cp.clinvars) + .where(cv.rs_db_snp != -1) + .where(cp.phenotype.like(f"%{kwd}%")) + ) + print(sql) + rows = self.session.execute(sql) results[kwd] = [Snp(*x) for x in rows.fetchall()] return results @@ -205,6 +206,7 @@ def update_interactions(self) -> int: for snp in tqdm(rows, desc=f"Add has_X_snp_cv edges to BEL for {disease}"): if snp.hgnc_id in hgnc_id_gene_rid_cache: gene_mapped_rid = hgnc_id_gene_rid_cache[snp.hgnc_id] + else: gene_mapped_rid = self._get_set_gene_rid(hgnc_id=snp.hgnc_id) hgnc_id_gene_rid_cache[snp.hgnc_id] = gene_mapped_rid @@ -214,7 +216,7 @@ def update_interactions(self) -> int: value_dict = { "clinical_significance": snp.clinical_significance, "phenotype": snp.phenotype, - "keyword": snp.keyword, + "keyword": disease, } self.create_edge( class_name="has_mapped_snp_cv", @@ -262,3 +264,29 @@ def _get_set_gene_rid(self, hgnc_id: str): gene_rid = self.get_create_rid("gene", data, check_for="bel") return gene_rid + + +if __name__ == "__main__": + c = ClinVar() + cv = clinvar.Clinvar + cp = clinvar.ClinvarPhenotype + kwd = "Depression" + + sql = ( + select( + cp.phenotype, + cv.rs_db_snp.label("rs_number"), + cv.hgnc_id, + cv.chromosome, + cv.start.label("position"), + cv.clinical_significance, + ) + .join(cp, cv.phenotypes) + .where(cv.rs_db_snp != -1) + .where(cp.phenotype.like(f"%{kwd}%")) + ) + rows = c.session.execute(sql) + amt = 0 + for x in rows: + amt += 1 + print(amt) diff --git a/ebel/manager/orientdb/biodbs/disgenet.py b/ebel/manager/orientdb/biodbs/disgenet.py index c052cc7..a814d25 100644 --- a/ebel/manager/orientdb/biodbs/disgenet.py +++ b/ebel/manager/orientdb/biodbs/disgenet.py @@ -183,35 +183,47 @@ def update_snps(self) -> int: "downstream": "upstream", "upstream": "downstream", } - # TODO: replace SQL with SQL Alchemy statement - sql_temp = """Select - snp_id, - chromosome, - position, - disease_name, - pmid, - score, - source - FROM - disgenet_variant v INNER JOIN - disgenet_source s on (v.source_id=s.id) INNER JOIN - disgenet_disease d on (v.disease_id=d.disease_id) - WHERE - disease_name like '%%{}%%' and - source!='BEFREE' - GROUP BY - snp_id, - chromosome, - position, - disease_name, - pmid, - score, - source""" + # # TODO: replace SQL with SQL Alchemy statement + # sql_temp = """Select + # snp_id, + # chromosome, + # position, + # disease_name, + # pmid, + # score, + # source + # FROM + # disgenet_variant v INNER JOIN + # disgenet_source s on (v.source_id=s.id) INNER JOIN + # disgenet_disease d on (v.disease_id=d.disease_id) + # WHERE + # disease_name like '%%{}%%' and + # source!='BEFREE' + # GROUP BY + # snp_id, + # chromosome, + # position, + # disease_name, + # pmid, + # score, + # source""" + + dv = disgenet.DisgenetVariant + ds = disgenet.DisgenetSource + dd = disgenet.DisgenetDisease results = dict() for kwd in self.disease_keywords: - sql = sql_temp.format(kwd) - rows = self.session.execute(text(sql)) + sql = ( + select(dv.snp_id, dv.chromosome, dv.position, dd.disease_name, dv.pmid, dv.score, ds.source) + .join(ds) + .join(dd) + .where(dd.disease_name.like(f"%{kwd}%")) + .where(ds.source != "BEFREE") + .group_by(dv.snp_id, dv.chromosome, dv.position, dd.disease_name, dv.pmid, dv.score, ds.source) + ) + + rows = self.session.execute(sql) results[kwd] = rows inserted = 0 diff --git a/ebel/manager/orientdb/biodbs/expression_atlas.py b/ebel/manager/orientdb/biodbs/expression_atlas.py index cdafe88..8fd8fd9 100644 --- a/ebel/manager/orientdb/biodbs/expression_atlas.py +++ b/ebel/manager/orientdb/biodbs/expression_atlas.py @@ -341,7 +341,7 @@ def get_idf(self, experiment_name: str) -> Optional[pd.DataFrame]: values = [x.strip() for x in line_splitted[1:] if x.strip()] rows.append((key_name, values)) - df = pd.DataFrame(rows, columns=("key_name", "value")).explode("value") + df = pd.DataFrame(rows, columns=["key_name", "value"]).explode("value") return df def get_sdrf_condensed(self, experiment_name: str) -> Optional[pd.DataFrame]: diff --git a/ebel/manager/orientdb/biodbs/gwas_catalog.py b/ebel/manager/orientdb/biodbs/gwas_catalog.py index ac44b79..414ee8d 100644 --- a/ebel/manager/orientdb/biodbs/gwas_catalog.py +++ b/ebel/manager/orientdb/biodbs/gwas_catalog.py @@ -79,7 +79,7 @@ def insert_data(self) -> Dict[str, int]: table_name = gwas_catalog.GwasCatalog.__tablename__ - df[columns_main_table].to_sql(table_name, self.engine, if_exists="replace") + df[columns_main_table].to_sql(table_name, self.engine, if_exists="append") df.snp_gene_ids = df.snp_gene_ids.str.strip().str.split(", ") df[table_name + "_id"] = df.index @@ -89,7 +89,7 @@ def insert_data(self) -> Dict[str, int]: df_snp_gene_ids.index = range(1, df_snp_gene_ids.shape[0] + 1) df_snp_gene_ids.rename(columns={"snp_gene_ids": "ensembl_identifier"}, inplace=True) df_snp_gene_ids.index.rename("id", inplace=True) - df_snp_gene_ids.to_sql(gwas_catalog.SnpGene.__tablename__, self.engine, if_exists="replace") + df_snp_gene_ids.to_sql(gwas_catalog.SnpGene.__tablename__, self.engine, if_exists="append") self.session.commit() diff --git a/ebel/manager/orientdb/biodbs/hgnc.py b/ebel/manager/orientdb/biodbs/hgnc.py index f16e0b7..0e505e0 100644 --- a/ebel/manager/orientdb/biodbs/hgnc.py +++ b/ebel/manager/orientdb/biodbs/hgnc.py @@ -8,7 +8,7 @@ import numpy as np import pandas as pd -from pyorientdb import OrientDB +from pyorientdb import OrientDB, OrientRecord from sqlalchemy import select from tqdm import tqdm @@ -267,26 +267,29 @@ def get_location(location: str) -> dict: location_dict = {"unknown_schema": location} return location_dict - def get_bel_symbols_without_hgnc_link(self): + def get_bel_symbols_without_hgnc_link(self) -> set: """Return set of all gene symbols in database without a link to HGNC.""" sql_symbols = "Select distinct(name) as symbol from bio_object where namespace='HGNC' and hgnc IS NULL" return {x.oRecordData["symbol"] for x in self.execute(sql_symbols)} - def get_bel_symbols_all(self): + def get_bel_symbols_all(self) -> set: """Return set of all gene symbols in database.""" sql_symbols = "Select distinct(name) as symbol from bio_object where namespace='HGNC'" return {x.oRecordData["symbol"] for x in self.execute(sql_symbols)} - def get_correct_symbol(self, symbol: str): + def get_correct_symbol(self, symbol: str) -> str: """Checks if symbol is valid otherwise checks previsous symbols.""" symbol_query = select(HgncDb).where(HgncDb.symbol == symbol) result_in_symbol = self.session.execute(symbol_query).first() if not result_in_symbol: result_in_prev_symbol = self.session.query(PrevSymbol).filter(PrevSymbol.prev_symbol == symbol).first() + if result_in_prev_symbol: symbol = result_in_prev_symbol.hgnc.symbol + else: symbol = None + return symbol def correct_wrong_symbol(self, symbol, bel_symbols_all: set): @@ -337,7 +340,7 @@ def update_gene( location: str, hgnc_symbol: str, suggested_corrections: str, - ) -> int: + ) -> OrientRecord: """Update genes in OrientDB and returns number of updates.""" suggest = ( ", suggested_corrections={{'wrong name': {}}}".format(suggested_corrections) @@ -358,7 +361,7 @@ def update_gene( ) return self.execute(sql)[0] - def update_rna(self, hgnc_rid: str, label: str, hgnc_symbol: str, suggested_corrections: str) -> int: + def update_rna(self, hgnc_rid: str, label: str, hgnc_symbol: str, suggested_corrections: str) -> OrientRecord: """Update RNAs in OrientDB and returns number of updates.""" suggest = ( ", suggested_corrections={{'wrong name': {}}}".format(suggested_corrections) @@ -375,7 +378,7 @@ def update_rna(self, hgnc_rid: str, label: str, hgnc_symbol: str, suggested_corr ) return self.execute(sql)[0] - def update_protein(self, hgnc_rid: str, label: str, hgnc_symbol: str, suggested_corrections: str) -> int: + def update_protein(self, hgnc_rid: str, label: str, hgnc_symbol: str, suggested_corrections: str) -> OrientRecord: """Update proteins in OrientDB and returns number of updates.""" suggest = ( ", suggested_corrections={{'wrong name': {}}}".format(suggested_corrections) diff --git a/ebel/manager/orientdb/biodbs/intact.py b/ebel/manager/orientdb/biodbs/intact.py index c3c4959..d982fc7 100644 --- a/ebel/manager/orientdb/biodbs/intact.py +++ b/ebel/manager/orientdb/biodbs/intact.py @@ -154,46 +154,24 @@ def get_namespace_name_by_uniprot(self, uniprot_accession: str) -> tuple: result = self.session.execute(sql).fetchone() taxid_to_namespace = {9606: "HGNC", 10090: "MGI", 10116: "RGD"} + if result: name, taxid = result namespace = taxid_to_namespace.get(taxid, "UNIPROT") return_value = (namespace, name) + else: if self.session.query(uniprot.Uniprot).filter(uniprot.Uniprot.accession == uniprot_accession).first(): return_value = ("UNIPROT", uniprot_accession) + return return_value def update_interactions(self) -> int: """Update intact interactions to graph.""" - uniprot = UniProt(self.client) - uniprot.update() - - uniprot_rid_dict = uniprot.get_pure_uniprot_rid_dict_in_bel_context() - - # sql_temp = """SELECT - # int_a_uniprot_id, - # int_b_uniprot_id, - # pmid, - # interaction_ids, - # interaction_type, - # interaction_type_psimi_id, - # detection_method, - # detection_method_psimi_id, - # confidence_value - # FROM - # intact - # WHERE - # int_a_uniprot_id = '{uniprot_accession}' or int_b_uniprot_id = '{uniprot_accession}' - # GROUP BY - # int_a_uniprot_id, - # int_b_uniprot_id, - # pmid, - # interaction_ids, - # interaction_type, - # interaction_type_psimi_id, - # detection_method, - # detection_method_psimi_id, - # confidence_value""" + up = UniProt(self.client) + up.update() + + uniprot_rid_dict = up.get_pure_uniprot_rid_dict_in_bel_context() updated = 0 @@ -201,7 +179,6 @@ def update_interactions(self) -> int: it = intact.Intact for uniprot_accession in tqdm(uniprot_accessions, desc="Update IntAct interactions"): - # sql = sql_temp.format(uniprot_accession=uniprot_accession) sql = ( select( it.int_a_uniprot_id, diff --git a/ebel/manager/orientdb/biodbs/kegg.py b/ebel/manager/orientdb/biodbs/kegg.py index 10ad6f9..7799357 100644 --- a/ebel/manager/orientdb/biodbs/kegg.py +++ b/ebel/manager/orientdb/biodbs/kegg.py @@ -9,6 +9,7 @@ import pandas as pd import requests from pyorientdb import OrientDB +from sqlalchemy import select, or_ from tqdm import tqdm from ebel.config import get_config_value @@ -281,29 +282,53 @@ def update_interactions(self) -> int: species_ids = ",".join([f"'{x}'" for x in self.species]) - sql_temp = f"""Select - interaction_type, - pathway_identifier, - pathway_name, - gene_symbol_a, - gene_symbol_b, - kegg_species_id - from - kegg - where - (gene_symbol_a='{{symbol}}' or gene_symbol_a='{{symbol}}') and - kegg_species_id in ({species_ids}) and - interaction_type in ({{interaction_types}}) - group by - interaction_type, - pathway_identifier, - pathway_name, - gene_symbol_a, - gene_symbol_b, - kegg_species_id""" - + # sql_temp = f"""Select + # interaction_type, + # pathway_identifier, + # pathway_name, + # gene_symbol_a, + # gene_symbol_b, + # kegg_species_id + # from + # kegg + # where + # (gene_symbol_a='{{symbol}}' or gene_symbol_a='{{symbol}}') and + # kegg_species_id in ({species_ids}) and + # interaction_type in ({{interaction_types}}) + # group by + # interaction_type, + # pathway_identifier, + # pathway_name, + # gene_symbol_a, + # gene_symbol_b, + # kegg_species_id""" + + kg = kegg.Kegg for symbol, rid in tqdm(symbol_rids_dict.items(), desc="Update KEGG posttranslational modifications"): - sql = sql_temp.format(symbol=symbol, interaction_types=post_translational_modifications) + # sql = sql_temp.format(symbol=symbol, interaction_types=post_translational_modifications) + + sql = ( + select( + kg.interaction_type, + kg.pathway_identifier, + kg.pathway_name, + kg.gene_symbol_a, + kg.gene_symbol_b, + kg.kegg_species_id, + ) + .where(or_(kg.gene_symbol_a == symbol, kg.gene_symbol_b == symbol)) + .where(kg.kegg_species_id.in_(species_ids)) + .where(kg.interaction_type.in_(post_translational_modifications)) + .group_by( + kg.interaction_type, + kg.pathway_identifier, + kg.pathway_name, + kg.gene_symbol_a, + kg.gene_symbol_b, + kg.kegg_species_id, + ) + ) + df = pd.read_sql(sql, self.engine) keys = ( "interaction_type", diff --git a/ebel/manager/orientdb/biodbs/mirtarbase.py b/ebel/manager/orientdb/biodbs/mirtarbase.py index 68ace6d..f3c03bc 100644 --- a/ebel/manager/orientdb/biodbs/mirtarbase.py +++ b/ebel/manager/orientdb/biodbs/mirtarbase.py @@ -58,19 +58,6 @@ def update_interactions(self) -> int: self.clear_edges() df_symbol_rid = self.get_pure_symbol_rid_df_in_bel_context(class_name="rna", namespace="HGNC") - # sql = f"""Select - # mi_rna, - # target_gene as symbol, - # support_type, - # references_pmid as pmid, - # experiments - # from - # {mirtarbase.Mirtarbase.__tablename__} - # where - # species_mi_rna='Homo sapiens' and - # species_target_gene='Homo sapiens' and - # support_type in ()""" - mtb = mirtarbase.Mirtarbase sql = ( select( diff --git a/ebel/manager/orientdb/biodbs/ncbi.py b/ebel/manager/orientdb/biodbs/ncbi.py index c395fc9..a629731 100644 --- a/ebel/manager/orientdb/biodbs/ncbi.py +++ b/ebel/manager/orientdb/biodbs/ncbi.py @@ -274,6 +274,7 @@ def _insert_info(self, chunksize: int = 1000000) -> int: "type_of_gene", "dbXrefs", } + for df in tqdm( pd.read_csv( file_path, diff --git a/ebel/manager/orientdb/biodbs/nsides.py b/ebel/manager/orientdb/biodbs/nsides.py index f9cb0a7..b9a1f43 100644 --- a/ebel/manager/orientdb/biodbs/nsides.py +++ b/ebel/manager/orientdb/biodbs/nsides.py @@ -7,13 +7,13 @@ import pandas as pd from pyorientdb import OrientDB -from sqlalchemy import text +from sqlalchemy import text, select, or_ from tqdm import tqdm from ebel.constants import RID from ebel.manager.orientdb import odb_meta, odb_structure, urls from ebel.manager.orientdb.constants import OFFSIDES, ONSIDES -from ebel.manager.rdbms.models import nsides +from ebel.manager.rdbms.models import nsides, drugbank from ebel.tools import get_file_path logger = logging.getLogger(__name__) @@ -143,25 +143,25 @@ def update_bel(self) -> int: self.delete_nodes_with_no_edges("side_effect") self.delete_nodes_with_no_edges("drug") - # TODO: Translate to sqlalchemy query - sql_temp = """Select - o.condition_meddra_id, - o.condition_concept_name, - o.prr, - o.mean_reporting_frequency - from - drugbank as d inner join - drugbank_external_identifier as dei on (d.id=dei.drugbank_id) inner join - nsides as o on (dei.identifier=o.drug_rxnorn_id) - where - d.drugbank_id='{}' and resource='RxCUI' - and (mean_reporting_frequency>=0.01 OR mean_reporting_frequency is NULL) - group by - o.condition_meddra_id, - o.condition_concept_name, - o.prr, - o.mean_reporting_frequency - """ + # # TODO: Translate to sqlalchemy query + # sql_temp = """Select + # o.condition_meddra_id, + # o.condition_concept_name, + # o.prr, + # o.mean_reporting_frequency + # from + # drugbank as d inner join + # drugbank_external_identifier as dei on (d.id=dei.drugbank_id) inner join + # nsides as o on (dei.identifier=o.drug_rxnorn_id) + # where + # d.drugbank_id='{}' and resource='RxCUI' + # and (mean_reporting_frequency>=0.01 OR mean_reporting_frequency is NULL) + # group by + # o.condition_meddra_id, + # o.condition_concept_name, + # o.prr, + # o.mean_reporting_frequency + # """ drugbank_ids = self.query_class("drug", columns=["drugbank_id"], drugbank_id="notnull") drugbank_id_rids = {d["drugbank_id"]: d[RID] for d in drugbank_ids} @@ -171,9 +171,27 @@ def update_bel(self) -> int: updated = 0 + d = drugbank.Drugbank + dei = drugbank.ExternalIdentifier + o = nsides.Nsides + for drugbank_id, drugbank_rid in tqdm(drugbank_id_rids.items(), desc=f"Update {self.biodb_name.upper()}"): - sql = sql_temp.format(drugbank_id) - for r in self.session.execute(text(sql)): + # sql = sql_temp.format(drugbank_id) + sql = ( + ( + select(o.condition_meddra_id, o.condition_concept_name, o.prr, o.mean_reporting_frequency) + .join(dei, dei.identifier == o.drug_rxnorn_id) + .join(d, d.id == dei.drugbank_id) + ) + .where(d.drugbank_id == drugbank_id) + .where(dei.resource == "RxCUI") + .where(or_(o.mean_reporting_frequency >= 0.01, o.mean_reporting_frequency.is_(None))) + .group_by(o.condition_meddra_id, o.condition_concept_name, o.prr, o.mean_reporting_frequency) + ) + + results = self.session.execute(sql) + + for r in results: ( condition_meddra_id, condition_concept_name, diff --git a/ebel/manager/orientdb/biodbs/uniprot.py b/ebel/manager/orientdb/biodbs/uniprot.py index 1bdb0be..06dcbcf 100644 --- a/ebel/manager/orientdb/biodbs/uniprot.py +++ b/ebel/manager/orientdb/biodbs/uniprot.py @@ -10,7 +10,7 @@ import pandas as pd from lxml.etree import iterparse from pyorientdb import OrientDB -from sqlalchemy import text +from sqlalchemy import text, select from tqdm import tqdm from ebel.defaults import default_tax_ids @@ -309,11 +309,17 @@ def _get_accesssion_recname(self, taxid, gene_symbol) -> Union[Tuple[str, str], """ # TODO: This is in general a dangerous method because it selects the first accession number, but there could # be more than one + # sql = ( + # f"Select accession, recommended_name from uniprot as u inner join uniprot_gene_symbol as gs " + # f'on (u.id=gs.uniprot_id) where u.taxid={taxid} and gs.symbol="{gene_symbol}" limit 1' + # ) sql = ( - f"Select accession, recommended_name from uniprot as u inner join uniprot_gene_symbol as gs " - f'on (u.id=gs.uniprot_id) where u.taxid={taxid} and gs.symbol="{gene_symbol}" limit 1' + select(up.Uniprot.accession, up.Uniprot.recommended_name) + .join(up.GeneSymbol) + .where(up.Uniprot.taxid == taxid) + .where(up.GeneSymbol.symbol == gene_symbol) ) - results = self.session.execute(text(sql)) + results = self.session.execute(sql) return results.fetchone() if results else None def _update_proteins(self, namespace, taxid) -> int: @@ -339,33 +345,36 @@ def _update_proteins(self, namespace, taxid) -> int: def _get_recname_taxid_by_accession_from_uniprot_api(self, accession) -> Tuple[str, int]: """Fetch uniprot entry by accession and adds to the database. Returns recommended name.""" - sql = f"Select recommended_name,taxid from uniprot where accession='{accession}' limit 1" - result = self.session.execute(text(sql)).fetchone() + # sql = f"Select recommended_name,taxid from uniprot where accession='{accession}' limit 1" + sql = select(up.Uniprot.recommended_name, up.Uniprot.taxid).where(up.Uniprot.accession == accession) + result = self.session.execute(sql).fetchone() if result: return result def _update_uniprot_proteins(self) -> int: """Update all proteins using UNIPROT as namespace. Returns number of updated proteins.""" updated = 0 - sql_temp = "Select recommended_name, taxid from uniprot where accession='{}' limit 1" + sql_uniprot = 'SELECT distinct(name) as accession from protein WHERE namespace="UNIPROT"' sql_update = ( 'Update protein set uniprot = name, label = "{}", species = {} ' 'where namespace = "UNIPROT" and name = "{}"' ) + for protein in self.query(sql_uniprot).itertuples(index=False): - sql = sql_temp.format(protein.accession) - found = self.session.execute(text(sql)).fetchone() + found = self._get_recname_taxid_by_accession_from_uniprot_api(protein.accession) if found: recommended_name, taxid = found num_updated = self.execute(sql_update.format(recommended_name, taxid, protein.accession))[0] updated += num_updated + else: recname_taxid = self._get_recname_taxid_by_accession_from_uniprot_api(protein.accession) if recname_taxid: recommended_name, taxid = recname_taxid num_updated = self.execute(sql_update.format(recommended_name, taxid, protein.accession))[0] updated += num_updated + return updated def __read_linked_tables( diff --git a/ebel/tools.py b/ebel/tools.py index 62fba6f..309a57d 100644 --- a/ebel/tools.py +++ b/ebel/tools.py @@ -5,6 +5,7 @@ import os.path import re import shutil +from os import PathLike from types import GeneratorType from typing import Iterable, List, Union @@ -93,7 +94,7 @@ def md5(file_path): return hash_md5.hexdigest() -def get_file_path(url: str, biodb: str): +def get_file_path(url: str, biodb: str) -> str: """Get standard file path by file_name and DATADIR.""" file_name = os.path.basename(url) bio_db_dir = os.path.join(DATA_DIR, biodb) From 770d9a83efbd50fe39c242e9511c675c0e860a11 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Tue, 26 Sep 2023 20:55:41 +0200 Subject: [PATCH 40/58] fix: remaining problems with sqla2 updates --- ebel/manager/orientdb/biodbs/biogrid.py | 63 ++++++++-------- ebel/manager/orientdb/biodbs/clinvar.py | 31 ++------ ebel/manager/orientdb/biodbs/disgenet.py | 5 ++ ebel/manager/orientdb/biodbs/kegg.py | 13 ++-- ebel/manager/orientdb/biodbs/stringdb.py | 12 +++- ebel/manager/orientdb/odb_meta.py | 92 +++++++++++++++--------- ebel/manager/orientdb/odb_structure.py | 4 +- ebel/manager/rdbms/models/iuphar.py | 8 +-- 8 files changed, 124 insertions(+), 104 deletions(-) diff --git a/ebel/manager/orientdb/biodbs/biogrid.py b/ebel/manager/orientdb/biodbs/biogrid.py index 789ed32..2e97b11 100644 --- a/ebel/manager/orientdb/biodbs/biogrid.py +++ b/ebel/manager/orientdb/biodbs/biogrid.py @@ -475,13 +475,13 @@ def get_uniprot_modification_pairs(self): ib.uniprot.label("object_uniprot"), ib.taxonomy_id.label("object_taxonomy_id"), ) + .select_from(b) .join(ia, b.biogrid_a_id == ia.biogrid_id) .join(ib, b.biogrid_b_id == ib.biogrid_id) .join(m, b.modification_id == m.id) ) .where(m.modification == "No Modification") .where(ia.uniprot.isnot(None)) - .where(ib.uniprot.isnot(None)) .group_by(ia.symbol, ia.uniprot, ia.taxonomy_id, ib.symbol, ib.uniprot, ib.taxonomy_id) ) results = self.session.execute(sql).fetchall() @@ -559,7 +559,7 @@ def update_interactions(self) -> int: func.group_concat( if_func(p.source == "PUBMED", cast(p.source_identifier, Integer), None).distinct() ).label("pmids"), - func.count(p.source_identifier).label("num_pubs"), + func.count(p.source_identifier.distinct()).label("num_pubs"), func.group_concat( if_func(p.source == "DOI", cast(p.source_identifier, Integer), None).distinct() ).label("dois"), @@ -574,32 +574,35 @@ def update_interactions(self) -> int: .where(m.modification != "No Modification") ) - for row in self.session.execute(sql).fetchall(): - row_dict = row._asdict() - be = BioGridEdge(subject_rid=subj_pure_rid, object_rid=obj_pure_rid, **row_dict) - edge_value_dict = be.get_edge_value_dict() - - if be.modConfig.bg_mod_name == "Proteolytic Processing": - self.create_edge( - "decreases_bg", - from_rid=subj_pure_rid, - to_rid=obj_pure_rid, - value_dict=edge_value_dict, - ) - counter += 1 - else: - obj_pmod_value_dict = be.obj.get_pmod_protein_as_value_dict() - pmod_protein_rid = self.node_exists("protein", obj_pmod_value_dict, check_for="bel") - if not pmod_protein_rid: - pmod_protein_rid = self.get_create_rid("protein", obj_pmod_value_dict, check_for="bel") - self.create_edge("has_modified_protein", obj_pure_rid, pmod_protein_rid) - pmod_rid = self.insert_record("pmod", be.get_pmod_as_value_dict()) - self.create_edge("has__pmod", pmod_protein_rid, pmod_rid) - self.create_edge( - be.edge_name, - subj_pure_rid, - pmod_protein_rid, - edge_value_dict, - ) - counter += 1 + results = self.session.execute(sql).fetchall() + + for row in results: + if row[0] is not None: # No results for uniprot ID combo + row_dict = row._asdict() # If no modification then no results were returned + be = BioGridEdge(subject_rid=subj_pure_rid, object_rid=obj_pure_rid, **row_dict) + edge_value_dict = be.get_edge_value_dict() + + if be.modConfig.bg_mod_name == "Proteolytic Processing": + self.create_edge( + "decreases_bg", + from_rid=subj_pure_rid, + to_rid=obj_pure_rid, + value_dict=edge_value_dict, + ) + counter += 1 + else: + obj_pmod_value_dict = be.obj.get_pmod_protein_as_value_dict() + pmod_protein_rid = self.node_exists("protein", obj_pmod_value_dict, check_for="bel") + if not pmod_protein_rid: + pmod_protein_rid = self.get_create_rid("protein", obj_pmod_value_dict, check_for="bel") + self.create_edge("has_modified_protein", obj_pure_rid, pmod_protein_rid) + pmod_rid = self.insert_record("pmod", be.get_pmod_as_value_dict()) + self.create_edge("has__pmod", pmod_protein_rid, pmod_rid) + self.create_edge( + be.edge_name, + subj_pure_rid, + pmod_protein_rid, + edge_value_dict, + ) + counter += 1 return counter diff --git a/ebel/manager/orientdb/biodbs/clinvar.py b/ebel/manager/orientdb/biodbs/clinvar.py index f19b510..2024124 100644 --- a/ebel/manager/orientdb/biodbs/clinvar.py +++ b/ebel/manager/orientdb/biodbs/clinvar.py @@ -19,7 +19,6 @@ Snp = namedtuple( "Snp", ( - "keyword", "phenotype", "rs_number", "hgnc_id", @@ -180,13 +179,12 @@ def get_disease_snps_dict(self) -> Dict[str, List[Snp]]: cv.start.label("position"), cv.clinical_significance, ) - .join(cv, cp.clinvars) + .join(cp, cv.phenotypes) .where(cv.rs_db_snp != -1) .where(cp.phenotype.like(f"%{kwd}%")) ) - print(sql) - rows = self.session.execute(sql) - results[kwd] = [Snp(*x) for x in rows.fetchall()] + rows = self.session.execute(sql).fetchall() + results[kwd] = [Snp(*x) for x in rows] return results @@ -268,25 +266,4 @@ def _get_set_gene_rid(self, hgnc_id: str): if __name__ == "__main__": c = ClinVar() - cv = clinvar.Clinvar - cp = clinvar.ClinvarPhenotype - kwd = "Depression" - - sql = ( - select( - cp.phenotype, - cv.rs_db_snp.label("rs_number"), - cv.hgnc_id, - cv.chromosome, - cv.start.label("position"), - cv.clinical_significance, - ) - .join(cp, cv.phenotypes) - .where(cv.rs_db_snp != -1) - .where(cp.phenotype.like(f"%{kwd}%")) - ) - rows = c.session.execute(sql) - amt = 0 - for x in rows: - amt += 1 - print(amt) + c.update() diff --git a/ebel/manager/orientdb/biodbs/disgenet.py b/ebel/manager/orientdb/biodbs/disgenet.py index a814d25..c903013 100644 --- a/ebel/manager/orientdb/biodbs/disgenet.py +++ b/ebel/manager/orientdb/biodbs/disgenet.py @@ -265,3 +265,8 @@ def update_snps(self) -> int: inserted += 1 return inserted + + +if __name__ == "__main__": + d = DisGeNet() + d.update() diff --git a/ebel/manager/orientdb/biodbs/kegg.py b/ebel/manager/orientdb/biodbs/kegg.py index 7799357..f75c91f 100644 --- a/ebel/manager/orientdb/biodbs/kegg.py +++ b/ebel/manager/orientdb/biodbs/kegg.py @@ -278,9 +278,9 @@ def update_interactions(self) -> int: "phosphorylation": ("pho", "increases", BelPmod.PHO), "ubiquitination": ("ubi", "increases", BelPmod.UBI), } - post_translational_modifications = ",".join([f"'{x}'" for x in pmods.keys()]) + # post_translational_modifications = ",".join([f"'{x}'" for x in pmods.keys()]) - species_ids = ",".join([f"'{x}'" for x in self.species]) + # species_ids = ",".join([f"'{x}'" for x in self.species]) # sql_temp = f"""Select # interaction_type, @@ -317,8 +317,8 @@ def update_interactions(self) -> int: kg.kegg_species_id, ) .where(or_(kg.gene_symbol_a == symbol, kg.gene_symbol_b == symbol)) - .where(kg.kegg_species_id.in_(species_ids)) - .where(kg.interaction_type.in_(post_translational_modifications)) + .where(kg.kegg_species_id.in_(self.species)) + .where(kg.interaction_type.in_(list(pmods.keys()))) .group_by( kg.interaction_type, kg.pathway_identifier, @@ -387,3 +387,8 @@ def update_interactions(self) -> int: self.hgnc.update_bel() return inserted + + +if __name__ == "__main__": + k = Kegg() + k.update() diff --git a/ebel/manager/orientdb/biodbs/stringdb.py b/ebel/manager/orientdb/biodbs/stringdb.py index 83380a5..7adccd0 100644 --- a/ebel/manager/orientdb/biodbs/stringdb.py +++ b/ebel/manager/orientdb/biodbs/stringdb.py @@ -160,9 +160,15 @@ def insert_action_data(self) -> int: def get_stringdb_action_hgnc_set(self): """Get unique HGNC symbols from stringdb_actions table.""" - sql = f"""(Select distinct( symbol1 ) from {self.table_action}) - union (Select distinct( symbol2 ) from {self.table_action})""" - return set([x[0] for x in self.session.execute(text(sql)).fetchall()]) + # sql = f"""(Select distinct( symbol1 ) from {self.table_action}) + # union (Select distinct( symbol2 ) from {self.table_action})""" + + stmt1 = select(stringdb.StringDbAction.symbol1).distinct() + stmt2 = select(stringdb.StringDbAction.symbol2).distinct() + sql = stmt1.union(stmt2).alias("combined") + print(sql) + + return set([x[0] for x in self.session.execute(sql).fetchall()]) def update_interactions(self) -> Dict[str, int]: """Update the edges with StringDB metadata.""" diff --git a/ebel/manager/orientdb/odb_meta.py b/ebel/manager/orientdb/odb_meta.py index 0501be6..e6f12e4 100644 --- a/ebel/manager/orientdb/odb_meta.py +++ b/ebel/manager/orientdb/odb_meta.py @@ -23,7 +23,7 @@ from pyorientdb import OrientDB, orient from pyorientdb.exceptions import PyOrientCommandException, PyOrientIndexException, PyOrientSecurityAccessException from pyorientdb.otypes import OrientRecord -from sqlalchemy import text +from sqlalchemy import text, select, func from sqlalchemy.sql.schema import Table from sqlalchemy_utils import create_database, database_exists from tqdm import tqdm @@ -34,6 +34,7 @@ from ebel.config import get_config_as_dict, get_config_value, write_to_config from ebel.constants import DEFAULT_ODB, RID from ebel.manager.orientdb import urls as default_urls +from ebel.manager.rdbms.models.ensembl import Ensembl as ens from ebel.manager.orientdb.odb_structure import Edge, Generic, Node, OClass, OIndex, OProperty from ebel.tools import BelRdb, chunks, get_file_path, get_standard_name @@ -97,12 +98,13 @@ def __init__( conn = get_config_value("DATABASE", "sqlalchemy_connection_string") - if not (conn or database_exists(self.engine.url)): - if str(self.engine.url).startswith("mysql"): + if not conn: + dialect = self.session.bind.dialect.name + if dialect == "mysql": set_mysql_interactive() - else: - create_database(self.engine.url) + if not database_exists(self.engine.url): + create_database(self.engine.url) def __config_params_check(self, overwrite_config: bool = False): """Go through passed/available configuration params.""" @@ -819,8 +821,9 @@ def number_of_generics(self) -> Dict[str, int]: if self.tables_base: for table_name, table in self.tables_base.metadata.tables.items(): if self.table_exists(table_name): - sql = f"Select count(*) from `{table_name}`" - numbers[table_name] = self.session.execute(text(sql)).fetchone()[0] + # sql = f"Select count(*) from `{table_name}`" + sql = select(func.count(table_name)) + numbers[table_name] = self.session.execute(sql).fetchone()[0] else: numbers[table_name] = 0 elif self.generic_classes: @@ -1320,36 +1323,57 @@ def get_set_gene_rids_by_position( gene_rids = defaultdict(list) sqls = dict() - sqls[ - "mapped" - ] = f"""Select symbol - from ensembl - where - start < {position} and - stop > {position} and - chromosome='{chromosome}' group by symbol""" - - sqls[ - "downstream" - ] = f"""Select symbol - from ensembl - where - start > {position} and - chromosome='{chromosome}' - order by start limit 1""" - - sqls[ - "upstream" - ] = f"""Select symbol - from ensembl - where - stop < {position} and - chromosome='{chromosome}' - order by stop desc limit 1""" + # sqls[ + # "mapped" + # ] = f"""Select symbol + # from ensembl + # where + # start < {position} and + # stop > {position} and + # chromosome='{chromosome}' group by symbol""" + sqls["mapped"] = ( + select(ens.symbol) + .where(ens.start < position) + .where(ens.stop > position) + .where(ens.chromosome == chromosome) + .group_by(ens.symbol) + ) + + # sqls[ + # "downstream" + # ] = f"""Select symbol + # from ensembl + # where + # start > {position} and + # chromosome='{chromosome}' + # order by start limit 1""" + sqls["downstream"] = ( + select(ens.symbol) + .where(ens.start > position) + .where(ens.chromosome == chromosome) + .limit(1) + .order_by(ens.start.asc()) + ) + + # sqls[ + # "upstream" + # ] = f"""Select symbol + # from ensembl + # where + # stop < {position} and + # chromosome='{chromosome}' + # order by stop desc limit 1""" + sqls["upstream"] = ( + select(ens.symbol) + .where(ens.stop < position) + .where(ens.chromosome == chromosome) + .limit(1) + .order_by(ens.stop.desc()) + ) for gene_type, sql in sqls.items(): if gene_type in gene_types: - results = self.session.execute(text(sql)) + results = self.session.execute(sql) for (symbol,) in results.fetchall(): bel = f'g(HGNC:"{symbol}")' data = { diff --git a/ebel/manager/orientdb/odb_structure.py b/ebel/manager/orientdb/odb_structure.py index e4e830b..8e09ec0 100755 --- a/ebel/manager/orientdb/odb_structure.py +++ b/ebel/manager/orientdb/odb_structure.py @@ -462,8 +462,8 @@ def __init__(self, odb_class: OClass, columns: Tuple[str, ...], index_type: OInd bel_indices = ( OIndex(bel, ("bel",), OIndexType.NOTUNIQUE_HASH_INDEX), - OIndex(bel, ("involved_genes",), OIndexType.NOTUNIQUE_HASH_INDEX), - OIndex(bel, ("involved_other",), OIndexType.NOTUNIQUE_HASH_INDEX), + # OIndex(bel, ("involved_genes",), OIndexType.NOTUNIQUE_HASH_INDEX), + # OIndex(bel, ("involved_other",), OIndexType.NOTUNIQUE_HASH_INDEX), OIndex(bel_relation, ("evidence",), OIndexType.NOTUNIQUE_HASH_INDEX), OIndex(protein, ("uniprot",), OIndexType.NOTUNIQUE_HASH_INDEX), OIndex(bel_relation, ("annotation",), OIndexType.DICTIONARY), diff --git a/ebel/manager/rdbms/models/iuphar.py b/ebel/manager/rdbms/models/iuphar.py index 83adc97..57a8b36 100644 --- a/ebel/manager/rdbms/models/iuphar.py +++ b/ebel/manager/rdbms/models/iuphar.py @@ -63,12 +63,12 @@ class IupharInteraction(Base): target_uniprot: Mapped[Optional[str]] = mapped_column(String(100)) target_ensembl_gene_id: Mapped[Optional[str]] = mapped_column(String(200)) target_ligand: Mapped[Optional[str]] = mapped_column(String(100)) - target_ligand_id: Mapped[Optional[str]] = mapped_column() + target_ligand_id: Mapped[Optional[str]] = mapped_column(String(100)) target_ligand_subunit_ids: Mapped[Optional[str]] = mapped_column(Text) target_ligand_gene_symbol: Mapped[Optional[str]] = mapped_column(String(50)) target_ligand_uniprot_id: Mapped[Optional[str]] = mapped_column(String(200)) target_ligand_ensembl_gene_id: Mapped[Optional[str]] = mapped_column(String(50)) - target_ligand_pubchem_sid: Mapped[Optional[str]] = mapped_column() + target_ligand_pubchem_sid: Mapped[Optional[str]] = mapped_column(String(100)) target_species: Mapped[Optional[str]] = mapped_column(String(100)) ligand: Mapped[str] = mapped_column(String(255)) ligand_id: Mapped[int] = mapped_column(ForeignKey("iuphar_ligand.id"), index=True) @@ -78,8 +78,8 @@ class IupharInteraction(Base): ligand_pubchem_sid: Mapped[Optional[int]] = mapped_column() ligand_type: Mapped[str] = mapped_column(Text) approved: Mapped[bool] = mapped_column() - type: Mapped[str] = mapped_column(String(100)) - action: Mapped[str] = mapped_column(String(100)) + type: Mapped[Optional[str]] = mapped_column(String(100)) + action: Mapped[Optional[str]] = mapped_column(String(100)) action_comment: Mapped[Optional[str]] = mapped_column(String(255)) selectivity: Mapped[Optional[str]] = mapped_column(String(50)) endogenous: Mapped[bool] = mapped_column() From fec21f5c7d7af15bb8adaa030ccc114c74c843f2 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Wed, 27 Sep 2023 07:48:20 +0200 Subject: [PATCH 41/58] chore: remove run blocks --- ebel/manager/orientdb/biodbs/clinvar.py | 5 ----- ebel/manager/orientdb/biodbs/disgenet.py | 5 ----- ebel/manager/orientdb/biodbs/kegg.py | 5 ----- ebel/manager/orientdb/biodbs/pathway_commons.py | 5 ----- 4 files changed, 20 deletions(-) diff --git a/ebel/manager/orientdb/biodbs/clinvar.py b/ebel/manager/orientdb/biodbs/clinvar.py index 2024124..a85c5fc 100644 --- a/ebel/manager/orientdb/biodbs/clinvar.py +++ b/ebel/manager/orientdb/biodbs/clinvar.py @@ -262,8 +262,3 @@ def _get_set_gene_rid(self, hgnc_id: str): gene_rid = self.get_create_rid("gene", data, check_for="bel") return gene_rid - - -if __name__ == "__main__": - c = ClinVar() - c.update() diff --git a/ebel/manager/orientdb/biodbs/disgenet.py b/ebel/manager/orientdb/biodbs/disgenet.py index c903013..a814d25 100644 --- a/ebel/manager/orientdb/biodbs/disgenet.py +++ b/ebel/manager/orientdb/biodbs/disgenet.py @@ -265,8 +265,3 @@ def update_snps(self) -> int: inserted += 1 return inserted - - -if __name__ == "__main__": - d = DisGeNet() - d.update() diff --git a/ebel/manager/orientdb/biodbs/kegg.py b/ebel/manager/orientdb/biodbs/kegg.py index f75c91f..2ef9e3a 100644 --- a/ebel/manager/orientdb/biodbs/kegg.py +++ b/ebel/manager/orientdb/biodbs/kegg.py @@ -387,8 +387,3 @@ def update_interactions(self) -> int: self.hgnc.update_bel() return inserted - - -if __name__ == "__main__": - k = Kegg() - k.update() diff --git a/ebel/manager/orientdb/biodbs/pathway_commons.py b/ebel/manager/orientdb/biodbs/pathway_commons.py index c0ca6e9..1ff0649 100644 --- a/ebel/manager/orientdb/biodbs/pathway_commons.py +++ b/ebel/manager/orientdb/biodbs/pathway_commons.py @@ -304,8 +304,3 @@ def get_pathway_pmids_sources(self, pc_id, pc_pathway_name_rid_dict) -> tuple: pmids = [x.pmid for x in pc_obj.pmids] pathways = [pc_pathway_name_rid_dict[x.name] for x in pc_obj.pathway_names] return pathways, pmids, sources - - -if __name__ == "__main__": - foo = PathwayCommons() - foo.update_interactions() From 128376ffcf8a53bc4f90c83ea541c1d693bb2e3b Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Wed, 27 Sep 2023 08:26:47 +0200 Subject: [PATCH 42/58] fix: ncbi nullable props --- ebel/manager/orientdb/biodbs/disgenet.py | 4 ++-- ebel/manager/orientdb/biodbs/ncbi.py | 5 +++++ ebel/manager/rdbms/models/ncbi.py | 8 ++++---- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/ebel/manager/orientdb/biodbs/disgenet.py b/ebel/manager/orientdb/biodbs/disgenet.py index a814d25..a4843cc 100644 --- a/ebel/manager/orientdb/biodbs/disgenet.py +++ b/ebel/manager/orientdb/biodbs/disgenet.py @@ -223,7 +223,7 @@ def update_snps(self) -> int: .group_by(dv.snp_id, dv.chromosome, dv.position, dd.disease_name, dv.pmid, dv.score, ds.source) ) - rows = self.session.execute(sql) + rows = self.session.execute(sql).fetchall() results[kwd] = rows inserted = 0 @@ -234,7 +234,7 @@ def update_snps(self) -> int: for r in tqdm( kwd_disease_results, desc=f"Update DisGeNET variant interactions for {trait}", - total=kwd_disease_results.rowcount, + total=len(kwd_disease_results), ): snp_id, chromosome, position, disease_name, pmid, score, source = r diff --git a/ebel/manager/orientdb/biodbs/ncbi.py b/ebel/manager/orientdb/biodbs/ncbi.py index a629731..eb50d44 100644 --- a/ebel/manager/orientdb/biodbs/ncbi.py +++ b/ebel/manager/orientdb/biodbs/ncbi.py @@ -299,3 +299,8 @@ def _insert_info(self, chunksize: int = 1000000) -> int: def update_interactions(self) -> int: """Abstract method.""" pass + + +if __name__ == "__main__": + n = Ncbi() + n._insert_medgen() diff --git a/ebel/manager/rdbms/models/ncbi.py b/ebel/manager/rdbms/models/ncbi.py index caa04f7..fb0231c 100644 --- a/ebel/manager/rdbms/models/ncbi.py +++ b/ebel/manager/rdbms/models/ncbi.py @@ -180,10 +180,10 @@ class NcbiGeneEnsembl(Base): tax_id: Mapped[int] = mapped_column(index=True) gene_id: Mapped[int] = mapped_column(ForeignKey("ncbi_gene_info.gene_id")) ensembl_gene_identifier: Mapped[str] = mapped_column(String(100)) - rna_nucleotide_accession_version: Mapped[str] = mapped_column(String(100)) - ensembl_rna_identifier: Mapped[str] = mapped_column(String(100)) - protein_accession_version: Mapped[str] = mapped_column(String(100)) - ensembl_protein_identifier: Mapped[str] = mapped_column(String(100)) + rna_nucleotide_accession_version: Mapped[Optional[str]] = mapped_column(String(100)) + ensembl_rna_identifier: Mapped[Optional[str]] = mapped_column(String(100)) + protein_accession_version: Mapped[Optional[str]] = mapped_column(String(100)) + ensembl_protein_identifier: Mapped[Optional[str]] = mapped_column(String(100)) genes: Mapped[NcbiGeneInfo] = relationship("NcbiGeneInfo", back_populates="ensembl_ids") From c5b00ae0d31cc96522e3b0233709a1257ddd3b46 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Wed, 27 Sep 2023 15:32:14 +0200 Subject: [PATCH 43/58] build: update logging and copyright year --- ebel/__init__.py | 6 +++--- ebel/defaults.py | 28 +++------------------------- ebel/logging.conf | 30 ++++++++++++++++++++++++++++++ 3 files changed, 36 insertions(+), 28 deletions(-) create mode 100644 ebel/logging.conf diff --git a/ebel/__init__.py b/ebel/__init__.py index 5cae1ea..e2ea2fc 100755 --- a/ebel/__init__.py +++ b/ebel/__init__.py @@ -1,6 +1,6 @@ """Root init for eBEL.""" -from . import cache, constants, errors, parser, transformers -from .manager.orientdb.biodbs.bel import Bel +from ebel import cache, constants, errors, parser, transformers +from ebel.manager.orientdb.biodbs.bel import Bel __version__ = "1.0.37" @@ -12,7 +12,7 @@ __email__ = "christian.ebeling@scai.fraunhofer.de" __license__ = "?" -__copyright__ = """Copyright (c) 2021 Christian Ebeling, Fraunhofer Institute for Algorithms and Scientific +__copyright__ = """Copyright (c) 2023 Christian Ebeling, Fraunhofer Institute for Algorithms and Scientific Computing SCAI, Schloss Birlinghoven, 53754 Sankt Augustin, Germany""" project_name = __title__ diff --git a/ebel/defaults.py b/ebel/defaults.py index 8466190..638c1bd 100755 --- a/ebel/defaults.py +++ b/ebel/defaults.py @@ -2,11 +2,9 @@ """This file contains default values for configurations and parameters.""" -import logging -import logging.handlers as handlers -import os +import logging.config -from .constants import DATA_DIR, LOG_DIR, PROJECT_DIR +from ebel.constants import DATA_DIR, PROJECT_DIR, THIS_DIR, LOG_DIR ############################################################################### # UNIPROT taxonomy IDs to import @@ -46,24 +44,4 @@ ############################################################################### # Log Handling -logHandler = handlers.RotatingFileHandler( - filename=LOG_DIR.joinpath("ebel.log"), - mode="a", - maxBytes=4098 * 10, # 4MB file max - backupCount=0, -) -logh_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") -logHandler.setFormatter(logh_format) -logHandler.setLevel(logging.DEBUG) - - -# Console Handler -ch = logging.StreamHandler() -ch_format = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") -ch.setFormatter(ch_format) -ch.setLevel(logging.WARNING) - -logging.basicConfig( - handlers=[logHandler, ch], - encoding="utf-8", -) +logging.config.fileConfig(THIS_DIR.joinpath("logging.conf"), defaults={"logfilename": LOG_DIR.joinpath("ebel.log")}) diff --git a/ebel/logging.conf b/ebel/logging.conf new file mode 100644 index 0000000..aa6b8b8 --- /dev/null +++ b/ebel/logging.conf @@ -0,0 +1,30 @@ +[loggers] +keys=root + +[handlers] +keys=consoleHandler,fileHandler + +[formatters] +keys=full,simple + +[logger_root] +level=DEBUG +handlers=fileHandler,consoleHandler + +[handler_consoleHandler] +class=StreamHandler +level=WARNING +formatter=simple +args=(sys.stdout,) + +[handler_fileHandler] +class=logging.handlers.RotatingFileHandler +level=INFO +formatter=full +args=("logfilename.log", 'a') + +[formatter_full] +format=%(asctime)s - %(name)s - %(levelname)s - %(message)s + +[formatter_simple] +format=%(asctime)s - %(message)s \ No newline at end of file From d9e213fdf638f3cf6f5448eb4938cc305db8eb53 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Wed, 27 Sep 2023 15:33:03 +0200 Subject: [PATCH 44/58] fix: optimize unirpto rid call --- ebel/manager/orientdb/biodbs/biogrid.py | 2 +- ebel/manager/orientdb/biodbs/intact.py | 32 +++++++++++++------------ ebel/manager/orientdb/odb_meta.py | 21 +++++++++------- 3 files changed, 30 insertions(+), 25 deletions(-) diff --git a/ebel/manager/orientdb/biodbs/biogrid.py b/ebel/manager/orientdb/biodbs/biogrid.py index 2e97b11..bd33d57 100644 --- a/ebel/manager/orientdb/biodbs/biogrid.py +++ b/ebel/manager/orientdb/biodbs/biogrid.py @@ -523,7 +523,7 @@ def update_interactions(self) -> int: if_func = func.iif if self.engine.dialect.name == "sqlite" else func.IF - logging.info("Update BioGRID") + logger.info("Update BioGRID") for e in tqdm( uniprot_modification_pairs, diff --git a/ebel/manager/orientdb/biodbs/intact.py b/ebel/manager/orientdb/biodbs/intact.py index d982fc7..a57b5e4 100644 --- a/ebel/manager/orientdb/biodbs/intact.py +++ b/ebel/manager/orientdb/biodbs/intact.py @@ -1,5 +1,6 @@ """IntAct module.""" import logging +import time import zipfile from typing import Dict @@ -35,6 +36,11 @@ def __init__(self, client: OrientDB = None, condition_keyword="Alzheimer"): biodb_name=self.biodb_name, ) + up = UniProt() + up.update() + + self.uniprot_rid_dict = self.get_pure_uniprot_rid_dict_in_bel_context() + def __len__(self): return self.number_of_generics @@ -102,22 +108,20 @@ def insert_data(self) -> Dict[str, int]: return {self.biodb_name: df.shape[0]} - def get_create_rid_by_uniprot(self, uniprot_accession: str, uniprot_rid_dict: dict) -> str: + def get_create_rid_by_uniprot(self, uniprot_accession: str) -> str: """Create or get rID entry for a given UniProt ID. Parameters ---------- uniprot_accession: str UniProt accession number. - uniprot_rid_dict: dict - Entry parameters matching those of the desired rID entry. Returns ------- str UniProt accession ID. """ - if uniprot_accession not in uniprot_rid_dict: + if uniprot_accession not in self.uniprot_rid_dict: nn = self.get_namespace_name_by_uniprot(uniprot_accession) if nn: namespace, name = nn @@ -128,8 +132,9 @@ def get_create_rid_by_uniprot(self, uniprot_accession: str, uniprot_rid_dict: di "bel": f'p({namespace}:"{name}")', "uniprot": uniprot_accession, } - uniprot_rid_dict[uniprot_accession] = self.get_create_rid("protein", value_dict, check_for="bel") - return uniprot_rid_dict.get(uniprot_accession) + self.uniprot_rid_dict[uniprot_accession] = self.get_create_rid("protein", value_dict, check_for="bel") + + return self.uniprot_rid_dict.get(uniprot_accession) def get_namespace_name_by_uniprot(self, uniprot_accession: str) -> tuple: """Get the namespace of a given UniProt ID. @@ -168,14 +173,11 @@ def get_namespace_name_by_uniprot(self, uniprot_accession: str) -> tuple: def update_interactions(self) -> int: """Update intact interactions to graph.""" - up = UniProt(self.client) - up.update() - - uniprot_rid_dict = up.get_pure_uniprot_rid_dict_in_bel_context() + logger.info("Update IntAct interactions") updated = 0 - uniprot_accessions = tuple(uniprot_rid_dict.keys()) + uniprot_accessions = tuple(self.uniprot_rid_dict.keys()) it = intact.Intact for uniprot_accession in tqdm(uniprot_accessions, desc="Update IntAct interactions"): @@ -204,7 +206,7 @@ def update_interactions(self) -> int: it.confidence_value, ) ) - result = self.session.execute(sql) + results = self.session.execute(sql).fetchall() for ( up_a, @@ -216,9 +218,9 @@ def update_interactions(self) -> int: d_method, d_method_id, c_value, - ) in result.fetchall(): - from_rid = self.get_create_rid_by_uniprot(up_a, uniprot_rid_dict) - to_rid = self.get_create_rid_by_uniprot(up_b, uniprot_rid_dict) + ) in results: + from_rid = self.get_create_rid_by_uniprot(up_a) + to_rid = self.get_create_rid_by_uniprot(up_b) if from_rid and to_rid: value_dict = { diff --git a/ebel/manager/orientdb/odb_meta.py b/ebel/manager/orientdb/odb_meta.py index e6f12e4..1b2d180 100644 --- a/ebel/manager/orientdb/odb_meta.py +++ b/ebel/manager/orientdb/odb_meta.py @@ -21,7 +21,12 @@ import sqlalchemy as sqla import xmltodict from pyorientdb import OrientDB, orient -from pyorientdb.exceptions import PyOrientCommandException, PyOrientIndexException, PyOrientSecurityAccessException +from pyorientdb.exceptions import ( + PyOrientCommandException, + PyOrientIndexException, + PyOrientSecurityAccessException, + PyOrientBadMethodCallException, +) from pyorientdb.otypes import OrientRecord from sqlalchemy import text, select, func from sqlalchemy.sql.schema import Table @@ -160,13 +165,11 @@ def execute(self, command_str: str) -> List[OrientRecord]: try: return self.client.command(command_str) - # TODO: following exceptions seems not to cover connection error - # except (PyOrientCommandException, PyOrientSecurityAccessException): - except: + except (PyOrientCommandException, PyOrientSecurityAccessException, PyOrientBadMethodCallException) as e: + logger.error(e) # Try to reconnect self.client.close() self.client = self.get_client() - # self.client.db_open(self.odb_name, self.odb_user, self.odb_password) # print(command_str) return self.client.command(command_str) @@ -848,7 +851,7 @@ def __get_sql_where_part(params, where_list: Tuple[str] = ()): where_list.append("`{}` IS NULL".format(column)) where = "" if where_list: - where = " WHERE " + " AND ".join(where_list) + where = "WHERE " + " AND ".join(where_list) return where def get_number_of_class(self, class_name, distinct_column_name: str = None, **params): @@ -1463,6 +1466,7 @@ def delete_nodes_with_no_edges(self, class_name=None) -> int: ) logger.warning(wtext) return 0 + else: class_name = class_name if class_name is not None else "V" return self.execute(f"Delete VERTEX {class_name} where both().size() = 0")[0] @@ -1524,9 +1528,8 @@ def get_pure_uniprot_rid_dict_in_bel_context(self) -> Dict[str, str]: # only include proteins which are also part of a BEL statement to avoid explosion of graph sql = """Select uniprot, @rid.asString() as rid from protein where pure=true and uniprot in ( - Select unionall(uniprot_list).asSet() as all_uniprots from (select unionall(in.uniprot, out.uniprot).asSet() as - uniprot_list from bel_relation where document IS NOT NULL - and (in.uniprot IS NOT NULL or out.uniprot IS NOT NULL)))""" + select set(unionall(in.uniprot, out.uniprot)) as all_uniprots from bel_relation where document IS NOT NULL)""" + # sql = "select uniprot, @rid.asString() as rid from protein where pure = true and uniprot is not null" return {r["uniprot"]: r["rid"] for r in self.query_get_dict(sql)} From c51943f887e5717d332938226ec95dfdbcb89be8 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Wed, 27 Sep 2023 15:59:34 +0200 Subject: [PATCH 45/58] perf: disable Uniprot update step in intact --- ebel/manager/orientdb/biodbs/intact.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ebel/manager/orientdb/biodbs/intact.py b/ebel/manager/orientdb/biodbs/intact.py index a57b5e4..3a56bce 100644 --- a/ebel/manager/orientdb/biodbs/intact.py +++ b/ebel/manager/orientdb/biodbs/intact.py @@ -36,8 +36,8 @@ def __init__(self, client: OrientDB = None, condition_keyword="Alzheimer"): biodb_name=self.biodb_name, ) - up = UniProt() - up.update() + # up = UniProt() + # up.update() self.uniprot_rid_dict = self.get_pure_uniprot_rid_dict_in_bel_context() @@ -81,7 +81,7 @@ def insert_data(self) -> Dict[str, int]: "Interaction detection method(s)": "dm", } - df = pd.read_csv(zf.open("intact.txt"), sep="\t", usecols=usecols.keys()) + df = pd.read_csv(zf.open("intact.txt"), sep="\t", usecols=list(usecols.keys())) df.rename(columns=usecols, inplace=True) regex_accession = r"uniprotkb:([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})" From cf10492a89202d9f662fdcd5b99fe108decbcb94 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Wed, 27 Sep 2023 15:59:59 +0200 Subject: [PATCH 46/58] fix: set query_class method with empty cols to avoid * in cmd --- ebel/manager/orientdb/odb_meta.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/ebel/manager/orientdb/odb_meta.py b/ebel/manager/orientdb/odb_meta.py index 1b2d180..c5daa1f 100644 --- a/ebel/manager/orientdb/odb_meta.py +++ b/ebel/manager/orientdb/odb_meta.py @@ -171,7 +171,7 @@ def execute(self, command_str: str) -> List[OrientRecord]: self.client.close() self.client = self.get_client() # print(command_str) - return self.client.command(command_str) + return self.execute(command_str) def set_configuration_parameters(self): """Set configuration for OrientDB database client instance using configuration file or passed params.""" @@ -359,7 +359,7 @@ def query_class( class_name: str, limit: int = 0, skip: int = 0, - columns: Iterable[str] = None, + columns: Iterable[str] = [], with_rid=True, with_class=False, print_sql: bool = False, @@ -406,7 +406,7 @@ def query_class( if distinct and len(cols) == 1: sql_cols = "distinct({})".format(sql_cols) - sql_temp = "SELECT {sql_cols} FROM `{class_name}` {where} {group_by} {sql_limit} {sql_skip}" + sql_temp = "SELECT {sql_cols} FROM {class_name} {where} {group_by} {sql_limit} {sql_skip}" sql = sql_temp.format( sql_cols=sql_cols, @@ -842,16 +842,21 @@ def __get_sql_where_part(params, where_list: Tuple[str] = ()): for column, value in params.items(): if isinstance(value, (str, list, dict)): if value == "notnull": - where_list.append("`{}` IS NOT NULL".format(column)) + where_list.append("{} IS NOT NULL".format(column)) + else: - where_list.append("`{}` = {}".format(column, json.dumps(value))) + where_list.append("{} = {}".format(column, json.dumps(value))) + elif isinstance(value, (int, float)): - where_list.append("`{}` = {}".format(column, value)) + where_list.append("{} = {}".format(column, value)) + elif value is None: - where_list.append("`{}` IS NULL".format(column)) + where_list.append("{` IS NULL".format(column)) + where = "" if where_list: where = "WHERE " + " AND ".join(where_list) + return where def get_number_of_class(self, class_name, distinct_column_name: str = None, **params): @@ -947,7 +952,9 @@ def node_exists( if check_for: check_for = [check_for] if isinstance(check_for, str) else check_for check_for_dict = {k: v for k, v in check_for_dict.items() if k in check_for} - result = self.query_class(class_name=class_name, limit=1, print_sql=print_sql, **check_for_dict) + result = self.query_class( + class_name=class_name, columns=[], limit=1, with_rid=True, print_sql=print_sql, **check_for_dict + ) if result: return result[0][RID] @@ -987,8 +994,10 @@ def get_create_rid(self, class_name: str, value_dict: dict, check_for=None, prin check_for=check_for, print_sql=print_sql, ) + if not rid: rid = self.insert_record(class_name=class_name, value_dict=value_dict, print_sql=print_sql) + return rid def update_correlative_edges(self) -> List[str]: From d6201929021f01657e20fd8d9e6c2328844faf73 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Wed, 27 Sep 2023 16:25:21 +0200 Subject: [PATCH 47/58] revert: logging setup --- ebel/defaults.py | 23 ++++++++++++++++++++--- ebel/logging.conf | 30 ------------------------------ 2 files changed, 20 insertions(+), 33 deletions(-) delete mode 100644 ebel/logging.conf diff --git a/ebel/defaults.py b/ebel/defaults.py index 638c1bd..205a3be 100755 --- a/ebel/defaults.py +++ b/ebel/defaults.py @@ -2,9 +2,10 @@ """This file contains default values for configurations and parameters.""" -import logging.config +import logging +import logging.handlers as handlers -from ebel.constants import DATA_DIR, PROJECT_DIR, THIS_DIR, LOG_DIR +from ebel.constants import DATA_DIR, PROJECT_DIR, LOG_DIR ############################################################################### # UNIPROT taxonomy IDs to import @@ -44,4 +45,20 @@ ############################################################################### # Log Handling -logging.config.fileConfig(THIS_DIR.joinpath("logging.conf"), defaults={"logfilename": LOG_DIR.joinpath("ebel.log")}) +logHandler = handlers.RotatingFileHandler( + filename=LOG_DIR.joinpath("ebel.log"), + mode="a", + maxBytes=4098 * 10, # 4MB file max + backupCount=3, +) +logh_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") +logHandler.setFormatter(logh_format) +logHandler.setLevel(logging.DEBUG) + +# Console Handler +streamHandler = logging.StreamHandler() +stream_format = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") +streamHandler.setFormatter(stream_format) +streamHandler.setLevel(logging.WARNING) + +logging.basicConfig(level=logging.INFO, handlers=[logHandler, streamHandler]) diff --git a/ebel/logging.conf b/ebel/logging.conf deleted file mode 100644 index aa6b8b8..0000000 --- a/ebel/logging.conf +++ /dev/null @@ -1,30 +0,0 @@ -[loggers] -keys=root - -[handlers] -keys=consoleHandler,fileHandler - -[formatters] -keys=full,simple - -[logger_root] -level=DEBUG -handlers=fileHandler,consoleHandler - -[handler_consoleHandler] -class=StreamHandler -level=WARNING -formatter=simple -args=(sys.stdout,) - -[handler_fileHandler] -class=logging.handlers.RotatingFileHandler -level=INFO -formatter=full -args=("logfilename.log", 'a') - -[formatter_full] -format=%(asctime)s - %(name)s - %(levelname)s - %(message)s - -[formatter_simple] -format=%(asctime)s - %(message)s \ No newline at end of file From d00277b6beec93eb23ad10fe4b185c4522cc2108 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Wed, 27 Sep 2023 16:35:51 +0200 Subject: [PATCH 48/58] revert: query_class default columns to None --- ebel/manager/orientdb/odb_meta.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ebel/manager/orientdb/odb_meta.py b/ebel/manager/orientdb/odb_meta.py index c5daa1f..8a31df2 100644 --- a/ebel/manager/orientdb/odb_meta.py +++ b/ebel/manager/orientdb/odb_meta.py @@ -359,7 +359,7 @@ def query_class( class_name: str, limit: int = 0, skip: int = 0, - columns: Iterable[str] = [], + columns: Iterable[str] = None, with_rid=True, with_class=False, print_sql: bool = False, From 77de7c4b53aba0926e0fd345e79a94d7913099a1 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Thu, 28 Sep 2023 09:11:37 +0200 Subject: [PATCH 49/58] chore: remove previous sql text after check --- ebel/manager/orientdb/biodbs/disgenet.py | 24 ------------------------ ebel/manager/orientdb/biodbs/kegg.py | 24 ------------------------ ebel/manager/orientdb/biodbs/nsides.py | 21 --------------------- ebel/manager/orientdb/biodbs/uniprot.py | 5 ----- 4 files changed, 74 deletions(-) diff --git a/ebel/manager/orientdb/biodbs/disgenet.py b/ebel/manager/orientdb/biodbs/disgenet.py index a4843cc..4d86467 100644 --- a/ebel/manager/orientdb/biodbs/disgenet.py +++ b/ebel/manager/orientdb/biodbs/disgenet.py @@ -183,30 +183,6 @@ def update_snps(self) -> int: "downstream": "upstream", "upstream": "downstream", } - # # TODO: replace SQL with SQL Alchemy statement - # sql_temp = """Select - # snp_id, - # chromosome, - # position, - # disease_name, - # pmid, - # score, - # source - # FROM - # disgenet_variant v INNER JOIN - # disgenet_source s on (v.source_id=s.id) INNER JOIN - # disgenet_disease d on (v.disease_id=d.disease_id) - # WHERE - # disease_name like '%%{}%%' and - # source!='BEFREE' - # GROUP BY - # snp_id, - # chromosome, - # position, - # disease_name, - # pmid, - # score, - # source""" dv = disgenet.DisgenetVariant ds = disgenet.DisgenetSource diff --git a/ebel/manager/orientdb/biodbs/kegg.py b/ebel/manager/orientdb/biodbs/kegg.py index 2ef9e3a..bcfa712 100644 --- a/ebel/manager/orientdb/biodbs/kegg.py +++ b/ebel/manager/orientdb/biodbs/kegg.py @@ -278,30 +278,6 @@ def update_interactions(self) -> int: "phosphorylation": ("pho", "increases", BelPmod.PHO), "ubiquitination": ("ubi", "increases", BelPmod.UBI), } - # post_translational_modifications = ",".join([f"'{x}'" for x in pmods.keys()]) - - # species_ids = ",".join([f"'{x}'" for x in self.species]) - - # sql_temp = f"""Select - # interaction_type, - # pathway_identifier, - # pathway_name, - # gene_symbol_a, - # gene_symbol_b, - # kegg_species_id - # from - # kegg - # where - # (gene_symbol_a='{{symbol}}' or gene_symbol_a='{{symbol}}') and - # kegg_species_id in ({species_ids}) and - # interaction_type in ({{interaction_types}}) - # group by - # interaction_type, - # pathway_identifier, - # pathway_name, - # gene_symbol_a, - # gene_symbol_b, - # kegg_species_id""" kg = kegg.Kegg for symbol, rid in tqdm(symbol_rids_dict.items(), desc="Update KEGG posttranslational modifications"): diff --git a/ebel/manager/orientdb/biodbs/nsides.py b/ebel/manager/orientdb/biodbs/nsides.py index b9a1f43..0d4d441 100644 --- a/ebel/manager/orientdb/biodbs/nsides.py +++ b/ebel/manager/orientdb/biodbs/nsides.py @@ -143,26 +143,6 @@ def update_bel(self) -> int: self.delete_nodes_with_no_edges("side_effect") self.delete_nodes_with_no_edges("drug") - # # TODO: Translate to sqlalchemy query - # sql_temp = """Select - # o.condition_meddra_id, - # o.condition_concept_name, - # o.prr, - # o.mean_reporting_frequency - # from - # drugbank as d inner join - # drugbank_external_identifier as dei on (d.id=dei.drugbank_id) inner join - # nsides as o on (dei.identifier=o.drug_rxnorn_id) - # where - # d.drugbank_id='{}' and resource='RxCUI' - # and (mean_reporting_frequency>=0.01 OR mean_reporting_frequency is NULL) - # group by - # o.condition_meddra_id, - # o.condition_concept_name, - # o.prr, - # o.mean_reporting_frequency - # """ - drugbank_ids = self.query_class("drug", columns=["drugbank_id"], drugbank_id="notnull") drugbank_id_rids = {d["drugbank_id"]: d[RID] for d in drugbank_ids} @@ -176,7 +156,6 @@ def update_bel(self) -> int: o = nsides.Nsides for drugbank_id, drugbank_rid in tqdm(drugbank_id_rids.items(), desc=f"Update {self.biodb_name.upper()}"): - # sql = sql_temp.format(drugbank_id) sql = ( ( select(o.condition_meddra_id, o.condition_concept_name, o.prr, o.mean_reporting_frequency) diff --git a/ebel/manager/orientdb/biodbs/uniprot.py b/ebel/manager/orientdb/biodbs/uniprot.py index 06dcbcf..95ae065 100644 --- a/ebel/manager/orientdb/biodbs/uniprot.py +++ b/ebel/manager/orientdb/biodbs/uniprot.py @@ -308,11 +308,6 @@ def _get_accesssion_recname(self, taxid, gene_symbol) -> Union[Tuple[str, str], If this has no result it tries uniprot by gene symbol and NCBI taxonomy ID. """ # TODO: This is in general a dangerous method because it selects the first accession number, but there could - # be more than one - # sql = ( - # f"Select accession, recommended_name from uniprot as u inner join uniprot_gene_symbol as gs " - # f'on (u.id=gs.uniprot_id) where u.taxid={taxid} and gs.symbol="{gene_symbol}" limit 1' - # ) sql = ( select(up.Uniprot.accession, up.Uniprot.recommended_name) .join(up.GeneSymbol) From 889035e3f1c209a50e08fe9d4cf8c716662fe4ee Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Thu, 28 Sep 2023 09:16:03 +0200 Subject: [PATCH 50/58] perf: improve intact update --- ebel/manager/orientdb/biodbs/intact.py | 56 +++++++++++++------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/ebel/manager/orientdb/biodbs/intact.py b/ebel/manager/orientdb/biodbs/intact.py index 3a56bce..40e1484 100644 --- a/ebel/manager/orientdb/biodbs/intact.py +++ b/ebel/manager/orientdb/biodbs/intact.py @@ -180,34 +180,34 @@ def update_interactions(self) -> int: uniprot_accessions = tuple(self.uniprot_rid_dict.keys()) it = intact.Intact - for uniprot_accession in tqdm(uniprot_accessions, desc="Update IntAct interactions"): - sql = ( - select( - it.int_a_uniprot_id, - it.int_b_uniprot_id, - it.pmid, - it.interaction_ids, - it.interaction_type, - it.interaction_type_psimi_id, - it.detection_method, - it.detection_method_psimi_id, - it.confidence_value, - ) - .where(or_(it.int_a_uniprot_id == uniprot_accession, it.int_b_uniprot_id == uniprot_accession)) - .group_by( - it.int_a_uniprot_id, - it.int_b_uniprot_id, - it.pmid, - it.interaction_ids, - it.interaction_type, - it.interaction_type_psimi_id, - it.detection_method, - it.detection_method_psimi_id, - it.confidence_value, - ) - ) - results = self.session.execute(sql).fetchall() + sql = select( + it.int_a_uniprot_id, + it.int_b_uniprot_id, + it.pmid, + it.interaction_ids, + it.interaction_type, + it.interaction_type_psimi_id, + it.detection_method, + it.detection_method_psimi_id, + it.confidence_value, + ).group_by( + it.int_a_uniprot_id, + it.int_b_uniprot_id, + it.pmid, + it.interaction_ids, + it.interaction_type, + it.interaction_type_psimi_id, + it.detection_method, + it.detection_method_psimi_id, + it.confidence_value, + ) + intact_df = pd.read_sql(sql, self.engine) + + for uniprot_accession in tqdm(uniprot_accessions, desc="Update IntAct interactions"): + filtered_df = intact_df[ + (intact_df.int_a_uniprot_id == uniprot_accession) | (intact_df.int_b_uniprot_id == uniprot_accession) + ] for ( up_a, up_b, @@ -218,7 +218,7 @@ def update_interactions(self) -> int: d_method, d_method_id, c_value, - ) in results: + ) in filtered_df.iterrows(index=False): from_rid = self.get_create_rid_by_uniprot(up_a) to_rid = self.get_create_rid_by_uniprot(up_b) From b610d93d178418bd5c0cbf6a0d97e92af19fe81d Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Thu, 28 Sep 2023 09:17:46 +0200 Subject: [PATCH 51/58] fix: change intact update to use itertuples --- ebel/manager/orientdb/biodbs/intact.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ebel/manager/orientdb/biodbs/intact.py b/ebel/manager/orientdb/biodbs/intact.py index 40e1484..0625efc 100644 --- a/ebel/manager/orientdb/biodbs/intact.py +++ b/ebel/manager/orientdb/biodbs/intact.py @@ -218,7 +218,7 @@ def update_interactions(self) -> int: d_method, d_method_id, c_value, - ) in filtered_df.iterrows(index=False): + ) in filtered_df.itertuples(index=False): from_rid = self.get_create_rid_by_uniprot(up_a) to_rid = self.get_create_rid_by_uniprot(up_b) From 83addcc8e4d8944e462b1f86e866d170c56247bd Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Thu, 28 Sep 2023 09:20:42 +0200 Subject: [PATCH 52/58] fix: replace nans with None in intact update --- ebel/manager/orientdb/biodbs/intact.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ebel/manager/orientdb/biodbs/intact.py b/ebel/manager/orientdb/biodbs/intact.py index 0625efc..25f44ba 100644 --- a/ebel/manager/orientdb/biodbs/intact.py +++ b/ebel/manager/orientdb/biodbs/intact.py @@ -4,6 +4,7 @@ import zipfile from typing import Dict +import numpy as np import pandas as pd from pyorientdb import OrientDB from sqlalchemy import select, or_ @@ -202,7 +203,7 @@ def update_interactions(self) -> int: it.confidence_value, ) - intact_df = pd.read_sql(sql, self.engine) + intact_df = pd.read_sql(sql, self.engine).replace({np.nan: None}) for uniprot_accession in tqdm(uniprot_accessions, desc="Update IntAct interactions"): filtered_df = intact_df[ From 697da197367200ddd9093f4bbf5dfc5833a02281 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Thu, 28 Sep 2023 09:34:43 +0200 Subject: [PATCH 53/58] =?UTF-8?q?Bump=20version:=201.0.37=20=E2=86=92=201.?= =?UTF-8?q?1.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- ebel/__init__.py | 2 +- mkdocs.yml | 2 +- pyproject.toml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 82983d2..d7f2e2f 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.0.37 +current_version = 1.1.0 commit = True tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(?:-(?P[0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*))?(?:\+(?P[0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*))? diff --git a/ebel/__init__.py b/ebel/__init__.py index e2ea2fc..97d20d5 100755 --- a/ebel/__init__.py +++ b/ebel/__init__.py @@ -2,7 +2,7 @@ from ebel import cache, constants, errors, parser, transformers from ebel.manager.orientdb.biodbs.bel import Bel -__version__ = "1.0.37" +__version__ = "1.1.0" __title__ = "e(BE:L)" __description__ = "Validation and extension of biomedical knowledge graphs" diff --git a/mkdocs.yml b/mkdocs.yml index 8be399d..bd87227 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -8,7 +8,7 @@ repo_url: https://github.com/e-bel/ebel theme: readthedocs extra: - version: 1.0.37 + version: 1.1.0 nav: # - Home: index.md diff --git a/pyproject.toml b/pyproject.toml index 187016c..56014ba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "ebel" -version = "1.0.37" +version = "1.1.0" description = "e(BE:L) - validation and extension of BEL networks." authors = [ "Bruce Schultz ", From 71fb54bb59fef1ec845b94eb17a6b50daa0c3397 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Thu, 28 Sep 2023 10:40:00 +0200 Subject: [PATCH 54/58] perf: improve intact update by caching bel string --- ebel/manager/orientdb/biodbs/intact.py | 25 +++++++++++++++++-------- ebel/manager/orientdb/odb_meta.py | 6 ++++++ 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/ebel/manager/orientdb/biodbs/intact.py b/ebel/manager/orientdb/biodbs/intact.py index 25f44ba..36caab3 100644 --- a/ebel/manager/orientdb/biodbs/intact.py +++ b/ebel/manager/orientdb/biodbs/intact.py @@ -41,6 +41,7 @@ def __init__(self, client: OrientDB = None, condition_keyword="Alzheimer"): # up.update() self.uniprot_rid_dict = self.get_pure_uniprot_rid_dict_in_bel_context() + self.bel_rid_dict = self.get_pure_bel_rid_dict() def __len__(self): return self.number_of_generics @@ -126,14 +127,22 @@ def get_create_rid_by_uniprot(self, uniprot_accession: str) -> str: nn = self.get_namespace_name_by_uniprot(uniprot_accession) if nn: namespace, name = nn - value_dict = { - "name": name, - "namespace": namespace, - "pure": True, - "bel": f'p({namespace}:"{name}")', - "uniprot": uniprot_accession, - } - self.uniprot_rid_dict[uniprot_accession] = self.get_create_rid("protein", value_dict, check_for="bel") + bel = f'p({namespace}:"{name}")' + + if bel in self.bel_rid_dict: + self.uniprot_rid_dict[uniprot_accession] = self.bel_rid_dict[bel] + + else: + value_dict = { + "name": name, + "namespace": namespace, + "pure": True, + "bel": bel, + "uniprot": uniprot_accession, + } + new_rid = self.insert_record("protein", value_dict=value_dict) + self.bel_rid_dict[bel] = new_rid + self.uniprot_rid_dict[uniprot_accession] = new_rid return self.uniprot_rid_dict.get(uniprot_accession) diff --git a/ebel/manager/orientdb/odb_meta.py b/ebel/manager/orientdb/odb_meta.py index 8a31df2..cd2bfbd 100644 --- a/ebel/manager/orientdb/odb_meta.py +++ b/ebel/manager/orientdb/odb_meta.py @@ -1542,6 +1542,12 @@ def get_pure_uniprot_rid_dict_in_bel_context(self) -> Dict[str, str]: return {r["uniprot"]: r["rid"] for r in self.query_get_dict(sql)} + def get_pure_bel_rid_dict(self) -> Dict[str, str]: + """Return a dictionary of pure bel representation and it's rid.""" + sql = "SELECT bel, @rid.asString() as rid from protein where pure=true" + results = self.query_get_dict(sql) + return {r["bel"]: r["rid"] for r in results} + def get_pure_uniprot_rids_dict(self): """Return dictionary with UniProt IDs as keys and node rIDs as values.""" sql = "Select uniprot, @rid.asString() as rid from protein where uniprot IS NOT NULL and pure=true" From fe8991a3ac037077ad8de61d32d8ad529d421036 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Thu, 28 Sep 2023 10:53:28 +0200 Subject: [PATCH 55/58] perf: improve intact update by caching uniprot nn --- ebel/manager/orientdb/biodbs/intact.py | 27 +++++++++----------------- ebel/manager/orientdb/odb_meta.py | 18 +++++++++++++++++ 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/ebel/manager/orientdb/biodbs/intact.py b/ebel/manager/orientdb/biodbs/intact.py index 36caab3..abd89db 100644 --- a/ebel/manager/orientdb/biodbs/intact.py +++ b/ebel/manager/orientdb/biodbs/intact.py @@ -42,6 +42,7 @@ def __init__(self, client: OrientDB = None, condition_keyword="Alzheimer"): self.uniprot_rid_dict = self.get_pure_uniprot_rid_dict_in_bel_context() self.bel_rid_dict = self.get_pure_bel_rid_dict() + self.acc_nn = self.get_uniprot_accession_namespaces() def __len__(self): return self.number_of_generics @@ -159,27 +160,17 @@ def get_namespace_name_by_uniprot(self, uniprot_accession: str) -> tuple: tuple namespace, value """ - return_value = () - - sql = ( - select(uniprot.GeneSymbol.symbol, uniprot.Uniprot.taxid) - .join(uniprot.Uniprot) - .where(uniprot.Uniprot.accession == uniprot_accession) - ) - - result = self.session.execute(sql).fetchone() - taxid_to_namespace = {9606: "HGNC", 10090: "MGI", 10116: "RGD"} - - if result: - name, taxid = result - namespace = taxid_to_namespace.get(taxid, "UNIPROT") - return_value = (namespace, name) + if uniprot_accession in self.acc_nn: + return self.acc_nn[uniprot_accession] else: - if self.session.query(uniprot.Uniprot).filter(uniprot.Uniprot.accession == uniprot_accession).first(): - return_value = ("UNIPROT", uniprot_accession) + up_r = self.session.query(uniprot.Uniprot).filter(uniprot.Uniprot.accession == uniprot_accession).first() + + if up_r: + return "UNIPROT", uniprot_accession - return return_value + else: + return () def update_interactions(self) -> int: """Update intact interactions to graph.""" diff --git a/ebel/manager/orientdb/odb_meta.py b/ebel/manager/orientdb/odb_meta.py index cd2bfbd..f18734e 100644 --- a/ebel/manager/orientdb/odb_meta.py +++ b/ebel/manager/orientdb/odb_meta.py @@ -39,6 +39,7 @@ from ebel.config import get_config_as_dict, get_config_value, write_to_config from ebel.constants import DEFAULT_ODB, RID from ebel.manager.orientdb import urls as default_urls +from ebel.manager.rdbms.models import uniprot from ebel.manager.rdbms.models.ensembl import Ensembl as ens from ebel.manager.orientdb.odb_structure import Edge, Generic, Node, OClass, OIndex, OProperty from ebel.tools import BelRdb, chunks, get_file_path, get_standard_name @@ -1553,3 +1554,20 @@ def get_pure_uniprot_rids_dict(self): sql = "Select uniprot, @rid.asString() as rid from protein where uniprot IS NOT NULL and pure=true" results = self.query_get_dict(sql) return {r["uniprot"]: r["rid"] for r in results} + + def get_uniprot_accession_namespaces(self) -> Dict[str, Tuple[str, str]]: + """Return a dictionary of uniprot accession keys and namespace and values.""" + sql = ( + select(uniprot.Uniprot.accession, uniprot.GeneSymbol.symbol, uniprot.Uniprot.taxid) + .join(uniprot.Uniprot) + ) + results = self.session.execute(sql).fetchall() + + acc_dict = dict() + taxid_to_namespace = {9606: "HGNC", 10090: "MGI", 10116: "RGD"} + for r in results: + accession, name, taxid = r + namespace = taxid_to_namespace.get(taxid, "UNIPROT") + acc_dict[accession] = (namespace, name) + + return acc_dict From a2cdd5c8138b488e2e2d12e00b4c83b7d66437d7 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Thu, 28 Sep 2023 11:03:35 +0200 Subject: [PATCH 56/58] perf: apply caching techniques to stringdb --- ebel/manager/orientdb/biodbs/intact.py | 7 +++- ebel/manager/orientdb/biodbs/stringdb.py | 50 ++++++++++++++---------- ebel/manager/orientdb/odb_meta.py | 5 +-- 3 files changed, 35 insertions(+), 27 deletions(-) diff --git a/ebel/manager/orientdb/biodbs/intact.py b/ebel/manager/orientdb/biodbs/intact.py index abd89db..f9625c4 100644 --- a/ebel/manager/orientdb/biodbs/intact.py +++ b/ebel/manager/orientdb/biodbs/intact.py @@ -167,10 +167,13 @@ def get_namespace_name_by_uniprot(self, uniprot_accession: str) -> tuple: up_r = self.session.query(uniprot.Uniprot).filter(uniprot.Uniprot.accession == uniprot_accession).first() if up_r: - return "UNIPROT", uniprot_accession + return_value = "UNIPROT", uniprot_accession else: - return () + return_value = () + + self.acc_nn[uniprot_accession] = return_value + return return_value def update_interactions(self) -> int: """Update intact interactions to graph.""" diff --git a/ebel/manager/orientdb/biodbs/stringdb.py b/ebel/manager/orientdb/biodbs/stringdb.py index 7adccd0..3ac3935 100644 --- a/ebel/manager/orientdb/biodbs/stringdb.py +++ b/ebel/manager/orientdb/biodbs/stringdb.py @@ -40,6 +40,9 @@ def __init__(self, client: OrientDB = None): biodb_name=self.biodb_name, ) + self.symbol_rid_dict = self.get_pure_symbol_rids_dict_in_bel_context(namespace="HGNC") + self.bel_rid_dict = self.get_pure_bel_rid_dict() + def __len__(self) -> dict: """Get number of 'biogrid_interaction' graph edges.""" pass @@ -202,10 +205,9 @@ def update_stringdb_interactions(self, hgnc: Hgnc) -> int: "combined_score", ) - bel_hgnc_rid_dict = self.get_pure_symbol_rids_dict_in_bel_context(namespace="HGNC") - bel_hgncs = set(bel_hgnc_rid_dict.keys()) + symbols = set(self.symbol_rid_dict.keys()) strdb_hgncs = self.get_stringdb_symbols() - shared_hgncs = bel_hgncs & strdb_hgncs + shared_hgncs = symbols & strdb_hgncs updated = 0 already_inserted = set() @@ -222,8 +224,8 @@ def update_stringdb_interactions(self, hgnc: Hgnc) -> int: if sorted_combi not in already_inserted: value_dict = {k: v for k, v in row.__dict__.items() if k in columns} - from_rid = self.get_create_rid_by_symbol(row.symbol1, bel_hgnc_rid_dict, hgnc) - to_rid = self.get_create_rid_by_symbol(row.symbol2, bel_hgnc_rid_dict, hgnc) + from_rid = self.get_create_rid_by_symbol(row.symbol1, hgnc) + to_rid = self.get_create_rid_by_symbol(row.symbol2, hgnc) if from_rid and to_rid: self.create_edge( @@ -237,15 +239,13 @@ def update_stringdb_interactions(self, hgnc: Hgnc) -> int: return updated - def get_create_rid_by_symbol(self, symbol: str, symbol_rid_dict: dict, hgnc: Hgnc) -> str: + def get_create_rid_by_symbol(self, symbol: str, hgnc: Hgnc) -> str: """Create or get rID entry for a given gene symbol. Parameters ---------- symbol: str Gene symbol. - symbol_rid_dict: dict - Entry parameters matching those of the desired rID entry. hgnc: Hgnc Hgnc model definition. @@ -254,17 +254,26 @@ def get_create_rid_by_symbol(self, symbol: str, symbol_rid_dict: dict, hgnc: Hgn str rID. """ - if symbol not in symbol_rid_dict: + if symbol not in self.symbol_rid_dict: symbol = hgnc.get_correct_symbol(symbol) if symbol: - value_dict = { - "name": symbol, - "namespace": "HGNC", - "pure": True, - "bel": f'p(HGNC:"{symbol}")', - } - symbol_rid_dict[symbol] = self.get_create_rid("protein", value_dict, check_for="bel") - return symbol_rid_dict.get(symbol) + bel = f'p(HGNC:"{symbol}")' + + if bel in self.bel_rid_dict: + self.symbol_rid_dict[symbol] = self.bel_rid_dict[bel] + + else: + value_dict = { + "name": symbol, + "namespace": "HGNC", + "pure": True, + "bel": bel, + } + new_rid = self.insert_record("protein", value_dict) + self.symbol_rid_dict[symbol] = new_rid + self.bel_rid_dict[bel] = new_rid + + return self.symbol_rid_dict.get(symbol) def update_action_interactions(self, hgnc: Hgnc) -> int: """Iterate through BEL proteins and add stringdb_action edges to existing proteins in KG. @@ -292,8 +301,7 @@ def update_action_interactions(self, hgnc: Hgnc) -> int: modes = ("activation", "inhibition", "ptmod", "expression") - symbols_rid_dict = self.get_pure_symbol_rids_dict_in_bel_context(namespace="HGNC") - symbols = tuple(symbols_rid_dict.keys()) + symbols = tuple(self.symbol_rid_dict.keys()) already_inserted = set() @@ -313,8 +321,8 @@ def update_action_interactions(self, hgnc: Hgnc) -> int: sorted_combi = tuple(sorted([action.symbol1, action.symbol2])) if sorted_combi not in already_inserted: - from_rid = self.get_create_rid_by_symbol(action.symbol1, symbols_rid_dict, hgnc) - to_rid = self.get_create_rid_by_symbol(action.symbol2, symbols_rid_dict, hgnc) + from_rid = self.get_create_rid_by_symbol(action.symbol1, hgnc) + to_rid = self.get_create_rid_by_symbol(action.symbol2, hgnc) if from_rid and to_rid: class_name = translator[(action.mode, action.action)] diff --git a/ebel/manager/orientdb/odb_meta.py b/ebel/manager/orientdb/odb_meta.py index f18734e..7fd687b 100644 --- a/ebel/manager/orientdb/odb_meta.py +++ b/ebel/manager/orientdb/odb_meta.py @@ -1557,10 +1557,7 @@ def get_pure_uniprot_rids_dict(self): def get_uniprot_accession_namespaces(self) -> Dict[str, Tuple[str, str]]: """Return a dictionary of uniprot accession keys and namespace and values.""" - sql = ( - select(uniprot.Uniprot.accession, uniprot.GeneSymbol.symbol, uniprot.Uniprot.taxid) - .join(uniprot.Uniprot) - ) + sql = select(uniprot.Uniprot.accession, uniprot.GeneSymbol.symbol, uniprot.Uniprot.taxid).join(uniprot.Uniprot) results = self.session.execute(sql).fetchall() acc_dict = dict() From 36337f41186e786d4054672d2aaaa6fe235debca Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Thu, 28 Sep 2023 12:51:58 +0200 Subject: [PATCH 57/58] style: black and isort --- ebel/defaults.py | 2 +- ebel/manager/orientdb/biodbs/biogrid.py | 2 +- ebel/manager/orientdb/biodbs/clinvar.py | 4 ++-- ebel/manager/orientdb/biodbs/intact.py | 2 +- ebel/manager/orientdb/biodbs/kegg.py | 2 +- ebel/manager/orientdb/biodbs/mirtarbase.py | 2 +- ebel/manager/orientdb/biodbs/nsides.py | 4 ++-- ebel/manager/orientdb/biodbs/stringdb.py | 2 +- ebel/manager/orientdb/biodbs/uniprot.py | 2 +- ebel/manager/orientdb/odb_meta.py | 14 ++++++++++---- ebel/manager/rdbms/models/biogrid.py | 2 +- 11 files changed, 22 insertions(+), 16 deletions(-) diff --git a/ebel/defaults.py b/ebel/defaults.py index 205a3be..aa8260e 100755 --- a/ebel/defaults.py +++ b/ebel/defaults.py @@ -5,7 +5,7 @@ import logging import logging.handlers as handlers -from ebel.constants import DATA_DIR, PROJECT_DIR, LOG_DIR +from ebel.constants import DATA_DIR, LOG_DIR, PROJECT_DIR ############################################################################### # UNIPROT taxonomy IDs to import diff --git a/ebel/manager/orientdb/biodbs/biogrid.py b/ebel/manager/orientdb/biodbs/biogrid.py index bd33d57..850016c 100644 --- a/ebel/manager/orientdb/biodbs/biogrid.py +++ b/ebel/manager/orientdb/biodbs/biogrid.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd from pyorientdb import OrientDB -from sqlalchemy import select, func, cast, Integer +from sqlalchemy import Integer, cast, func, select from sqlalchemy.orm import aliased from tqdm import tqdm diff --git a/ebel/manager/orientdb/biodbs/clinvar.py b/ebel/manager/orientdb/biodbs/clinvar.py index a85c5fc..ca9dc7c 100644 --- a/ebel/manager/orientdb/biodbs/clinvar.py +++ b/ebel/manager/orientdb/biodbs/clinvar.py @@ -5,13 +5,13 @@ import pandas as pd from pyorientdb import OrientDB -from sqlalchemy import text, select +from sqlalchemy import select, text from tqdm import tqdm from ebel.manager.orientdb import odb_meta, odb_structure, urls +from ebel.manager.orientdb.biodbs.ensembl import Ensembl from ebel.manager.orientdb.constants import CLINVAR from ebel.manager.rdbms.models import clinvar -from ebel.manager.orientdb.biodbs.ensembl import Ensembl from ebel.tools import get_disease_trait_keywords_from_config, get_file_path logger = logging.getLogger(__name__) diff --git a/ebel/manager/orientdb/biodbs/intact.py b/ebel/manager/orientdb/biodbs/intact.py index f9625c4..e9fde67 100644 --- a/ebel/manager/orientdb/biodbs/intact.py +++ b/ebel/manager/orientdb/biodbs/intact.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd from pyorientdb import OrientDB -from sqlalchemy import select, or_ +from sqlalchemy import or_, select from tqdm import tqdm from ebel.manager.orientdb import odb_meta, odb_structure, urls diff --git a/ebel/manager/orientdb/biodbs/kegg.py b/ebel/manager/orientdb/biodbs/kegg.py index bcfa712..585b16d 100644 --- a/ebel/manager/orientdb/biodbs/kegg.py +++ b/ebel/manager/orientdb/biodbs/kegg.py @@ -9,7 +9,7 @@ import pandas as pd import requests from pyorientdb import OrientDB -from sqlalchemy import select, or_ +from sqlalchemy import or_, select from tqdm import tqdm from ebel.config import get_config_value diff --git a/ebel/manager/orientdb/biodbs/mirtarbase.py b/ebel/manager/orientdb/biodbs/mirtarbase.py index f3c03bc..2703586 100644 --- a/ebel/manager/orientdb/biodbs/mirtarbase.py +++ b/ebel/manager/orientdb/biodbs/mirtarbase.py @@ -3,7 +3,7 @@ import pandas as pd from pyorientdb import OrientDB -from sqlalchemy import text, select +from sqlalchemy import select, text from tqdm import tqdm from ebel.manager.orientdb import odb_meta, odb_structure, urls diff --git a/ebel/manager/orientdb/biodbs/nsides.py b/ebel/manager/orientdb/biodbs/nsides.py index 0d4d441..ef16c77 100644 --- a/ebel/manager/orientdb/biodbs/nsides.py +++ b/ebel/manager/orientdb/biodbs/nsides.py @@ -7,13 +7,13 @@ import pandas as pd from pyorientdb import OrientDB -from sqlalchemy import text, select, or_ +from sqlalchemy import or_, select, text from tqdm import tqdm from ebel.constants import RID from ebel.manager.orientdb import odb_meta, odb_structure, urls from ebel.manager.orientdb.constants import OFFSIDES, ONSIDES -from ebel.manager.rdbms.models import nsides, drugbank +from ebel.manager.rdbms.models import drugbank, nsides from ebel.tools import get_file_path logger = logging.getLogger(__name__) diff --git a/ebel/manager/orientdb/biodbs/stringdb.py b/ebel/manager/orientdb/biodbs/stringdb.py index 3ac3935..68219b6 100644 --- a/ebel/manager/orientdb/biodbs/stringdb.py +++ b/ebel/manager/orientdb/biodbs/stringdb.py @@ -6,7 +6,7 @@ import numpy as np import pandas as pd from pyorientdb import OrientDB -from sqlalchemy import or_, select, text, and_ +from sqlalchemy import and_, or_, select, text from tqdm import tqdm from ebel.manager.orientdb import odb_meta, odb_structure, urls diff --git a/ebel/manager/orientdb/biodbs/uniprot.py b/ebel/manager/orientdb/biodbs/uniprot.py index 95ae065..9fbc0c1 100644 --- a/ebel/manager/orientdb/biodbs/uniprot.py +++ b/ebel/manager/orientdb/biodbs/uniprot.py @@ -10,7 +10,7 @@ import pandas as pd from lxml.etree import iterparse from pyorientdb import OrientDB -from sqlalchemy import text, select +from sqlalchemy import select, text from tqdm import tqdm from ebel.defaults import default_tax_ids diff --git a/ebel/manager/orientdb/odb_meta.py b/ebel/manager/orientdb/odb_meta.py index 7fd687b..96d3f3d 100644 --- a/ebel/manager/orientdb/odb_meta.py +++ b/ebel/manager/orientdb/odb_meta.py @@ -22,13 +22,14 @@ import xmltodict from pyorientdb import OrientDB, orient from pyorientdb.exceptions import ( + PyOrientBadMethodCallException, PyOrientCommandException, PyOrientIndexException, PyOrientSecurityAccessException, - PyOrientBadMethodCallException, + PyOrientSecurityException, ) from pyorientdb.otypes import OrientRecord -from sqlalchemy import text, select, func +from sqlalchemy import func, select, text from sqlalchemy.sql.schema import Table from sqlalchemy_utils import create_database, database_exists from tqdm import tqdm @@ -39,9 +40,9 @@ from ebel.config import get_config_as_dict, get_config_value, write_to_config from ebel.constants import DEFAULT_ODB, RID from ebel.manager.orientdb import urls as default_urls +from ebel.manager.orientdb.odb_structure import Edge, Generic, Node, OClass, OIndex, OProperty from ebel.manager.rdbms.models import uniprot from ebel.manager.rdbms.models.ensembl import Ensembl as ens -from ebel.manager.orientdb.odb_structure import Edge, Generic, Node, OClass, OIndex, OProperty from ebel.tools import BelRdb, chunks, get_file_path, get_standard_name type_map_inverse = {v: k for k, v in orient.type_map.items()} @@ -166,7 +167,12 @@ def execute(self, command_str: str) -> List[OrientRecord]: try: return self.client.command(command_str) - except (PyOrientCommandException, PyOrientSecurityAccessException, PyOrientBadMethodCallException) as e: + except ( + PyOrientCommandException, + PyOrientSecurityAccessException, + PyOrientBadMethodCallException, + PyOrientSecurityException, + ) as e: logger.error(e) # Try to reconnect self.client.close() diff --git a/ebel/manager/rdbms/models/biogrid.py b/ebel/manager/rdbms/models/biogrid.py index 58def36..dd0ab2a 100644 --- a/ebel/manager/rdbms/models/biogrid.py +++ b/ebel/manager/rdbms/models/biogrid.py @@ -1,7 +1,7 @@ """BioGRID RDBMS model definition.""" from sqlalchemy import Float, ForeignKey, Integer, String, Text, select from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import Mapped, mapped_column, relationship, aliased +from sqlalchemy.orm import Mapped, aliased, mapped_column, relationship from sqlalchemy_utils import create_view from ebel.manager.rdbms.models import object_as_dict From 7ef4fa5a8739b00869fca45850bbe0a1701d5ed0 Mon Sep 17 00:00:00 2001 From: Bruce Schultz Date: Fri, 29 Sep 2023 09:29:07 +0200 Subject: [PATCH 58/58] fix: specify columns in pc dict method --- ebel/manager/orientdb/biodbs/pathway_commons.py | 12 ++++++++++-- ebel/manager/orientdb/odb_meta.py | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/ebel/manager/orientdb/biodbs/pathway_commons.py b/ebel/manager/orientdb/biodbs/pathway_commons.py index 1ff0649..9a44aa2 100644 --- a/ebel/manager/orientdb/biodbs/pathway_commons.py +++ b/ebel/manager/orientdb/biodbs/pathway_commons.py @@ -221,10 +221,12 @@ def update_interactions(self) -> Dict[str, int]: self.hgnc.update() valid_hgnc_symbols = {x[0] for x in self.session.query(hgnc.Hgnc).with_entities(hgnc.Hgnc.symbol).all()} + pure_symbol_rids_dict = self.get_pure_symbol_rids_dict() + symbol_rids_bel_context_dict = self.get_pure_symbol_rids_dict_in_bel_context() + cols = ["symbol", "rid"] - pure_symbol_rids_dict = self.hgnc.get_pure_symbol_rids_dict() df_all = pd.DataFrame(pure_symbol_rids_dict.items(), columns=cols) - df_bel = pd.DataFrame(self.hgnc.get_pure_symbol_rids_dict_in_bel_context().items(), columns=cols) + df_bel = pd.DataFrame(symbol_rids_bel_context_dict.items(), columns=cols) # skip here if there is no pure symbols with or without BEL context if any([df_all.empty, df_bel.empty]): @@ -304,3 +306,9 @@ def get_pathway_pmids_sources(self, pc_id, pc_pathway_name_rid_dict) -> tuple: pmids = [x.pmid for x in pc_obj.pmids] pathways = [pc_pathway_name_rid_dict[x.name] for x in pc_obj.pathway_names] return pathways, pmids, sources + + +if __name__ == "__main__": + p = PathwayCommons() + foo = p.get_pure_symbol_rids_dict() + a = 2 diff --git a/ebel/manager/orientdb/odb_meta.py b/ebel/manager/orientdb/odb_meta.py index 96d3f3d..15fa146 100644 --- a/ebel/manager/orientdb/odb_meta.py +++ b/ebel/manager/orientdb/odb_meta.py @@ -1525,7 +1525,7 @@ def get_pure_symbol_rid_df_in_bel_context(self, class_name="protein", namespace= def get_pure_symbol_rids_dict(self, class_name="protein", namespace="HGNC") -> Dict[str, str]: """Return dictionary with protein name as keys and node rIDs as values.""" - results = self.query_class(class_name, pure=True, namespace=namespace) + results = self.query_class(class_name, pure=True, namespace=namespace, columns=["name"], with_rid=True) return {r["name"]: r["rid"] for r in results} def get_pure_rid_by_uniprot(self, uniprot: str):