diff --git a/bionty/__init__.py b/bionty/__init__.py index f8ff8b08..db6a5938 100644 --- a/bionty/__init__.py +++ b/bionty/__init__.py @@ -51,7 +51,7 @@ __version__ = "0.25.2" # denote release candidate for 0.1.0 with 0.1rc1 # prints warning of python versions -from lamin_logger import py_version_warning +from lamin_utils import py_version_warning py_version_warning("3.8", "3.10") diff --git a/bionty/_bionty.py b/bionty/_bionty.py index 7274ef8f..1aa261f3 100644 --- a/bionty/_bionty.py +++ b/bionty/_bionty.py @@ -1,13 +1,15 @@ from __future__ import annotations +import logging import os from functools import cached_property from pathlib import Path from typing import Dict, Iterable, List, Literal, Optional, Set, Tuple, Union +import numpy as np import pandas as pd -from lamin_logger import logger -from lamin_logger._lookup import Lookup +from lamin_utils import logger +from lamin_utils._lookup import Lookup from bionty._md5 import verify_md5 @@ -103,6 +105,7 @@ def __repr__(self) -> str: f"🎯 {self.__class__.__name__}.search(): free text search of terms\n" f"🧐 {self.__class__.__name__}.inspect(): check if identifiers are mappable\n" f"👽 {self.__class__.__name__}.map_synonyms(): map synonyms to standardized names\n" + f"⚖ {self.__class__.__name__}.diff(): difference between two versions\n" f"🔗 {self.__class__.__name__}.ontology: Pronto.Ontology object" ) # fmt: on @@ -374,7 +377,7 @@ def inspect( >>> gene_symbols = ["A1CF", "A1BG", "FANCD1", "FANCD20"] >>> gene_bt.inspect(gene_symbols, field=gene_bt.symbol) """ - from lamin_logger._inspect import inspect + from lamin_utils._inspect import inspect return inspect( df=self._df, @@ -426,7 +429,7 @@ def map_synonyms( >>> gene_symbols = ["A1CF", "A1BG", "FANCD1", "FANCD20"] >>> standardized_symbols = gene_bt.map_synonyms(gene_symbols, gene_bt.symbol) """ - from lamin_logger._map_synonyms import map_synonyms + from lamin_utils._map_synonyms import map_synonyms return map_synonyms( df=self._df, @@ -489,7 +492,7 @@ def search( >>> celltype_bt = bt.CellType() >>> celltype_bt.search("gamma delta T cell") """ - from lamin_logger._search import search + from lamin_utils._search import search return search( df=self._df, @@ -500,6 +503,66 @@ def search( synonyms_field=str(synonyms_field), ) + def diff(self, compare_to: Bionty, **kwargs) -> Tuple[pd.DataFrame, pd.DataFrame]: + """Determines a diff between two Bionty objects' ontologies. + + Args: + compare_to: Bionty object that must be of the same class as the calling object. + kwargs: Are passed to pd.DataFrame.compare() + + Returns: + A tuple of two DataFrames: + 1. New entries. + 2. A pd.DataFrame.compare result which denotes all changes in `self` and `other`. + + Examples: + >>> import bionty as bt + >>> disease_bt_1 = bt.Disease(source="mondo", version="2023-04-04") + >>> disease_bt_2 = bt.Disease(source="mondo", version="2023-04-04") + >>> new_entries, modified_entries = disease_bt_1.diff(disease_bt_2) + >>> print(new_entries.head()) + >>> print(modified_entries.head()) + """ + if not type(self) is type(compare_to): + raise ValueError("Both Bionty objects must be of the same class.") + + if not self.source == compare_to.source: + raise ValueError("Both Bionty objects must use the same source.") + + if self.version == compare_to.version: + raise ValueError("The versions of the Bionty objects must differ.") + + # The 'parents' column (among potentially others) contain Numpy array values. + # We transform them to tuples to determine the diff. + def _convert_arrays_to_tuples(arr): # pragma: no cover + if isinstance(arr, np.ndarray): + return tuple(arr) + else: + return arr + + for bt_obj in [self, compare_to]: + for column in bt_obj.df().columns: + if any(isinstance(val, np.ndarray) for val in bt_obj.df()[column]): + bt_obj._df[column] = bt_obj.df()[column].apply( + _convert_arrays_to_tuples + ) + + # New entries + new_entries = pd.concat([self.df(), compare_to.df()]).drop_duplicates( + keep=False + ) + + # Changes in existing entries + common_index = self.df().index.intersection(compare_to.df().index) + self_df_common = self.df().loc[common_index] + compare_to_df_common = compare_to.df().loc[common_index] + modified_entries = self_df_common.compare(compare_to_df_common, **kwargs) + + logging.info(f"{len(new_entries)} new entries were added.") + logging.info(f"{len(modified_entries)} entries were modified.") + + return new_entries, modified_entries + class BiontyField: """Field of a Bionty model.""" diff --git a/bionty/_ontology.py b/bionty/_ontology.py index 96284fb6..58a25724 100644 --- a/bionty/_ontology.py +++ b/bionty/_ontology.py @@ -67,8 +67,8 @@ def filter_include_id_prefixes(terms: pronto.ontology._OntologyTerms): df_values = [] for term in filtered_terms: - # skip terms without id or name and obsolete terms - if (not term.id) or (not term.name) or term.obsolete: + # skip terms without id or name + if (not term.id) or (not term.name): continue # term definition text diff --git a/bionty/dev/_handle_sources.py b/bionty/dev/_handle_sources.py index 62012a11..0916fa47 100644 --- a/bionty/dev/_handle_sources.py +++ b/bionty/dev/_handle_sources.py @@ -2,7 +2,7 @@ from typing import Dict, List, Literal, Union import pandas as pd -from lamin_logger import logger +from lamin_utils import logger from bionty._settings import settings from bionty.dev._io import load_yaml, write_yaml @@ -90,7 +90,6 @@ def parse_sources_yaml(filepath: Union[str, Path] = PUBLIC_SOURCES) -> pd.DataFr Args: filepath: Path to the versions yaml file. - return_df: Whether to return a Pandas DataFrame Returns: - entity diff --git a/bionty/entities/_experimentalfactor.py b/bionty/entities/_experimentalfactor.py index e397238a..6b5c828b 100644 --- a/bionty/entities/_experimentalfactor.py +++ b/bionty/entities/_experimentalfactor.py @@ -2,7 +2,7 @@ from typing import Dict, Literal, Optional import pandas as pd -from lamin_logger import logger +from lamin_utils import logger from bionty.entities._shared_docstrings import _doc_params, species_removed diff --git a/docs/guide/ontology.ipynb b/docs/guide/ontology.ipynb index ef96bd9e..e6a69127 100644 --- a/docs/guide/ontology.ipynb +++ b/docs/guide/ontology.ipynb @@ -1,7 +1,6 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "id": "053a464e-df2f-4386-b537-4f0a4fb46408", "metadata": {}, @@ -23,7 +22,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "1a61c7de", "metadata": {}, @@ -88,7 +86,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "4cb4718b", "metadata": {}, @@ -107,7 +104,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "58cd007f", "metadata": {}, @@ -126,7 +122,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "66ba4cba", "metadata": {}, @@ -179,7 +174,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "be4cab83", "metadata": {}, @@ -188,7 +182,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "879230a6", "metadata": {}, @@ -230,7 +223,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "689b1622", "metadata": {}, @@ -279,7 +271,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "9e501882", "metadata": {}, @@ -296,6 +287,69 @@ "source": [ "lookup.tumor_size" ] + }, + { + "cell_type": "markdown", + "id": "76e2e2f9-19c0-46e2-97f6-4487428ed1c3", + "metadata": {}, + "source": [ + "## Comparing ontology versions" + ] + }, + { + "cell_type": "markdown", + "id": "afa16569-5f98-41ec-981c-db1b81fbbe91", + "metadata": {}, + "source": [ + "Bionty provides {func}`bionty.Bionty.diff` to determine a diff between two ontology versions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2b7ba3c-6f25-49d2-9dc2-589fee0f0d53", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "disease_bt_old = bt.Disease(source=\"mondo\", version=\"2023-04-04\")\n", + "disease_bt_new = bt.Disease(source=\"mondo\", version=\"2023-02-06\")\n", + "\n", + "new_entries, modified_entries = disease_bt_old.diff(disease_bt_new)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f7cbe33-b7f3-4c76-b277-3b849ac2ed98", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "new_entries.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05dfa5e3-a50c-4488-bc86-929fd198c30d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "modified_entries.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ff90ce1-d427-43f6-9c22-868878d59a35", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -317,7 +371,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.10.10" }, "vscode": { "interpreter": { diff --git a/pyproject.toml b/pyproject.toml index 277d830f..a46a7a1f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ classifiers = [ ] dependencies = [ "pronto>=2.5.4", - "lamin_logger>=0.8.0", # don't pin here as it's pinned in lamindb + "lamin_utils>=0.9.1", # don't pin here as it's pinned in lamindb "pyyaml", "pandas", "pyarrow", diff --git a/tests/test_bionty.py b/tests/test_bionty.py index fcabf401..42901a78 100644 --- a/tests/test_bionty.py +++ b/tests/test_bionty.py @@ -20,3 +20,32 @@ def test_reset_sources(monkeypatch): CURRENT_SOURCES.unlink() LOCAL_SOURCES.unlink() bt.reset_sources() + + +def test_diff_successful(): + disease_bt_1 = bt.Disease(source="mondo", version="2023-04-04") + disease_bt_2 = bt.Disease(source="mondo", version="2023-02-06") + + new_entries, modified_entries = disease_bt_1.diff(disease_bt_2) + assert len(new_entries) == 819 + assert len(modified_entries) == 249 + + +def test_diff_value_errors(): + # Two different Bionty object types + disease_bt = bt.Disease() + phenotype_bt = bt.Phenotype() + with pytest.raises(ValueError): + disease_bt.diff(phenotype_bt) + + # Different sources + disease_bt_1 = bt.Disease(source="mondo") + disease_bt_2 = bt.Disease(source="doid") + with pytest.raises(ValueError): + disease_bt_1.diff(disease_bt_2) + + # Same version + disease_bt_3 = bt.Disease(source="mondo", version="2023-04-04") + disease_bt_4 = bt.Disease(source="mondo", version="2023-04-04") + with pytest.raises(ValueError): + disease_bt_3.diff(disease_bt_4)