From 5b1c4eef3d9c03865f2a7640578af459d6d96377 Mon Sep 17 00:00:00 2001 From: Peter Robinson Date: Thu, 5 Sep 2024 10:23:00 +0200 Subject: [PATCH] improved table creation --- src/pyphetools/visualization/__init__.py | 2 +- .../disease_specific_hpo_counter.py | 54 +++++++++++++++---- .../visualization/phenopacket_ingestor.py | 25 +++++++++ test/test_disease_specific_hpo_counter.py | 38 +++++++++++++ 4 files changed, 109 insertions(+), 10 deletions(-) create mode 100644 test/test_disease_specific_hpo_counter.py diff --git a/src/pyphetools/visualization/__init__.py b/src/pyphetools/visualization/__init__.py index 12afa64..4dda561 100644 --- a/src/pyphetools/visualization/__init__.py +++ b/src/pyphetools/visualization/__init__.py @@ -1,5 +1,5 @@ from .detailed_suppl_table import DetailedSupplTable -from .disease_specific_hpo_counter import DiseaseSpecificHpoCounter +from .disease_specific_hpo_counter import DiseaseSpecificHpoCounter, HpoCohortCount from .focus_count_table import FocusCountTable from .hpoa_table_creator import HpoaTableCreator, HpoaTableBuilder from .individual_table import IndividualTable diff --git a/src/pyphetools/visualization/disease_specific_hpo_counter.py b/src/pyphetools/visualization/disease_specific_hpo_counter.py index 4e20560..18ffe46 100644 --- a/src/pyphetools/visualization/disease_specific_hpo_counter.py +++ b/src/pyphetools/visualization/disease_specific_hpo_counter.py @@ -7,6 +7,7 @@ from ..pp.v202 import Phenopacket as Phenopacket202 from ..pp.v202 import OntologyClass as OntologyClass202 +TARGET_DISEASE_ID = "MONDO:0000001" class HpoCohortCount: """ @@ -56,7 +57,19 @@ def frequency_for_disease(self, disease: OntologyClass202) -> str: return "n/a" # no information available for this else: percentage = 100 * obs / total - return f"{obs}/{total} ({percentage}%)" + return f"{obs}/{total} ({int(percentage)}%)" + + + def frequency_for_target_disease(self, disease: OntologyClass202) -> str: + exc = self._d_to_excluded[disease] + obs = self._d_to_observed[disease] + total = exc + obs + if total == 0: + return "n/a" # no information available for this + elif obs == 1: + return f"observed" + else: + return "excluded" def __str__(self): items = list() @@ -69,7 +82,6 @@ def get_maximum_frequency(self): We sort with a heuristic that rewards at least one disease with a high frequency AND an overall high number of observed counts. """ frequencies = list() ## here we do not care about the actual order - total = self.get_total() ## total number of observations for k, v in self._d_to_observed.items(): if v == 0: frequencies.append(0) @@ -80,7 +92,15 @@ def get_maximum_frequency(self): frequencies.append(obs/total) if len(frequencies) == 0: return 0 - return total * max(frequencies) + return max(frequencies) + + def get_weighted_maximum_frequency(self): + """ + Heuristic for sorting - maximum frequency times toit + """ + total = self.get_total() ## total number of observations + max_f = self.get_maximum_frequency() + return total * max_f @@ -88,10 +108,13 @@ class DiseaseSpecificHpoCounter: def __init__(self, ppkt_list: typing.List[PPKt.Phenopacket], + target_ppkt: PPKt.Phenopacket = None, hpo: hpotk.MinimalOntology = None) -> None: """ :param ppkt_list: List of Phenopackets we wish to display as a table of HPO term counts :type ppkt_list: typing.list[PPKt.Phenopacket] + :target_ppkt: Phenopacket of the individual that we wish to compare with the cohort that is represented in ppkt_list. Optional + :type target_ppkt: typing.Optional[PPKt.Phenopacket] :param hpo: Reference to HPO ontology object (if nulll, will be created in constructor) :type hpo: hpotk.MinimalOntology """ @@ -107,16 +130,24 @@ def __init__(self, raise ValueError(f"This class does not support visualization of phenopackets with more than one disease diagnosis") disease_term = ppkt.diseases[0] disease_dict[disease_term.term].append(ppkt) + if target_ppkt is not None: + ## We want to show the target as a separate column. + ## use this term as a marker, it will not be displayed + oclzz = OntologyClass202(id=TARGET_DISEASE_ID, label=target_ppkt.id) + disease_dict[oclzz].append(target_ppkt) hpo_to_counter = dict() self._hpo_term_ids_for_display = set() warn_terms = set() ## to avoid making the same error message multiple times + # The following for loop extracts data for each disease one at a time. for disease_term, ppkt_list in disease_dict.items(): for ppkt in ppkt_list: + # We count not only explicitly annotated terms but also the ancestor of observed terms + # and descendents of excluded terms. + # Note that we keep track of explicitly annotated terms and these are the only ones we show in the output table observed_with_ancestors = set() excluded_with_descendants = set() for pf in ppkt.phenotypic_features: oclzz = pf.type - hpo_id = oclzz.id hpo_term = self._hpo.get_term(oclzz.id) if hpo_term.identifier.value != oclzz.id : if hpo_term.identifier.value not in warn_terms: @@ -125,8 +156,8 @@ def __init__(self, print(f"Use of outdated id {oclzz.id} ({oclzz.label}). Replacing with {hpo_term.identifier.value}.") print("###################################") oclzz = OntologyClass202(id=hpo_term.identifier.value, label=hpo_term.name) - hpo_id = oclzz.id - self._hpo_term_ids_for_display.add(oclzz.id) + hpo_id = oclzz.id + self._hpo_term_ids_for_display.add(hpo_id) if pf.excluded: desc_set = self._hpo.graph.get_descendants(hpo_id, include_source=True) excluded_with_descendants.update(desc_set) @@ -139,7 +170,7 @@ def __init__(self, if oclzz not in hpo_to_counter: hpo_to_counter[oclzz] = HpoCohortCount(hpo=oclzz) hpo_to_counter.get(oclzz).increment_observed(disease_term) - for hpo_term in excluded_with_descendants: + for hpo_id in excluded_with_descendants: hpo_label = self._hpo.get_term_name(hpo_id) oclzz = OntologyClass202(id=hpo_id, label=hpo_label) if oclzz not in hpo_to_counter: @@ -163,9 +194,14 @@ def to_data_frame(self) -> pd.DataFrame: d = dict() d["HPO"] = hpo2c.hpo_display for disease in self._disease_list: - d[disease.label] = hpo2c.frequency_for_disease(disease) + if disease.id == TARGET_DISEASE_ID: + d[disease.label] = hpo2c.frequency_for_target_disease(disease) + else: + d[disease.label] = hpo2c.frequency_for_disease(disease) items.append(d) - return pd.DataFrame(items) + df = pd.DataFrame(items) + df_reset = df.reset_index(drop=True) # the index is irrelevant + return df_reset diff --git a/src/pyphetools/visualization/phenopacket_ingestor.py b/src/pyphetools/visualization/phenopacket_ingestor.py index 156d417..ba0c8d8 100644 --- a/src/pyphetools/visualization/phenopacket_ingestor.py +++ b/src/pyphetools/visualization/phenopacket_ingestor.py @@ -68,4 +68,29 @@ def get_phenopacket_dictionary(self) -> typing.Dict: def get_phenopacket_list(self) -> typing.List: ppktd = self.get_phenopacket_dictionary() return list(ppktd.values()) + + + def _ingest(self, indir="phenopackets", recursive:bool=False, disease_id:str=None): + for file in os.listdir(indir): + fname = os.path.join(indir, file) + if fname.endswith(".json") and os.path.isfile(fname): + with open(fname) as f: + data = f.read() + jsondata = json.loads(data) + ppack = Parse(json.dumps(jsondata), PPKt.Phenopacket()) + if disease_id is not None: + if not PhenopacketIngestor.has_disease_id(ppkt=ppack, disease_id=disease_id): + continue + self._phenopackets.append(ppack) + + + def ingest_from_directory(self, indir:str): + return self._ingest(indir=indir) + + def ingest_from_file(self, json_file:str) -> PPKt.Phenopacket: + with open(json_file) as f: + data = f.read() + jsondata = json.loads(data) + ppack = Parse(json.dumps(jsondata), PPKt.Phenopacket()) + return ppack diff --git a/test/test_disease_specific_hpo_counter.py b/test/test_disease_specific_hpo_counter.py new file mode 100644 index 0000000..4188a36 --- /dev/null +++ b/test/test_disease_specific_hpo_counter.py @@ -0,0 +1,38 @@ +import pytest + +from pyphetools.visualization import DiseaseSpecificHpoCounter, HpoCohortCount +from pyphetools.pp.v202 import OntologyClass as OntologyClass202 + +class TestDiseaseSpecificHpoCounter: + + def test_HpoCohortCount(self): + oclzz = OntologyClass202(id="HP:0001288", label="Gait disturbance") + eri1 = OntologyClass202(id='OMIM:608739', label='ERI1-related disease') + eri2 = OntologyClass202(id='OMIM:608742', label='ERI2-related disease') # fake + eri3 = OntologyClass202(id='OMIM:608744', label='ERI3-related disease') # fake + hpo_counter = HpoCohortCount(hpo=oclzz) + assert hpo_counter is not None + hpo_counter.increment_observed(eri1) + assert hpo_counter.get_observed(eri1) == 1 + hpo_counter.increment_observed(eri1) + hpo_counter.increment_observed(eri1) + assert hpo_counter.get_observed(eri1) == 3 + hpo_counter.increment_excluded(eri1) + assert hpo_counter.get_observed(eri1) == 3 + assert hpo_counter.get_excluded(eri1) == 1 + hpo_counter.increment_observed(eri2) + hpo_counter.increment_observed(eri2) + hpo_counter.increment_excluded(eri2) + hpo_counter.increment_excluded(eri2) + hpo_counter.increment_excluded(eri2) + assert hpo_counter.get_observed(eri2) == 2 + assert hpo_counter.get_excluded(eri2) == 3 + assert hpo_counter.get_observed(eri1) == 3 + assert hpo_counter.get_excluded(eri1) == 1 + assert hpo_counter.get_observed(eri3) == 0 ## we did not add frequencies for this disease + assert hpo_counter.frequency_for_disease(eri1) == "3/4 (75%)" + assert hpo_counter.frequency_for_disease(eri2) == "2/5 (40%)" + assert hpo_counter.frequency_for_disease(eri3) == "n/a" # no information available for eri3 + assert hpo_counter.get_maximum_frequency() == 0.75 + +