Skip to content

Commit

Permalink
improved table creation
Browse files Browse the repository at this point in the history
  • Loading branch information
pnrobinson committed Sep 5, 2024
1 parent 866ec4f commit 5b1c4ee
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 10 deletions.
2 changes: 1 addition & 1 deletion src/pyphetools/visualization/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .detailed_suppl_table import DetailedSupplTable
from .disease_specific_hpo_counter import DiseaseSpecificHpoCounter
from .disease_specific_hpo_counter import DiseaseSpecificHpoCounter, HpoCohortCount
from .focus_count_table import FocusCountTable
from .hpoa_table_creator import HpoaTableCreator, HpoaTableBuilder
from .individual_table import IndividualTable
Expand Down
54 changes: 45 additions & 9 deletions src/pyphetools/visualization/disease_specific_hpo_counter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from ..pp.v202 import Phenopacket as Phenopacket202
from ..pp.v202 import OntologyClass as OntologyClass202

TARGET_DISEASE_ID = "MONDO:0000001"

class HpoCohortCount:
"""
Expand Down Expand Up @@ -56,7 +57,19 @@ def frequency_for_disease(self, disease: OntologyClass202) -> str:
return "n/a" # no information available for this
else:
percentage = 100 * obs / total
return f"{obs}/{total} ({percentage}%)"
return f"{obs}/{total} ({int(percentage)}%)"


def frequency_for_target_disease(self, disease: OntologyClass202) -> str:
exc = self._d_to_excluded[disease]
obs = self._d_to_observed[disease]
total = exc + obs
if total == 0:
return "n/a" # no information available for this
elif obs == 1:
return f"observed"
else:
return "excluded"

def __str__(self):
items = list()
Expand All @@ -69,7 +82,6 @@ def get_maximum_frequency(self):
We sort with a heuristic that rewards at least one disease with a high frequency AND an overall high number of observed counts.
"""
frequencies = list() ## here we do not care about the actual order
total = self.get_total() ## total number of observations
for k, v in self._d_to_observed.items():
if v == 0:
frequencies.append(0)
Expand All @@ -80,18 +92,29 @@ def get_maximum_frequency(self):
frequencies.append(obs/total)
if len(frequencies) == 0:
return 0
return total * max(frequencies)
return max(frequencies)

def get_weighted_maximum_frequency(self):
"""
Heuristic for sorting - maximum frequency times toit
"""
total = self.get_total() ## total number of observations
max_f = self.get_maximum_frequency()
return total * max_f



class DiseaseSpecificHpoCounter:

def __init__(self,
ppkt_list: typing.List[PPKt.Phenopacket],
target_ppkt: PPKt.Phenopacket = None,
hpo: hpotk.MinimalOntology = None) -> None:
"""
:param ppkt_list: List of Phenopackets we wish to display as a table of HPO term counts
:type ppkt_list: typing.list[PPKt.Phenopacket]
:target_ppkt: Phenopacket of the individual that we wish to compare with the cohort that is represented in ppkt_list. Optional
:type target_ppkt: typing.Optional[PPKt.Phenopacket]
:param hpo: Reference to HPO ontology object (if nulll, will be created in constructor)
:type hpo: hpotk.MinimalOntology
"""
Expand All @@ -107,16 +130,24 @@ def __init__(self,
raise ValueError(f"This class does not support visualization of phenopackets with more than one disease diagnosis")
disease_term = ppkt.diseases[0]
disease_dict[disease_term.term].append(ppkt)
if target_ppkt is not None:
## We want to show the target as a separate column.
## use this term as a marker, it will not be displayed
oclzz = OntologyClass202(id=TARGET_DISEASE_ID, label=target_ppkt.id)
disease_dict[oclzz].append(target_ppkt)
hpo_to_counter = dict()
self._hpo_term_ids_for_display = set()
warn_terms = set() ## to avoid making the same error message multiple times
# The following for loop extracts data for each disease one at a time.
for disease_term, ppkt_list in disease_dict.items():
for ppkt in ppkt_list:
# We count not only explicitly annotated terms but also the ancestor of observed terms
# and descendents of excluded terms.
# Note that we keep track of explicitly annotated terms and these are the only ones we show in the output table
observed_with_ancestors = set()
excluded_with_descendants = set()
for pf in ppkt.phenotypic_features:
oclzz = pf.type
hpo_id = oclzz.id
hpo_term = self._hpo.get_term(oclzz.id)
if hpo_term.identifier.value != oclzz.id :
if hpo_term.identifier.value not in warn_terms:
Expand All @@ -125,8 +156,8 @@ def __init__(self,
print(f"Use of outdated id {oclzz.id} ({oclzz.label}). Replacing with {hpo_term.identifier.value}.")
print("###################################")
oclzz = OntologyClass202(id=hpo_term.identifier.value, label=hpo_term.name)
hpo_id = oclzz.id
self._hpo_term_ids_for_display.add(oclzz.id)
hpo_id = oclzz.id
self._hpo_term_ids_for_display.add(hpo_id)
if pf.excluded:
desc_set = self._hpo.graph.get_descendants(hpo_id, include_source=True)
excluded_with_descendants.update(desc_set)
Expand All @@ -139,7 +170,7 @@ def __init__(self,
if oclzz not in hpo_to_counter:
hpo_to_counter[oclzz] = HpoCohortCount(hpo=oclzz)
hpo_to_counter.get(oclzz).increment_observed(disease_term)
for hpo_term in excluded_with_descendants:
for hpo_id in excluded_with_descendants:
hpo_label = self._hpo.get_term_name(hpo_id)
oclzz = OntologyClass202(id=hpo_id, label=hpo_label)
if oclzz not in hpo_to_counter:
Expand All @@ -163,9 +194,14 @@ def to_data_frame(self) -> pd.DataFrame:
d = dict()
d["HPO"] = hpo2c.hpo_display
for disease in self._disease_list:
d[disease.label] = hpo2c.frequency_for_disease(disease)
if disease.id == TARGET_DISEASE_ID:
d[disease.label] = hpo2c.frequency_for_target_disease(disease)
else:
d[disease.label] = hpo2c.frequency_for_disease(disease)
items.append(d)
return pd.DataFrame(items)
df = pd.DataFrame(items)
df_reset = df.reset_index(drop=True) # the index is irrelevant
return df_reset



Expand Down
25 changes: 25 additions & 0 deletions src/pyphetools/visualization/phenopacket_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,4 +68,29 @@ def get_phenopacket_dictionary(self) -> typing.Dict:
def get_phenopacket_list(self) -> typing.List:
ppktd = self.get_phenopacket_dictionary()
return list(ppktd.values())


def _ingest(self, indir="phenopackets", recursive:bool=False, disease_id:str=None):
for file in os.listdir(indir):
fname = os.path.join(indir, file)
if fname.endswith(".json") and os.path.isfile(fname):
with open(fname) as f:
data = f.read()
jsondata = json.loads(data)
ppack = Parse(json.dumps(jsondata), PPKt.Phenopacket())
if disease_id is not None:
if not PhenopacketIngestor.has_disease_id(ppkt=ppack, disease_id=disease_id):
continue
self._phenopackets.append(ppack)


def ingest_from_directory(self, indir:str):
return self._ingest(indir=indir)

def ingest_from_file(self, json_file:str) -> PPKt.Phenopacket:
with open(json_file) as f:
data = f.read()
jsondata = json.loads(data)
ppack = Parse(json.dumps(jsondata), PPKt.Phenopacket())
return ppack

38 changes: 38 additions & 0 deletions test/test_disease_specific_hpo_counter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import pytest

from pyphetools.visualization import DiseaseSpecificHpoCounter, HpoCohortCount
from pyphetools.pp.v202 import OntologyClass as OntologyClass202

class TestDiseaseSpecificHpoCounter:

def test_HpoCohortCount(self):
oclzz = OntologyClass202(id="HP:0001288", label="Gait disturbance")
eri1 = OntologyClass202(id='OMIM:608739', label='ERI1-related disease')
eri2 = OntologyClass202(id='OMIM:608742', label='ERI2-related disease') # fake
eri3 = OntologyClass202(id='OMIM:608744', label='ERI3-related disease') # fake
hpo_counter = HpoCohortCount(hpo=oclzz)
assert hpo_counter is not None
hpo_counter.increment_observed(eri1)
assert hpo_counter.get_observed(eri1) == 1
hpo_counter.increment_observed(eri1)
hpo_counter.increment_observed(eri1)
assert hpo_counter.get_observed(eri1) == 3
hpo_counter.increment_excluded(eri1)
assert hpo_counter.get_observed(eri1) == 3
assert hpo_counter.get_excluded(eri1) == 1
hpo_counter.increment_observed(eri2)
hpo_counter.increment_observed(eri2)
hpo_counter.increment_excluded(eri2)
hpo_counter.increment_excluded(eri2)
hpo_counter.increment_excluded(eri2)
assert hpo_counter.get_observed(eri2) == 2
assert hpo_counter.get_excluded(eri2) == 3
assert hpo_counter.get_observed(eri1) == 3
assert hpo_counter.get_excluded(eri1) == 1
assert hpo_counter.get_observed(eri3) == 0 ## we did not add frequencies for this disease
assert hpo_counter.frequency_for_disease(eri1) == "3/4 (75%)"
assert hpo_counter.frequency_for_disease(eri2) == "2/5 (40%)"
assert hpo_counter.frequency_for_disease(eri3) == "n/a" # no information available for eri3
assert hpo_counter.get_maximum_frequency() == 0.75


0 comments on commit 5b1c4ee

Please sign in to comment.