From 5b1c4eef3d9c03865f2a7640578af459d6d96377 Mon Sep 17 00:00:00 2001
From: Peter Robinson <peter.robinson@jax.org>
Date: Thu, 5 Sep 2024 10:23:00 +0200
Subject: [PATCH] improved table creation

---
 src/pyphetools/visualization/__init__.py      |  2 +-
 .../disease_specific_hpo_counter.py           | 54 +++++++++++++++----
 .../visualization/phenopacket_ingestor.py     | 25 +++++++++
 test/test_disease_specific_hpo_counter.py     | 38 +++++++++++++
 4 files changed, 109 insertions(+), 10 deletions(-)
 create mode 100644 test/test_disease_specific_hpo_counter.py

diff --git a/src/pyphetools/visualization/__init__.py b/src/pyphetools/visualization/__init__.py
index 12afa64..4dda561 100644
--- a/src/pyphetools/visualization/__init__.py
+++ b/src/pyphetools/visualization/__init__.py
@@ -1,5 +1,5 @@
 from .detailed_suppl_table import DetailedSupplTable
-from .disease_specific_hpo_counter import DiseaseSpecificHpoCounter
+from .disease_specific_hpo_counter import DiseaseSpecificHpoCounter, HpoCohortCount
 from .focus_count_table import FocusCountTable
 from .hpoa_table_creator import HpoaTableCreator, HpoaTableBuilder
 from .individual_table import IndividualTable
diff --git a/src/pyphetools/visualization/disease_specific_hpo_counter.py b/src/pyphetools/visualization/disease_specific_hpo_counter.py
index 4e20560..18ffe46 100644
--- a/src/pyphetools/visualization/disease_specific_hpo_counter.py
+++ b/src/pyphetools/visualization/disease_specific_hpo_counter.py
@@ -7,6 +7,7 @@
 from ..pp.v202 import Phenopacket as Phenopacket202
 from ..pp.v202 import OntologyClass as OntologyClass202
 
+TARGET_DISEASE_ID = "MONDO:0000001"
 
 class HpoCohortCount:
     """
@@ -56,7 +57,19 @@ def frequency_for_disease(self, disease: OntologyClass202) -> str:
             return "n/a" # no information available for this
         else:
             percentage = 100 * obs / total
-            return f"{obs}/{total} ({percentage}%)"
+            return f"{obs}/{total} ({int(percentage)}%)"
+        
+
+    def frequency_for_target_disease(self, disease: OntologyClass202) -> str:
+        exc = self._d_to_excluded[disease]
+        obs = self._d_to_observed[disease]
+        total = exc + obs
+        if total == 0:
+            return "n/a" # no information available for this
+        elif obs == 1:
+            return f"observed"
+        else:
+            return "excluded"
         
     def __str__(self):
         items = list()
@@ -69,7 +82,6 @@ def get_maximum_frequency(self):
         We sort with a heuristic that rewards at least one disease with a high frequency AND an overall high number of observed counts.
         """
         frequencies = list() ## here we do not care about the actual order
-        total = self.get_total() ## total number of observations
         for k, v in self._d_to_observed.items():
             if v == 0:
                 frequencies.append(0)
@@ -80,7 +92,15 @@ def get_maximum_frequency(self):
                 frequencies.append(obs/total)
         if len(frequencies) == 0:
             return 0
-        return total * max(frequencies)
+        return max(frequencies)
+    
+    def get_weighted_maximum_frequency(self):
+        """
+        Heuristic for sorting - maximum frequency times toit
+        """
+        total = self.get_total() ## total number of observations
+        max_f = self.get_maximum_frequency()
+        return total * max_f
 
 
 
@@ -88,10 +108,13 @@ class DiseaseSpecificHpoCounter:
 
     def __init__(self, 
                  ppkt_list: typing.List[PPKt.Phenopacket],
+                 target_ppkt: PPKt.Phenopacket = None,
                  hpo: hpotk.MinimalOntology = None) -> None:
         """
         :param ppkt_list: List of Phenopackets we wish to display as a table of HPO term counts
         :type ppkt_list: typing.list[PPKt.Phenopacket]
+        :target_ppkt: Phenopacket of the individual that we wish to compare with the cohort that is represented in ppkt_list. Optional
+        :type target_ppkt: typing.Optional[PPKt.Phenopacket]
         :param hpo: Reference to HPO ontology object (if nulll, will be created in constructor)
         :type hpo: hpotk.MinimalOntology
         """
@@ -107,16 +130,24 @@ def __init__(self,
                 raise ValueError(f"This class does not support visualization of phenopackets with more than one disease diagnosis")
             disease_term = ppkt.diseases[0]
             disease_dict[disease_term.term].append(ppkt)
+        if target_ppkt is not None:
+            ## We want to show the target as a separate column.
+            ## use this term as a marker, it will not be displayed
+            oclzz = OntologyClass202(id=TARGET_DISEASE_ID, label=target_ppkt.id) 
+            disease_dict[oclzz].append(target_ppkt)
         hpo_to_counter = dict()
         self._hpo_term_ids_for_display = set()
         warn_terms = set() ## to avoid making the same error message multiple times
+        # The following for loop extracts data for each disease one at a time.
         for disease_term, ppkt_list in disease_dict.items():
             for ppkt in ppkt_list:
+                # We count not only explicitly annotated terms but also the ancestor of observed terms
+                # and descendents of excluded terms.
+                # Note that we keep track of explicitly annotated terms and these are the only ones we show in the output table
                 observed_with_ancestors = set()
                 excluded_with_descendants = set()
                 for pf in ppkt.phenotypic_features:
                     oclzz = pf.type
-                    hpo_id = oclzz.id
                     hpo_term = self._hpo.get_term(oclzz.id)
                     if hpo_term.identifier.value != oclzz.id :
                         if hpo_term.identifier.value not in warn_terms:
@@ -125,8 +156,8 @@ def __init__(self,
                             print(f"Use of outdated id {oclzz.id} ({oclzz.label}). Replacing with {hpo_term.identifier.value}.")
                             print("###################################") 
                         oclzz = OntologyClass202(id=hpo_term.identifier.value, label=hpo_term.name)
-                        hpo_id = oclzz.id
-                    self._hpo_term_ids_for_display.add(oclzz.id)
+                    hpo_id = oclzz.id
+                    self._hpo_term_ids_for_display.add(hpo_id)
                     if pf.excluded:
                         desc_set = self._hpo.graph.get_descendants(hpo_id, include_source=True)
                         excluded_with_descendants.update(desc_set)
@@ -139,7 +170,7 @@ def __init__(self,
                     if oclzz not in hpo_to_counter:
                         hpo_to_counter[oclzz] = HpoCohortCount(hpo=oclzz)
                     hpo_to_counter.get(oclzz).increment_observed(disease_term)
-                for hpo_term in excluded_with_descendants:
+                for hpo_id in excluded_with_descendants:
                     hpo_label = self._hpo.get_term_name(hpo_id)
                     oclzz = OntologyClass202(id=hpo_id, label=hpo_label)
                     if oclzz not in hpo_to_counter:
@@ -163,9 +194,14 @@ def to_data_frame(self) -> pd.DataFrame:
             d = dict()
             d["HPO"] = hpo2c.hpo_display
             for disease in self._disease_list:
-                d[disease.label] = hpo2c.frequency_for_disease(disease)
+                if disease.id == TARGET_DISEASE_ID:
+                    d[disease.label] = hpo2c.frequency_for_target_disease(disease)
+                else:
+                    d[disease.label] = hpo2c.frequency_for_disease(disease)
             items.append(d)
-        return pd.DataFrame(items)
+        df = pd.DataFrame(items)
+        df_reset = df.reset_index(drop=True) # the index is irrelevant
+        return df_reset
 
         
         
diff --git a/src/pyphetools/visualization/phenopacket_ingestor.py b/src/pyphetools/visualization/phenopacket_ingestor.py
index 156d417..ba0c8d8 100644
--- a/src/pyphetools/visualization/phenopacket_ingestor.py
+++ b/src/pyphetools/visualization/phenopacket_ingestor.py
@@ -68,4 +68,29 @@ def get_phenopacket_dictionary(self) -> typing.Dict:
     def get_phenopacket_list(self) -> typing.List:
         ppktd = self.get_phenopacket_dictionary()
         return list(ppktd.values())
+    
+
+    def _ingest(self, indir="phenopackets", recursive:bool=False, disease_id:str=None):
+        for file in os.listdir(indir):
+            fname = os.path.join(indir, file)
+            if fname.endswith(".json") and os.path.isfile(fname):
+                with open(fname) as f:
+                    data = f.read()
+                    jsondata = json.loads(data)
+                    ppack = Parse(json.dumps(jsondata), PPKt.Phenopacket())
+                    if disease_id is not None:
+                        if not PhenopacketIngestor.has_disease_id(ppkt=ppack, disease_id=disease_id):
+                            continue
+                    self._phenopackets.append(ppack)
+
+
+    def ingest_from_directory(self, indir:str):
+        return self._ingest(indir=indir)
+    
+    def ingest_from_file(self, json_file:str) -> PPKt.Phenopacket:
+         with open(json_file) as f:
+            data = f.read()
+            jsondata = json.loads(data)
+            ppack = Parse(json.dumps(jsondata), PPKt.Phenopacket())
+            return ppack
 
diff --git a/test/test_disease_specific_hpo_counter.py b/test/test_disease_specific_hpo_counter.py
new file mode 100644
index 0000000..4188a36
--- /dev/null
+++ b/test/test_disease_specific_hpo_counter.py
@@ -0,0 +1,38 @@
+import pytest
+
+from pyphetools.visualization import DiseaseSpecificHpoCounter, HpoCohortCount
+from pyphetools.pp.v202 import OntologyClass as OntologyClass202
+
+class TestDiseaseSpecificHpoCounter:
+
+    def test_HpoCohortCount(self):
+        oclzz = OntologyClass202(id="HP:0001288", label="Gait disturbance")
+        eri1 = OntologyClass202(id='OMIM:608739', label='ERI1-related disease')
+        eri2 = OntologyClass202(id='OMIM:608742', label='ERI2-related disease') # fake
+        eri3 = OntologyClass202(id='OMIM:608744', label='ERI3-related disease') # fake
+        hpo_counter = HpoCohortCount(hpo=oclzz)
+        assert hpo_counter is not None
+        hpo_counter.increment_observed(eri1)
+        assert hpo_counter.get_observed(eri1) == 1
+        hpo_counter.increment_observed(eri1)
+        hpo_counter.increment_observed(eri1)
+        assert hpo_counter.get_observed(eri1) == 3
+        hpo_counter.increment_excluded(eri1)
+        assert hpo_counter.get_observed(eri1) == 3
+        assert hpo_counter.get_excluded(eri1) == 1
+        hpo_counter.increment_observed(eri2)
+        hpo_counter.increment_observed(eri2)
+        hpo_counter.increment_excluded(eri2)
+        hpo_counter.increment_excluded(eri2)
+        hpo_counter.increment_excluded(eri2)
+        assert hpo_counter.get_observed(eri2) == 2
+        assert hpo_counter.get_excluded(eri2) == 3
+        assert hpo_counter.get_observed(eri1) == 3
+        assert hpo_counter.get_excluded(eri1) == 1
+        assert hpo_counter.get_observed(eri3) == 0 ## we did not add frequencies for this disease
+        assert hpo_counter.frequency_for_disease(eri1) == "3/4 (75%)"
+        assert hpo_counter.frequency_for_disease(eri2) == "2/5 (40%)"
+        assert hpo_counter.frequency_for_disease(eri3) == "n/a" # no information available for eri3
+        assert hpo_counter.get_maximum_frequency() == 0.75
+
+