diff --git a/docs/index.md b/docs/index.md index a007e5c..ea3f4d1 100644 --- a/docs/index.md +++ b/docs/index.md @@ -10,7 +10,7 @@ from tabular data such as databases or supplemental files found in the medical l This documentation contains information about - How to use the [Excel template](user-guide/excel.md) to code clinical data -- How to use [pyphetools classes](user-guide/jupyter.md) to convert tabular data (e.g., supplemental tables) to phenopackets +- How to use [pyphetools classes](tabular/jupyter.md) to convert tabular data (e.g., supplemental tables) to phenopackets - Information for [developers](developers/developers.md) - A description of the pyphetools [API](api/overview.md) diff --git a/docs/user-guide/excel.md b/docs/user-guide/excel.md index cd597b2..1fe3d61 100644 --- a/docs/user-guide/excel.md +++ b/docs/user-guide/excel.md @@ -2,7 +2,7 @@ We have designed a format for Excel templates that can be used to quickly and efficiently generate collections of Phenopackets. This is currently the prefered way for clinicians and translational researchers to contribute to this project. The [pyphetools](https://github.com/monarch-initiative/pyphetools){:target="_blank"} library provides other means for bioinformaticians (please ask us). -The template can be downloaded [here](../_static/template.xlsx){:target="_blank"}. +The template file is generated for each disease as described in [template](template.md). # A format for cohort descriptions in excel diff --git a/docs/user-guide/variant_notation.md b/docs/user-guide/variant_notation.md index 85a5e62..8c5f314 100644 --- a/docs/user-guide/variant_notation.md +++ b/docs/user-guide/variant_notation.md @@ -1,6 +1,6 @@ # Variant Notation -We recommend that users choose one transcript for all HGVS variant descriptions in a project. In general, the most clinicallz relevant transcript should be chosen. +We recommend that users choose one transcript for all HGVS variant descriptions in a project. In general, the most clinically relevant transcript should be chosen. ### Choosing the reference transcript for a project diff --git a/src/pyphetools/__init__.py b/src/pyphetools/__init__.py index 33da553..65bd31d 100644 --- a/src/pyphetools/__init__.py +++ b/src/pyphetools/__init__.py @@ -5,7 +5,7 @@ from . import validation -__version__ = "0.9.105" +__version__ = "0.9.108" __all__ = [ diff --git a/src/pyphetools/creation/create_template.py b/src/pyphetools/creation/create_template.py index 2a1de40..25a9aed 100644 --- a/src/pyphetools/creation/create_template.py +++ b/src/pyphetools/creation/create_template.py @@ -80,6 +80,7 @@ def create_template(self, disease_id:str, disease_label:str, HGNC_id:str, gene_s :param gene_symbol: corresponding gene symbol, e.g., FBN1 :param transcript: transcript to be used for the HVGC nomenclature. Must be refseq with version number """ + self._qc_disease_information(disease_id=disease_id, disease_label=disease_label) H1_Headers = REQUIRED_H1_FIELDS H2_Headers = REQUIRED_H2_FIELDS if len(H1_Headers) != len(H2_Headers): @@ -123,6 +124,23 @@ def create_template(self, disease_id:str, disease_label:str, HGNC_id:str, gene_s df.to_excel(fname, index=False) print(f"Wrote Excel pyphetools template file to {fname}") + def _qc_disease_information(self, + disease_id:str, + disease_label:str) -> None: + """ + Check some common errors in data entry and raise an exception if the disease ID and label are not correct + """ + if ":" not in disease_id: + raise ValueError(f"Malformed disease id-not CURIE: \"{disease_id}\"") + fields = disease_id.split(":") + if len(fields) != 2: + raise ValueError(f"Malformed disease id-only one colon allowed: \"{disease_id}\"") + disease_db = fields[0] + if disease_db != "OMIM" and disease_db != "MONDO": + raise ValueError(f"Malformed disease id-did not recognize disease database: \"{disease_id}\"") + if "\t" in disease_label: + raise ValueError(f"Malformed disease label: \”{disease_label}\"") + def create_from_phenopacket(self, ppkt): """ create pyphetools templates from an individual phenopacket. diff --git a/src/pyphetools/creation/disease.py b/src/pyphetools/creation/disease.py index ee260a5..6228d79 100644 --- a/src/pyphetools/creation/disease.py +++ b/src/pyphetools/creation/disease.py @@ -16,6 +16,9 @@ def __init__(self, disease_id:str, disease_label): raise ValueError(f"Malformed disease identifier with white space: \"{disease_id}\"") if disease_label.startswith(" ") or disease_label.endswith(" "): raise ValueError(f"Malformed disease label (starts/ends with whitespace): \"{disease_label}\"") + # occasionally, copy-paste error leads to this kind of malformed label: "Developmental and epileptic encephalopathy 50\t616457\tAR\t3\t" + if "\t" in disease_label: + raise ValueError(f"Malformed disease label (contains tabs): \"{disease_label}\"") self._id = disease_id self._label = disease_label diff --git a/src/pyphetools/creation/measurements.py b/src/pyphetools/creation/measurements.py index 7836109..f044067 100644 --- a/src/pyphetools/creation/measurements.py +++ b/src/pyphetools/creation/measurements.py @@ -11,16 +11,19 @@ pg_per_l = OntologyClass202(id="UCUM:pg/L", label="picogram per liter") pg_per_ml = OntologyClass202(id="UCUM:pg/mL", label="picogram per milliliter") nmol_per_l = OntologyClass202(id="UCUM:nmol/L", label="nanomole per liter") +mmol_per_l= OntologyClass202(id="UCUM:mmol/L", label="millimole per liter") +percent = OntologyClass202(id="UCUM:%", label="percent") class Measurements: - - + """ + Convenience class with static methods to create Measurement objects for common units. + """ @staticmethod def _with_reference_range(assay: OntologyClass202, - unit: OntologyClass202, + unit: OntologyClass202, value: float, low: float, high: float) -> Measurement202: @@ -93,6 +96,27 @@ def nanomole_per_liter(code: str, high: float = None) -> Measurement202: assay = OntologyClass202(id=code, label=label) return Measurements._from_assay_and_values(assay=assay, unit=nmol_per_l, value=concentration, low=low, high=high) + + @staticmethod + def millimole_per_liter(code: str, + label: str, + concentration: float, + low: float = None, + high: float = None) -> Measurement202: + assay = OntologyClass202(id=code, label=label) + return Measurements._from_assay_and_values(assay=assay, unit=mmol_per_l, value=concentration, low=low, high=high) + + @staticmethod + def percent(code: str, + label: str, + concentration: float, + low: float = None, + high: float = None) -> Measurement202: + assay = OntologyClass202(id=code, label=label) + return Measurements._from_assay_and_values(assay=assay, unit=percent, value=concentration, low=low, high=high) + + + diff --git a/src/pyphetools/creation/promoter_variant.py b/src/pyphetools/creation/promoter_variant.py index 7fe27b8..6ca224d 100644 --- a/src/pyphetools/creation/promoter_variant.py +++ b/src/pyphetools/creation/promoter_variant.py @@ -47,6 +47,7 @@ def to_variant_interpretation(self, acmg=None) -> VariantInterpretation202: gene_descriptor = GeneDescriptor202(value_id=self._hgnc_id, symbol=self._gene_symbol) vdescriptor = VariationDescriptor202(id=self._variant_id, molecule_context=MoleculeContext202.genomic, + description=self._description, gene_context=gene_descriptor, label=self._description, structural_type=self._sequence_ontology_term) diff --git a/src/pyphetools/creation/variant_manager.py b/src/pyphetools/creation/variant_manager.py index 7ac7558..7c27843 100644 --- a/src/pyphetools/creation/variant_manager.py +++ b/src/pyphetools/creation/variant_manager.py @@ -1,7 +1,7 @@ import os import pickle import pandas as pd -from typing import List, Dict +from typing import List from collections import defaultdict from .individual import Individual from .variant_validator import VariantValidator @@ -48,6 +48,21 @@ class VariantManager: shows the other variants. These can be use to create chromosomal deletions, duplications, and inversions. Finally, the class can be used to add variants to a list of Individual objects. + If the Excel template is used, this class will be called internally and users do not need to use the code. If the + data is ingested manually, the class can be used as follows. + + gnas_symbol = "GNAS" + gnas_id = "HGNC:4392" + gnas_MANE_transcript = "NM_000516.7" + vmanager = VariantManager(df=df, + individual_column_name="individual", + transcript=gnas_MANE_transcript, + gene_id=gnas_id, + gene_symbol=gnas_symbol, + allele_1_column_name="allele_1") + + See [variant_manager](https://monarch-initiative.github.io/pyphetools/api/creation/variant_manager/) for more information. + :param df: DataFrame representing the input data :type df: pd.DataFrame :param individual_column_name: Name of the individual (patient) column @@ -99,13 +114,13 @@ def __init__(self, self._create_variant_d(overwrite) - def _format_pmid_id(self, identifier, pmid): + def _format_pmid_id(self, identifier, pmid) -> str: if pmid is not None: return f"{pmid}_{identifier}" else: return identifier - def _get_identifier_with_pmid(self, row:pd.Series): + def _get_identifier_with_pmid(self, row:pd.Series) -> str: """Get an identifier such as PMID_33087723_A2 for a daa row with PMID:33087723 and identifier within that publication A2 Identifiers such as P1 are commonly used and there is a risk of a clash with collections of phenopackets from various papers. @@ -118,7 +133,7 @@ def _get_identifier_with_pmid(self, row:pd.Series): else: return individual_id - def _create_variant_d(self, overwrite): + def _create_variant_d(self, overwrite) -> None: """ Creates a dictionary with all HGVS variants, and as a side effect creates a set with variants that are not HGVS and need to be mapped manually. This method has the following effects @@ -182,9 +197,9 @@ def _create_variant_d(self, overwrite): self._unmapped_alleles.add(v) # This allows us to use the chromosomal mappers. write_variant_pickle(name=self._gene_symbol, my_object=self._var_d) - def code_as_chromosomal_deletion(self, allele_set): + def code_as_chromosomal_deletion(self, allele_set) -> None: """ - Code as Structural variants - chromosomal deletion (to be added to self._var_d) + Code variants with the identifiers in "allele_set" as Structural variants (chromosomal deletion) :param allele_set: Set of alleles (strings) for coding as Structural variants (chromosomal deletion) """ # first check that all of the alleles are in self._unmapped_alleles @@ -200,9 +215,9 @@ def code_as_chromosomal_deletion(self, allele_set): self._unmapped_alleles.remove(allele) self._var_d[allele] = var - def code_as_chromosomal_duplication(self, allele_set): + def code_as_chromosomal_duplication(self, allele_set) -> None: """ - Code as Structural variants - chromosomal duplication (to be added to self._var_d) + Code variants with the identifiers in "allele_set" as Structural variants (chromosomal duplication) :param allele_set: Set of alleles (strings) for coding as Structural variants (chromosomal duplication) """ # first check that all of the alleles are in self._unmapped_alleles @@ -217,7 +232,7 @@ def code_as_chromosomal_duplication(self, allele_set): def code_as_chromosomal_inversion(self, allele_set) -> None: """ - Code as Structural variants - chromosomal inversion (to be added to self._var_d) + Code variants with the identifiers in "allele_set" as Structural variants (chromosomal inversion) :param allele_set: Set of alleles (strings) for coding as Structural variants (chromosomal inversion) """ # first check that all of the alleles are in self._unmapped_alleles @@ -232,8 +247,8 @@ def code_as_chromosomal_inversion(self, allele_set) -> None: def code_as_chromosomal_translocation(self, allele_set) -> None: """ - Code as Structural variants - chromosomal translocation (to be added to self._var_d) - :param allele_set: Set of alleles (strings) for coding as Structural variants (chromosomal inversion) + Code variants with the identifiers in "allele_set" as Structural variants (chromosomal translocation) + :param allele_set: Set of alleles (strings) for coding as Structural variants (chromosomal translocation) """ # first check that all of the alleles are in self._unmapped_alleles if not allele_set.issubset(self._unmapped_alleles): diff --git a/src/pyphetools/pp/v202/_base.py b/src/pyphetools/pp/v202/_base.py index ca97889..efae2f9 100644 --- a/src/pyphetools/pp/v202/_base.py +++ b/src/pyphetools/pp/v202/_base.py @@ -66,10 +66,16 @@ def iso_to_days(iso_age:str) -> int: y = age.find("Y") if y != -1: days = days + int(365.25*int(age[:y])) + if age.endswith("Y"): + return days age = age[y+1:] m = age.find("M") if m != -1: days = days + int(30.436875*int(age[:m])) + # if the string ends with M, e.g., P3Y2M, then do not look for days + if age.endswith("M"): + return days + # if not, advance the string pointer so we can parse the days age = age[m+1:] d = age.find("D") if d != -1: diff --git a/src/pyphetools/visualization/__init__.py b/src/pyphetools/visualization/__init__.py index 62f6c7e..4dda561 100644 --- a/src/pyphetools/visualization/__init__.py +++ b/src/pyphetools/visualization/__init__.py @@ -1,4 +1,5 @@ from .detailed_suppl_table import DetailedSupplTable +from .disease_specific_hpo_counter import DiseaseSpecificHpoCounter, HpoCohortCount from .focus_count_table import FocusCountTable from .hpoa_table_creator import HpoaTableCreator, HpoaTableBuilder from .individual_table import IndividualTable diff --git a/src/pyphetools/visualization/counted_hpo_term.py b/src/pyphetools/visualization/counted_hpo_term.py index ab0aec8..44c877a 100644 --- a/src/pyphetools/visualization/counted_hpo_term.py +++ b/src/pyphetools/visualization/counted_hpo_term.py @@ -3,25 +3,26 @@ class CountedHpoTerm: """ - This class intends to keep track of the frequzency of an HPO term in a cohort + This class intends to keep track of the frequency of an HPO term in a cohort + The denominator refers to the total number of observed counts and the numerator is equal to observed+exluded, i.., the total count for which some information was available. """ def __init__(self, hpo_term, numerator, denominator): if not isinstance(numerator, int): raise ValueError(f"Malformed numerator (must be integer but was {numerator})") if not isinstance(denominator, int): raise ValueError(f"Malformed denominator (must be integer but was {denominator})") - self._onset_term_id = hpo_term.id - self._onset_term_label = hpo_term.label + self._hpo_term_id = hpo_term.id + self._hpo_term_label = hpo_term.label self._num = numerator self._denom = denominator @property def id(self): - return self._onset_term_id + return self._hpo_term_id @property def label(self): - return self._onset_term_label + return self._hpo_term_label def has_frequency(self): return self._num is not None and self._denom is not None @@ -34,9 +35,6 @@ def numerator(self): def denominator(self): return self._denom - def increment_numerator(self): - self._num += 1 - class CohortTermCounter: diff --git a/src/pyphetools/visualization/detailed_suppl_table.py b/src/pyphetools/visualization/detailed_suppl_table.py index a35abda..2b3d61e 100644 --- a/src/pyphetools/visualization/detailed_suppl_table.py +++ b/src/pyphetools/visualization/detailed_suppl_table.py @@ -6,10 +6,44 @@ from .hpo_category import HpoCategorySet from hpotk.model import TermId from collections import defaultdict +import hpotk +from enum import Enum + ALL_ROOT = TermId.from_curie("HP:0000001") PHENOTYPIC_ABNORMALITY_ROOT = TermId.from_curie("HP:0000118") +class HpoStatus(Enum): + OBSERVED = 1, "observed" + EXCLUDED = 2, "excluded" + NOT_AVAILABLE = 3, "na" + + +class HpoTableCell: + """ + Represents one cell of the detailed table + """ + def __init__(self, + hpo_term_id:hpotk.TermId, + status: HpoStatus + ) -> None: + self._hpo_id = hpo_term_id + self._status = status + + def to_cell(self): + return self._status.value + + def to_cell_pm(self): + """ + Alternative plus-minus (pm) notation + """ + if self._status == HpoStatus.OBSERVED: + return "+" + elif self._status == HpoStatus.EXCLUDED: + return "-" + else: + return "na" + class DetailedSupplTable: @@ -19,7 +53,9 @@ class DetailedSupplTable: are shown as columns. """ - def __init__(self, patient_list: typing.List[PPKt.Phenopacket], hp_json:str=None) -> None: + def __init__(self, + patient_list: typing.List[PPKt.Phenopacket], + hp_json:str=None) -> None: """ :param patient_d: dictionary of patients to display :type patient_d: map with key string and value SimplePatient @@ -47,6 +83,8 @@ def __init__(self, patient_list: typing.List[PPKt.Phenopacket], hp_json:str=None # key is a string such as HP:0001234, value is an HpTerm object # we need to convert it to an object from hpo-toolkit because get_ancestors returns HpTerm objects hp_termid = TermId.from_curie(hp_id) + if not self._hp_ontology.graph.is_descendant_of(hp_termid, PHENOTYPIC_ABNORMALITY_ROOT): + continue # do not count terms that are not phenotypes ancs = self._hp_ontology.graph.get_ancestors(hp_termid) anc_set.add(hp_termid) anc_set.update(ancs) @@ -59,10 +97,14 @@ def __init__(self, patient_list: typing.List[PPKt.Phenopacket], hp_json:str=None variants = pat.get_variant_list() for var in variants: var_d[var] += 1 - - # TODO figure out what to do with biallelic self._hpo_category_set = HpoCategorySet(ontology=hp_ontology) + def _calculate_table(self, patient_list: typing.List[PPKt.Phenopacket], ) -> typing.List[typing.List[str]]: + for pat in self._simple_patient_list: + hpo_terms = pat.get_observed_hpo_d() + anc_set = set() # graph with ancestors induced by all terms of the patient + + def _get_table(self, counts_d): """ diff --git a/src/pyphetools/visualization/disease_specific_hpo_counter.py b/src/pyphetools/visualization/disease_specific_hpo_counter.py new file mode 100644 index 0000000..18ffe46 --- /dev/null +++ b/src/pyphetools/visualization/disease_specific_hpo_counter.py @@ -0,0 +1,209 @@ +import typing +from collections import defaultdict +import pandas as pd +import hpotk +import phenopackets as PPKt +from ..creation.hpo_parser import HpoParser +from ..pp.v202 import Phenopacket as Phenopacket202 +from ..pp.v202 import OntologyClass as OntologyClass202 + +TARGET_DISEASE_ID = "MONDO:0000001" + +class HpoCohortCount: + """ + Keep track of the observed and excluded counts for some HPO term within a cohort defined by a Disease + """ + + def __init__(self, hpo: OntologyClass202) -> None: + self._hpo = hpo + self._d_to_observed = defaultdict(int) + self._d_to_excluded = defaultdict(int) + self._disease_set = set() + + + @property + def hpo_display(self) -> str: + return f"{self._hpo.label} ({self._hpo.id})" + + @property + def hpo(self) -> OntologyClass202: + return self._hpo + + + def get_observed(self, disease: OntologyClass202) -> int: + return self._d_to_observed[disease] + + + def get_excluded(self, disease: OntologyClass202) -> int: + return self._d_to_excluded[disease] + + def get_total(self) -> int: + x = [x for x in self._d_to_observed.values()] + return sum(x) + + def increment_excluded(self, disease: OntologyClass202) -> None: + self._disease_set.add(disease) + self._d_to_excluded[disease] += 1 + + def increment_observed(self, disease: OntologyClass202) -> None: + self._disease_set.add(disease) + self._d_to_observed[disease] += 1 + + def frequency_for_disease(self, disease: OntologyClass202) -> str: + exc = self._d_to_excluded[disease] + obs = self._d_to_observed[disease] + total = exc + obs + if total == 0: + return "n/a" # no information available for this + else: + percentage = 100 * obs / total + return f"{obs}/{total} ({int(percentage)}%)" + + + def frequency_for_target_disease(self, disease: OntologyClass202) -> str: + exc = self._d_to_excluded[disease] + obs = self._d_to_observed[disease] + total = exc + obs + if total == 0: + return "n/a" # no information available for this + elif obs == 1: + return f"observed" + else: + return "excluded" + + def __str__(self): + items = list() + for d in self._disease_set: + items.append(f"{d.id}-{self.frequency_for_disease(d)}") + return ";".join(items) + + def get_maximum_frequency(self): + """ + We sort with a heuristic that rewards at least one disease with a high frequency AND an overall high number of observed counts. + """ + frequencies = list() ## here we do not care about the actual order + for k, v in self._d_to_observed.items(): + if v == 0: + frequencies.append(0) + else: + exc = self._d_to_excluded.get(k, 0) + obs = self._d_to_observed.get(k, 0) + total = exc + obs + frequencies.append(obs/total) + if len(frequencies) == 0: + return 0 + return max(frequencies) + + def get_weighted_maximum_frequency(self): + """ + Heuristic for sorting - maximum frequency times toit + """ + total = self.get_total() ## total number of observations + max_f = self.get_maximum_frequency() + return total * max_f + + + +class DiseaseSpecificHpoCounter: + + def __init__(self, + ppkt_list: typing.List[PPKt.Phenopacket], + target_ppkt: PPKt.Phenopacket = None, + hpo: hpotk.MinimalOntology = None) -> None: + """ + :param ppkt_list: List of Phenopackets we wish to display as a table of HPO term counts + :type ppkt_list: typing.list[PPKt.Phenopacket] + :target_ppkt: Phenopacket of the individual that we wish to compare with the cohort that is represented in ppkt_list. Optional + :type target_ppkt: typing.Optional[PPKt.Phenopacket] + :param hpo: Reference to HPO ontology object (if nulll, will be created in constructor) + :type hpo: hpotk.MinimalOntology + """ + v202_ppkt = [Phenopacket202.from_message(ppkt) for ppkt in ppkt_list] + if hpo is None: + parser = HpoParser() + self._hpo = parser.get_ontology() + else: + self._hpo = hpo + disease_dict = defaultdict(list) + for ppkt in v202_ppkt: + if len(ppkt.diseases) != 1: + raise ValueError(f"This class does not support visualization of phenopackets with more than one disease diagnosis") + disease_term = ppkt.diseases[0] + disease_dict[disease_term.term].append(ppkt) + if target_ppkt is not None: + ## We want to show the target as a separate column. + ## use this term as a marker, it will not be displayed + oclzz = OntologyClass202(id=TARGET_DISEASE_ID, label=target_ppkt.id) + disease_dict[oclzz].append(target_ppkt) + hpo_to_counter = dict() + self._hpo_term_ids_for_display = set() + warn_terms = set() ## to avoid making the same error message multiple times + # The following for loop extracts data for each disease one at a time. + for disease_term, ppkt_list in disease_dict.items(): + for ppkt in ppkt_list: + # We count not only explicitly annotated terms but also the ancestor of observed terms + # and descendents of excluded terms. + # Note that we keep track of explicitly annotated terms and these are the only ones we show in the output table + observed_with_ancestors = set() + excluded_with_descendants = set() + for pf in ppkt.phenotypic_features: + oclzz = pf.type + hpo_term = self._hpo.get_term(oclzz.id) + if hpo_term.identifier.value != oclzz.id : + if hpo_term.identifier.value not in warn_terms: + warn_terms.add(hpo_term.identifier.value) + print("############# WARNING #############") + print(f"Use of outdated id {oclzz.id} ({oclzz.label}). Replacing with {hpo_term.identifier.value}.") + print("###################################") + oclzz = OntologyClass202(id=hpo_term.identifier.value, label=hpo_term.name) + hpo_id = oclzz.id + self._hpo_term_ids_for_display.add(hpo_id) + if pf.excluded: + desc_set = self._hpo.graph.get_descendants(hpo_id, include_source=True) + excluded_with_descendants.update(desc_set) + else: + ancs_set = self._hpo.graph.get_ancestors(hpo_id, include_source=True) + observed_with_ancestors.update(ancs_set) + for hpo_id in observed_with_ancestors: + hpo_label = self._hpo.get_term_name(hpo_id) + oclzz = OntologyClass202(id=hpo_id, label=hpo_label) + if oclzz not in hpo_to_counter: + hpo_to_counter[oclzz] = HpoCohortCount(hpo=oclzz) + hpo_to_counter.get(oclzz).increment_observed(disease_term) + for hpo_id in excluded_with_descendants: + hpo_label = self._hpo.get_term_name(hpo_id) + oclzz = OntologyClass202(id=hpo_id, label=hpo_label) + if oclzz not in hpo_to_counter: + hpo_to_counter[oclzz] = HpoCohortCount(hpo=oclzz) + hpo_to_counter.get(oclzz).increment_excluded(disease_term) + ## arrange the diseases according to number of annotated phenopackets + disease_tuple_list = [(k,v) for k, v in sorted(disease_dict.items(), key=lambda item: len(item[1]), reverse=True)] + ## sort the HPO terms according to the overall frequency. + self._disease_list = [x[0] for x in disease_tuple_list] + hpo_to_counter_list = list(hpo_to_counter.values()) + self._hpo_to_counter_list = sorted(hpo_to_counter_list, key=lambda x: x.get_maximum_frequency(), reverse=True) + + + def to_data_frame(self) -> pd.DataFrame: + disease_labels = [d.label for d in self._disease_list] + items = list() + for hpo2c in self._hpo_to_counter_list: + hpo_term = hpo2c.hpo + if hpo_term.id.value not in self._hpo_term_ids_for_display: + continue ## these will be the inferrence ancestors classes that are not used for explicitly for annotation, we want to skip them + d = dict() + d["HPO"] = hpo2c.hpo_display + for disease in self._disease_list: + if disease.id == TARGET_DISEASE_ID: + d[disease.label] = hpo2c.frequency_for_target_disease(disease) + else: + d[disease.label] = hpo2c.frequency_for_disease(disease) + items.append(d) + df = pd.DataFrame(items) + df_reset = df.reset_index(drop=True) # the index is irrelevant + return df_reset + + + + + diff --git a/src/pyphetools/visualization/phenopacket_ingestor.py b/src/pyphetools/visualization/phenopacket_ingestor.py index 156d417..ba0c8d8 100644 --- a/src/pyphetools/visualization/phenopacket_ingestor.py +++ b/src/pyphetools/visualization/phenopacket_ingestor.py @@ -68,4 +68,29 @@ def get_phenopacket_dictionary(self) -> typing.Dict: def get_phenopacket_list(self) -> typing.List: ppktd = self.get_phenopacket_dictionary() return list(ppktd.values()) + + + def _ingest(self, indir="phenopackets", recursive:bool=False, disease_id:str=None): + for file in os.listdir(indir): + fname = os.path.join(indir, file) + if fname.endswith(".json") and os.path.isfile(fname): + with open(fname) as f: + data = f.read() + jsondata = json.loads(data) + ppack = Parse(json.dumps(jsondata), PPKt.Phenopacket()) + if disease_id is not None: + if not PhenopacketIngestor.has_disease_id(ppkt=ppack, disease_id=disease_id): + continue + self._phenopackets.append(ppack) + + + def ingest_from_directory(self, indir:str): + return self._ingest(indir=indir) + + def ingest_from_file(self, json_file:str) -> PPKt.Phenopacket: + with open(json_file) as f: + data = f.read() + jsondata = json.loads(data) + ppack = Parse(json.dumps(jsondata), PPKt.Phenopacket()) + return ppack diff --git a/src/pyphetools/visualization/phenopacket_table.py b/src/pyphetools/visualization/phenopacket_table.py index d1bfce7..01f5166 100644 --- a/src/pyphetools/visualization/phenopacket_table.py +++ b/src/pyphetools/visualization/phenopacket_table.py @@ -7,6 +7,7 @@ from ..creation import Individual, HpTerm, MetaData from .simple_patient import SimplePatient from .html_table_generator import HtmlTableGenerator +from ..pp.v202._base import TimeElement as TimeElement202 # @@ -24,6 +25,15 @@ def __init__(self, age, days) -> None: self.key = age self.days = days + def __rep__(self): + """ + self.key can be either a TimeElement or a simple string. + """ + if isinstance(self.key, TimeElement202): + return self.key.display_time_element() + else: + return self.key + #@DeprecationWarning("This class will be replaced by IndividualTable and will be deleted in a future version") class PhenopacketTable: @@ -208,10 +218,14 @@ def iso_to_days(iso_age:str) -> int: y = age.find("Y") if y != -1: days = days + int(365.25*int(age[:y])) + if age.endswith("Y"): + return days age = age[y+1:] m = age.find("M") - if m != -1: + if m is not None and m != -1: days = days + int(30.436875*int(age[:m])) + if age.endswith("M"): + return days age = age[m+1:] d = age.find("D") if d != -1: diff --git a/test/test_disease.py b/test/test_disease.py index fcf5044..1f202df 100644 --- a/test/test_disease.py +++ b/test/test_disease.py @@ -20,3 +20,11 @@ def test_raise_exception_label(self): with pytest.raises(Exception) as e_info: eri1 = Disease(disease_id='OMIM:608739', disease_label=' ERI1-related disease') assert str(e_info.value) == 'Malformed disease label (starts/ends with whitespace): " ERI1-related disease"' + + def test_exception_because_of_stray_tab(self): + disease_id = "OMIM:616457" + disease_label = "Developmental and epileptic encephalopathy 50\t616457\tAR\t3\t" + with pytest.raises(ValueError) as val_error: + disease = Disease(disease_id=disease_id, disease_label=disease_label) + assert str(val_error.value) == 'Malformed disease label (contains tabs): "Developmental and epileptic encephalopathy 50 616457 AR 3 "' + diff --git a/test/test_disease_specific_hpo_counter.py b/test/test_disease_specific_hpo_counter.py new file mode 100644 index 0000000..4188a36 --- /dev/null +++ b/test/test_disease_specific_hpo_counter.py @@ -0,0 +1,38 @@ +import pytest + +from pyphetools.visualization import DiseaseSpecificHpoCounter, HpoCohortCount +from pyphetools.pp.v202 import OntologyClass as OntologyClass202 + +class TestDiseaseSpecificHpoCounter: + + def test_HpoCohortCount(self): + oclzz = OntologyClass202(id="HP:0001288", label="Gait disturbance") + eri1 = OntologyClass202(id='OMIM:608739', label='ERI1-related disease') + eri2 = OntologyClass202(id='OMIM:608742', label='ERI2-related disease') # fake + eri3 = OntologyClass202(id='OMIM:608744', label='ERI3-related disease') # fake + hpo_counter = HpoCohortCount(hpo=oclzz) + assert hpo_counter is not None + hpo_counter.increment_observed(eri1) + assert hpo_counter.get_observed(eri1) == 1 + hpo_counter.increment_observed(eri1) + hpo_counter.increment_observed(eri1) + assert hpo_counter.get_observed(eri1) == 3 + hpo_counter.increment_excluded(eri1) + assert hpo_counter.get_observed(eri1) == 3 + assert hpo_counter.get_excluded(eri1) == 1 + hpo_counter.increment_observed(eri2) + hpo_counter.increment_observed(eri2) + hpo_counter.increment_excluded(eri2) + hpo_counter.increment_excluded(eri2) + hpo_counter.increment_excluded(eri2) + assert hpo_counter.get_observed(eri2) == 2 + assert hpo_counter.get_excluded(eri2) == 3 + assert hpo_counter.get_observed(eri1) == 3 + assert hpo_counter.get_excluded(eri1) == 1 + assert hpo_counter.get_observed(eri3) == 0 ## we did not add frequencies for this disease + assert hpo_counter.frequency_for_disease(eri1) == "3/4 (75%)" + assert hpo_counter.frequency_for_disease(eri2) == "2/5 (40%)" + assert hpo_counter.frequency_for_disease(eri3) == "n/a" # no information available for eri3 + assert hpo_counter.get_maximum_frequency() == 0.75 + +