monarch-initiative · pnrobinson · Sep 1, 2024 · Aug 13, 2024 · Aug 14, 2024 · Aug 14, 2024
diff --git a/docs/api/creation/discombobulator.md b/docs/api/creation/discombobulator.md
@@ -0,0 +1,4 @@
+# Discombobulator
+
+
+::: pyphetools.creation.Discombobulator
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -70,6 +70,7 @@ nav:
       - CohortEncoder: "api/creation/cohort_encoder.md"
       - ColumnMapper: "api/creation/column_mapper.md"
       - ConstantColumnMapper: "api/creation/constant_column_mapper.md"
+      - Disease: "api/creation/discombobulator.md"
       - Disease: "api/creation/disease.md"
       - DiseaseIdColumnMapper: "api/creation/disease_id_column_mapper.md"
       - HgvsVariant: "api/creation/hgvs_variant.md"

diff --git a/src/pyphetools/__init__.py b/src/pyphetools/__init__.py
@@ -5,7 +5,7 @@
 from . import validation
 
 
-__version__ = "0.9.98"
+__version__ = "0.9.105"
 
 
 __all__ = [

diff --git a/src/pyphetools/creation/__init__.py b/src/pyphetools/creation/__init__.py
@@ -20,9 +20,12 @@
 from .hp_term import HpTerm, HpTermBuilder
 from .import_template import TemplateImporter
 from .individual import Individual
+from .measurements import Measurements
 from .metadata import MetaData
 from .mode_of_inheritance import Moi
+from .ontology_terms import OntologyTerms
 from .option_column_mapper import OptionColumnMapper
+from .promoter_variant import PromoterVariant
 from .pyphetools_age import PyPheToolsAge, AgeSorter, HPO_ONSET_TERMS
 from .sex_column_mapper import SexColumnMapper
 from .simple_column_mapper import SimpleColumnMapper

diff --git a/src/pyphetools/creation/age_column_mapper.py b/src/pyphetools/creation/age_column_mapper.py
@@ -224,7 +224,7 @@ def map_cell(self, cell_contents) -> typing.Optional[TimeElement202]:
         """
         Extract an iso8601 string for age recorded as a year (either an int such as 4 or a float such as 4.25 for P4Y3M)
         :param age: an int representing years or a float such as 2.5 for two and a half years
-        :return: an ISO 8601 string such as P2Y6M
+        :returns: an ISO 8601 string such as P2Y6M
         """
         if isinstance(cell_contents, int):
             age_str = f"P{cell_contents}Y"

diff --git a/src/pyphetools/creation/age_isoformater.py b/src/pyphetools/creation/age_isoformater.py
@@ -48,7 +48,7 @@ def __init__(self, y=None, m=None, w=None, d=None):
         self._months = months
         self._days = days
 
-    def to_iso8601(self):
+    def to_iso8601(self) -> str:
         components = ["P"]
         if self._years > 0:
             components.append(f"{self._years}Y")
@@ -62,7 +62,7 @@ def to_iso8601(self):
             return "".join(components)
 
     @staticmethod
-    def to_string(y=None, m=None, w=None, d=None):
+    def to_string(y=None, m=None, w=None, d=None) -> str:
         """
         :param y: years
         :type y: Union(int,str), optional
@@ -80,7 +80,7 @@ def to_string(y=None, m=None, w=None, d=None):
 
 
     @staticmethod
-    def from_numerical_month(month):
+    def from_numerical_month(month) -> str:
         """
         decode entries such as 18 or 0.7 (number of months)
         """

diff --git a/src/pyphetools/creation/citation.py b/src/pyphetools/creation/citation.py
@@ -1,28 +1,39 @@
-
-
+from ..pp.v202 import ExternalReference as ExternalReference202
 
 class Citation:
     """encapsulate information about a citation that we add to the metadata for display
-
-    :param pmid: PubMed identifier for the publication in which this individual was described (e.g. PMID:321..).
-    :type pmid: str
-    :param title: Title of the publication in which this individual was described.
-    :type title: str
     """
 
     def __init__(self, pmid:str, title:str) -> None:
+        """
+        :param pmid: PubMed identifier for the publication in which this individual was described (e.g. PMID:321..).
+        :type pmid: str
+        :param title: Title of the publication in which this individual was described.
+        :type title: str
+        """
         if pmid is None or isinstance(pmid, float) or not pmid.startswith("PMID"):
             raise ValueError(f"Could not find PubMed identifier")
         if title is None or isinstance(title, float) or len(title) < 5:
             raise ValueError(f"Could not find valid title")
         self._pmid = pmid
         self._title = title
 
-
     @property
-    def pmid(self):
+    def pmid(self) -> str:
         return self._pmid
 
     @property
-    def title(self):
-        return self._title
+    def title(self) -> str:
+        return self._title
+
+    def to_external_reference(self) -> ExternalReference202:
+        """
+        :returns: an ExternalReference object representing this PubMed citation
+        :rtype: ExternalReference202
+        """
+        pm_number = self._pmid.replace("PMID:", "")
+        pm_url = f"https://pubmed.ncbi.nlm.nih.gov/{pm_number}" 
+        return ExternalReference202(id=self._pmid,
+                                    reference=pm_url,
+                                    description=self._title)
+
diff --git a/src/pyphetools/creation/create_template.py b/src/pyphetools/creation/create_template.py
@@ -78,7 +78,7 @@ def create_template(self, disease_id:str, disease_label:str, HGNC_id:str, gene_s
         :param disease_label: the corresponding name
         :param HGNC_id: HUGO Gene Nomenclattre Committee identifier, e.g., HGNC:3603
         :param gene_symbol: corresponding gene symbol, e.g., FBN1
-        :transcript: transcript to be used for the HVGC nomenclature. Must be refseq with version number
+        :param transcript: transcript to be used for the HVGC nomenclature. Must be refseq with version number
         """
         H1_Headers = REQUIRED_H1_FIELDS
         H2_Headers = REQUIRED_H2_FIELDS

diff --git a/src/pyphetools/creation/discombulator.py b/src/pyphetools/creation/discombulator.py
@@ -1,41 +1,87 @@
-import os
 import pandas as pd
+import typing
 from collections import defaultdict
 from .hpo_cr import HpoConceptRecognizer
+from .hpo_parser import HpoParser
 
 class AnnotationRow:
+    """
+    This class represents one row of the output file.
+    """
     def __init__(self, idx) -> None:
         self._index = idx
         self._annot_list = list()
         self._annot_list.append(str(idx))
 
-    def add_observed(self):
+    def add_observed(self) -> None:
         self._annot_list.append("observed")
 
-    def add_excluded(self):
+    def add_excluded(self) -> None:
         self._annot_list.append("excluded")
 
-    def add_na(self):
+    def add_na(self) -> None:
         self._annot_list.append("na")
 
-    def get_annot_lst(self):
+    def get_annot_lst(self) -> typing.List[str]:
         return self._annot_list
 
 class Discombobulator:
+    """
+    Discombobulate a column of the original data, using text mining to find HPO terms and make one column for each identified HPO term in the output.
+    In the following example, "Book2.xlsx" is an Excel file derived from an original publication. It has a column called "Cardiac defect", some of 
+    whose cells contain items such as Ventricular septal defect, Atrial septal defect, Patent foramen ovale. Some of the cells contain codes (here, "na",
+    and "UN") that indicate that no information is available (so we want to output "na"). The assumeExcluded argument means that if an observation
+    was made (e.g., echocardiography), then we assume all items are excluded except those that are named in the cell. The decode method returns
+    a pandas DataFrame that has columns that can be inspected and then added to the pyphetools Excel template once any necessary revisions have been made.
+    The DataFrame will have one column for the patient identifier and one column for each of the identified HPO terms. Finally, the last column will be
+    the original column that we can use to vet results.
 
-    def __init__(self, hpo_cr:HpoConceptRecognizer) -> None:
-        self._hpo_cr = hpo_cr
+        import pandas as pd
+        df = pd.read_excel("../../Book2.xlsx")
+        from pyphetools.creation import Discombobulator
+        dc = Discombobulator(df=df, individual_id="individual column name")
+        cardiac = dc.decode(column="Cardiac defect", trueNa={"na", "UN"}, assumeExcluded=True)
+        cardiac.to_excel("cardiac.xlsx")
 
-    def decode(self, df:pd.DataFrame, column:str, delim:str=",", assumeExcluded=False):
-        if not column in df.columns:
-            raise ValueError(f"could not find column {column} in dataframe")
-        if not column in df.columns:
+    """
+    def __init__(self, 
+                df:pd.DataFrame,
+                individual_id:str,
+                hpo_cr:HpoConceptRecognizer = None) -> None:
+        if hpo_cr is not None:
+            self._hpo_cr = hpo_cr
+        else:
+            parser = HpoParser()
+            self._hpo_cr = parser.get_hpo_concept_recognizer()
+        self._individual_id = individual_id
+        self._df = df
+
+    def decode(self,  
+               column:str, 
+               delim:str=",", 
+               assumeExcluded=False, 
+               trueNa:typing.Union[str,typing.Set[str]]="na") -> pd.DataFrame:
+        """
+        Discombobulate a column of the original data, using text mining to find HPO terms and make one column for each identified HPO term in the output.
+        :param column: The name of the column to dsicombobulate
+        :param delim: delimiter between items
+        :assumeExcluded: Assume that if an item is not mentioned in a cell, then it was excluded. This can be justified if the column is about Echocardiography findings, for instance.
+        :trueNa:  
+        """
+        if not column in self._df.columns:
             raise ValueError(f"could not find column {column} in dataframe")
-        index_to_hpo_set = defaultdict(set)
+        index_to_hpo_d = defaultdict(set)
         label_to_id = dict()
         all_hpo_terms = set()
+        if isinstance(trueNa, str):
+            self._true_na_set = set()
+            self._true_na_set.add(trueNa)
+        elif isinstance(trueNa, set):
+            self._true_na_set = trueNa
+        else:
+            raise ValueError(f"trueNa argument must be string or set, but was {type(trueNa)}")
         ## First get list of all HPO terms used
-        for idx, row in df.iterrows():
+        for idx, row in self._df.iterrows():
             idx = str(idx)
             contents = row[column]
             contents = str(contents) ## coerce to string in case empty
@@ -44,7 +90,7 @@ def decode(self, df:pd.DataFrame, column:str, delim:str=",", assumeExcluded=Fals
                 hpo_id = hterm.id
                 label = hterm.label
                 label_to_id[label] = hpo_id
-                index_to_hpo_set[idx].add(label)
+                index_to_hpo_d[idx].add(label)
                 all_hpo_terms.add(label)
         label_list = list()
         id_list = list()
@@ -58,14 +104,14 @@ def decode(self, df:pd.DataFrame, column:str, delim:str=",", assumeExcluded=Fals
             id_list.append(hpo_id)
         row_list = list()
         row_list.append(id_list)
-        for hpo_list in index_to_hpo_set.values():
+        for hpo_list in index_to_hpo_d.values():
             for hpo in hpo_term_list:
                 all_hpo_terms.add(hpo)
         hpo_annot_row = list()
-        for idx, row in df.iterrows():
+        for idx, row in self._df.iterrows():
             idx = str(idx)
-            if idx in index_to_hpo_set:
-                observed_hpo_set = index_to_hpo_set.get(idx)
+            if idx in index_to_hpo_d:
+                observed_hpo_set = index_to_hpo_d.get(idx)
             else:
                 observed_hpo_set = set() ## now terms parsed for this index
 
@@ -80,6 +126,26 @@ def decode(self, df:pd.DataFrame, column:str, delim:str=",", assumeExcluded=Fals
             row_list.append(arow.get_annot_lst())
         # Create DataFrame
         df_out = pd.DataFrame(row_list, columns=label_list)
+        original_column = self._df[column]
+        a = pd.Series(["Original"])
+        new_column = pd.concat([a, original_column], axis=0, ignore_index=True)
+        new_column_header = f"Original:{column}"
+        df_out[new_column_header] = new_column
+        df_out[new_column_header] = new_column
+
+
+        # Now replace with na
+        # List of columns to exclude
+        for na_symbol in self._true_na_set:
+            exclude_columns = ['individual_id', new_column_header]
+            columns_to_change = df_out.columns.difference(exclude_columns)
+            df_out.loc[df_out[new_column_header] == na_symbol, columns_to_change] = "na"
+        # Now add back the original individual labels
+        individual_column = self._df[self._individual_id]
+        a = pd.Series(["Individual"])
+        individual_column = pd.concat([a, individual_column], axis=0, ignore_index=True)
+        df_out["original individual id"] = individual_column
+
         return df_out
 
     def write(self, df:pd.DataFrame, column:str, delim:str=",", assumeExcluded=False):

diff --git a/src/pyphetools/creation/hgvs_variant.py b/src/pyphetools/creation/hgvs_variant.py
@@ -1,5 +1,11 @@
 import phenopackets
 from .variant import Variant
+from ..pp.v202 import GeneDescriptor as GeneDescriptor202
+from ..pp.v202 import VariantInterpretation as VariantInterpretation202
+from ..pp.v202 import VariationDescriptor as VariationDescriptor202
+from ..pp.v202 import Expression as Expression202
+from ..pp.v202 import MoleculeContext as MoleculeContext202
+from ..pp.v202 import VcfRecord as VcfRecord202
 import string
 from typing import Dict
 import random
@@ -141,3 +147,38 @@ def to_ga4gh(self, acmg=None):
         vdescriptor.vcf_record.CopyFrom(vcf_record)
         vinterpretation.variation_descriptor.CopyFrom(vdescriptor)
         return vinterpretation
+
+    def to_variant_interpretation_202(self, 
+                                      acmg:str=None) -> VariantInterpretation202:
+        """
+        Transform this Variant object into a "variantInterpretation" message of the GA4GH Phenopacket schema
+        """
+
+        vcf_record = VcfRecord202(genome_assembly=self._assembly,
+                                  chrom=self._chr,
+                                  pos=self._position,
+                                  ref=self._ref,
+                                  alt=self._alt)
+        vdescriptor = VariationDescriptor202(id=self._variant_id, vcf_record=vcf_record, molecule_context=MoleculeContext202.genomic)
+        if self._hgnc_id is not None and self._symbol is not None:
+            gene_descriptor = GeneDescriptor202(value_id=self._hgnc_id, symbol=self._symbol)
+            vdescriptor.gene_context = gene_descriptor
+        if self._hgvs is not None:
+            hgvs_expression = Expression202(syntax="hgvs.c", value=self._hgvs)
+            vdescriptor.expressions.append(hgvs_expression)
+        if self._g_hgvs is not None:
+            hgvs_expression = Expression202(syntax="hgvs.g", value=self._g_hgvs)
+            vdescriptor.expressions.append(hgvs_expression)
+        gt_term = Variant._get_genotype_term(self._genotype)
+        # it can occur that the genotype is not set when we call this function (it will be set by calling code)
+        # therefore it is not necessarily an error if the genotype is None, calling code needs to check this appropriately
+        if gt_term is not None:
+            vdescriptor.allelic_state = gt_term
+        vinterpretation = VariantInterpretation202(variation_descriptor=vdescriptor)
+        acmg_code = Variant._get_acmg_classification(acmg=acmg)
+        if acmg_code is not None:
+            vinterpretation.acmg_pathogenicity_classification = acmg_code
+        else:
+            print(f"Warning- did not recognize ACMG category {acmg}")
+
+        return vinterpretation
diff --git a/src/pyphetools/creation/individual.py b/src/pyphetools/creation/individual.py
@@ -163,24 +163,24 @@ def set_disease(self, disease: Disease) -> None:
         """
         self._disease = disease
 
-    def disease_count(self):
+    def disease_count(self) -> int:
         if self._disease is None:
             return 0
         else:
             return 1
 
-    def set_hpo_terms(self, cleansed_hpo_terms: List[HpTerm]):
+    def set_hpo_terms(self, cleansed_hpo_terms: List[HpTerm]) -> None:
         """
         :param cleansed_hpo_terms: a list of HpTerm objects that has been cleansed by OntologyQC
         :type cleansed_hpo_terms: List[pyphetools.creation.HpTerm]
         """
         self._hpo_terms = cleansed_hpo_terms
 
     @property
-    def pmid(self):
+    def pmid(self) -> str:
         return self._citation.pmid
 
-    def set_citation(self, citation: Citation):
+    def set_citation(self, citation: Citation) -> None:
         """
         :param citation: Object with the title and PubMed identifier for the publication in which this individual was described (e.g. PMID:321..)
         :type citation: Citation
@@ -330,12 +330,8 @@ def to_ga4gh_phenopacket(self, metadata, phenopacket_id=None) -> PPKt.Phenopacke
                 # `protobuf` devs must have removed `clear()` method
                 # This is a workaround to clear the list of external references.
                 _ = metadata.external_references.pop()
-            extref = PPKt.ExternalReference()
-            extref.id = self._citation.pmid
-            pm = self._citation.pmid.replace("PMID:", "")
-            extref.reference = f"https://pubmed.ncbi.nlm.nih.gov/{pm}"
-            extref.description = self._citation.title
-            metadata.external_references.append(extref)
+            extref202 = self._citation.to_external_reference()
+            metadata.external_references.append(extref202.to_message())
         php.meta_data.CopyFrom(metadata)
         return php