Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Develop #131

Merged
merged 10 commits into from
Sep 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/api/creation/discombobulator.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Discombobulator


::: pyphetools.creation.Discombobulator
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ nav:
- CohortEncoder: "api/creation/cohort_encoder.md"
- ColumnMapper: "api/creation/column_mapper.md"
- ConstantColumnMapper: "api/creation/constant_column_mapper.md"
- Disease: "api/creation/discombobulator.md"
- Disease: "api/creation/disease.md"
- DiseaseIdColumnMapper: "api/creation/disease_id_column_mapper.md"
- HgvsVariant: "api/creation/hgvs_variant.md"
Expand Down
2 changes: 1 addition & 1 deletion src/pyphetools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from . import validation


__version__ = "0.9.98"
__version__ = "0.9.105"


__all__ = [
Expand Down
3 changes: 3 additions & 0 deletions src/pyphetools/creation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,12 @@
from .hp_term import HpTerm, HpTermBuilder
from .import_template import TemplateImporter
from .individual import Individual
from .measurements import Measurements
from .metadata import MetaData
from .mode_of_inheritance import Moi
from .ontology_terms import OntologyTerms
from .option_column_mapper import OptionColumnMapper
from .promoter_variant import PromoterVariant
from .pyphetools_age import PyPheToolsAge, AgeSorter, HPO_ONSET_TERMS
from .sex_column_mapper import SexColumnMapper
from .simple_column_mapper import SimpleColumnMapper
Expand Down
2 changes: 1 addition & 1 deletion src/pyphetools/creation/age_column_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ def map_cell(self, cell_contents) -> typing.Optional[TimeElement202]:
"""
Extract an iso8601 string for age recorded as a year (either an int such as 4 or a float such as 4.25 for P4Y3M)
:param age: an int representing years or a float such as 2.5 for two and a half years
:return: an ISO 8601 string such as P2Y6M
:returns: an ISO 8601 string such as P2Y6M
"""
if isinstance(cell_contents, int):
age_str = f"P{cell_contents}Y"
Expand Down
6 changes: 3 additions & 3 deletions src/pyphetools/creation/age_isoformater.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def __init__(self, y=None, m=None, w=None, d=None):
self._months = months
self._days = days

def to_iso8601(self):
def to_iso8601(self) -> str:
components = ["P"]
if self._years > 0:
components.append(f"{self._years}Y")
Expand All @@ -62,7 +62,7 @@ def to_iso8601(self):
return "".join(components)

@staticmethod
def to_string(y=None, m=None, w=None, d=None):
def to_string(y=None, m=None, w=None, d=None) -> str:
"""
:param y: years
:type y: Union(int,str), optional
Expand All @@ -80,7 +80,7 @@ def to_string(y=None, m=None, w=None, d=None):


@staticmethod
def from_numerical_month(month):
def from_numerical_month(month) -> str:
"""
decode entries such as 18 or 0.7 (number of months)
"""
Expand Down
33 changes: 22 additions & 11 deletions src/pyphetools/creation/citation.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,39 @@


from ..pp.v202 import ExternalReference as ExternalReference202

class Citation:
"""encapsulate information about a citation that we add to the metadata for display

:param pmid: PubMed identifier for the publication in which this individual was described (e.g. PMID:321..).
:type pmid: str
:param title: Title of the publication in which this individual was described.
:type title: str
"""

def __init__(self, pmid:str, title:str) -> None:
"""
:param pmid: PubMed identifier for the publication in which this individual was described (e.g. PMID:321..).
:type pmid: str
:param title: Title of the publication in which this individual was described.
:type title: str
"""
if pmid is None or isinstance(pmid, float) or not pmid.startswith("PMID"):
raise ValueError(f"Could not find PubMed identifier")
if title is None or isinstance(title, float) or len(title) < 5:
raise ValueError(f"Could not find valid title")
self._pmid = pmid
self._title = title


@property
def pmid(self):
def pmid(self) -> str:
return self._pmid

@property
def title(self):
return self._title
def title(self) -> str:
return self._title

def to_external_reference(self) -> ExternalReference202:
"""
:returns: an ExternalReference object representing this PubMed citation
:rtype: ExternalReference202
"""
pm_number = self._pmid.replace("PMID:", "")
pm_url = f"https://pubmed.ncbi.nlm.nih.gov/{pm_number}"
return ExternalReference202(id=self._pmid,
reference=pm_url,
description=self._title)

2 changes: 1 addition & 1 deletion src/pyphetools/creation/create_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def create_template(self, disease_id:str, disease_label:str, HGNC_id:str, gene_s
:param disease_label: the corresponding name
:param HGNC_id: HUGO Gene Nomenclattre Committee identifier, e.g., HGNC:3603
:param gene_symbol: corresponding gene symbol, e.g., FBN1
:transcript: transcript to be used for the HVGC nomenclature. Must be refseq with version number
:param transcript: transcript to be used for the HVGC nomenclature. Must be refseq with version number
"""
H1_Headers = REQUIRED_H1_FIELDS
H2_Headers = REQUIRED_H2_FIELDS
Expand Down
102 changes: 84 additions & 18 deletions src/pyphetools/creation/discombulator.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,87 @@
import os
import pandas as pd
import typing
from collections import defaultdict
from .hpo_cr import HpoConceptRecognizer
from .hpo_parser import HpoParser

class AnnotationRow:
"""
This class represents one row of the output file.
"""
def __init__(self, idx) -> None:
self._index = idx
self._annot_list = list()
self._annot_list.append(str(idx))

def add_observed(self):
def add_observed(self) -> None:
self._annot_list.append("observed")

def add_excluded(self):
def add_excluded(self) -> None:
self._annot_list.append("excluded")

def add_na(self):
def add_na(self) -> None:
self._annot_list.append("na")

def get_annot_lst(self):
def get_annot_lst(self) -> typing.List[str]:
return self._annot_list

class Discombobulator:
"""
Discombobulate a column of the original data, using text mining to find HPO terms and make one column for each identified HPO term in the output.
In the following example, "Book2.xlsx" is an Excel file derived from an original publication. It has a column called "Cardiac defect", some of
whose cells contain items such as Ventricular septal defect, Atrial septal defect, Patent foramen ovale. Some of the cells contain codes (here, "na",
and "UN") that indicate that no information is available (so we want to output "na"). The assumeExcluded argument means that if an observation
was made (e.g., echocardiography), then we assume all items are excluded except those that are named in the cell. The decode method returns
a pandas DataFrame that has columns that can be inspected and then added to the pyphetools Excel template once any necessary revisions have been made.
The DataFrame will have one column for the patient identifier and one column for each of the identified HPO terms. Finally, the last column will be
the original column that we can use to vet results.

def __init__(self, hpo_cr:HpoConceptRecognizer) -> None:
self._hpo_cr = hpo_cr
import pandas as pd
df = pd.read_excel("../../Book2.xlsx")
from pyphetools.creation import Discombobulator
dc = Discombobulator(df=df, individual_id="individual column name")
cardiac = dc.decode(column="Cardiac defect", trueNa={"na", "UN"}, assumeExcluded=True)
cardiac.to_excel("cardiac.xlsx")

def decode(self, df:pd.DataFrame, column:str, delim:str=",", assumeExcluded=False):
if not column in df.columns:
raise ValueError(f"could not find column {column} in dataframe")
if not column in df.columns:
"""
def __init__(self,
df:pd.DataFrame,
individual_id:str,
hpo_cr:HpoConceptRecognizer = None) -> None:
if hpo_cr is not None:
self._hpo_cr = hpo_cr
else:
parser = HpoParser()
self._hpo_cr = parser.get_hpo_concept_recognizer()
self._individual_id = individual_id
self._df = df

def decode(self,
column:str,
delim:str=",",
assumeExcluded=False,
trueNa:typing.Union[str,typing.Set[str]]="na") -> pd.DataFrame:
"""
Discombobulate a column of the original data, using text mining to find HPO terms and make one column for each identified HPO term in the output.
:param column: The name of the column to dsicombobulate
:param delim: delimiter between items
:assumeExcluded: Assume that if an item is not mentioned in a cell, then it was excluded. This can be justified if the column is about Echocardiography findings, for instance.
:trueNa:
"""
if not column in self._df.columns:
raise ValueError(f"could not find column {column} in dataframe")
index_to_hpo_set = defaultdict(set)
index_to_hpo_d = defaultdict(set)
label_to_id = dict()
all_hpo_terms = set()
if isinstance(trueNa, str):
self._true_na_set = set()
self._true_na_set.add(trueNa)
elif isinstance(trueNa, set):
self._true_na_set = trueNa
else:
raise ValueError(f"trueNa argument must be string or set, but was {type(trueNa)}")
## First get list of all HPO terms used
for idx, row in df.iterrows():
for idx, row in self._df.iterrows():
idx = str(idx)
contents = row[column]
contents = str(contents) ## coerce to string in case empty
Expand All @@ -44,7 +90,7 @@ def decode(self, df:pd.DataFrame, column:str, delim:str=",", assumeExcluded=Fals
hpo_id = hterm.id
label = hterm.label
label_to_id[label] = hpo_id
index_to_hpo_set[idx].add(label)
index_to_hpo_d[idx].add(label)
all_hpo_terms.add(label)
label_list = list()
id_list = list()
Expand All @@ -58,14 +104,14 @@ def decode(self, df:pd.DataFrame, column:str, delim:str=",", assumeExcluded=Fals
id_list.append(hpo_id)
row_list = list()
row_list.append(id_list)
for hpo_list in index_to_hpo_set.values():
for hpo_list in index_to_hpo_d.values():
for hpo in hpo_term_list:
all_hpo_terms.add(hpo)
hpo_annot_row = list()
for idx, row in df.iterrows():
for idx, row in self._df.iterrows():
idx = str(idx)
if idx in index_to_hpo_set:
observed_hpo_set = index_to_hpo_set.get(idx)
if idx in index_to_hpo_d:
observed_hpo_set = index_to_hpo_d.get(idx)
else:
observed_hpo_set = set() ## now terms parsed for this index

Expand All @@ -80,6 +126,26 @@ def decode(self, df:pd.DataFrame, column:str, delim:str=",", assumeExcluded=Fals
row_list.append(arow.get_annot_lst())
# Create DataFrame
df_out = pd.DataFrame(row_list, columns=label_list)
original_column = self._df[column]
a = pd.Series(["Original"])
new_column = pd.concat([a, original_column], axis=0, ignore_index=True)
new_column_header = f"Original:{column}"
df_out[new_column_header] = new_column
df_out[new_column_header] = new_column


# Now replace with na
# List of columns to exclude
for na_symbol in self._true_na_set:
exclude_columns = ['individual_id', new_column_header]
columns_to_change = df_out.columns.difference(exclude_columns)
df_out.loc[df_out[new_column_header] == na_symbol, columns_to_change] = "na"
# Now add back the original individual labels
individual_column = self._df[self._individual_id]
a = pd.Series(["Individual"])
individual_column = pd.concat([a, individual_column], axis=0, ignore_index=True)
df_out["original individual id"] = individual_column

return df_out

def write(self, df:pd.DataFrame, column:str, delim:str=",", assumeExcluded=False):
Expand Down
41 changes: 41 additions & 0 deletions src/pyphetools/creation/hgvs_variant.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
import phenopackets
from .variant import Variant
from ..pp.v202 import GeneDescriptor as GeneDescriptor202
from ..pp.v202 import VariantInterpretation as VariantInterpretation202
from ..pp.v202 import VariationDescriptor as VariationDescriptor202
from ..pp.v202 import Expression as Expression202
from ..pp.v202 import MoleculeContext as MoleculeContext202
from ..pp.v202 import VcfRecord as VcfRecord202
import string
from typing import Dict
import random
Expand Down Expand Up @@ -141,3 +147,38 @@ def to_ga4gh(self, acmg=None):
vdescriptor.vcf_record.CopyFrom(vcf_record)
vinterpretation.variation_descriptor.CopyFrom(vdescriptor)
return vinterpretation

def to_variant_interpretation_202(self,
acmg:str=None) -> VariantInterpretation202:
"""
Transform this Variant object into a "variantInterpretation" message of the GA4GH Phenopacket schema
"""

vcf_record = VcfRecord202(genome_assembly=self._assembly,
chrom=self._chr,
pos=self._position,
ref=self._ref,
alt=self._alt)
vdescriptor = VariationDescriptor202(id=self._variant_id, vcf_record=vcf_record, molecule_context=MoleculeContext202.genomic)
if self._hgnc_id is not None and self._symbol is not None:
gene_descriptor = GeneDescriptor202(value_id=self._hgnc_id, symbol=self._symbol)
vdescriptor.gene_context = gene_descriptor
if self._hgvs is not None:
hgvs_expression = Expression202(syntax="hgvs.c", value=self._hgvs)
vdescriptor.expressions.append(hgvs_expression)
if self._g_hgvs is not None:
hgvs_expression = Expression202(syntax="hgvs.g", value=self._g_hgvs)
vdescriptor.expressions.append(hgvs_expression)
gt_term = Variant._get_genotype_term(self._genotype)
# it can occur that the genotype is not set when we call this function (it will be set by calling code)
# therefore it is not necessarily an error if the genotype is None, calling code needs to check this appropriately
if gt_term is not None:
vdescriptor.allelic_state = gt_term
vinterpretation = VariantInterpretation202(variation_descriptor=vdescriptor)
acmg_code = Variant._get_acmg_classification(acmg=acmg)
if acmg_code is not None:
vinterpretation.acmg_pathogenicity_classification = acmg_code
else:
print(f"Warning- did not recognize ACMG category {acmg}")

return vinterpretation
16 changes: 6 additions & 10 deletions src/pyphetools/creation/individual.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,24 +163,24 @@ def set_disease(self, disease: Disease) -> None:
"""
self._disease = disease

def disease_count(self):
def disease_count(self) -> int:
if self._disease is None:
return 0
else:
return 1

def set_hpo_terms(self, cleansed_hpo_terms: List[HpTerm]):
def set_hpo_terms(self, cleansed_hpo_terms: List[HpTerm]) -> None:
"""
:param cleansed_hpo_terms: a list of HpTerm objects that has been cleansed by OntologyQC
:type cleansed_hpo_terms: List[pyphetools.creation.HpTerm]
"""
self._hpo_terms = cleansed_hpo_terms

@property
def pmid(self):
def pmid(self) -> str:
return self._citation.pmid

def set_citation(self, citation: Citation):
def set_citation(self, citation: Citation) -> None:
"""
:param citation: Object with the title and PubMed identifier for the publication in which this individual was described (e.g. PMID:321..)
:type citation: Citation
Expand Down Expand Up @@ -330,12 +330,8 @@ def to_ga4gh_phenopacket(self, metadata, phenopacket_id=None) -> PPKt.Phenopacke
# `protobuf` devs must have removed `clear()` method
# This is a workaround to clear the list of external references.
_ = metadata.external_references.pop()
extref = PPKt.ExternalReference()
extref.id = self._citation.pmid
pm = self._citation.pmid.replace("PMID:", "")
extref.reference = f"https://pubmed.ncbi.nlm.nih.gov/{pm}"
extref.description = self._citation.title
metadata.external_references.append(extref)
extref202 = self._citation.to_external_reference()
metadata.external_references.append(extref202.to_message())
php.meta_data.CopyFrom(metadata)
return php

Expand Down
Loading