Skip to content

Commit

Permalink
Merge pull request #134 from monarch-initiative/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
pnrobinson authored Oct 3, 2024
2 parents cc08bd2 + 064f8f8 commit dae5dd0
Show file tree
Hide file tree
Showing 18 changed files with 432 additions and 30 deletions.
2 changes: 1 addition & 1 deletion docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ from tabular data such as databases or supplemental files found in the medical l
This documentation contains information about

- How to use the [Excel template](user-guide/excel.md) to code clinical data
- How to use [pyphetools classes](user-guide/jupyter.md) to convert tabular data (e.g., supplemental tables) to phenopackets
- How to use [pyphetools classes](tabular/jupyter.md) to convert tabular data (e.g., supplemental tables) to phenopackets
- Information for [developers](developers/developers.md)
- A description of the pyphetools [API](api/overview.md)

Expand Down
2 changes: 1 addition & 1 deletion docs/user-guide/excel.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

We have designed a format for Excel templates that can be used to quickly and efficiently generate collections of Phenopackets. This is currently the prefered way for clinicians and translational researchers to contribute to this project. The [pyphetools](https://github.com/monarch-initiative/pyphetools){:target="_blank"} library provides other means for bioinformaticians (please ask us).

The template can be downloaded [here](../_static/template.xlsx){:target="_blank"}.
The template file is generated for each disease as described in [template](template.md).

# A format for cohort descriptions in excel

Expand Down
2 changes: 1 addition & 1 deletion docs/user-guide/variant_notation.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Variant Notation

We recommend that users choose one transcript for all HGVS variant descriptions in a project. In general, the most clinicallz relevant transcript should be chosen.
We recommend that users choose one transcript for all HGVS variant descriptions in a project. In general, the most clinically relevant transcript should be chosen.


### Choosing the reference transcript for a project
Expand Down
2 changes: 1 addition & 1 deletion src/pyphetools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from . import validation


__version__ = "0.9.105"
__version__ = "0.9.108"


__all__ = [
Expand Down
18 changes: 18 additions & 0 deletions src/pyphetools/creation/create_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def create_template(self, disease_id:str, disease_label:str, HGNC_id:str, gene_s
:param gene_symbol: corresponding gene symbol, e.g., FBN1
:param transcript: transcript to be used for the HVGC nomenclature. Must be refseq with version number
"""
self._qc_disease_information(disease_id=disease_id, disease_label=disease_label)
H1_Headers = REQUIRED_H1_FIELDS
H2_Headers = REQUIRED_H2_FIELDS
if len(H1_Headers) != len(H2_Headers):
Expand Down Expand Up @@ -123,6 +124,23 @@ def create_template(self, disease_id:str, disease_label:str, HGNC_id:str, gene_s
df.to_excel(fname, index=False)
print(f"Wrote Excel pyphetools template file to {fname}")

def _qc_disease_information(self,
disease_id:str,
disease_label:str) -> None:
"""
Check some common errors in data entry and raise an exception if the disease ID and label are not correct
"""
if ":" not in disease_id:
raise ValueError(f"Malformed disease id-not CURIE: \"{disease_id}\"")
fields = disease_id.split(":")
if len(fields) != 2:
raise ValueError(f"Malformed disease id-only one colon allowed: \"{disease_id}\"")
disease_db = fields[0]
if disease_db != "OMIM" and disease_db != "MONDO":
raise ValueError(f"Malformed disease id-did not recognize disease database: \"{disease_id}\"")
if "\t" in disease_label:
raise ValueError(f"Malformed disease label: \”{disease_label}\"")

def create_from_phenopacket(self, ppkt):
"""
create pyphetools templates from an individual phenopacket.
Expand Down
3 changes: 3 additions & 0 deletions src/pyphetools/creation/disease.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ def __init__(self, disease_id:str, disease_label):
raise ValueError(f"Malformed disease identifier with white space: \"{disease_id}\"")
if disease_label.startswith(" ") or disease_label.endswith(" "):
raise ValueError(f"Malformed disease label (starts/ends with whitespace): \"{disease_label}\"")
# occasionally, copy-paste error leads to this kind of malformed label: "Developmental and epileptic encephalopathy 50\t616457\tAR\t3\t"
if "\t" in disease_label:
raise ValueError(f"Malformed disease label (contains tabs): \"{disease_label}\"")
self._id = disease_id
self._label = disease_label

Expand Down
30 changes: 27 additions & 3 deletions src/pyphetools/creation/measurements.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,19 @@
pg_per_l = OntologyClass202(id="UCUM:pg/L", label="picogram per liter")
pg_per_ml = OntologyClass202(id="UCUM:pg/mL", label="picogram per milliliter")
nmol_per_l = OntologyClass202(id="UCUM:nmol/L", label="nanomole per liter")
mmol_per_l= OntologyClass202(id="UCUM:mmol/L", label="millimole per liter")
percent = OntologyClass202(id="UCUM:%", label="percent")


class Measurements:


"""
Convenience class with static methods to create Measurement objects for common units.
"""


@staticmethod
def _with_reference_range(assay: OntologyClass202,
unit: OntologyClass202,
unit: OntologyClass202,
value: float,
low: float,
high: float) -> Measurement202:
Expand Down Expand Up @@ -93,6 +96,27 @@ def nanomole_per_liter(code: str,
high: float = None) -> Measurement202:
assay = OntologyClass202(id=code, label=label)
return Measurements._from_assay_and_values(assay=assay, unit=nmol_per_l, value=concentration, low=low, high=high)

@staticmethod
def millimole_per_liter(code: str,
label: str,
concentration: float,
low: float = None,
high: float = None) -> Measurement202:
assay = OntologyClass202(id=code, label=label)
return Measurements._from_assay_and_values(assay=assay, unit=mmol_per_l, value=concentration, low=low, high=high)

@staticmethod
def percent(code: str,
label: str,
concentration: float,
low: float = None,
high: float = None) -> Measurement202:
assay = OntologyClass202(id=code, label=label)
return Measurements._from_assay_and_values(assay=assay, unit=percent, value=concentration, low=low, high=high)






Expand Down
1 change: 1 addition & 0 deletions src/pyphetools/creation/promoter_variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def to_variant_interpretation(self, acmg=None) -> VariantInterpretation202:
gene_descriptor = GeneDescriptor202(value_id=self._hgnc_id, symbol=self._gene_symbol)
vdescriptor = VariationDescriptor202(id=self._variant_id,
molecule_context=MoleculeContext202.genomic,
description=self._description,
gene_context=gene_descriptor,
label=self._description,
structural_type=self._sequence_ontology_term)
Expand Down
37 changes: 26 additions & 11 deletions src/pyphetools/creation/variant_manager.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import pickle
import pandas as pd
from typing import List, Dict
from typing import List
from collections import defaultdict
from .individual import Individual
from .variant_validator import VariantValidator
Expand Down Expand Up @@ -48,6 +48,21 @@ class VariantManager:
shows the other variants. These can be use to create chromosomal deletions, duplications, and inversions. Finally,
the class can be used to add variants to a list of Individual objects.
If the Excel template is used, this class will be called internally and users do not need to use the code. If the
data is ingested manually, the class can be used as follows.
gnas_symbol = "GNAS"
gnas_id = "HGNC:4392"
gnas_MANE_transcript = "NM_000516.7"
vmanager = VariantManager(df=df,
individual_column_name="individual",
transcript=gnas_MANE_transcript,
gene_id=gnas_id,
gene_symbol=gnas_symbol,
allele_1_column_name="allele_1")
See [variant_manager](https://monarch-initiative.github.io/pyphetools/api/creation/variant_manager/) for more information.
:param df: DataFrame representing the input data
:type df: pd.DataFrame
:param individual_column_name: Name of the individual (patient) column
Expand Down Expand Up @@ -99,13 +114,13 @@ def __init__(self,
self._create_variant_d(overwrite)


def _format_pmid_id(self, identifier, pmid):
def _format_pmid_id(self, identifier, pmid) -> str:
if pmid is not None:
return f"{pmid}_{identifier}"
else:
return identifier

def _get_identifier_with_pmid(self, row:pd.Series):
def _get_identifier_with_pmid(self, row:pd.Series) -> str:
"""Get an identifier such as PMID_33087723_A2 for a daa row with PMID:33087723 and identifier within that publication A2
Identifiers such as P1 are commonly used and there is a risk of a clash with collections of phenopackets from various papers.
Expand All @@ -118,7 +133,7 @@ def _get_identifier_with_pmid(self, row:pd.Series):
else:
return individual_id

def _create_variant_d(self, overwrite):
def _create_variant_d(self, overwrite) -> None:
"""
Creates a dictionary with all HGVS variants, and as a side effect creates a set with variants that
are not HGVS and need to be mapped manually. This method has the following effects
Expand Down Expand Up @@ -182,9 +197,9 @@ def _create_variant_d(self, overwrite):
self._unmapped_alleles.add(v) # This allows us to use the chromosomal mappers.
write_variant_pickle(name=self._gene_symbol, my_object=self._var_d)

def code_as_chromosomal_deletion(self, allele_set):
def code_as_chromosomal_deletion(self, allele_set) -> None:
"""
Code as Structural variants - chromosomal deletion (to be added to self._var_d)
Code variants with the identifiers in "allele_set" as Structural variants (chromosomal deletion)
:param allele_set: Set of alleles (strings) for coding as Structural variants (chromosomal deletion)
"""
# first check that all of the alleles are in self._unmapped_alleles
Expand All @@ -200,9 +215,9 @@ def code_as_chromosomal_deletion(self, allele_set):
self._unmapped_alleles.remove(allele)
self._var_d[allele] = var

def code_as_chromosomal_duplication(self, allele_set):
def code_as_chromosomal_duplication(self, allele_set) -> None:
"""
Code as Structural variants - chromosomal duplication (to be added to self._var_d)
Code variants with the identifiers in "allele_set" as Structural variants (chromosomal duplication)
:param allele_set: Set of alleles (strings) for coding as Structural variants (chromosomal duplication)
"""
# first check that all of the alleles are in self._unmapped_alleles
Expand All @@ -217,7 +232,7 @@ def code_as_chromosomal_duplication(self, allele_set):

def code_as_chromosomal_inversion(self, allele_set) -> None:
"""
Code as Structural variants - chromosomal inversion (to be added to self._var_d)
Code variants with the identifiers in "allele_set" as Structural variants (chromosomal inversion)
:param allele_set: Set of alleles (strings) for coding as Structural variants (chromosomal inversion)
"""
# first check that all of the alleles are in self._unmapped_alleles
Expand All @@ -232,8 +247,8 @@ def code_as_chromosomal_inversion(self, allele_set) -> None:

def code_as_chromosomal_translocation(self, allele_set) -> None:
"""
Code as Structural variants - chromosomal translocation (to be added to self._var_d)
:param allele_set: Set of alleles (strings) for coding as Structural variants (chromosomal inversion)
Code variants with the identifiers in "allele_set" as Structural variants (chromosomal translocation)
:param allele_set: Set of alleles (strings) for coding as Structural variants (chromosomal translocation)
"""
# first check that all of the alleles are in self._unmapped_alleles
if not allele_set.issubset(self._unmapped_alleles):
Expand Down
6 changes: 6 additions & 0 deletions src/pyphetools/pp/v202/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,16 @@ def iso_to_days(iso_age:str) -> int:
y = age.find("Y")
if y != -1:
days = days + int(365.25*int(age[:y]))
if age.endswith("Y"):
return days
age = age[y+1:]
m = age.find("M")
if m != -1:
days = days + int(30.436875*int(age[:m]))
# if the string ends with M, e.g., P3Y2M, then do not look for days
if age.endswith("M"):
return days
# if not, advance the string pointer so we can parse the days
age = age[m+1:]
d = age.find("D")
if d != -1:
Expand Down
1 change: 1 addition & 0 deletions src/pyphetools/visualization/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .detailed_suppl_table import DetailedSupplTable
from .disease_specific_hpo_counter import DiseaseSpecificHpoCounter, HpoCohortCount
from .focus_count_table import FocusCountTable
from .hpoa_table_creator import HpoaTableCreator, HpoaTableBuilder
from .individual_table import IndividualTable
Expand Down
14 changes: 6 additions & 8 deletions src/pyphetools/visualization/counted_hpo_term.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,26 @@

class CountedHpoTerm:
"""
This class intends to keep track of the frequzency of an HPO term in a cohort
This class intends to keep track of the frequency of an HPO term in a cohort
The denominator refers to the total number of observed counts and the numerator is equal to observed+exluded, i.., the total count for which some information was available.
"""
def __init__(self, hpo_term, numerator, denominator):
if not isinstance(numerator, int):
raise ValueError(f"Malformed numerator (must be integer but was {numerator})")
if not isinstance(denominator, int):
raise ValueError(f"Malformed denominator (must be integer but was {denominator})")
self._onset_term_id = hpo_term.id
self._onset_term_label = hpo_term.label
self._hpo_term_id = hpo_term.id
self._hpo_term_label = hpo_term.label
self._num = numerator
self._denom = denominator

@property
def id(self):
return self._onset_term_id
return self._hpo_term_id

@property
def label(self):
return self._onset_term_label
return self._hpo_term_label

def has_frequency(self):
return self._num is not None and self._denom is not None
Expand All @@ -34,9 +35,6 @@ def numerator(self):
def denominator(self):
return self._denom

def increment_numerator(self):
self._num += 1


class CohortTermCounter:

Expand Down
48 changes: 45 additions & 3 deletions src/pyphetools/visualization/detailed_suppl_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,44 @@
from .hpo_category import HpoCategorySet
from hpotk.model import TermId
from collections import defaultdict
import hpotk
from enum import Enum


ALL_ROOT = TermId.from_curie("HP:0000001")
PHENOTYPIC_ABNORMALITY_ROOT = TermId.from_curie("HP:0000118")

class HpoStatus(Enum):
OBSERVED = 1, "observed"
EXCLUDED = 2, "excluded"
NOT_AVAILABLE = 3, "na"


class HpoTableCell:
"""
Represents one cell of the detailed table
"""
def __init__(self,
hpo_term_id:hpotk.TermId,
status: HpoStatus
) -> None:
self._hpo_id = hpo_term_id
self._status = status

def to_cell(self):
return self._status.value

def to_cell_pm(self):
"""
Alternative plus-minus (pm) notation
"""
if self._status == HpoStatus.OBSERVED:
return "+"
elif self._status == HpoStatus.EXCLUDED:
return "-"
else:
return "na"



class DetailedSupplTable:
Expand All @@ -19,7 +53,9 @@ class DetailedSupplTable:
are shown as columns.
"""

def __init__(self, patient_list: typing.List[PPKt.Phenopacket], hp_json:str=None) -> None:
def __init__(self,
patient_list: typing.List[PPKt.Phenopacket],
hp_json:str=None) -> None:
"""
:param patient_d: dictionary of patients to display
:type patient_d: map with key string and value SimplePatient
Expand Down Expand Up @@ -47,6 +83,8 @@ def __init__(self, patient_list: typing.List[PPKt.Phenopacket], hp_json:str=None
# key is a string such as HP:0001234, value is an HpTerm object
# we need to convert it to an object from hpo-toolkit because get_ancestors returns HpTerm objects
hp_termid = TermId.from_curie(hp_id)
if not self._hp_ontology.graph.is_descendant_of(hp_termid, PHENOTYPIC_ABNORMALITY_ROOT):
continue # do not count terms that are not phenotypes
ancs = self._hp_ontology.graph.get_ancestors(hp_termid)
anc_set.add(hp_termid)
anc_set.update(ancs)
Expand All @@ -59,10 +97,14 @@ def __init__(self, patient_list: typing.List[PPKt.Phenopacket], hp_json:str=None
variants = pat.get_variant_list()
for var in variants:
var_d[var] += 1

# TODO figure out what to do with biallelic
self._hpo_category_set = HpoCategorySet(ontology=hp_ontology)

def _calculate_table(self, patient_list: typing.List[PPKt.Phenopacket], ) -> typing.List[typing.List[str]]:
for pat in self._simple_patient_list:
hpo_terms = pat.get_observed_hpo_d()
anc_set = set() # graph with ancestors induced by all terms of the patient



def _get_table(self, counts_d):
"""
Expand Down
Loading

0 comments on commit dae5dd0

Please sign in to comment.