diff --git a/.github/workflows/generate_phenopackets.yml b/.github/workflows/generate_phenopackets.yml
deleted file mode 100644
index 81e2baf6..00000000
--- a/.github/workflows/generate_phenopackets.yml
+++ /dev/null
@@ -1,65 +0,0 @@
-name: Generate phenopackets
-
-env:
- # We generate the phenopackets from this state
- # of the `phenopacket-store` repository.
- PHENOPACKET_STORE_TAG: pyphe-ci-wip
-# PHENOPACKET_STORE_TAG: c06a52e0 # State from Mar 5th, 2024
-
-on:
- push:
- branches: [ develop ]
- pull_request:
- branches: [ main, develop ]
-
-jobs:
- build:
- runs-on: ubuntu-latest
-
- steps:
- - uses: actions/checkout@v2
-
- - name: Initialize Python 3.8
- uses: actions/setup-python@v4.3.1
- with:
- python-version: "3.8"
-
- - name: Install pyphetools & dependencies
- run: |
- python3 -m pip install --editable .
-
- - name: Install Jupyter and register current Python as kernel
- run: |
- python3 -m pip install ipykernel notebook
- python3 -m ipykernel install --user --name pp_env --display-name "Pyphetools (Python 3)"
-
- - name: Check out phenopacket-store
- run: |
- cd ..
- git clone https://github.com/monarch-initiative/phenopacket-store.git
- cd phenopacket-store
- git checkout $PHENOPACKET_STORE_TAG
-
- - name: Generate phenopackets
- run: |
- # phenopacket-store folder
- pps_dir=$(dirname $(pwd))/phenopacket-store
-
- # Prepare the notebooks to run
- declare -a fbn1_notebooks=(
- "FBN1/Comeglio_isolated_ectopia_lentis.ipynb"
- "FBN1/FBN1_lipodystrophy_Lin_2020.ipynb"
- "FBN1/Katzke_2002_Marfan.ipynb"
- "FBN1/LeGoff_FBN1_acromicric_and_geleophysic_dysplasia.ipynb"
- "FBN1/Loeys2010_stiff_skin_syndrome.ipynb"
- "FBN1/Palz_Marfan_2000.ipynb"
- "FBN1/Tiecke_2001_Marfan.ipynb"
- )
-
- # Run the notebooks
- for nb in ${fbn1_notebooks[@]}
- do
- nb_path=${pps_dir}/notebooks/${nb}
- printf "Running %s\n" ${nb_path}
- jupyter execute --kernel_name=pp_env ${nb_path}
- done
diff --git a/docs/developers/developers.md b/docs/developers/developers.md
index ba96e3fb..4e34e893 100644
--- a/docs/developers/developers.md
+++ b/docs/developers/developers.md
@@ -1,5 +1,35 @@
# For developers
+## Local Installation
+
+We recommend creating a local environment:
+
+```bash
+python3 -m venv venv
+source venv/bin/activate
+```
+
+and updating Python's `pip` tool:
+
+```bash
+python3 -m pip install --upgrade pip
+```
+
+You can then do a local/editable install:
+
+
+```bash
+python3 -m pip install --editable ".[test]"
+```
+
+After installation you should be able to run the test suite:
+
+```bash
+pytest
+```
+
+
+## Creating Phenopackets
pyphetools provides two main ways of creating phenopackets.
diff --git a/docs/img/deletion_error.png b/docs/img/deletion_error.png
new file mode 100644
index 00000000..3c476246
Binary files /dev/null and b/docs/img/deletion_error.png differ
diff --git a/docs/user-guide/python_notebook.md b/docs/user-guide/python_notebook.md
index 08b2696c..ca41d61b 100644
--- a/docs/user-guide/python_notebook.md
+++ b/docs/user-guide/python_notebook.md
@@ -15,27 +15,53 @@ import pyphetools
print(f"Using pyphetools version {pyphetools.__version__}")
```
-Import the [Human Phenotype Ontology (HPO)](https://hpo.jax.org/app/) hp.json file. Note that here we show code that assumes that the file is available in the enclosing directory. Update the ORCID identifier to your own [ORCID](https://orcid.org/){:target="_blank"} id. Indicate
-the location of the template file.
+### Set paths and identifiers
+Update the ORCID identifier to your own [ORCID](https://orcid.org/){:target="_blank"} id.
+Update the path to the template file.
```python
template = "input/BRD4_individuals.xlsx"
-hp_json = "../hp.json"
created_by = "0000-0002-0736-9199"
```
-import the template file. The code returns the pyphetools Individual objects, each of which contains all of the information needed to create a phenopacket and which here can be used if desired for debugging or further analysis. The cvalidator object is used to display quality assessment information.
+### Import the template file.
+The code returns the pyphetools Individual objects, each of which contains all of the information needed to create a phenopacket and which here can be used if desired for debugging or further analysis. The cvalidator object is used to display quality assessment information.
+Note that optionally you can provide an argument to the location of the hp.json file using the ``hp_json``argument. If no argument is provided, the hpo-toolkit library will download the latest version of
+hp.json to your user directory (.hpotk folder).
-```
-timporter = TemplateImporter(template=template, hp_json=hp_json, created_by=created_by)
+```python
+timporter = TemplateImporter(template=template, created_by=created_by)
individual_list, cvalidator = timporter.import_phenopackets_from_template()
```
-Display quality assessment data.
+
+### Structural variants
+pyphetools will automatically retrieve information about small variants coded as HGVS strings using the
+[VariantValidator](https://variantvalidator.org/) API. Until very recently, it was challenging to determine the exact positions of larger structural variants, and for this reason, publications often described them
+using phrases such as "whole gene deletion" or "EX9-12DEL". If such as string is found in the template file,
+pyphetool will emit an error such as the following.
+
+
+
+This can be fixed by passing an argument with a set of all strings that represent deletions (as in the following example), duplications, or inversions.
+
+```python title="Specifying structural variants"
+del_set = {"EX9-12DEL"}
+timporter = TemplateImporter(template=template, created_by=created_by)
+individual_list, cvalidator = timporter.import_phenopackets_from_template(deletions=del_set)
+```
+
+### Display quality assessment data.
```
qc = QcVisualizer(cohort_validator=cvalidator)
display(HTML(qc.to_summary_html()))
```
-Display summaries of each phenopacket. The command ``cvalidator.get_error_free_individual_list()``returns versions of the Individual objects
+### Display summaries of each phenopacket.
+
+The command ``cvalidator.get_error_free_individual_list()``returns versions of the Individual objects
in which errors such as redundancies have been removed; this is the data that gets transformed into phenopackets.
diff --git a/docs/user-guide/template.md b/docs/user-guide/template.md
index b6a9fdcb..0404d6b7 100644
--- a/docs/user-guide/template.md
+++ b/docs/user-guide/template.md
@@ -1,7 +1,7 @@
# Data-Entry Template
pyphetools offers two main ways to encode clinical data as phenopackets. The library provides various functions to encode data found in
-typical supplementary materials of publications about cohorts. This option, which is covered in more detail in TODO is intended for those
+typical supplementary materials of publications about cohorts. This option, which is covered in more detail [here](../developers/developers.md) is intended for those
with skills in scripting with Python. Additionally, pyphetools can ingest data encoded in an Excel template that can be used without additional scripting.
The template can be ingested using a standardized notebook. Alternatively, users are invited to work with the HPO team to enter the data into the HPO database.
@@ -57,4 +57,15 @@ tcreator.create_template(disease_id=disease_id,
transcript=ofd1_transcript)
```
+The following snippet can be used as a "starter" by pasting it into the notebook.
+
+```python
+tc.create_template(disease_id="",
+ disease_label="",
+ gene_symbol="",
+ HGNC_id="",
+ transcript="")
+```
+
+
The script creates a file that can be opened in Excel and used for curation. Add additional HPO terms as necessary and remove terms that are not needed.
\ No newline at end of file
diff --git a/src/pyphetools/__init__.py b/src/pyphetools/__init__.py
index a583542d..23fe80e2 100644
--- a/src/pyphetools/__init__.py
+++ b/src/pyphetools/__init__.py
@@ -4,7 +4,7 @@
from . import visualization
from . import validation
-__version__ = "0.9.77"
+__version__ = "0.9.85"
__all__ = [
"creation",
diff --git a/src/pyphetools/creation/case_template_encoder.py b/src/pyphetools/creation/case_template_encoder.py
index 2e7d9661..91634cad 100644
--- a/src/pyphetools/creation/case_template_encoder.py
+++ b/src/pyphetools/creation/case_template_encoder.py
@@ -388,7 +388,7 @@ def _parse_individual(self, row:pd.Series):
elif sex == "U":
sex = Constants.UNKNOWN_SEX_SYMBOL
else:
- raise ValueError(f"Unrecognized sex symbol: {sex}")
+ raise ValueError(f"Unrecognized sex symbol: {sex} for individual \"{individual_id}\"")
onset_age = data_items.get(AGE_OF_ONSET_FIELDNAME)
if onset_age is not None and isinstance(onset_age, str):
onset_age = PyPheToolsAge.get_age(onset_age)
diff --git a/src/pyphetools/creation/create_template.py b/src/pyphetools/creation/create_template.py
index 48500992..98d8c465 100644
--- a/src/pyphetools/creation/create_template.py
+++ b/src/pyphetools/creation/create_template.py
@@ -1,20 +1,31 @@
import os
+import typing
+
import pandas as pd
from collections import defaultdict
from .hpo_parser import HpoParser
+from .hp_term import HpTerm
from typing import List
import hpotk
from .case_template_encoder import REQUIRED_H1_FIELDS, REQUIRED_H2_FIELDS
class TemplateCreator:
- def __init__(self, hp_json:str, hp_cr_index:str=None) -> None:
- if not os.path.isfile(hp_json):
+ def __init__(
+ self,
+ hp_json: typing.Optional[str] = None,
+ hp_cr_index: typing.Optional[str] = None,
+ ) -> None:
+ if hp_json is None:
+ parser = HpoParser()
+ elif not os.path.isfile(hp_json):
raise FileNotFoundError(f"Could not find hp.json file at {hp_json}")
- if hp_cr_index:
+ else:
+ parser = HpoParser(hpo_json_file=hp_json)
+ if hp_cr_index is not None:
if not os.path.isfile(hp_cr_index):
raise FileNotFoundError(f"Could not find the FastHPOCR index file at {hp_cr_index}")
- parser = HpoParser(hpo_json_file=hp_json)
+
self._hpo_cr = parser.get_hpo_concept_recognizer(hp_cr_index=hp_cr_index)
self._hpo_ontology = parser.get_ontology()
self._all_added_hp_term_set = set()
@@ -60,7 +71,7 @@ def arrange_terms(self) -> List[hpotk.model.TermId]:
return hp_term_list
- def create_template(self, disease_id:str, disease_label:str, HGNC_id:str, gene_symbol:str, transcript:str, append=False):
+ def create_template(self, disease_id:str, disease_label:str, HGNC_id:str, gene_symbol:str, transcript:str):
"""Create an Excel file that can be used to enter data as a pyphetools template
:param disease_id: an OMIM, MONDO, or other similar CURIE identifier
@@ -107,17 +118,56 @@ def create_template(self, disease_id:str, disease_label:str, HGNC_id:str, gene_s
df.loc[len(df)] = new_row
## Output as excel
fname = disease_id.replace(":", "_") + "_individuals.xlsx"
-
if os.path.isfile(fname):
- if not append:
- raise FileExistsError(f"Excel file '{fname}' already exists. Use 'append=True' \
- to append HPO terms to the existing file.")
- else:
- print(f"[WARNING] Appending to existing file '{fname}'. This might lead to duplicate HPO terms. \
- It's recommended to create a new file instead.")
-
+ raise FileExistsError(f"Excel file '{fname}' already exists.")
df.to_excel(fname, index=False)
- print(f"Write excel pyphetools template file to {fname}")
-
-
+ print(f"Wrote Excel pyphetools template file to {fname}")
+ def create_from_phenopacket(self, ppkt):
+ """
+ create pyphetools templates from an individual phenopacket.
+ This function is intended to accelerate the process of converting the LIRICAL phenopackets
+ to our current format and generally should not be used for new cases
+ """
+ id_to_observed = set()
+ id_to_excluded = set()
+
+ for pf in ppkt.phenotypic_features:
+ hpt = HpTerm(hpo_id=pf.type.id, label=pf.type.label)
+ self._all_added_hp_term_set.add(hpt)
+ if pf.excluded:
+ id_to_excluded.add(pf.type.label)
+ else:
+ id_to_observed.add(pf.type.label)
+ H1_Headers = REQUIRED_H1_FIELDS
+ H2_Headers = REQUIRED_H2_FIELDS
+ if len(H1_Headers) != len(H2_Headers):
+ raise ValueError("Header lists must have same length")
+ EMPTY_STRING = ""
+ hp_term_list = self.arrange_terms()
+ for hpt in hp_term_list:
+ H1_Headers.append(hpt.label)
+ H2_Headers.append(hpt.id)
+ df = pd.DataFrame(columns=H1_Headers)
+ new_row = dict()
+ for i in range(len(H1_Headers)):
+ new_row[H1_Headers[i]] = H2_Headers[i]
+ df.loc[len(df)] = new_row
+ # add one row with some of the data from the phenopakcet
+ new_row = dict()
+ for i in range(len(H1_Headers)):
+ header_field = H1_Headers[i]
+ if header_field == "HPO":
+ new_row[header_field] = "na"
+ elif header_field in id_to_observed:
+ new_row[header_field] = "observed"
+ elif header_field in id_to_excluded:
+ new_row[header_field] = "excluded"
+ else:
+ new_row[header_field] = "?"
+ df.loc[len(df)] = new_row
+ ## Output as excel
+ ppkt_id = "".join(e for e in ppkt.id if e.isalnum())
+ fname = ppkt_id + "_phenopacket_template.xlsx"
+ df.to_excel(fname, index=False)
+ print(f"Wrote excel pyphetools template file to {fname}")
diff --git a/src/pyphetools/creation/import_template.py b/src/pyphetools/creation/import_template.py
index 5aba0168..afcbd309 100644
--- a/src/pyphetools/creation/import_template.py
+++ b/src/pyphetools/creation/import_template.py
@@ -14,8 +14,8 @@ class TemplateImporter:
ORCID_regex = r"^\d{4}-\d{4}-\d{4}-\d{4}$"
def __init__(self,template:str,
- hp_json:str,
- created_by:str) -> None:
+ created_by:str,
+ hp_json:str=None) -> None:
"""Constructor
:param template: path to Excel template file
@@ -36,10 +36,13 @@ def __init__(self,template:str,
self._created_by = f"ORCID:{created_by}"
if not os.path.isfile(template):
raise FileNotFoundError(f"Could not find Excel template at {template}")
- if not os.path.isfile(hp_json):
+ if hp_json is not None and not os.path.isfile(hp_json):
raise FileNotFoundError(f"Could not find hp.json file at {hp_json}")
self._template = template
self._hp_json = hp_json
+ # we keep a list of modes of inheritance.
+ # Most genetic diseases have one, but diseases with both austosomal dominant and recessive, e.g., OMIM:620647 are not very uncommon
+ self._moi_list = list()
@staticmethod
def _get_data_from_template(df:pd.DataFrame) -> typing.Tuple[str,str,str,str,str]:
@@ -118,7 +121,8 @@ def import_phenopackets_from_template(self,
deletions:typing.Set[str]=set(),
duplications:typing.Set[str]=set(),
inversions:typing.Set[str]=set(),
- hemizygous:bool=False):
+ hemizygous:bool=False,
+ leniant_MOI:bool=False):
"""Import the data from an Excel template and create a collection of Phenopackets
Note that things will be completely automatic if the template just has HGNC encoding variants
@@ -132,6 +136,8 @@ def import_phenopackets_from_template(self,
:type inversions: (typing.Set[str], optional
:param hemizygous: Set this to true for X-chromosomal recessive conditions in which the genotype of affected males is hemizygous
:type hemizygous: bool
+ :param leniant_MOI: Do not check allelic requirements. Use this if the disease being curated has more than one MOI. This may require manually adding the "second" MOI in PhenoteFX
+ :type leniant_MOI: bool
:returns: tuple with individual list and CohortValidator that optionally can be used to display in a notebook
:rtype: typing.Tuple[typing.List[pyphetools.creation.Individual], pyphetools.validation.CohortValidator]
"""
@@ -172,8 +178,14 @@ def import_phenopackets_from_template(self,
print("Fix this error and then try again!")
sys.exit(1)
vman.add_variants_to_individuals(individuals, hemizygous=hemizygous)
- all_req = TemplateImporter._get_allelic_requirement(df)
- cvalidator = CohortValidator(cohort=individuals, ontology=hpo_ontology, min_hpo=1, allelic_requirement=all_req)
+ if leniant_MOI:
+ # We need this in case a disease has more than one mode of inheritance/allelic requirement. In this case, we cannot distinguish
+ # between an mistake or a different MOI automatically, and we need to carefully chanck by hand. For instance, OMIM:620647 is both AD and AR
+ # and we have data with biallelic and monoallelic variants.
+ cvalidator = CohortValidator(cohort=individuals, ontology=hpo_ontology, min_hpo=1)
+ else:
+ all_req = TemplateImporter._get_allelic_requirement(df)
+ cvalidator = CohortValidator(cohort=individuals, ontology=hpo_ontology, min_hpo=1, allelic_requirement=all_req)
if cvalidator.n_removed_individuals() > 0:
print(f"Removed {cvalidator.n_removed_individuals()} individuals with unfixable errors")
ef_individuals = cvalidator.get_error_free_individual_list()
@@ -228,7 +240,8 @@ def filter_diseases(disease_id, ppkt_list):
def create_hpoa_from_phenopackets(self,
pmid:str,
- moi:str, ppkt_dir:str="phenopackets",
+ moi:str,
+ ppkt_dir:str="phenopackets",
target:str=None) -> pd.DataFrame:
"""Create an HPO annotation (HPOA) file from the current cohort
@@ -252,7 +265,7 @@ def create_hpoa_from_phenopackets(self,
if target is not None:
ppkt_list = TemplateImporter.filter_diseases(target, ppkt_list)
TemplateImporter.check_disease_entries(ppkt_list)
- builder = HpoaTableBuilder(phenopacket_list=ppkt_list)
+ builder = HpoaTableBuilder(phenopacket_list=ppkt_list, created_by=self._created_by)
if moi == "Autosomal dominant":
builder.autosomal_dominant(pmid)
elif moi == "Autosomal recessive":
diff --git a/src/pyphetools/creation/individual.py b/src/pyphetools/creation/individual.py
index 147c098e..28071a56 100644
--- a/src/pyphetools/creation/individual.py
+++ b/src/pyphetools/creation/individual.py
@@ -176,6 +176,12 @@ def set_disease(self, disease:Disease) -> None:
"""
self._disease = disease
+ def disease_count(self):
+ if self._disease is None:
+ return 0
+ else:
+ return 1
+
def set_hpo_terms(self, cleansed_hpo_terms:List[HpTerm]):
"""
:param cleansed_hpo_terms: a list of HpTerm objects that has been cleansed by OntologyQC
@@ -213,7 +219,11 @@ def get_phenopacket_id(self, phenopacket_id=None) -> str:
ppkt_id = indi_id
else:
ppkt_id = phenopacket_id
- ppkt_id = ppkt_id.replace(" ", "_")
+ # strip non alphanumeric characters
+ ppkt_id = ''.join(e if e.isalnum() else "_" for e in ppkt_id)
+ ppkt_id = ppkt_id.replace("__", "_")
+ if ppkt_id.endswith("_"):
+ ppkt_id = ppkt_id[:-1]
return ppkt_id
def get_citation(self) -> Citation:
diff --git a/src/pyphetools/validation/content_validator.py b/src/pyphetools/validation/content_validator.py
index 748cd612..c3ae65bd 100644
--- a/src/pyphetools/validation/content_validator.py
+++ b/src/pyphetools/validation/content_validator.py
@@ -35,10 +35,11 @@ class ContentValidator(PhenopacketValidator):
:param allelic_requirement: used to check number of alleles and variants
:type allelic_requirement: AllelicRequirement
"""
- def __init__(self, min_hpo: int, allelic_requirement: AllelicRequirement = None) -> None:
+ def __init__(self, min_hpo: int, allelic_requirement: AllelicRequirement = None, minimum_disease_count:int=1) -> None:
super().__init__()
self._min_hpo = min_hpo
self._allelic_requirement = allelic_requirement
+ self._minimum_disease_count = minimum_disease_count
def validate_individual(self, individual:Individual) -> List[ValidationResult]:
@@ -63,7 +64,8 @@ def validate_individual(self, individual:Individual) -> List[ValidationResult]:
n_alleles += 2
elif gtype.label == "hemizygous": # "GENO:0000134"
n_alleles += 1
- return self._validate(pp_id=pp_id, n_hpo=n_pf, n_var=n_var, n_alleles=n_alleles)
+ disease_count = individual.disease_count()
+ return self._validate(pp_id=pp_id, n_hpo=n_pf, disease_count=disease_count, n_var=n_var, n_alleles=n_alleles)
@@ -110,11 +112,12 @@ def validate_phenopacket(self, phenopacket) -> List[ValidationResult]:
n_alleles += 2
elif gtype.label == "hemizygous": # "GENO:0000134"
n_alleles += 1
- return self._validate(pp_id=pp_id, n_hpo=n_pf, n_var=n_var, n_alleles=n_alleles)
+ disease_count = len(phenopacket.diseases)
+ return self._validate(pp_id=pp_id, n_hpo=n_pf, disease_count=disease_count, n_var=n_var, n_alleles=n_alleles)
- def _validate(self, pp_id:str, n_hpo:int, n_var:int=None, n_alleles:int=None):
+ def _validate(self, pp_id:str, n_hpo:int, disease_count:int, n_var:int=None, n_alleles:int=None):
"""
private method called by validate_individual or validate_phenopacket.
:param pp_id: phenopacket identifier
@@ -129,6 +132,9 @@ def _validate(self, pp_id:str, n_hpo:int, n_var:int=None, n_alleles:int=None):
validation_results = []
if n_hpo < self._min_hpo:
validation_results.append(ValidationResultBuilder(phenopacket_id=pp_id).insufficient_hpos(min_hpo=self._min_hpo, n_hpo=n_hpo).build())
+ if disease_count < self._minimum_disease_count:
+ val_result = ValidationResultBuilder(phenopacket_id=pp_id).insufficient_disease_count(disease_count, self._minimum_disease_count).build()
+ validation_results.append(val_result)
if self._allelic_requirement is None:
return validation_results
if self._allelic_requirement == AllelicRequirement.MONO_ALLELIC:
diff --git a/src/pyphetools/validation/validated_individual.py b/src/pyphetools/validation/validated_individual.py
index 90bddff6..225c9381 100644
--- a/src/pyphetools/validation/validated_individual.py
+++ b/src/pyphetools/validation/validated_individual.py
@@ -7,6 +7,11 @@
import hpotk
class ValidatedIndividual:
+ """
+ Class to coordinate quality assessment. In addition to ontology-based tests performed by the OntologyQC class,
+ we here test for a minimum number of HPO annotations, the present of at least one disease, and the correct
+ number of alleles.
+ """
def __init__(self, individual:Individual) -> None:
self._individual = individual
@@ -14,7 +19,7 @@ def __init__(self, individual:Individual) -> None:
self._validation_errors = []
- def validate(self, ontology:hpotk.MinimalOntology, min_hpo:int, allelic_requirement:AllelicRequirement=None) -> None:
+ def validate(self, ontology:hpotk.MinimalOntology, min_hpo:int, allelic_requirement:AllelicRequirement=None, minimum_disease_count:int=1) -> None:
"""validate an Individual object for errors in the Ontology or the minimum number of HPO terms/alleles/variants
:param ontology: HPO object
@@ -29,7 +34,7 @@ def validate(self, ontology:hpotk.MinimalOntology, min_hpo:int, allelic_requirem
self._validation_errors.extend(qc_validation_results)
self._clean_terms = qc.get_clean_terms()
self._individual.set_hpo_terms(self._clean_terms)
- cvalidator = ContentValidator(min_hpo=min_hpo, allelic_requirement=allelic_requirement)
+ cvalidator = ContentValidator(min_hpo=min_hpo, allelic_requirement=allelic_requirement, minimum_disease_count=minimum_disease_count)
validation_results = cvalidator.validate_individual(individual=self._individual)
self._validation_errors.extend(validation_results)
# The following checks for remaining errors that would force us to remove the patient from the cohort
diff --git a/src/pyphetools/validation/validation_result.py b/src/pyphetools/validation/validation_result.py
index cec13a7f..0bd33cf9 100644
--- a/src/pyphetools/validation/validation_result.py
+++ b/src/pyphetools/validation/validation_result.py
@@ -32,7 +32,8 @@ class Category(IntEnum):
NOT_MEASURED = 8
OBSERVED_AND_EXCLUDED = 9
DUPLICATE = 10
- UNKNOWN = 11
+ INSUFFICIENT_DISEASE_COUNT = 11
+ UNKNOWN = 12
class ValidationResult:
@@ -247,6 +248,12 @@ def malformed_hpo_id(self, malformed_term:HpTerm):
self._category = Category.MALFORMED_ID
self._message = f"Malformed term {malformed_term.label} with invalid HPO id {malformed_term.id}"
return self
+
+ def insufficient_disease_count(self, observed_count:int, minimum_count:int):
+ self._error_level = ErrorLevel.ERROR
+ self._category = Category.INSUFFICIENT_DISEASE_COUNT
+ self._message = f"Individual had {observed_count} disease annotation(s) but the mininum required count is {minimum_count}"
+ return self
def malformed_hpo_label(self, malformed_label, valid_term:HpTerm):
self._error_level = ErrorLevel.ERROR
diff --git a/src/pyphetools/visualization/hpoa_table_creator.py b/src/pyphetools/visualization/hpoa_table_creator.py
index 8ce93b8c..24115e11 100644
--- a/src/pyphetools/visualization/hpoa_table_creator.py
+++ b/src/pyphetools/visualization/hpoa_table_creator.py
@@ -1,8 +1,9 @@
import os
import json
-import re
-import datetime
+import typing
from typing import List, Dict
+from datetime import datetime
+
from google.protobuf.json_format import Parse
@@ -111,9 +112,13 @@ class HpoaTableCreator:
14. biocuration
These should be tab separated fields.
"""
- DATE_REGEX = r"(\d{4}-\d{2}-\d{2})"
-
- def __init__(self, phenopacket_list, onset_term_d, moi_d, created_by:str=None) -> None:
+ def __init__(
+ self,
+ phenopacket_list,
+ onset_term_d,
+ moi_d,
+ created_by: str,
+ ) -> None:
"""Constructor
:param phenopacket_list: List of GA4GH phenopackets
@@ -122,6 +127,9 @@ def __init__(self, phenopacket_list, onset_term_d, moi_d, created_by:str=None)
:type: Dict[str, OnsetTerm]
:param moi_d: Dictionary with key PMID and value Mode of inheritance
"""
+ todays_date = datetime.now().strftime("%Y-%m-%d")
+ self._created_by = created_by
+ self._todays_date = f"[{todays_date}]"
self._phenopackets = phenopacket_list
self._all_hpo_d = self._get_all_hpos()
self._disease = self._get_disease() # only allow one disease, therefore this is a scalar value (string)
@@ -129,7 +137,6 @@ def __init__(self, phenopacket_list, onset_term_d, moi_d, created_by:str=None)
self._biocurator_d = self._get_biocurator_d()
self._onset_rows = self._add_age_of_onset_terms(onset_term_d)
self._moi_rows = self._add_moi_rows(moi_d)
- self._created_by = created_by
def _get_all_hpos(self) -> Dict[str,HpTerm]:
"""Get a dictionary of HpTerms, with key being HPO id and the value the corresponding HpTerm
@@ -190,19 +197,18 @@ def _get_biocurator_d(self):
:returns: dictionary with key=PMID, value=biocurator
:rtype: Dict[str,str]
"""
- biocurator_d = defaultdict()
+ biocurator_d = {}
for ppkt in self._phenopackets:
pmid = HpoaTableCreator.get_pmid(ppkt=ppkt)
mdata = ppkt.meta_data
created_by = mdata.created_by
if mdata.HasField("created"):
- created = mdata.created # created is a TimeStamp object
+ created = mdata.created # created is a TimeStamp object
created_dt = created.ToDatetime()
ymd = created_dt.strftime('%Y-%m-%d')
created_by = f"{created_by}[{ymd}]"
else:
- today = datetime.today().strftime('%Y-%m-%d')
- created_by = f"{created_by}[{today}]"
+ created_by = f"{created_by}{self._todays_date}"
biocurator_d[pmid] = created_by
return biocurator_d
@@ -249,7 +255,7 @@ def _add_moi_rows(self, moi_d) -> List[HpoaTableRow]:
# If we add an MOI outside of the template, then it will not have a PMID
# the template builder requires a created_by field which is designed for this.
if biocurator is None:
- biocurator = self._created_by
+ biocurator = f'{self._created_by}{self._todays_date}'
for hpterm in hpterm_list:
row = HpoaTableRow(disease=self._disease, hpo_term=hpterm, publication=pmid, biocurator=biocurator)
moi_rows.append(row)
@@ -292,7 +298,13 @@ def write_data_frame(self):
class HpoaTableBuilder:
- def __init__(self, indir=None, phenopacket_list=None, created_by:str=None, target=None) -> None:
+ def __init__(
+ self,
+ indir=None,
+ phenopacket_list=None,
+ created_by: typing.Optional[str] = None,
+ target: typing.Optional[str] = None,
+ ) -> None:
if indir is not None:
if not os.path.isdir(indir):
raise ValueError(f"indir argument {indir} must be directory!")
@@ -307,10 +319,11 @@ def __init__(self, indir=None, phenopacket_list=None, created_by:str=None, targe
ppack = Parse(json.dumps(jsondata), PPKt.Phenopacket())
self._phenopackets.append(ppack)
elif phenopacket_list is not None:
- if target is not None:
- self._phenopackets = HpoaTableBuilder.filter_diseases(target, phenopacket_list)
- else:
+ if target is None:
self._phenopackets = phenopacket_list
+ else:
+ self._phenopackets = HpoaTableBuilder.filter_diseases(target, phenopacket_list)
+
else:
raise ValueError("A valid value must be supplied for either \"indir\" or \"phenopacket_list\"")
self._onset_term_d = defaultdict(list)
diff --git a/test/test_cohort_validator.py b/test/test_cohort_validator.py
index 88f3bde6..a40d6fcf 100644
--- a/test/test_cohort_validator.py
+++ b/test/test_cohort_validator.py
@@ -1,7 +1,7 @@
import hpotk
import pytest
-from pyphetools.creation import Individual, HpTerm
+from pyphetools.creation import Disease,Individual, HpTerm
from pyphetools.validation import CohortValidator
@@ -17,6 +17,7 @@ def ind_a(self) -> Individual:
i.add_hpo_term(HpTerm(hpo_id="HP:0000490", label="Deeply set eye"))
i.add_hpo_term(HpTerm(hpo_id="HP:0011525", label="Iris nevus"))
i.add_hpo_term(HpTerm(hpo_id="HP:0000490", label="Deeply set eye"))
+ i.set_disease(disease=Disease(disease_id="OMIM:123456", disease_label="label"))
return i
@pytest.fixture
@@ -24,6 +25,7 @@ def ind_b(self) -> Individual:
i = Individual(individual_id="B")
i.add_hpo_term(HpTerm(hpo_id="HP:0000490", label="Deeply set eye"))
i.add_hpo_term(HpTerm(hpo_id="HP:0011525", label="Iris nevus"))
+ i.set_disease(disease=Disease(disease_id="OMIM:123456", disease_label="label"))
return i
def test_redundant_term(
@@ -74,7 +76,7 @@ def test_redundant_in_hierarchy(self, hpo: hpotk.Ontology):
individual_C.add_hpo_term(HpTerm(hpo_id="HP:0011525", label="Iris nevus"))
individual_C.add_hpo_term(HpTerm(hpo_id="HP:0430025", label="Bilateral facial palsy"))
individual_C.add_hpo_term(HpTerm(hpo_id="HP:0010628", label="Facial palsy"))
-
+ individual_C.set_disease(disease=Disease(disease_id="OMIM:123456", disease_label="label"))
cohort = [individual_C]
cvalidator = CohortValidator(cohort=cohort, ontology=hpo, min_hpo=1)
validated_individuals = cvalidator.get_validated_individual_list()
@@ -88,3 +90,16 @@ def test_redundant_in_hierarchy(self, hpo: hpotk.Ontology):
error = errors[0]
assert error.message == "Facial palsy is redundant because of Bilateral facial palsy"
+
+ def test_error_if_no_disease(self, hpo: hpotk.Ontology):
+ individual_D = Individual(individual_id="D")
+ individual_D.add_hpo_term(HpTerm(hpo_id="HP:0000490", label="Deeply set eye"))
+ cohort = [individual_D]
+ cvalidator = CohortValidator(cohort=cohort, ontology=hpo, min_hpo=1)
+ validated_individuals = cvalidator.get_validated_individual_list()
+ assert len(validated_individuals) == 1
+ individual_D = validated_individuals[0]
+ errors = individual_D.get_validation_errors()
+ assert len(errors) == 1
+ error = errors[0]
+ assert error.message == "Individual had 0 disease annotation(s) but the mininum required count is 1"
diff --git a/test/test_individual.py b/test/test_individual.py
new file mode 100644
index 00000000..3bbbc9b7
--- /dev/null
+++ b/test/test_individual.py
@@ -0,0 +1,58 @@
+import hpotk
+import pytest
+
+from pyphetools.creation import Citation, Disease,Individual, HpTerm
+
+
+
+# The API requires us to pass a column name but the column name will not be used in the tests
+TEST_COLUMN = "test"
+
+
+class TestIndividual:
+
+ @pytest.fixture
+ def ind_a(self) -> Individual:
+ cite = Citation(pmid="PMID:1234", title="some title")
+ i = Individual(individual_id="Individual A (from previous publication)", citation=cite)
+ i.add_hpo_term(HpTerm(hpo_id="HP:0000490", label="Deeply set eye"))
+ i.set_disease(disease=Disease(disease_id="OMIM:123456", disease_label="label"))
+ return i
+
+ @pytest.fixture
+ def ind_b(self) -> Individual:
+ cite = Citation(pmid="PMID:36446582", title="some title")
+ i = Individual(individual_id="Alves, 2019", citation=cite)
+ i.add_hpo_term(HpTerm(hpo_id="HP:0000490", label="Deeply set eye"))
+ i.set_disease(disease=Disease(disease_id="OMIM:123456", disease_label="label"))
+ return i
+
+ @pytest.fixture
+ def ind_c(self) -> Individual:
+ cite = Citation(pmid="PMID:36446582", title="some title")
+ i = Individual(individual_id="Low, 2016_P17 (10)", citation=cite)
+ i.add_hpo_term(HpTerm(hpo_id="HP:0000490", label="Deeply set eye"))
+ i.set_disease(disease=Disease(disease_id="OMIM:123456", disease_label="label"))
+ return i
+
+
+
+
+ def test_phenopacket_identifier(
+ self,
+ ind_a: Individual,
+ ):
+ phenopacket_id = ind_a.get_phenopacket_id()
+ expected = "PMID_1234_Individual_A_from_previous_publication"
+ assert expected == phenopacket_id
+
+
+ def test_phenopacket_id_B(self, ind_b: Individual):
+ phenopacket_id = ind_b.get_phenopacket_id()
+ expected = "PMID_36446582_Alves_2019"
+ assert expected == phenopacket_id
+
+ def test_phenopacket_id_C(self, ind_c: Individual):
+ phenopacket_id = ind_c.get_phenopacket_id()
+ expected = "PMID_36446582_Low_2016_P17_10"
+ assert expected == phenopacket_id
\ No newline at end of file
diff --git a/test/test_validation_result.py b/test/test_validation_result.py
index 86447642..e1e363b3 100644
--- a/test/test_validation_result.py
+++ b/test/test_validation_result.py
@@ -60,4 +60,10 @@ def test_conflict_is_not_unfixable(self):
self.assertEqual("CONFLICT", vresult.category)
self.assertFalse(vresult.is_unfixable_error())
+ def test_insufficient_disease(self):
+ vresult = ValidationResultBuilder("id3").insufficient_disease_count(observed_count=7, minimum_count=42).build()
+ self.assertEqual("ERROR", vresult.error_level)
+ self.assertEqual("INSUFFICIENT_DISEASE_COUNT", vresult.category)
+ self.assertFalse(vresult.is_unfixable_error())
+